RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SynthonSpace.h
Go to the documentation of this file.
1//
2// Copyright (C) David Cosgrove 2024.
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#ifndef RDKIT_SYNTHONSPACE_H
11#define RDKIT_SYNTHONSPACE_H
12
13/*! \file SynthonSpace.h
14
15 \brief contains a class for searching combinatorial libraries in
16 Synthon format such as Enamine REAL.
17
18 \b Note that this functionality is experimental and the API may change
19 in future releases.
20*/
21
22#include <map>
23#include <string>
24#include <vector>
25
26#include <boost/dynamic_bitset.hpp>
27
28#include <RDGeneral/export.h>
32
33namespace RDKit {
34class ROMol;
35
36namespace SynthonSpaceSearch {
37
38// This the maximum number of connectors that we can deal with at the moment.
39// In reality, there may be fewer than this. However, the key limit is in
40// The symbols used for the connectors in Enamine REAL etc.
41const std::vector<std::string> CONNECTOR_SYMBOLS{"[U]", "[Np]", "[Pu]", "[Am]"};
42constexpr unsigned int MAX_CONNECTOR_NUM{4};
43
45 int maxBondSplits{MAX_CONNECTOR_NUM}; // The maximum number of bonds to break
46 // in the query. It should be no more
47 // than the maximum number of connector
48 // types in the SynthonSpace. At
49 // present this is 4. Specifying more
50 // than that will not matter as it will
51 // be reduced to 4. Likewise, values
52 // lower than 1 will be increased to 1.
53 std::uint64_t maxNumFrags{
54 100000}; // The maximum number of fragments the query can
55 // be broken into. Big molecules will create huge
56 // numbers of fragments that may cause excessive
57 // memory use. If the number of fragments hits this number,
58 // fragmentation stops and the search results will likely be
59 // incomplete.
60 std::int64_t maxHits{1000}; // The maximum number of hits to return. Use
61 // -1 for no maximum.
62 std::int64_t hitStart{0}; // Sequence number of hit to start from. So that
63 // you can return the next N hits of a search
64 // having already obtained N-1.
65 bool randomSample{false}; // If true, returns a random sample of the hit
66 // hits, up to maxHits in number.
67 int randomSeed{-1}; // Seed for random-number generator. -1 means use
68 // a random seed (std::random_device).
69 bool buildHits{true}; // If false, reports the maximum number of hits that
70 // the search could produce, but doesn't return them.
71 int numRandomSweeps{10}; // The random sampling doesn't always produce the
72 // required number of hits in 1 go. This parameter
73 // controls how many loops it makes to try and get
74 // the hits before giving up.
75 double similarityCutoff{0.5}; // Similarity cutoff for returning hits by
76 // fingerprint similarity. The default is
77 // appropriate for a Morgan fingerprint of
78 // radius=2, it may need changing for other
79 // fingerprint types.
80 double fragSimilarityAdjuster{
81 0.1}; // Similarity values for fragments are generally low
82 // due to low bit densities. For the fragment
83 // matching, reduce the similarity cutoff
84 // by this amount. A higher number will give slower search
85 // times, a lower number will give faster searches at the
86 // risk of missing some hits. The value you use should have
87 // a positive correlation with your FOMO.
88 double approxSimilarityAdjuster{
89 0.1}; // The fingerprint search uses an approximate similarity method
90 // before building a product and doing a final check. The
91 // similarityCutoff is reduced by this value for the approximate
92 // check. A lower value will give faster run times at the
93 // risk of missing some hits. The value you use should have a
94 // positive correlation with your FOMO. The default is
95 // appropriate for Morgan fingerprints. With RDKit fingerprints,
96 // 0.05 is adequate, and higher than that has been seen to
97 // produce long run times.
98 std::uint64_t timeOut{600}; // Maximum number of seconds to spend on a single
99 // search. 0 means no maximum.
100};
101
102// Holds the information about a set of hits. The molecules can be built
103// by making all combinations of synthons, one taken from each synthon set.
106 SynthonSpaceHitSet(const std::string &id,
107 const std::vector<std::vector<size_t>> &stu)
108 : reactionId(id), synthonsToUse(stu) {
109 numHits = std::accumulate(
110 synthonsToUse.begin(), synthonsToUse.end(), size_t(1),
111 [](const int prevRes, const std::vector<size_t> &s2) -> size_t {
112 return prevRes * s2.size();
113 });
114 }
115 std::string reactionId;
116 std::vector<std::vector<size_t>> synthonsToUse;
117 size_t numHits{0};
118};
119
121 public:
122 // Create the synthonspace from a file in the correct format.
123 explicit SynthonSpace() = default;
124 SynthonSpace(const SynthonSpace &other) = delete;
125 SynthonSpace &operator=(const SynthonSpace &other) = delete;
126 // Get the number of different reactions in the SynthonSpace.
127 /*!
128 *
129 * @return int
130 */
131 size_t getNumReactions() const { return d_reactions.size(); }
132 const std::map<std::string, std::unique_ptr<SynthonSet>> &getReactions()
133 const {
134 return d_reactions;
135 }
136
137 // Get the total number of products that the SynthonSpace could produce.
138 /*!
139 *
140 * @return std::int64_t
141 */
142 std::int64_t getNumProducts() const;
143
144 std::string getSynthonFingerprintType() const { return d_fpType; }
145
146 // Perform a substructure search with the given query molecule across
147 // the synthonspace library. Duplicate SMILES strings produced by different
148 // reactions will be returned.
149 /*!
150 *
151 * @param query : query molecule
152 * @param params : (optional) settings for the search
153 * @return : the hits as a SearchResults object.
154 */
156 const ROMol &query,
158
159 // Perform a fingerprint similarity search with the given query molecule
160 // across the synthonspace library. Duplicate SMILES strings produced by
161 // different reactions will be returned.
162 /*!
163 *
164 * @param query : query molecule
165 * @param fpGen: a FingerprintGenerator object that will provide the
166 * fingerprints for the similarity calculation
167 * @param params : (optional) settings for the search
168 * @return : the hits as a SearchResults object.
169 */
171 const ROMol &query, const FingerprintGenerator<std::uint64_t> &fpGen,
173
174 /*!
175 *
176 * @param inFilename: name of the file containing the synthon-based library.
177 *
178 * The original format is:
179 * all lines are tab-separated
180 * first line:SMILES synton_id synton# reaction_id
181 * Note the spelling "synton" from the original paper/example file.
182 * Subsequent lines have a single reagent e.g.
183 * OCC([U])=NN=[Np] 1-1 0 triazole-1
184 * C1CCCC1N([Pu])[U] 2-1 1 triazole-1
185 * CC1CCN(C1)C(=[Np])[Pu] 3-1 2 triazole-1
186 *
187 * Other acceptable formats are as above, but with a 5th column "release":
188 * SMILES synton_id synton# reaction_id release
189 *
190 * or a comma-separated equivalent of the first format:
191 * SMILES,synton_id,synton_role,reaction_id
192 * but with the 3rd column named differently but with the same meaning.
193 * The formatting of the first 2 formats has been relaxed such that any
194 * whitespace may be used as the field separator.
195 *
196 * Attachment points are U, Np, Pu and Am for up to 4 synthons per reaction.
197 * A product is created by taking a synthon from each synton# value and
198 * combining by replacing matching trans-uranic elements and replacing them
199 * with a direct bond of the appropriate type.
200 * A more (for RDKit) conventional connection flag of isotope labelled
201 * dummy atoms is also accepted ([1*] etc.).
202 * Throws a std::runtime_error if it doesn't think the format is correct,
203 * which it does by checking that the first line is as above and subsequent
204 * lines have appropriate number of fields.
205 */
206 void readTextFile(const std::string &inFilename);
207
208 // Writes to/reads from a binary DB File in our format.
209 /*!
210 *
211 * @param outFilename: the name of the file to write.
212 */
213 void writeDBFile(const std::string &outFilename) const;
214 /*!
215 *
216 * @param inFilename: the name of the file to read.
217 */
218 void readDBFile(const std::string &inFilename);
219
220 // Write a summary of the SynthonSpace to given stream.
221 /*!
222 *
223 * @param os: stream
224 */
225 void summarise(std::ostream &os) const;
226
227 // Writes the enumerated library to file in SMILES format (1 compound
228 // per line, SMILES name
229 /*!
230 @param outFilename: name of the file to write
231 */
232 void writeEnumeratedFile(const std::string &outFilename) const;
233
234 bool hasFingerprints() const;
235 // Create the fingerprints for the synthons ready for fingerprint searches.
236 // Will be done by the fingerprint search if not done ahead of time.
239
241 // Create the add and substract fingerprints for the SynthonSets.
242 // Will be done by the fingerprint search if not done ahead of time.
245
246 private:
247 std::string d_fileName;
248 std::map<std::string, std::unique_ptr<SynthonSet>> d_reactions;
249
250 // For the similarity search, this records the generator used for
251 // creating synthon fingerprints that are read from a binary file.
252 std::string d_fpType;
253};
254
255} // namespace SynthonSpaceSearch
256} // namespace RDKit
257
258#endif // RDKIT_SYNTHONSPACE_H
class that generates same fingerprint style for different output formats
std::string getSynthonFingerprintType() const
const std::map< std::string, std::unique_ptr< SynthonSet > > & getReactions() const
void readTextFile(const std::string &inFilename)
void summarise(std::ostream &os) const
SynthonSpace & operator=(const SynthonSpace &other)=delete
SearchResults substructureSearch(const ROMol &query, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
SynthonSpace(const SynthonSpace &other)=delete
void writeEnumeratedFile(const std::string &outFilename) const
void writeDBFile(const std::string &outFilename) const
SearchResults fingerprintSearch(const ROMol &query, const FingerprintGenerator< std::uint64_t > &fpGen, const SynthonSpaceSearchParams &params=SynthonSpaceSearchParams())
void readDBFile(const std::string &inFilename)
void buildAddAndSubstractFingerprints(const FingerprintGenerator< std::uint64_t > &fpGen)
void buildSynthonFingerprints(const FingerprintGenerator< std::uint64_t > &fpGen)
#define RDKIT_SYNTHONSPACESEARCH_EXPORT
Definition export.h:545
constexpr unsigned int MAX_CONNECTOR_NUM
const std::vector< std::string > CONNECTOR_SYMBOLS
Std stuff.
std::vector< std::vector< size_t > > synthonsToUse
SynthonSpaceHitSet(const std::string &id, const std::vector< std::vector< size_t > > &stu)