RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolSupplier.v1API.h
Go to the documentation of this file.
1//
2// Copyright (C) 2024 greg landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#ifndef RD_MOLSUPPLIER_v1_H
11#define RD_MOLSUPPLIER_v1_H
12
13namespace RDKit {
14inline namespace v1 {
15/*!
16//
17// Here are a couple of ways one can interact with MolSuppliers:
18//
19// 1) Lazy (ForwardIterator):
20// while(!supplier.atEnd()){
21// ROMol *mol = supplier.next();
22// if(mol){
23// do something;
24// }
25// }
26// 2) Random Access:
27// for(int i=0;i<supplier.length();i++){
28// ROMol *mol = supplier[i];
29// if(mol){
30// do something;
31// }
32// }
33//
34//
35*/
37 // this is an abstract base class to supply molecules one at a time
38 public:
40 virtual ~MolSupplier() {}
41 void init() {
42 if (dp_supplier) {
43 dp_supplier->init();
44 }
45 }
46 void reset() {
47 if (dp_supplier) {
48 dp_supplier->reset();
49 }
50 }
51
52 bool atEnd() {
53 if (dp_supplier) {
54 return dp_supplier->atEnd();
55 }
56 return true;
57 }
59 PRECONDITION(dp_supplier, "no supplier");
60 return dp_supplier->next().release();
61 }
62
63 virtual void close() {
64 if (dp_supplier) {
65 dp_supplier->close();
66 }
67 }
68
69 private:
70 // disable automatic copy constructors and assignment operators
71 // for this class and its subclasses. They will likely be
72 // carrying around stream pointers and copying those is a recipe
73 // for disaster.
74 MolSupplier(const MolSupplier &);
75 MolSupplier &operator=(const MolSupplier &);
76
77 protected:
78 std::unique_ptr<v2::FileParsers::MolSupplier> dp_supplier;
79};
80
81// \brief a supplier from an SD file that only reads forward:
83 /*************************************************************************
84 * A lazy mol supplier from a SD file.
85 * - When new molecules are read using "next" their positions in the file are
86 *noted.
87 ***********************************************************************************/
88 public:
91
92 explicit ForwardSDMolSupplier(std::istream *inStream,
93 bool takeOwnership = true, bool sanitize = true,
94 bool removeHs = true,
95 bool strictParsing = false) {
97 params.sanitize = sanitize;
98 params.removeHs = removeHs;
99 params.strictParsing = strictParsing;
100 dp_supplier.reset(new v2::FileParsers::ForwardSDMolSupplier(
101 inStream, takeOwnership, params));
102 };
103
105
106 void setProcessPropertyLists(bool val) {
107 PRECONDITION(dp_supplier, "no supplier");
108 static_cast<ContainedType *>(dp_supplier.get())
109 ->setProcessPropertyLists(val);
110 }
112 if (dp_supplier) {
113 return static_cast<ContainedType *>(dp_supplier.get())
114 ->getProcessPropertyLists();
115 }
116 return false;
117 }
118
119 bool getEOFHitOnRead() const {
120 if (dp_supplier) {
121 return static_cast<ContainedType *>(dp_supplier.get())->getEOFHitOnRead();
122 }
123 return false;
124 }
125};
126
127// \brief a lazy supplier from an SD file
129 /*************************************************************************
130 * A lazy mol supplier from a SD file.
131 * - When new molecules are read using "next" their positions in the file are
132 *noted.
133 * - A call to the "length" will automatically parse the entire file and
134 *cache all the mol
135 * block positions
136 * - [] operator is used to access a molecule at "idx", calling next
137 *following this will result
138 * in the next molecule after "idx"
139 ***********************************************************************************/
140
141 public:
143 SDMolSupplier() { dp_supplier.reset(new ContainedType()); }
144
145 /*!
146 * \param fileName - the name of the SD file
147 * \param sanitize - if true sanitize the molecule before returning it
148 * \param removeHs - if true remove Hs from the molecule before returning it
149 * (triggers sanitization)
150 * \param strictParsing - if set to false, the parser is more lax about
151 * correctness
152 * of the contents.
153 */
154 explicit SDMolSupplier(const std::string &fileName, bool sanitize = true,
155 bool removeHs = true, bool strictParsing = true) {
157 params.sanitize = sanitize;
158 params.removeHs = removeHs;
159 params.strictParsing = strictParsing;
160 dp_supplier.reset(new v2::FileParsers::SDMolSupplier(fileName, params));
161 }
162
163 explicit SDMolSupplier(std::istream *inStream, bool takeOwnership = true,
164 bool sanitize = true, bool removeHs = true,
165 bool strictParsing = true) {
167 params.sanitize = sanitize;
168 params.removeHs = removeHs;
169 params.strictParsing = strictParsing;
170 dp_supplier.reset(
171 new v2::FileParsers::SDMolSupplier(inStream, takeOwnership, params));
172 }
173
174 void moveTo(unsigned int idx) {
175 PRECONDITION(dp_supplier, "no supplier");
176 static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
177 }
178 ROMol *operator[](unsigned int idx) {
179 PRECONDITION(dp_supplier, "no supplier");
180 return static_cast<ContainedType *>(dp_supplier.get())
181 ->operator[](idx)
182 .release();
183 }
184 /*! \brief returns the text block for a particular item
185 *
186 * \param idx - which item to return
187 */
188 std::string getItemText(unsigned int idx) {
189 PRECONDITION(dp_supplier, "no supplier");
190 return static_cast<ContainedType *>(dp_supplier.get())->getItemText(idx);
191 }
192 unsigned int length() {
193 PRECONDITION(dp_supplier, "no supplier");
194 return static_cast<ContainedType *>(dp_supplier.get())->length();
195 }
196 void setData(const std::string &text, bool sanitize = true,
197 bool removeHs = true) {
198 PRECONDITION(dp_supplier, "no supplier");
200 params.sanitize = sanitize;
201 params.removeHs = removeHs;
202 static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
203 }
204 void setData(const std::string &text, bool sanitize, bool removeHs,
205 bool strictParsing) {
207 params.sanitize = sanitize;
208 params.removeHs = removeHs;
209 params.strictParsing = strictParsing;
210 static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
211 }
212 /*! Resets our internal state and sets the indices of molecules in the stream.
213 * The client should be *very* careful about calling this method, as it's
214 *trivial
215 * to end up with a completely useless supplier.
216 *
217 * \param locs - the vector of stream positions.
218 *
219 * Note that this can be used not only to make reading selected molecules
220 *from a
221 * large SD file much faster, but it can also allow subsetting an SD file or
222 * rearranging the order of the molecules.
223 */
224 void setStreamIndices(const std::vector<std::streampos> &locs) {
225 PRECONDITION(dp_supplier, "no supplier");
226 static_cast<ContainedType *>(dp_supplier.get())->setStreamIndices(locs);
227 }
228};
229
230//! lazy file parser for Smiles tables
232 /**************************************************************************
233 * Lazy file parser for Smiles table file, similar to the lazy SD
234 * file parser above
235 * - As an when new molecules are read using "next" their
236 * positions in the file are noted.
237 * - A call to the "length" will automatically parse the entire
238 * file and cache all the mol block positions
239 * - [] operator is used to access a molecule at "idx", calling
240 * next following this will result in the next molecule after
241 * "idx"
242 ***************************************************************************/
243 public:
245 /*!
246 * \param fileName - the name of smiles table file
247 * \param delimiter - delimiting characters between records on a each
248 * line NOTE that this is not a string, the tokenizer looks for
249 * the individual characters in delimiter, not the full string
250 * itself. So the default delimiter: " \t", means " " or "\t".
251 * \param smilesColumn - column number for the SMILES string (defaults
252 * to the first column)
253 * \param nameColumn - column number for the molecule name (defaults to
254 * the second column) If set to -1 we assume that no name is
255 * available for the molecule and the name is defaulted to the
256 * smiles string
257 * \param titleLine - if true, the first line is assumed to list the
258 * names of properties in order separated by 'delimiter'. It is
259 * also assume that the 'SMILES' column and the 'name' column
260 * are not specified here if false - no title line is assumed
261 * and the properties are recorded as the "columnX" where "X" is
262 * the column number
263 * \param sanitize - if true sanitize the molecule before returning it
264 */
265 explicit SmilesMolSupplier(const std::string &fileName,
266 const std::string &delimiter = " \t",
267 int smilesColumn = 0, int nameColumn = 1,
268 bool titleLine = true, bool sanitize = true) {
270 params.delimiter = delimiter;
271 params.smilesColumn = smilesColumn;
272 params.nameColumn = nameColumn;
273 params.titleLine = titleLine;
274 params.parseParameters.sanitize = sanitize;
275 dp_supplier.reset(new v2::FileParsers::SmilesMolSupplier(fileName, params));
276 }
277 explicit SmilesMolSupplier(std::istream *inStream, bool takeOwnership = true,
278 const std::string &delimiter = " \t",
279 int smilesColumn = 0, int nameColumn = 1,
280 bool titleLine = true, bool sanitize = true) {
282 params.delimiter = delimiter;
283 params.smilesColumn = smilesColumn;
284 params.nameColumn = nameColumn;
285 params.titleLine = titleLine;
286 params.parseParameters.sanitize = sanitize;
287 dp_supplier.reset(new v2::FileParsers::SmilesMolSupplier(
288 inStream, takeOwnership, params));
289 }
290 SmilesMolSupplier() { dp_supplier.reset(new ContainedType()); }
291
292 void setData(const std::string &text, const std::string &delimiter = " ",
293 int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
294 bool sanitize = true) {
295 PRECONDITION(dp_supplier, "no supplier");
297 params.delimiter = delimiter;
298 params.smilesColumn = smilesColumn;
299 params.nameColumn = nameColumn;
300 params.titleLine = titleLine;
301 params.parseParameters.sanitize = sanitize;
302 static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
303 }
304 void moveTo(unsigned int idx) {
305 PRECONDITION(dp_supplier, "no supplier");
306 static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
307 }
308 ROMol *operator[](unsigned int idx) {
309 PRECONDITION(dp_supplier, "no supplier");
310 return static_cast<ContainedType *>(dp_supplier.get())
311 ->operator[](idx)
312 .release();
313 }
314 /*! \brief returns the text block for a particular item
315 *
316 * \param idx - which item to return
317 */
318 std::string getItemText(unsigned int idx) {
319 PRECONDITION(dp_supplier, "no supplier");
320 return static_cast<ContainedType *>(dp_supplier.get())->getItemText(idx);
321 }
322 unsigned int length() {
323 PRECONDITION(dp_supplier, "no supplier")
324 return static_cast<ContainedType *>(dp_supplier.get())->length();
325 }
326};
327
328//! lazy file parser for TDT files
330 /**************************************************************************
331 * Lazy file parser for TDT files, similar to the lazy SD
332 * file parser above
333 * - As an when new molecules are read using "next" their
334 * positions in the file are noted.
335 * - A call to the "length" will automatically parse the entire
336 * file and cache all the mol block positions
337 * - [] operator is used to access a molecule at "idx", calling
338 * next following this will result in the next molecule after
339 * "idx"
340 ***************************************************************************/
341 public:
343 /*!
344 * \param fileName - the name of the TDT file
345 * \param nameRecord - property name for the molecule name.
346 * If empty (the default), the name defaults to be empty
347 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
348 * structure (depiction) in the input will be read into the
349 * corresponding conformer id.
350 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
351 * structure (depiction) in the input will be read into the
352 * corresponding conformer id.
353 * \param sanitize - if true sanitize the molecule before returning it
354 */
355 explicit TDTMolSupplier(const std::string &fileName,
356 const std::string &nameRecord = "", int confId2D = -1,
357 int confId3D = 0, bool sanitize = true) {
359 params.nameRecord = nameRecord;
360 params.confId2D = confId2D;
361 params.confId3D = confId3D;
362 params.parseParameters.sanitize = sanitize;
363 dp_supplier.reset(new v2::FileParsers::TDTMolSupplier(fileName, params));
364 }
365 explicit TDTMolSupplier(std::istream *inStream, bool takeOwnership = true,
366 const std::string &nameRecord = "", int confId2D = -1,
367 int confId3D = 0, bool sanitize = true) {
369 params.nameRecord = nameRecord;
370 params.confId2D = confId2D;
371 params.confId3D = confId3D;
372 params.parseParameters.sanitize = sanitize;
373 dp_supplier.reset(
374 new v2::FileParsers::TDTMolSupplier(inStream, takeOwnership, params));
375 }
376 TDTMolSupplier() { dp_supplier.reset(new ContainedType()); }
377 void setData(const std::string &text, const std::string &nameRecord = "",
378 int confId2D = -1, int confId3D = 0, bool sanitize = true) {
379 PRECONDITION(dp_supplier, "no supplier");
381 params.nameRecord = nameRecord;
382 params.confId2D = confId2D;
383 params.confId3D = confId3D;
384 params.parseParameters.sanitize = sanitize;
385 static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
386 }
387 void moveTo(unsigned int idx) {
388 PRECONDITION(dp_supplier, "no supplier");
389 static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
390 }
391 ROMol *operator[](unsigned int idx) {
392 PRECONDITION(dp_supplier, "no supplier");
393 return static_cast<ContainedType *>(dp_supplier.get())
394 ->operator[](idx)
395 .release();
396 }
397 /*! \brief returns the text block for a particular item
398 *
399 * \param idx - which item to return
400 */
401 std::string getItemText(unsigned int idx) {
402 PRECONDITION(dp_supplier, "no supplier");
403 return static_cast<ContainedType *>(dp_supplier.get())->getItemText(idx);
404 }
405 unsigned int length() {
406 PRECONDITION(dp_supplier, "no supplier");
407 return static_cast<ContainedType *>(dp_supplier.get())->length();
408 }
409};
410
411#ifdef RDK_BUILD_MAEPARSER_SUPPORT
412//! lazy file parser for MAE files
414 /**
415 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
416 * always requires taking ownership of the istream ptr, as the shared ptr will
417 * always clear it upon destruction.
418 */
419
420 public:
421 using ContainedType = v2::FileParsers::MaeMolSupplier;
422 MaeMolSupplier() { dp_supplier.reset(new ContainedType()); }
423
424 explicit MaeMolSupplier(std::shared_ptr<std::istream> inStream,
425 bool sanitize = true, bool removeHs = true) {
426 v2::FileParsers::MaeMolSupplierParams params;
427 params.sanitize = sanitize;
428 params.removeHs = removeHs;
429 dp_supplier.reset(new ContainedType(inStream, params));
430 }
431
432 explicit MaeMolSupplier(std::istream *inStream, bool takeOwnership = true,
433 bool sanitize = true, bool removeHs = true) {
434 v2::FileParsers::MaeMolSupplierParams params;
435 params.sanitize = sanitize;
436 params.removeHs = removeHs;
437 dp_supplier.reset(new ContainedType(inStream, takeOwnership, params));
438 }
439
440 explicit MaeMolSupplier(const std::string &fname, bool sanitize = true,
441 bool removeHs = true) {
442 v2::FileParsers::MaeMolSupplierParams params;
443 params.sanitize = sanitize;
444 params.removeHs = removeHs;
445 dp_supplier.reset(new ContainedType(fname, params));
446 }
447 void moveTo(unsigned int idx) {
448 PRECONDITION(dp_supplier, "no supplier");
449 static_cast<ContainedType *>(dp_supplier.get())->moveTo(idx);
450 }
451 RWMol *operator[](unsigned int idx) {
452 PRECONDITION(dp_supplier, "no supplier");
453 return static_cast<ContainedType *>(dp_supplier.get())
454 ->operator[](idx)
455 .release();
456 }
457 unsigned int length() {
458 PRECONDITION(dp_supplier, "no supplier");
459 return static_cast<ContainedType *>(dp_supplier.get())->length();
460 }
461
462 void setData(const std::string &text, bool sanitize = true,
463 bool removeHs = true) {
464 PRECONDITION(dp_supplier, "no supplier");
465 v2::FileParsers::MaeMolSupplierParams params;
466 params.sanitize = sanitize;
467 params.removeHs = removeHs;
468 static_cast<ContainedType *>(dp_supplier.get())->setData(text, params);
469 }
470};
471#endif // RDK_BUILD_MAEPARSER_SUPPORT
472
473#if 0
474
475//! This class is still a bit experimental and the public API may change
476//! in future releases.
478 : public MultithreadedMolSupplier {
479 public:
480 explicit MultithreadedSDMolSupplier(
481 const std::string &fileName, bool sanitize = true, bool removeHs = true,
482 bool strictParsing = true, unsigned int numWriterThreads = 1,
483 size_t sizeInputQueue = 5, size_t sizeOutputQueue = 5);
484
485 explicit MultithreadedSDMolSupplier(
486 std::istream *inStream, bool takeOwnership = true, bool sanitize = true,
487 bool removeHs = true, bool strictParsing = true,
488 unsigned int numWriterThreads = 1, size_t sizeInputQueue = 5,
489 size_t sizeOutputQueue = 5);
490
491 MultithreadedSDMolSupplier();
492 ~MultithreadedSDMolSupplier() override;
493 void init() override {}
494
495 void checkForEnd();
496 bool getEnd() const override;
497 void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
498 bool getProcessPropertyLists() const { return df_processPropertyLists; }
499 bool getEOFHitOnRead() const { return df_eofHitOnRead; }
500
501 //! reads next record and returns whether or not EOF was hit
502 bool extractNextRecord(std::string &record, unsigned int &lineNum,
503 unsigned int &index) override;
504 void readMolProps(RWMol *mol, std::istringstream &inStream);
505 //! parses the record and returns the resulting molecule
506 RWMol *processMoleculeRecord(const std::string &record,
507 unsigned int lineNum) override;
508
509 private:
510 void initFromSettings(bool takeOwnership, bool sanitize, bool removeHs,
511 bool strictParsing, unsigned int numWriterThreads,
512 size_t sizeInputQueue, size_t sizeOutputQueue);
513
514 private:
515 bool df_end = false; //!< have we reached the end of the file?
516 int d_line = 0; //!< line number we are currently on
517 bool df_sanitize = true, df_removeHs = true, df_strictParsing = true;
518 bool df_processPropertyLists = true;
519 bool df_eofHitOnRead = false;
520 unsigned int d_currentRecordId = 1; //!< current record id
521};
522
523//! This class is still a bit experimental and the public API may change
524//! in future releases.
526 : public MultithreadedMolSupplier {
527 public:
528 explicit MultithreadedSmilesMolSupplier(
529 const std::string &fileName, const std::string &delimiter = " \t",
530 int smilesColumn = 0, int nameColumn = 1, bool titleLine = true,
531 bool sanitize = true, unsigned int numWriterThreads = 1,
532 size_t sizeInputQueue = 5, size_t sizeOutputQueue = 5);
533
534 explicit MultithreadedSmilesMolSupplier(
535 std::istream *inStream, bool takeOwnership = true,
536 const std::string &delimiter = " \t", int smilesColumn = 0,
537 int nameColumn = 1, bool titleLine = true, bool sanitize = true,
538 unsigned int numWriterThreads = 1, size_t sizeInputQueue = 5,
539 size_t sizeOutputQueue = 5);
540 MultithreadedSmilesMolSupplier();
541 ~MultithreadedSmilesMolSupplier() override;
542
543 void init() override {}
544 //! returns df_end
545 bool getEnd() const override;
546 //! reads and processes the title line
547 void processTitleLine();
548 //! reads next record and returns whether or not EOF was hit
549 bool extractNextRecord(std::string &record, unsigned int &lineNum,
550 unsigned int &index) override;
551 //! parses the record and returns the resulting molecule
552 ROMol *processMoleculeRecord(const std::string &record,
553 unsigned int lineNum) override;
554
555 private:
556 void initFromSettings(bool takeOwnership, const std::string &delimiter,
557 int smilesColumn, int nameColumn, bool titleLine,
558 bool sanitize, unsigned int numWriterThreads,
559 size_t sizeInputQueue, size_t sizeOutputQueue);
560
561 private:
562 bool df_end = false; //!< have we reached the end of the file?
563 int d_line = 0; //!< line number we are currently on
564 std::string d_delim; //!< the delimiter string
565 bool df_sanitize = true; //!< sanitize molecules before returning them?
566 STR_VECT d_props; //!< vector of property names
567 bool df_title = true; //!< do we have a title line?
568 int d_smi = 0; //!< column id for the smile string
569 int d_name = 1; //!< column id for the name
570 unsigned int d_currentRecordId = 1; //!< current record id
571};
572
573#endif
574} // namespace v1
575} // namespace RDKit
576
577#endif
#define PRECONDITION(expr, mess)
Definition Invariant.h:109
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=false)
std::unique_ptr< v2::FileParsers::MolSupplier > dp_supplier
ROMol * operator[](unsigned int idx)
void moveTo(unsigned int idx)
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
void setStreamIndices(const std::vector< std::streampos > &locs)
void setData(const std::string &text, bool sanitize=true, bool removeHs=true)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, bool sanitize, bool removeHs, bool strictParsing)
SDMolSupplier(const std::string &fileName, bool sanitize=true, bool removeHs=true, bool strictParsing=true)
lazy file parser for Smiles tables
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
void moveTo(unsigned int idx)
void setData(const std::string &text, const std::string &delimiter=" ", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
ROMol * operator[](unsigned int idx)
SmilesMolSupplier(const std::string &fileName, const std::string &delimiter=" \t", int smilesColumn=0, int nameColumn=1, bool titleLine=true, bool sanitize=true)
std::string getItemText(unsigned int idx)
returns the text block for a particular item
lazy file parser for TDT files
std::string getItemText(unsigned int idx)
returns the text block for a particular item
TDTMolSupplier(const std::string &fileName, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
ROMol * operator[](unsigned int idx)
void setData(const std::string &text, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const std::string &nameRecord="", int confId2D=-1, int confId3D=0, bool sanitize=true)
void moveTo(unsigned int idx)
lazy file parser for Smiles tables
lazy file parser for TDT files
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:161
RDKIT_GRAPHMOL_EXPORT ROMol * removeHs(const ROMol &mol, bool implicitOnly=false, bool updateExplicitCount=false, bool sanitize=true)
returns a copy of a molecule with hydrogens removed
Std stuff.
std::vector< std::string > STR_VECT
Definition Dict.h:29
bool rdvalue_is(const RDValue_cast_t)
v2::SmilesParse::SmilesParserParams parseParameters
v2::SmilesParse::SmilesParserParams parseParameters