RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MolSupplier.h
Go to the documentation of this file.
1//
2// Copyright (C) 2002-2024 greg landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_MOLSUPPLIER_H
12#define RD_MOLSUPPLIER_H
13
14#include <RDGeneral/types.h>
15
16#include <string>
17#include <string_view>
18#include <list>
19#include <memory>
20#include <vector>
21#include <iostream>
22#include <fstream>
23#include <GraphMol/ROMol.h>
25#include "FileParsers.h"
27
28#ifdef RDK_BUILD_MAEPARSER_SUPPORT
29namespace schrodinger {
30namespace mae {
31class Reader;
32class Block;
33} // namespace mae
34} // namespace schrodinger
35#endif // RDK_BUILD_MAEPARSER_SUPPORT
36
37namespace RDKit {
38RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig);
39
40namespace v2 {
41namespace FileParsers {
42/*!
43//
44// Here are a couple of ways one can interact with MolSuppliers:
45//
46// 1) Lazy (ForwardIterator):
47// while(!supplier.atEnd()){
48// ROMol *mol = supplier.next();
49// if(mol){
50// do something;
51// }
52// }
53// 2) Random Access:
54// for(int i=0;i<supplier.length();i++){
55// ROMol *mol = supplier[i];
56// if(mol){
57// do something;
58// }
59// }
60//
61//
62*/
64 // this is an abstract base class to supply molecules one at a time
65 public:
67 virtual ~MolSupplier() {}
68 virtual void init() = 0;
69 virtual void reset() = 0;
70 virtual bool atEnd() = 0;
71 virtual std::unique_ptr<RWMol> next() = 0;
72
73 virtual void close() {
74 if (df_owner) {
75 delete dp_inStream;
76 df_owner = false;
77 }
78 dp_inStream = nullptr;
79 }
80
81 private:
82 // disable automatic copy constructors and assignment operators
83 // for this class and its subclasses. They will likely be
84 // carrying around stream pointers and copying those is a recipe
85 // for disaster.
86 MolSupplier(const MolSupplier &);
87 MolSupplier &operator=(const MolSupplier &);
88
89 protected:
90 // stream to read the molecules from:
91 std::istream *dp_inStream = nullptr;
92 // do we own dp_inStream?
93 bool df_owner = false;
94 // opens a stream for reading and verifies that it can be read from.
95 // if not it throws an exception
96 // the caller owns the resulting stream
97 std::istream *openAndCheckStream(const std::string &filename) {
98 // FIX: this binary mode of opening file is here because of a bug in
99 // VC++ 6.0
100 // the function "tellg" does not work correctly if we do not open it this
101 // way
102 // Jan 2009: Confirmed that this is still the case in visual studio 2008
103 std::ifstream *strm =
104 new std::ifstream(filename.c_str(), std::ios_base::binary);
105 if ((!(*strm)) || strm->bad()) {
106 std::ostringstream errout;
107 errout << "Bad input file " << filename;
108 delete strm;
109 throw BadFileException(errout.str());
110 }
111 strm->peek();
112 if (strm->bad() || strm->eof()) {
113 std::ostringstream errout;
114 errout << "Invalid input file " << filename;
115 delete strm;
116 throw BadFileException(errout.str());
117 }
118 return static_cast<std::istream *>(strm);
119 }
120};
121
122// \brief a supplier from an SD file that only reads forward:
124 /*************************************************************************
125 * A lazy mol supplier from a SD file.
126 * - When new molecules are read using "next" their positions in the file are
127 *noted.
128 ***********************************************************************************/
129 public:
131
133 std::istream *inStream, bool takeOwnership = true,
134 const MolFileParserParams &params = MolFileParserParams());
135
136 ~ForwardSDMolSupplier() override { close(); }
137
138 void init() override;
139 void reset() override;
140 std::unique_ptr<RWMol> next() override;
141 bool atEnd() override;
142
143 void setProcessPropertyLists(bool val) { df_processPropertyLists = val; }
144 bool getProcessPropertyLists() const { return df_processPropertyLists; }
145
146 bool getEOFHitOnRead() const { return df_eofHitOnRead; }
147
148 protected:
149 virtual void checkForEnd();
150 std::unique_ptr<RWMol> _next();
151 virtual void readMolProps(ROMol &);
152 bool df_end = false;
153 int d_line = 0; // line number we are currently on
155 bool df_processPropertyLists = true;
156 bool df_eofHitOnRead = false;
157};
158// \brief a lazy supplier from an SD file
160 /*************************************************************************
161 * A lazy mol supplier from a SD file.
162 * - When new molecules are read using "next" their positions in the file are
163 *noted.
164 * - A call to the "length" will automatically parse the entire file and
165 *cache all the mol
166 * block positions
167 * - [] operator is used to access a molecule at "idx", calling next
168 *following this will result
169 * in the next molecule after "idx"
170 ***********************************************************************************/
171
172 public:
173 SDMolSupplier() { init(); }
174
175 /*!
176 * \param fileName - the name of the SD file
177 * \param sanitize - if true sanitize the molecule before returning it
178 * \param removeHs - if true remove Hs from the molecule before returning it
179 * (triggers sanitization)
180 * \param strictParsing - if set to false, the parser is more lax about
181 * correctness
182 * of the contents.
183 */
185 const std::string &fileName,
186 const MolFileParserParams &params = MolFileParserParams());
187
189 std::istream *inStream, bool takeOwnership = true,
190 const MolFileParserParams &params = MolFileParserParams());
191
192 ~SDMolSupplier() override { close(); }
193 void init() override;
194 void reset() override;
195 std::unique_ptr<RWMol> next() override;
196 bool atEnd() override;
197 void moveTo(unsigned int idx);
198 std::unique_ptr<RWMol> operator[](unsigned int idx);
199 /*! \brief returns the text block for a particular item
200 *
201 * \param idx - which item to return
202 */
203 std::string getItemText(unsigned int idx);
204 unsigned int length();
205 void setData(const std::string &text);
206 void setData(const std::string &text, const MolFileParserParams &params);
207
208 /*! Resets our internal state and sets the indices of molecules in the stream.
209 * The client should be *very* careful about calling this method, as it's
210 *trivial
211 * to end up with a completely useless supplier.
212 *
213 * \param locs - the vector of stream positions.
214 *
215 * Note that this can be used not only to make reading selected molecules
216 *from a
217 * large SD file much faster, but it can also allow subsetting an SD file or
218 * rearranging the order of the molecules.
219 */
220 void setStreamIndices(const std::vector<std::streampos> &locs);
221
222 private:
223 void checkForEnd() override;
224 int d_len = 0; // total number of mol blocks in the file (initialized to -1)
225 int d_last = 0; // the molecule we are ready to read
226 std::vector<std::streampos> d_molpos;
227};
228
230 std::string delimiter = " \t";
232 int nameColumn = 1;
233 bool titleLine = true;
234 v2::SmilesParse::SmilesParserParams parseParameters = {
235 true, // sanitize
236 false, // allowCXSMILES
237 true, // strictCXSMILES
238 false, // parseName
239 true, // removeHs
240 false, // skipCleanup
241 false, // debugParse
242 {} // replacements
243 };
244};
245
246//! lazy file parser for Smiles tables
248 /**************************************************************************
249 * Lazy file parser for Smiles table file, similar to the lazy SD
250 * file parser above
251 * - As an when new molecules are read using "next" their
252 * positions in the file are noted.
253 * - A call to the "length" will automatically parse the entire
254 * file and cache all the mol block positions
255 * - [] operator is used to access a molecule at "idx", calling
256 * next following this will result in the next molecule after
257 * "idx"
258 ***************************************************************************/
259 public:
260 /*!
261 * \param fileName - the name of smiles table file
262 * \param delimiter - delimiting characters between records on a each
263 * line NOTE that this is not a string, the tokenizer looks for
264 * the individual characters in delimiter, not the full string
265 * itself. So the default delimiter: " \t", means " " or "\t".
266 * \param smilesColumn - column number for the SMILES string (defaults
267 * to the first column)
268 * \param nameColumn - column number for the molecule name (defaults to
269 * the second column) If set to -1 we assume that no name is
270 * available for the molecule and the name is defaulted to the
271 * smiles string
272 * \param titleLine - if true, the first line is assumed to list the
273 * names of properties in order separated by 'delimiter'. It is
274 * also assume that the 'SMILES' column and the 'name' column
275 * are not specified here if false - no title line is assumed
276 * and the properties are recorded as the "columnX" where "X" is
277 * the column number
278 * \param sanitize - if true sanitize the molecule before returning it
279 */
281 const std::string &fileName,
285 std::istream *inStream, bool takeOwnership = true,
287
288 ~SmilesMolSupplier() override { close(); }
289 void setData(const std::string &text, const SmilesMolSupplierParams &params =
291 void init() override;
292 void reset() override;
293 std::unique_ptr<RWMol> next() override;
294 bool atEnd() override;
295 void moveTo(unsigned int idx);
296 std::unique_ptr<RWMol> operator[](unsigned int idx);
297 /*! \brief returns the text block for a particular item
298 *
299 * \param idx - which item to return
300 */
301 std::string getItemText(unsigned int idx);
302 unsigned int length();
303
304 private:
305 std::unique_ptr<RWMol> processLine(std::string inLine);
306 void processTitleLine();
307 std::string nextLine();
308 long int skipComments();
309 void checkForEnd();
310
311 bool df_end = false; // have we reached the end of the file?
312 long d_len = 0; // total number of smiles in the file
313 long d_next = 0; // the molecule we are ready to read
314 size_t d_line = 0; // line number we are currently on
316 std::vector<std::streampos>
317 d_molpos; // vector of positions in the file for molecules
318 std::vector<int> d_lineNums;
319 STR_VECT d_props; // vector of property names
320};
321
323 std::string nameRecord = "";
324 int confId2D = -1;
325 int confId3D = -1;
326 v2::SmilesParse::SmilesParserParams parseParameters = {
327 true, // sanitize
328 false, // allowCXSMILES
329 true, // strictCXSMILES
330 false, // parseName
331 true, // removeHs
332 false, // skipCleanup
333 false, // debugParse
334 {} // replacements
335 };
336};
337
338//! lazy file parser for TDT files
340 /**************************************************************************
341 * Lazy file parser for TDT files, similar to the lazy SD
342 * file parser above
343 * - As an when new molecules are read using "next" their
344 * positions in the file are noted.
345 * - A call to the "length" will automatically parse the entire
346 * file and cache all the mol block positions
347 * - [] operator is used to access a molecule at "idx", calling
348 * next following this will result in the next molecule after
349 * "idx"
350 ***************************************************************************/
351 public:
352 /*!
353 * \param fileName - the name of the TDT file
354 * \param nameRecord - property name for the molecule name.
355 * If empty (the default), the name defaults to be empty
356 * \param confId2D - if >=0 and 2D coordinates are provided, the 2D
357 * structure (depiction) in the input will be read into the
358 * corresponding conformer id.
359 * \param confId3D - if >=0 and 3D coordinates are provided, the 3D
360 * structure (depiction) in the input will be read into the
361 * corresponding conformer id.
362 * \param sanitize - if true sanitize the molecule before returning it
363 */
365 const std::string &fileName,
368 std::istream *inStream, bool takeOwnership = true,
371 ~TDTMolSupplier() override { close(); }
372 void setData(const std::string &text,
374 void init() override;
375 void reset() override;
376 std::unique_ptr<RWMol> next() override;
377 bool atEnd() override;
378 void moveTo(unsigned int idx);
379 std::unique_ptr<RWMol> operator[](unsigned int idx);
380 /*! \brief returns the text block for a particular item
381 *
382 * \param idx - which item to return
383 */
384 std::string getItemText(unsigned int idx);
385 unsigned int length();
386
387 private:
388 bool advanceToNextRecord();
389 void checkForEnd();
390 std::unique_ptr<RWMol> parseMol(std::string inLine);
391
392 bool df_end = false; // have we reached the end of the file?
393 int d_len = 0; // total number of mols in the file
394 int d_last = 0; // the molecule we are ready to read
395 int d_line = 0; // line number we are currently on
396 std::vector<std::streampos>
397 d_molpos; // vector of positions in the file for molecules
398 TDTMolSupplierParams d_params;
399};
400
401#ifdef RDK_BUILD_MAEPARSER_SUPPORT
403 bool sanitize = true;
404 bool removeHs = true;
405};
406//! lazy file parser for MAE files
408 /**
409 * Due to maeparser's shared_ptr<istream> Reader interface, MaeMolSupplier
410 * always requires taking ownership of the istream ptr, as the shared ptr will
411 * always clear it upon destruction.
412 */
413
414 public:
415 MaeMolSupplier() {}
416
417 explicit MaeMolSupplier(
418 std::shared_ptr<std::istream> inStream,
419 const MaeMolSupplierParams &params = MaeMolSupplierParams());
420
421 explicit MaeMolSupplier(
422 std::istream *inStream, bool takeOwnership = true,
423 const MaeMolSupplierParams &params = MaeMolSupplierParams());
424
425 explicit MaeMolSupplier(
426 const std::string &fname,
427 const MaeMolSupplierParams &params = MaeMolSupplierParams());
428
429 ~MaeMolSupplier() override {}
430
431 void init() override;
432 void reset() override;
433 std::unique_ptr<RWMol> next() override;
434 bool atEnd() override;
435 void moveTo(unsigned int idx);
436 std::unique_ptr<RWMol> operator[](unsigned int idx);
437 unsigned int length();
438
439 void close() override { dp_sInStream.reset(); }
440
441 void setData(const std::string &text,
442 const MaeMolSupplierParams &params = MaeMolSupplierParams());
443
444 private:
445 void moveToNextBlock();
446
447 protected:
448 MaeMolSupplierParams d_params;
449 std::shared_ptr<schrodinger::mae::Reader> d_reader;
450 std::shared_ptr<schrodinger::mae::Block> d_next_struct;
451 std::shared_ptr<std::istream> dp_sInStream;
452 std::string d_stored_exc;
453 unsigned d_position;
454 unsigned d_length;
455};
456#endif // RDK_BUILD_MAEPARSER_SUPPORT
457
458} // namespace FileParsers
459} // namespace v2
460} // namespace RDKit
461
462#include "MolSupplier.v1API.h"
463
464#endif
Defines the primary molecule class ROMol as well as associated typedefs.
used by various file parsing classes to indicate a bad file
ForwardSDMolSupplier(std::istream *inStream, bool takeOwnership=true, const MolFileParserParams &params=MolFileParserParams())
std::unique_ptr< RWMol > next() override
std::istream * openAndCheckStream(const std::string &filename)
Definition MolSupplier.h:97
virtual std::unique_ptr< RWMol > next()=0
void setStreamIndices(const std::vector< std::streampos > &locs)
void setData(const std::string &text)
std::unique_ptr< RWMol > next() override
SDMolSupplier(std::istream *inStream, bool takeOwnership=true, const MolFileParserParams &params=MolFileParserParams())
std::string getItemText(unsigned int idx)
returns the text block for a particular item
void setData(const std::string &text, const MolFileParserParams &params)
SDMolSupplier(const std::string &fileName, const MolFileParserParams &params=MolFileParserParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
lazy file parser for Smiles tables
std::string getItemText(unsigned int idx)
returns the text block for a particular item
std::unique_ptr< RWMol > next() override
SmilesMolSupplier(std::istream *inStream, bool takeOwnership=true, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
std::unique_ptr< RWMol > operator[](unsigned int idx)
SmilesMolSupplier(const std::string &fileName, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
void setData(const std::string &text, const SmilesMolSupplierParams &params=SmilesMolSupplierParams())
lazy file parser for TDT files
std::unique_ptr< RWMol > operator[](unsigned int idx)
void setData(const std::string &text, const TDTMolSupplierParams &params=TDTMolSupplierParams())
TDTMolSupplier(const std::string &fileName, const TDTMolSupplierParams &params=TDTMolSupplierParams())
std::string getItemText(unsigned int idx)
returns the text block for a particular item
TDTMolSupplier(std::istream *inStream, bool takeOwnership=true, const TDTMolSupplierParams &params=TDTMolSupplierParams())
std::unique_ptr< RWMol > next() override
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:161
Std stuff.
RDKIT_FILEPARSERS_EXPORT std::string strip(const std::string &orig)
std::vector< std::string > STR_VECT
Definition Dict.h:29
bool rdvalue_is(const RDValue_cast_t)
v2::SmilesParse::SmilesParserParams parseParameters
v2::SmilesParse::SmilesParserParams parseParameters