RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
FileParserUtils.h
Go to the documentation of this file.
1//
2// Copyright (C) 2010-2022 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef RD_FILEPARSERUTILS_H
12#define RD_FILEPARSERUTILS_H
13
14#include <string>
15#include <iostream>
17#include <boost/lexical_cast.hpp>
18#include <boost/algorithm/string.hpp>
19#include <boost/format.hpp>
21#include "FileParsers.h"
22#include <string_view>
23
24namespace RDKit {
25class RWMol;
26class Conformer;
27
28namespace FileParserUtils {
29RDKIT_FILEPARSERS_EXPORT inline std::string_view strip(
30 std::string_view orig, std::string stripChars = " \t\r\n") {
31 std::string_view res = orig;
32 auto start = res.find_first_not_of(stripChars);
33 if (start != std::string_view::npos) {
34 auto end = res.find_last_not_of(stripChars) + 1;
35 res = res.substr(start, end - start);
36 } else {
37 res = "";
38 }
39 return res;
40}
41
42template <typename T>
43T stripSpacesAndCast(std::string_view input, bool acceptSpaces = false) {
44 auto trimmed = strip(input, " ");
45 if (acceptSpaces && trimmed.empty()) {
46 return 0;
47 } else {
48 return boost::lexical_cast<T>(trimmed);
49 }
50}
51template <typename T>
52T stripSpacesAndCast(const std::string &input, bool acceptSpaces = false) {
53 return stripSpacesAndCast<T>(std::string_view(input.c_str()), acceptSpaces);
54}
55RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input,
56 bool acceptSpaces = true);
57RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input,
58 bool acceptSpaces = true);
59RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input,
60 bool acceptSpaces = true);
61RDKIT_FILEPARSERS_EXPORT int toInt(const std::string_view input,
62 bool acceptSpaces = true);
63RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(std::string_view input,
64 bool acceptSpaces = true);
65RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string_view input,
66 bool acceptSpaces = true);
67
68// gets a V3000 CTAB for a molecule
70 const ROMol &tmol, const boost::dynamic_bitset<> &wasAromatic,
71 int confId = -1, unsigned int precision = 6);
72//! \overload
73inline std::string getV3000CTAB(const ROMol &tmol, int confId = -1,
74 unsigned int precision = 6) {
75 boost::dynamic_bitset<> wasAromatic(tmol.getNumBonds());
76 return getV3000CTAB(tmol, wasAromatic, confId, precision);
77};
78// reads a line from an MDL v3K CTAB
80 unsigned int &line);
81
82// nAtoms and nBonds are ignored on input, set on output
84 std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
85 bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
86 bool strictParsing = true, bool expectMEND = true);
87
88// nAtoms and nBonds are used
90 std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf,
91 bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds,
92 bool strictParsing = true);
93
94//! finishes up the processing (sanitization, etc.) of a molecule read from
95//! CTAB
99//! \overload
101 bool sanitize, bool removeHs) {
103 ps.sanitize = sanitize;
104 ps.removeHs = removeHs;
106}
107
108//! Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead
110
111//! applies a particular property to the atoms as an atom property list
112template <typename T>
113void applyMolListPropToAtoms(ROMol &mol, const std::string &pn,
114 const std::string &prefix,
115 const std::string &missingValueMarker = "n/a") {
116 std::string atompn = pn.substr(prefix.size());
117 std::string strVect = mol.getProp<std::string>(pn);
118 std::vector<std::string> tokens;
119 boost::split(tokens, strVect, boost::is_any_of(" \t\n"),
120 boost::token_compress_on);
121 if (tokens.size() < mol.getNumAtoms()) {
123 << "Property list " << pn << " too short, only " << tokens.size()
124 << " elements found. Ignoring it." << std::endl;
125 return;
126 }
127 std::string mv = missingValueMarker;
128 size_t first_token = 0;
129 if (tokens.size() == mol.getNumAtoms() + 1 && tokens[0].front() == '[' &&
130 tokens[0].back() == ']') {
131 mv = std::string(tokens[0].begin() + 1, tokens[0].end() - 1);
132 first_token = 1;
133 }
134 if (mv.empty()) {
135 BOOST_LOG(rdWarningLog) << "Missing value marker for property " << pn
136 << " is empty." << std::endl;
137 }
138 for (size_t i = first_token; i < tokens.size(); ++i) {
139 if (tokens[i] != mv) {
140 unsigned int atomid = i - first_token;
141 try {
142 T apv = boost::lexical_cast<T>(tokens[i]);
144 } catch (const boost::bad_lexical_cast &) {
146 << "Value " << tokens[i] << " for property " << pn << " of atom "
147 << atomid << " can not be parsed. Ignoring it." << std::endl;
148 }
149 }
150 }
151}
152
153//! applies all properties matching a particular prefix as an atom property
154//! list
155template <typename T>
156void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix,
157 const std::string missingValueMarker = "n/a") {
158 for (auto pn : mol.getPropList()) {
159 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
161 }
162 }
163}
164static const std::string atomPropPrefix = "atom.";
165//! if the property name matches our rules for atom property lists, we'll
166//! apply it to the atoms
168 ROMol &mol, const std::string pn,
169 const std::string &missingValueMarker = "n/a") {
170 if (pn.find(atomPropPrefix) == 0 && pn.length() > atomPropPrefix.length()) {
171 std::string prefix = atomPropPrefix + "prop.";
172 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
174 } else {
175 prefix = atomPropPrefix + "iprop.";
176 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
179 } else {
180 prefix = atomPropPrefix + "dprop.";
181 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
183 } else {
184 prefix = atomPropPrefix + "bprop.";
185 if (pn.find(prefix) == 0 && pn.length() > prefix.length()) {
187 }
188 }
189 }
190 }
191 }
192}
193//! loops over all properties and applies the ones that match the rules for
194//! atom property lists to the atoms
196 ROMol &mol, const std::string &missingValueMarker = "n/a") {
197 for (auto pn : mol.getPropList()) {
199 }
200}
201template <typename T>
202std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName,
203 std::string missingValueMarker = "",
204 unsigned int lineSize = 190) {
205 std::string res;
206 std::string propVal;
207 if (!missingValueMarker.empty()) {
208 propVal += boost::str(boost::format("[%s] ") % missingValueMarker);
209 } else {
210 missingValueMarker = "n/a";
211 }
212 for (const auto &atom : mol.atoms()) {
213 std::string apVal = missingValueMarker;
214 if (atom->hasProp(atomPropName)) {
215 T tVal = atom->getProp<T>(atomPropName);
216 apVal = boost::lexical_cast<std::string>(tVal);
217 // seems like this should work, but it doesn't:
218 // atom->getProp(atomPropName,apVal);
219 }
220 if (propVal.length() + apVal.length() + 1 >= lineSize) {
221 // remove trailing space:
222 propVal.pop_back();
223 res += propVal + "\n";
224 propVal = "";
225 }
226 propVal += apVal + " ";
227 }
228 if (!propVal.empty()) {
229 // remove the trailing space:
230 propVal.pop_back();
231 res += propVal;
232 }
233 return res;
234}
236 ROMol &mol, const std::string &atomPropName,
237 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
238 std::string molPropName = "atom.iprop." + atomPropName;
242}
244 ROMol &mol, const std::string &atomPropName,
245 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
246 std::string molPropName = "atom.dprop." + atomPropName;
249 lineSize));
250}
252 ROMol &mol, const std::string &atomPropName,
253 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
254 std::string molPropName = "atom.bprop." + atomPropName;
257 lineSize));
258}
260 ROMol &mol, const std::string &atomPropName,
261 const std::string &missingValueMarker = "", unsigned int lineSize = 190) {
262 std::string molPropName = "atom.prop." + atomPropName;
266}
267
269
270} // namespace FileParserUtils
271} // namespace RDKit
272
273#endif
#define BOOST_LOG(__arg__)
Definition RDLog.h:110
RDKIT_RDGENERAL_EXPORT RDLogger rdWarningLog
The class for representing atoms.
Definition Atom.h:75
The class for representing 2D or 3D conformation of a molecule.
Definition Conformer.h:46
void getProp(const std::string &key, T &res) const
allows retrieval of a particular property value
Definition RDProps.h:107
void setProp(const std::string &key, T val, bool computed=false) const
sets a property value
Definition RDProps.h:77
STR_VECT getPropList(bool includePrivate=true, bool includeComputed=true) const
returns a list with the names of our properties
Definition RDProps.h:45
Atom * getAtomWithIdx(unsigned int idx)
returns a pointer to a particular Atom
unsigned int getNumAtoms() const
returns our number of atoms
Definition ROMol.h:421
CXXAtomIterator< MolGraph, Atom * > atoms()
C++11 Range iterator.
Definition ROMol.h:277
RWMol is a molecule class that is intended to be edited.
Definition RWMol.h:32
#define RDKIT_FILEPARSERS_EXPORT
Definition export.h:161
void processMolPropertyList(ROMol &mol, const std::string pn, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT void moveAdditionalPropertiesToSGroups(RWMol &mol)
RDKIT_FILEPARSERS_EXPORT std::string getV3000CTAB(const ROMol &tmol, const boost::dynamic_bitset<> &wasAromatic, int confId=-1, unsigned int precision=6)
void createAtomDoublePropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT double toDouble(const std::string &input, bool acceptSpaces=true)
void createAtomIntPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT int toInt(const std::string &input, bool acceptSpaces=true)
RDKIT_FILEPARSERS_EXPORT Atom * replaceAtomWithQueryAtom(RWMol *mol, Atom *atom)
Deprecated, please use QueryOps::replaceAtomWithQueryAtom instead.
T stripSpacesAndCast(std::string_view input, bool acceptSpaces=false)
void createAtomStringPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
void applyMolListPropToAtoms(ROMol &mol, const std::string &pn, const std::string &prefix, const std::string &missingValueMarker="n/a")
applies a particular property to the atoms as an atom property list
std::string getAtomPropertyList(ROMol &mol, const std::string &atomPropName, std::string missingValueMarker="", unsigned int lineSize=190)
RDKIT_FILEPARSERS_EXPORT void finishMolProcessing(RWMol *res, bool chiralityPossible, const v2::FileParsers::MolFileParserParams &ps)
RDKIT_FILEPARSERS_EXPORT std::string_view strip(std::string_view orig, std::string stripChars=" \t\r\n")
void applyMolListPropsToAtoms(ROMol &mol, const std::string &prefix, const std::string missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV3000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true, bool expectMEND=true)
void processMolPropertyLists(ROMol &mol, const std::string &missingValueMarker="n/a")
RDKIT_FILEPARSERS_EXPORT bool ParseV2000CTAB(std::istream *inStream, unsigned int &line, RWMol *mol, Conformer *&conf, bool &chiralityPossible, unsigned int &nAtoms, unsigned int &nBonds, bool strictParsing=true)
static const std::string atomPropPrefix
RDKIT_FILEPARSERS_EXPORT std::string getV3000Line(std::istream *inStream, unsigned int &line)
RDKIT_FILEPARSERS_EXPORT unsigned int toUnsigned(const std::string &input, bool acceptSpaces=true)
void createAtomBoolPropertyList(ROMol &mol, const std::string &atomPropName, const std::string &missingValueMarker="", unsigned int lineSize=190)
Std stuff.
bool rdvalue_is(const RDValue_cast_t)