RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
GeneralFileReader.h
Go to the documentation of this file.
1//
2// Copyright (C) 2020 Shrey Aryan
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#ifndef GENERAL_FILE_READER_H
11#define GENERAL_FILE_READER_H
13#include <RDStreams/streams.h>
14
15#include <boost/algorithm/string.hpp>
16#include <iostream>
17#include <memory>
18#include <string>
19#include <vector>
20
21#include "MolSupplier.h"
24
25namespace RDKit {
26namespace FileParsers = v2::FileParsers;
27namespace GeneralMolSupplier {
29 bool takeOwnership = true;
30 bool sanitize = true;
31 bool removeHs = true;
32 bool strictParsing = true;
33
34 std::string delimiter = "\t";
35 int smilesColumn = 0;
36 int nameColumn = 1;
37 bool titleLine = true;
38
39 std::string nameRecord = "";
40 int confId2D = -1;
41 int confId3D = 0;
42
44};
45//! current supported file formats
46const std::vector<std::string> supportedFileFormats{
47 "sdf", "mae", "maegz", "sdfgz", "smi", "csv", "txt", "tsv", "tdt"};
48//! current supported compression formats
49const std::vector<std::string> supportedCompressionFormats{"gz"};
50
51//! given file path determines the file and compression format
52//! returns true on success, otherwise false
53//! Note: Error handeling is done in the getSupplier method
54
55inline void determineFormat(const std::string path, std::string &fileFormat,
56 std::string &compressionFormat) {
57 //! filename without compression format
58 std::string basename;
59 //! Special case maegz.
60 //! NOTE: also supporting case-insensitive filesystems
61 if (boost::algorithm::iends_with(path, ".maegz")) {
62 fileFormat = "mae";
63 compressionFormat = "gz";
64 return;
65 } else if (boost::algorithm::iends_with(path, ".sdfgz")) {
66 fileFormat = "sdf";
67 compressionFormat = "gz";
68 return;
69 } else if (boost::algorithm::iends_with(path, ".gz")) {
70 compressionFormat = "gz";
71 basename = path.substr(0, path.size() - 3);
72 } else if (boost::algorithm::iends_with(path, ".zst") ||
73 boost::algorithm::iends_with(path, ".bz2") ||
74 boost::algorithm::iends_with(path, ".7z")) {
75 throw BadFileException(
76 "Unsupported compression extension (.zst, .bz2, .7z) given path: " +
77 path);
78 } else {
79 basename = path;
81 }
82 for (auto const &suffix : supportedFileFormats) {
83 if (boost::algorithm::iends_with(basename, "." + suffix)) {
85 return;
86 }
87 }
88 throw BadFileException(
89 "Unsupported structure or compression extension given path: " + path);
90}
91
92//! returns a new MolSupplier object based on the file name instantiated
93//! with the relevant options provided in the SupplierOptions struct
94/*!
95 <b>Note:</b>
96 - the caller owns the memory and therefore the pointer must be deleted
97*/
98
99inline std::unique_ptr<FileParsers::MolSupplier> getSupplier(
100 const std::string &path, const struct SupplierOptions &opt) {
101 std::string fileFormat = "";
102 std::string compressionFormat = "";
103 //! get the file and compression format form the path
105
106 std::istream *strm;
107 if (compressionFormat.empty()) {
108 strm = new std::ifstream(path.c_str(), std::ios::in | std::ios::binary);
109 } else {
110#ifdef RDK_USE_BOOST_IOSTREAMS
111 strm = new gzstream(path);
112#else
113 throw BadFileException(
114 "compressed files are only supported if the RDKit is built with boost::iostreams support");
115#endif
116 }
117
118 if ((!(*strm)) || strm->bad()) {
119 std::ostringstream errout;
120 errout << "Bad input file " << path;
121 delete strm;
122 throw BadFileException(errout.str());
123 }
124 strm->peek();
125 if (strm->bad() || strm->eof()) {
126 std::ostringstream errout;
127 errout << "Invalid input file " << path;
128 delete strm;
129 throw BadFileException(errout.str());
130 }
131
132#ifdef RDK_BUILD_THREADSAFE_SSS
133 FileParsers::MultithreadedMolSupplier::Parameters params;
134 params.numWriterThreads = getNumThreadsToUse(opt.numWriterThreads);
135#endif
136 //! Dispatch to the appropriate supplier
137 if (fileFormat == "sdf") {
139 parseParams.sanitize = opt.sanitize;
140 parseParams.removeHs = opt.removeHs;
141 parseParams.strictParsing = opt.strictParsing;
142#ifdef RDK_BUILD_THREADSAFE_SSS
143 if (params.numWriterThreads > 1) {
144 return std::make_unique<FileParsers::MultithreadedSDMolSupplier>(
145 strm, true, params, parseParams);
146 }
147#endif
148 return std::make_unique<FileParsers::ForwardSDMolSupplier>(strm, true,
150 }
151
152 else if (fileFormat == "smi" || fileFormat == "csv" || fileFormat == "txt" ||
153 fileFormat == "tsv") {
155 parseParams.delimiter = opt.delimiter;
156 parseParams.smilesColumn = opt.smilesColumn;
157 parseParams.nameColumn = opt.nameColumn;
158 parseParams.titleLine = opt.titleLine;
159 parseParams.parseParameters.sanitize = opt.sanitize;
160#ifdef RDK_BUILD_THREADSAFE_SSS
161 if (params.numWriterThreads > 1) {
162 return std::make_unique<FileParsers::MultithreadedSmilesMolSupplier>(
163 strm, true, params, parseParams);
164 }
165#endif
166 return std::make_unique<FileParsers::SmilesMolSupplier>(strm, true,
168 }
169#ifdef RDK_BUILD_MAEPARSER_SUPPORT
170 else if (fileFormat == "mae") {
171 FileParsers::MaeMolSupplierParams parseParams;
172 parseParams.sanitize = opt.sanitize;
173 parseParams.removeHs = opt.removeHs;
174 return std::make_unique<FileParsers::MaeMolSupplier>(strm, true,
176 }
177#endif
178 else if (fileFormat == "tdt") {
180 parseParams.nameRecord = opt.nameRecord;
181 parseParams.confId2D = opt.confId2D;
182 parseParams.confId3D = opt.confId3D;
183 parseParams.parseParameters.sanitize = opt.sanitize;
184 return std::make_unique<FileParsers::TDTMolSupplier>(strm, true,
186 }
187 throw BadFileException("Unsupported file format: " + fileFormat);
188}
189
190} // namespace GeneralMolSupplier
191} // namespace RDKit
192#endif
used by various file parsing classes to indicate a bad file
const std::vector< std::string > supportedCompressionFormats
current supported compression formats
const std::vector< std::string > supportedFileFormats
current supported file formats
void determineFormat(const std::string path, std::string &fileFormat, std::string &compressionFormat)
std::unique_ptr< FileParsers::MolSupplier > getSupplier(const std::string &path, const struct SupplierOptions &opt)
Std stuff.
bool rdvalue_is(const RDValue_cast_t)
unsigned int getNumThreadsToUse(int target)
Definition RDThreads.h:37