RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
SparseIntVect.h
Go to the documentation of this file.
1//
2// Copyright (C) 2007-2024 Greg Landrum and other RDKit contributors
3//
4// @@ All Rights Reserved @@
5// This file is part of the RDKit.
6// The contents are covered by the terms of the BSD license
7// which is included in the file license.txt, found at the root
8// of the RDKit source tree.
9//
10#include <RDGeneral/export.h>
11#ifndef __RD_SPARSE_INT_VECT_20070921__
12#define __RD_SPARSE_INT_VECT_20070921__
13
14#include <map>
15#include <string>
16#include <RDGeneral/Invariant.h>
17#include <sstream>
19#include <RDGeneral/StreamOps.h>
20#include <cstdint>
21#include <limits>
22
24 0x0001; //!< version number to use in pickles
25namespace RDKit {
26//! a class for efficiently storing sparse vectors of ints
27template <typename IndexType>
29 public:
30 typedef std::map<IndexType, int> StorageType;
31
32 SparseIntVect() : d_length(0) {}
33
34 //! initialize with a particular length
35 SparseIntVect(IndexType length) : d_length(length) {}
36
37 //! Copy constructor
39 d_length = other.d_length;
40 d_data.clear();
41 d_data.insert(other.d_data.begin(), other.d_data.end());
42 }
43
44 //! constructor from a pickle
45 SparseIntVect(const std::string &pkl) {
46 initFromText(pkl.c_str(), pkl.size());
47 }
48 //! constructor from a pickle
49 SparseIntVect(const char *pkl, const unsigned int len) {
50 initFromText(pkl, len);
51 }
52
54 if (this == &other) {
55 return *this;
56 }
57 d_length = other.d_length;
58 d_data.clear();
59 d_data.insert(other.d_data.begin(), other.d_data.end());
60 return *this;
61 }
62
63 //! destructor (doesn't need to do anything)
64 ~SparseIntVect() = default;
65
66#ifdef __clang__
67#pragma clang diagnostic push
68#pragma clang diagnostic ignored "-Wtautological-compare"
69#elif (defined(__GNUC__) || defined(__GNUG__)) && \
70 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 1))
71#if (__GNUC__ > 4 || __GNUC_MINOR__ > 5)
72#pragma GCC diagnostic push
73#endif
74#pragma GCC diagnostic ignored "-Wtype-limits"
75#endif
76 //! return the value at an index
77 int getVal(IndexType idx) const {
78 if (!checkIndex(idx)) {
79 throw IndexErrorException(static_cast<int>(idx));
80 }
81 int res = 0;
82 typename StorageType::const_iterator iter = d_data.find(idx);
83 if (iter != d_data.end()) {
84 res = iter->second;
85 }
86 return res;
87 }
88
89 //! set the value at an index
90 void setVal(IndexType idx, int val) {
91 if (!checkIndex(idx)) {
92 throw IndexErrorException(static_cast<int>(idx));
93 }
94 if (val != 0) {
95 d_data[idx] = val;
96 } else {
97 d_data.erase(idx);
98 }
99 }
100#ifdef __clang__
101#pragma clang diagnostic pop
102#elif (defined(__GNUC__) || defined(__GNUG__)) && \
103 (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ > 5))
104#pragma GCC diagnostic pop
105#endif
106 //! support indexing using []
107 int operator[](IndexType idx) const { return getVal(idx); }
108
109 //! returns the length
110 IndexType getLength() const { return d_length; }
111
112 //! returns the sum of all the elements in the vect
113 //! the doAbs argument toggles summing the absolute values of the elements
114 int getTotalVal(bool doAbs = false) const {
115 int res = 0;
116 typename StorageType::const_iterator iter;
117 for (iter = d_data.begin(); iter != d_data.end(); ++iter) {
118 if (!doAbs) {
119 res += iter->second;
120 } else {
121 res += abs(iter->second);
122 }
123 }
124 return res;
125 }
126 //! returns the length
127 unsigned int size() const { return getLength(); }
128
129 //! returns our nonzero elements as a map(IndexType->int)
130 const StorageType &getNonzeroElements() const { return d_data; }
131
132 //! this is a "fuzzy" intesection, the final value
133 //! of each element is equal to the minimum from
134 //! the two vects.
136 if (other.d_length != d_length) {
137 throw ValueErrorException("SparseIntVect size mismatch");
138 }
139
140 typename StorageType::iterator iter = d_data.begin();
141 typename StorageType::const_iterator oIter = other.d_data.begin();
142 while (iter != d_data.end()) {
143 // we're relying on the fact that the maps are sorted:
144 while (oIter != other.d_data.end() && oIter->first < iter->first) {
145 ++oIter;
146 }
147 if (oIter != other.d_data.end() && oIter->first == iter->first) {
148 // found it:
149 if (oIter->second < iter->second) {
150 iter->second = oIter->second;
151 }
152 ++oIter;
153 ++iter;
154 } else {
155 // not there; our value is zero, which means
156 // we should remove this value:
157 typename StorageType::iterator tmpIter = iter;
158 ++tmpIter;
159 d_data.erase(iter);
160 iter = tmpIter;
161 }
162 }
163 return *this;
164 }
166 const SparseIntVect<IndexType> &other) const {
168 return res &= other;
169 }
170
171 //! this is a "fuzzy" union, the final value
172 //! of each element is equal to the maximum from
173 //! the two vects.
175 if (other.d_length != d_length) {
176 throw ValueErrorException("SparseIntVect size mismatch");
177 }
178
179 typename StorageType::iterator iter = d_data.begin();
180 typename StorageType::const_iterator oIter = other.d_data.begin();
181 while (iter != d_data.end()) {
182 // we're relying on the fact that the maps are sorted:
183 while (oIter != other.d_data.end() && oIter->first < iter->first) {
184 d_data[oIter->first] = oIter->second;
185 ++oIter;
186 }
187 if (oIter != other.d_data.end() && oIter->first == iter->first) {
188 // found it:
189 if (oIter->second > iter->second) {
190 iter->second = oIter->second;
191 }
192 ++oIter;
193 }
194 ++iter;
195 }
196 // finish up the other vect:
197 while (oIter != other.d_data.end()) {
198 d_data[oIter->first] = oIter->second;
199 ++oIter;
200 }
201 return *this;
202 }
204 const SparseIntVect<IndexType> &other) const {
206 return res |= other;
207 }
208
210 if (other.d_length != d_length) {
211 throw ValueErrorException("SparseIntVect size mismatch");
212 }
213 typename StorageType::iterator iter = d_data.begin();
214 typename StorageType::const_iterator oIter = other.d_data.begin();
215 while (oIter != other.d_data.end()) {
216 while (iter != d_data.end() && iter->first < oIter->first) {
217 ++iter;
218 }
219 if (iter != d_data.end() && oIter->first == iter->first) {
220 // found it:
221 iter->second += oIter->second;
222 if (!iter->second) {
223 typename StorageType::iterator tIter = iter;
224 ++tIter;
225 d_data.erase(iter);
226 iter = tIter;
227 } else {
228 ++iter;
229 }
230 } else {
231 d_data[oIter->first] = oIter->second;
232 }
233 ++oIter;
234 }
235 return *this;
236 }
238 const SparseIntVect<IndexType> &other) const {
240 return res += other;
241 }
242
244 if (other.d_length != d_length) {
245 throw ValueErrorException("SparseIntVect size mismatch");
246 }
247 typename StorageType::iterator iter = d_data.begin();
248 typename StorageType::const_iterator oIter = other.d_data.begin();
249 while (oIter != other.d_data.end()) {
250 while (iter != d_data.end() && iter->first < oIter->first) {
251 ++iter;
252 }
253 if (iter != d_data.end() && oIter->first == iter->first) {
254 // found it:
255 iter->second -= oIter->second;
256 if (!iter->second) {
257 typename StorageType::iterator tIter = iter;
258 ++tIter;
259 d_data.erase(iter);
260 iter = tIter;
261 } else {
262 ++iter;
263 }
264 } else {
265 d_data[oIter->first] = -oIter->second;
266 }
267 ++oIter;
268 }
269 return *this;
270 }
272 const SparseIntVect<IndexType> &other) const {
274 return res -= other;
275 }
277 typename StorageType::iterator iter = d_data.begin();
278 while (iter != d_data.end()) {
279 iter->second *= v;
280 ++iter;
281 }
282 return *this;
283 }
286 return res *= v;
287 }
289 typename StorageType::iterator iter = d_data.begin();
290 while (iter != d_data.end()) {
291 iter->second /= v;
292 ++iter;
293 }
294 return *this;
295 }
298 return res /= v;
299 }
301 typename StorageType::iterator iter = d_data.begin();
302 while (iter != d_data.end()) {
303 iter->second += v;
304 ++iter;
305 }
306 return *this;
307 }
310 return res += v;
311 }
313 typename StorageType::iterator iter = d_data.begin();
314 while (iter != d_data.end()) {
315 iter->second -= v;
316 ++iter;
317 }
318 return *this;
319 }
322 return res -= v;
323 }
324
325 bool operator==(const SparseIntVect<IndexType> &v2) const {
326 if (d_length != v2.d_length) {
327 return false;
328 }
329 return d_data == v2.d_data;
330 }
331 bool operator!=(const SparseIntVect<IndexType> &v2) const {
332 return !(*this == v2);
333 }
334
335 //! returns a binary string representation (pickle)
336 std::string toString() const {
337 std::stringstream ss(std::ios_base::binary | std::ios_base::out |
338 std::ios_base::in);
339 std::uint32_t tInt;
342 tInt = sizeof(IndexType);
344 streamWrite(ss, d_length);
345 IndexType nEntries = d_data.size();
347
348 typename StorageType::const_iterator iter = d_data.begin();
349 while (iter != d_data.end()) {
350 streamWrite(ss, iter->first);
351 std::int32_t tInt = iter->second;
353 ++iter;
354 }
355 return ss.str();
356 }
357
358 void fromString(const std::string &txt) {
359 initFromText(txt.c_str(), txt.length());
360 }
361
362 private:
363 IndexType d_length;
364 StorageType d_data;
365
366 void initFromText(const char *pkl, const unsigned int len) {
367 d_data.clear();
368 std::stringstream ss(std::ios_base::binary | std::ios_base::out |
369 std::ios_base::in);
370 ss.write(pkl, len);
371
372 std::uint32_t vers;
374 if (vers == 0x0001) {
375 std::uint32_t tInt;
377 if (tInt > sizeof(IndexType)) {
379 "IndexType cannot accommodate index size in SparseIntVect pickle");
380 }
381 switch (tInt) {
382 case sizeof(char):
383 readVals<unsigned char>(ss);
384 break;
385 case sizeof(std::int32_t):
386 readVals<std::uint32_t>(ss);
387 break;
388 case sizeof(boost::int64_t):
389 readVals<boost::uint64_t>(ss);
390 break;
391 default:
392 throw ValueErrorException("unreadable format");
393 }
394 } else {
395 throw ValueErrorException("bad version in SparseIntVect pickle");
396 }
397 }
398 template <typename T>
399 void readVals(std::stringstream &ss) {
400 PRECONDITION(sizeof(T) <= sizeof(IndexType), "invalid size");
401 T tVal;
403 d_length = tVal;
404 T nEntries;
406 for (T i = 0; i < nEntries; ++i) {
408 std::int32_t val;
409 streamRead(ss, val);
410 d_data[tVal] = val;
411 }
412 }
413 bool checkIndex(IndexType idx) const {
414 if (idx < 0 || idx > d_length ||
415 (idx == d_length && d_length < std::numeric_limits<IndexType>::max())) {
416 return false;
417 }
418 return true;
419 }
420};
421
422template <typename IndexType, typename SequenceType>
424 const SequenceType &seq) {
425 typename SequenceType::const_iterator seqIt;
426 for (seqIt = seq.begin(); seqIt != seq.end(); ++seqIt) {
427 // EFF: probably not the most efficient approach
428 IndexType idx = *seqIt;
429 vect.setVal(idx, vect.getVal(idx) + 1);
430 }
431}
432
433namespace {
434template <typename IndexType>
436 const SparseIntVect<IndexType> &v2, double &v1Sum,
437 double &v2Sum, double &andSum) {
438 if (v1.getLength() != v2.getLength()) {
439 throw ValueErrorException("SparseIntVect size mismatch");
440 }
441 v1Sum = v2Sum = andSum = 0.0;
442 // we're doing : (v1&v2).getTotalVal(), but w/o generating
443 // the other vector:
444 typename SparseIntVect<IndexType>::StorageType::const_iterator iter1, iter2;
445 iter1 = v1.getNonzeroElements().begin();
446 if (iter1 != v1.getNonzeroElements().end()) {
447 v1Sum += abs(iter1->second);
448 }
449 iter2 = v2.getNonzeroElements().begin();
450 if (iter2 != v2.getNonzeroElements().end()) {
451 v2Sum += abs(iter2->second);
452 }
453 while (iter1 != v1.getNonzeroElements().end()) {
454 while (iter2 != v2.getNonzeroElements().end() &&
455 iter2->first < iter1->first) {
456 ++iter2;
457 if (iter2 != v2.getNonzeroElements().end()) {
458 v2Sum += abs(iter2->second);
459 }
460 }
461 if (iter2 != v2.getNonzeroElements().end()) {
462 if (iter2->first == iter1->first) {
463 if (abs(iter2->second) < abs(iter1->second)) {
464 andSum += abs(iter2->second);
465 } else {
466 andSum += abs(iter1->second);
467 }
468 ++iter2;
469 if (iter2 != v2.getNonzeroElements().end()) {
470 v2Sum += abs(iter2->second);
471 }
472 }
473 ++iter1;
474 if (iter1 != v1.getNonzeroElements().end()) {
475 v1Sum += abs(iter1->second);
476 }
477 } else {
478 break;
479 }
480 }
481 if (iter1 != v1.getNonzeroElements().end()) {
482 ++iter1;
483 while (iter1 != v1.getNonzeroElements().end()) {
484 v1Sum += abs(iter1->second);
485 ++iter1;
486 }
487 }
488 if (iter2 != v2.getNonzeroElements().end()) {
489 ++iter2;
490 while (iter2 != v2.getNonzeroElements().end()) {
491 v2Sum += abs(iter2->second);
492 ++iter2;
493 }
494 }
495}
496} // namespace
497
498template <typename IndexType>
500 const SparseIntVect<IndexType> &v2,
501 bool returnDistance = false, double bounds = 0.0) {
502 if (v1.getLength() != v2.getLength()) {
503 throw ValueErrorException("SparseIntVect size mismatch");
504 }
505 double v1Sum = 0.0;
506 double v2Sum = 0.0;
507 if (!returnDistance && bounds > 0.0) {
508 v1Sum = v1.getTotalVal(true);
509 v2Sum = v2.getTotalVal(true);
510 double denom = v1Sum + v2Sum;
511 if (fabs(denom) < 1e-6) {
512 // no need to worry about returnDistance here
513 return 0.0;
514 }
515 double minV = v1Sum < v2Sum ? v1Sum : v2Sum;
516 if (2. * minV / denom < bounds) {
517 return 0.0;
518 }
519 v1Sum = 0.0;
520 v2Sum = 0.0;
521 }
522
523 double numer = 0.0;
524
525 calcVectParams(v1, v2, v1Sum, v2Sum, numer);
526
527 double denom = v1Sum + v2Sum;
528 double sim;
529 if (fabs(denom) < 1e-6) {
530 sim = 0.0;
531 } else {
532 sim = 2. * numer / denom;
533 }
534 if (returnDistance) {
535 sim = 1. - sim;
536 }
537 // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
538 return sim;
539}
540
541template <typename IndexType>
543 const SparseIntVect<IndexType> &v2, double a, double b,
544 bool returnDistance = false, double bounds = 0.0) {
546 if (v1.getLength() != v2.getLength()) {
547 throw ValueErrorException("SparseIntVect size mismatch");
548 }
549 double v1Sum = 0.0;
550 double v2Sum = 0.0;
551 double andSum = 0.0;
552
553 calcVectParams(v1, v2, v1Sum, v2Sum, andSum);
554
555 double denom = a * v1Sum + b * v2Sum + (1 - a - b) * andSum;
556 double sim;
557
558 if (fabs(denom) < 1e-6) {
559 sim = 0.0;
560 } else {
561 sim = andSum / denom;
562 }
563 if (returnDistance) {
564 sim = 1. - sim;
565 }
566 // std::cerr<<" "<<v1Sum<<" "<<v2Sum<<" " << numer << " " << sim <<std::endl;
567 return sim;
568}
569
570template <typename IndexType>
572 const SparseIntVect<IndexType> &v2,
573 bool returnDistance = false, double bounds = 0.0) {
574 return TverskySimilarity(v1, v2, 1.0, 1.0, returnDistance, bounds);
575}
576} // namespace RDKit
577
578#endif
#define RDUNUSED_PARAM(x)
Definition Invariant.h:196
#define PRECONDITION(expr, mess)
Definition Invariant.h:109
const int ci_SPARSEINTVECT_VERSION
version number to use in pickles
Class to allow us to throw an IndexError from C++ and have it make it back to Python.
Definition Exceptions.h:20
a class for efficiently storing sparse vectors of ints
const StorageType & getNonzeroElements() const
returns our nonzero elements as a map(IndexType->int)
SparseIntVect< IndexType > & operator&=(const SparseIntVect< IndexType > &other)
SparseIntVect(IndexType length)
initialize with a particular length
SparseIntVect< IndexType > & operator-=(int v)
unsigned int size() const
returns the length
SparseIntVect< IndexType > & operator+(int v)
SparseIntVect< IndexType > & operator-(int v)
SparseIntVect< IndexType > & operator/(int v)
bool operator==(const SparseIntVect< IndexType > &v2) const
SparseIntVect(const SparseIntVect< IndexType > &other)
Copy constructor.
~SparseIntVect()=default
destructor (doesn't need to do anything)
const SparseIntVect< IndexType > operator&(const SparseIntVect< IndexType > &other) const
SparseIntVect< IndexType > & operator+=(int v)
SparseIntVect(const char *pkl, const unsigned int len)
constructor from a pickle
int operator[](IndexType idx) const
support indexing using []
void fromString(const std::string &txt)
void setVal(IndexType idx, int val)
set the value at an index
SparseIntVect< IndexType > & operator|=(const SparseIntVect< IndexType > &other)
std::string toString() const
returns a binary string representation (pickle)
int getTotalVal(bool doAbs=false) const
SparseIntVect< IndexType > & operator-=(const SparseIntVect< IndexType > &other)
const SparseIntVect< IndexType > operator+(const SparseIntVect< IndexType > &other) const
std::map< IndexType, int > StorageType
SparseIntVect & operator=(const SparseIntVect< IndexType > &other)
bool operator!=(const SparseIntVect< IndexType > &v2) const
SparseIntVect< IndexType > & operator*=(int v)
SparseIntVect< IndexType > & operator+=(const SparseIntVect< IndexType > &other)
SparseIntVect< IndexType > & operator*(int v)
SparseIntVect(const std::string &pkl)
constructor from a pickle
SparseIntVect< IndexType > & operator/=(int v)
const SparseIntVect< IndexType > operator-(const SparseIntVect< IndexType > &other) const
IndexType getLength() const
returns the length
int getVal(IndexType idx) const
return the value at an index
const SparseIntVect< IndexType > operator|(const SparseIntVect< IndexType > &other) const
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
Std stuff.
bool rdvalue_is(const RDValue_cast_t)
double TverskySimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, double a, double b, bool returnDistance=false, double bounds=0.0)
void updateFromSequence(SparseIntVect< IndexType > &vect, const SequenceType &seq)
double TanimotoSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
double DiceSimilarity(const SparseIntVect< IndexType > &v1, const SparseIntVect< IndexType > &v2, bool returnDistance=false, double bounds=0.0)
void streamRead(std::istream &ss, T &loc)
does a binary read of an object from a stream
Definition StreamOps.h:283
void streamWrite(std::ostream &ss, const T &val)
does a binary write of an object to a stream
Definition StreamOps.h:261
Definition RDLog.h:25