RDKit
Open-source cheminformatics and machine learning.
Loading...
Searching...
No Matches
MaxMinPicker.h
Go to the documentation of this file.
1//
2// Copyright (C) 2003-2007 Greg Landrum and Rational Discovery LLC
3// Copyright (C) 2017 Greg Landrum and NextMove Software
4//
5// @@ All Rights Reserved @@
6// This file is part of the RDKit.
7// The contents are covered by the terms of the BSD license
8// which is included in the file license.txt, found at the root
9// of the RDKit source tree.
10//
11#include <RDGeneral/export.h>
12#ifndef RD_MAXMINPICKER_H
13#define RD_MAXMINPICKER_H
14
15#include <RDGeneral/types.h>
16#include <RDGeneral/utils.h>
17#include <RDGeneral/Invariant.h>
18#include <RDGeneral/RDLog.h>
20#include <cstdlib>
21#include "DistPicker.h"
22#include <boost/random.hpp>
23#include <random>
24
25namespace RDPickers {
26
27/*! \brief Implements the MaxMin algorithm for picking a subset of item from a
28 *pool
29 *
30 * This class inherits from the DistPicker and implements a specific picking
31 *strategy
32 * aimed at diversity. See documentation for "pick()" member function for the
33 *algorithm details
34 */
36 public:
37 /*! \brief Default Constructor
38 *
39 */
41
42 /*! \brief Contains the implementation for a lazy MaxMin diversity picker
43 *
44 * See the documentation for the pick() method for details about the algorithm
45 *
46 * \param func - a function (or functor) taking two unsigned ints as
47 *arguments
48 * and returning the distance (as a double) between those two
49 *elements.
50 * \param poolSize - the size of the pool to pick the items from. It is
51 *assumed that the
52 * distance matrix above contains the right number of elements;
53 *i.e.
54 * poolSize*(poolSize-1)
55 * \param pickSize - the number items to pick from pool (<= poolSize)
56 * \param firstPicks - (optional)the first items in the pick list
57 * \param seed - (optional) seed for the random number generator.
58 * If this is <0 the generator will be seeded with a
59 * random number.
60 */
61 template <typename T>
62 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
63 unsigned int pickSize) const;
64
65 template <typename T>
66 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
67 unsigned int pickSize,
68 const RDKit::INT_VECT &firstPicks,
69 int seed = -1) const;
70
71 template <typename T>
72 RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize,
73 unsigned int pickSize,
74 const RDKit::INT_VECT &firstPicks, int seed,
75 double &threshold) const;
76
77 /*! \brief Contains the implementation for the MaxMin diversity picker
78 *
79 * Here is how the picking algorithm works, refer to
80 * Ashton, M. et. al., Quant. Struct.-Act. Relat., 21 (2002), 598-604
81 * for more detail:
82 *
83 * A subset of k items is to be selected from a pool containing N molecules.
84 * Then the MaxMin method is as follows:
85 * -# Initialise Subset with some appropriately chosen seed
86 * compound and set x = 1.
87 * -# For each of the N-x remaining compounds in Dataset
88 * calculate its dissimilarity with each of the x compounds in
89 * Subset and retain the smallest of these x dissimilarities for
90 * each compound in Dataset.
91 * -# Select the molecule from Dataset with the largest value
92 * for the smallest dissimilarity calculated in Step 2 and
93 * transfer it to Subset.
94 * -# Set x = x + 1 and return to Step 2 if x < k.
95 *
96 *
97 *
98 * \param distMat - distance matrix - a vector of double. It is assumed that
99 *only the
100 * lower triangle element of the matrix are supplied in a 1D
101 *array\n
102 * \param poolSize - the size of the pool to pick the items from. It is
103 *assumed that the
104 * distance matrix above contains the right number of elements;
105 *i.e.
106 * poolSize*(poolSize-1) \n
107 * \param pickSize - the number items to pick from pool (<= poolSize)
108 * \param firstPicks - indices of the items used to seed the pick set.
109 * \param seed - (optional) seed for the random number generator
110 * If this is <0 the generator will be seeded with a
111 * random number.
112 */
113 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
114 unsigned int pickSize, RDKit::INT_VECT firstPicks,
115 int seed = -1) const {
116 CHECK_INVARIANT(distMat, "Invalid Distance Matrix");
117 if (!poolSize) {
118 throw ValueErrorException("empty pool to pick from");
119 }
120 if (poolSize < pickSize) {
121 throw ValueErrorException("pickSize cannot be larger than the poolSize");
122 }
123 distmatFunctor functor(distMat);
124 return this->lazyPick(functor, poolSize, pickSize, firstPicks, seed);
125 }
126
127 /*! \overload */
128 RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize,
129 unsigned int pickSize) const override {
131 return pick(distMat, poolSize, pickSize, iv);
132 }
133};
134
136 double dist_bound; // distance to closest reference
137 unsigned int picks; // number of references considered
138 unsigned int next; // singly linked list of candidates
139};
140
141// we implement this here in order to allow arbitrary functors without link
142// errors
143template <typename T>
144RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
145 unsigned int pickSize,
146 const RDKit::INT_VECT &firstPicks,
147 int seed, double &threshold) const {
148 if (!poolSize) {
149 throw ValueErrorException("empty pool to pick from");
150 }
151
152 if (poolSize < pickSize) {
153 throw ValueErrorException("pickSize cannot be larger than the poolSize");
154 }
155
156 RDKit::INT_VECT picks;
157
158 unsigned int memsize = (unsigned int)(poolSize * sizeof(MaxMinPickInfo));
159 MaxMinPickInfo *pinfo = new MaxMinPickInfo[memsize];
160 memset(pinfo, 0, memsize);
161
162 picks.reserve(pickSize);
163 unsigned int picked = 0; // picks.size()
164 unsigned int pick = 0;
165
166 // pick the first entry
167 if (firstPicks.empty()) {
168 // get a seeded random number generator:
169 typedef boost::mt19937 rng_type;
170 typedef boost::uniform_int<> distrib_type;
171 typedef boost::variate_generator<rng_type &, distrib_type> source_type;
172 rng_type generator;
173 distrib_type dist(0, poolSize - 1);
174 if (seed >= 0) {
175 generator.seed(static_cast<rng_type::result_type>(seed));
176 } else {
177 generator.seed(std::random_device()());
178 }
179 source_type randomSource(generator, dist);
180 pick = randomSource();
181 // add the pick to the picks
182 picks.push_back(pick);
183 // and remove it from the pool
184 pinfo[pick].picks = 1;
185 picked = 1;
186
187 } else {
188 for (RDKit::INT_VECT::const_iterator pIdx = firstPicks.begin();
189 pIdx != firstPicks.end(); ++pIdx) {
190 pick = static_cast<unsigned int>(*pIdx);
191 if (pick >= poolSize) {
192 delete[] pinfo;
193 throw ValueErrorException("pick index was larger than the poolSize");
194 }
195 picks.push_back(pick);
196 pinfo[pick].picks = 1;
197 picked++;
198 }
199 }
200
201 if (picked >= pickSize) {
202 threshold = -1.0;
203 delete[] pinfo;
204 return picks;
205 }
206
207 unsigned int pool_list = 0;
208 unsigned int *prev = &pool_list;
209 // enter the pool into a list so that we can pick out of it easily
210 for (unsigned int i = 0; i < poolSize; i++) {
211 if (pinfo[i].picks == 0) {
212 *prev = i;
213 prev = &pinfo[i].next;
214 }
215 }
216 *prev = 0;
217
218 unsigned int poolIdx;
219 unsigned int pickIdx;
220
221 // First pass initialize dist_bound
222 prev = &pool_list;
223 pickIdx = picks[0];
224 do {
225 poolIdx = *prev;
226 pinfo[poolIdx].dist_bound = func(poolIdx, pickIdx);
227 pinfo[poolIdx].picks = 1;
228 prev = &pinfo[poolIdx].next;
229 } while (*prev != 0);
230
231 // now pick 1 compound at a time
232 double maxOFmin = -1.0;
233 double tmpThreshold = -1.0;
234 while (picked < pickSize) {
235 unsigned int *pick_prev = nullptr;
236 maxOFmin = -1.0;
237 prev = &pool_list;
238 do {
239 poolIdx = *prev;
240 double minTOi = pinfo[poolIdx].dist_bound;
241 if (minTOi > maxOFmin) {
242 unsigned int pi = pinfo[poolIdx].picks;
243 while (pi < picked) {
244 unsigned int picki = picks[pi];
245 CHECK_INVARIANT(poolIdx != picki, "pool index != pick index");
246 double dist = func(poolIdx, picki);
247 pi++;
248 if (dist <= minTOi) {
249 minTOi = dist;
250 if (minTOi <= maxOFmin) {
251 break;
252 }
253 }
254 }
255 pinfo[poolIdx].dist_bound = minTOi;
256 pinfo[poolIdx].picks = pi;
257 if (minTOi > maxOFmin) {
258 maxOFmin = minTOi;
259 pick_prev = prev;
260 pick = poolIdx;
261 }
262 }
263 prev = &pinfo[poolIdx].next;
264 } while (*prev != 0);
265
266 // if the current distance is closer then threshold, we're done
267 if (maxOFmin <= threshold && threshold >= 0.0) {
268 break;
269 }
270 tmpThreshold = maxOFmin;
271 // now add the new pick to picks and remove it from the pool
272 *pick_prev = pinfo[pick].next;
273 picks.push_back(pick);
274 picked++;
275 }
276
277 threshold = tmpThreshold;
278 delete[] pinfo;
279 return picks;
280}
281
282template <typename T>
283RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
284 unsigned int pickSize,
285 const RDKit::INT_VECT &firstPicks,
286 int seed) const {
287 double threshold = -1.0;
288 return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
289 threshold);
290}
291
292template <typename T>
293RDKit::INT_VECT MaxMinPicker::lazyPick(T &func, unsigned int poolSize,
294 unsigned int pickSize) const {
295 RDKit::INT_VECT firstPicks;
296 double threshold = -1.0;
297 int seed = -1;
298 return MaxMinPicker::lazyPick(func, poolSize, pickSize, firstPicks, seed,
299 threshold);
300}
301}; // namespace RDPickers
302
303#endif
#define CHECK_INVARIANT(expr, mess)
Definition Invariant.h:101
Abstract base class to do perform item picking (typically molecules) using a distance matrix.
Definition DistPicker.h:46
Implements the MaxMin algorithm for picking a subset of item from a pool.
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize, RDKit::INT_VECT firstPicks, int seed=-1) const
Contains the implementation for the MaxMin diversity picker.
RDKit::INT_VECT lazyPick(T &func, unsigned int poolSize, unsigned int pickSize) const
Contains the implementation for a lazy MaxMin diversity picker.
MaxMinPicker()
Default Constructor.
RDKit::INT_VECT pick(const double *distMat, unsigned int poolSize, unsigned int pickSize) const override
Class to allow us to throw a ValueError from C++ and have it make it back to Python.
Definition Exceptions.h:40
#define RDKIT_SIMDIVPICKERS_EXPORT
Definition export.h:489
std::vector< int > INT_VECT
Definition types.h:289