npstat is hosted by Hepforge, IPPP Durham
NPStat  5.10.0
buildInterpolatedHelpers.hh
1 #ifndef NPSTAT_BUILDINTERPOLATEDHELPERS_HH_
2 #define NPSTAT_BUILDINTERPOLATEDHELPERS_HH_
3 
4 //======================================================================
5 // buildInterpolatedHelpers.hh
6 //
7 // This is an internal header which is subject to change without notice.
8 // Application code should never directly use classes and functions
9 // declared or defined in this header.
10 //
11 // Author: I. Volobouev
12 //
13 // July 2015
14 //======================================================================
15 
16 #include <vector>
17 #include <utility>
18 #include <climits>
19 #include <iostream>
20 #include <ctime>
21 #include <cstdio>
22 #include <cassert>
23 
24 #include "geners/CPP11_auto_ptr.hh"
25 
26 #include "npstat/nm/AbsVisitor.hh"
27 #include "npstat/nm/BoxND.hh"
28 #include "npstat/nm/KDTree.hh"
34 #include "npstat/nm/GridAxis.hh"
35 
40 
41 namespace npstat {
42  namespace Private {
43  // Number of bandwidth values needed so that the weight of
44  // the distribution outside of this number of bandwidth values
45  // can be neglected
46  double symbetaNumSigmas(int symbetaPower, unsigned dim);
47 
48  double symbetaBandwidthInsideUnitBox(
49  int symbetaPower, double nEffToNRatio,
50  const double* coords, unsigned dim);
51 
52  double symbetaEffRatioInsideUnitBox(
53  int symbetaPower, double bandwidth,
54  const double* coords, unsigned dim);
55 
56  template <class Point>
57  class CdfWeightAssigner : public AbsVisitor<Point,double>
58  {
59  public:
60  typedef typename Point::value_type Real;
61  typedef std::pair<const Point*, double> WeightedPointPtr;
62  typedef std::vector<WeightedPointPtr> WeightedPtrVec;
63 
64  inline CdfWeightAssigner(
65  const std::vector<SampleAccumulator<Real> >& predSamples,
66  const unsigned* dimPredictors,
67  const double* coords, const unsigned nCoords,
68  const int symbetaPow)
69  : predSamples_(predSamples),
70  dimPredictors_(dimPredictors),
71  coords_(coords),
72  symbetaPow_(symbetaPow),
73  gauss_(0.0, 1.0),
74  symbeta_(0.0, 1.0, symbetaPow > 0 ? symbetaPow : 0),
75  h_(0.0),
76  wsum_(0.0L),
77  wsumsq_(0.0L),
78  nVizited_(0),
79  nZeroWeights_(0)
80  {
81  assert(nCoords == predSamples_.size());
82  }
83 
84  inline virtual ~CdfWeightAssigner() {}
85 
86  inline void setBandwidth(const double h)
87  {
88  assert(h > 0.0);
89  h_ = h;
90  }
91 
92  inline void clear()
93  {
94  ptrVec_.clear();
95  h_ = 0.0;
96  wsum_ = 0.0L;
97  wsumsq_ = 0.0L;
98  nVizited_ = 0;
99  nZeroWeights_ = 0;
100  }
101 
102  inline void process(const Point& point)
103  {
104  assert(h_ > 0.0);
105  const npstat::AbsDistribution1D& g = symbetaPow_ < 0 ?
106  dynamic_cast<const npstat::AbsDistribution1D&>(gauss_) :
107  dynamic_cast<const npstat::AbsDistribution1D&>(symbeta_);
108  const unsigned ndim = predSamples_.size();
109  double wprod = 1.0;
110  for (unsigned ipred=0; ipred<ndim; ++ipred)
111  {
112  const unsigned idim = dimPredictors_[ipred];
113  const double cdf = predSamples_[ipred].cdf(point[idim]);
114  const double w = g.density((cdf - coords_[ipred])/h_)/h_;
115  wprod *= w;
116  }
117  if (wprod > 0.0)
118  {
119  ptrVec_.emplace_back(&point, wprod);
120  wsum_ += wprod;
121  wsumsq_ += wprod*wprod;
122  }
123  else
124  ++nZeroWeights_;
125  ++nVizited_;
126  }
127 
128  inline double result()
129  {
130  if (wsumsq_ > 0.0L)
131  return wsum_*wsum_/wsumsq_;
132  else
133  return 0.0;
134  }
135 
136  inline const WeightedPtrVec& getPoints() const {return ptrVec_;}
137  inline unsigned getVisitCount() const {return nVizited_;}
138  inline unsigned getNZeroWeights() const {return nZeroWeights_;}
139 
140  private:
141  const std::vector<SampleAccumulator<Real> >& predSamples_;
142  const unsigned* dimPredictors_;
143  const double* coords_;
144  WeightedPtrVec ptrVec_;
145  int symbetaPow_;
146  Gauss1D gauss_;
147  SymmetricBeta1D symbeta_;
148  double h_;
149  long double wsum_;
150  long double wsumsq_;
151  unsigned nVizited_;
152  unsigned nZeroWeights_;
153  };
154 
155  template <typename Real>
156  inline void fillCoordBoxFromBandwidth(
157  const std::vector<SampleAccumulator<Real> >& predSamples,
158  const double* cdfValues, const unsigned nCdfValues,
159  const double h, BoxND<double>* box)
160  {
161  assert(h > 0.0);
162 
163  box->clear();
164  box->reserve(nCdfValues);
165  for (unsigned i=0; i<nCdfValues; ++i)
166  {
167  double cdfmin = cdfValues[i] - h;
168  if (cdfmin < 0.0)
169  cdfmin = 0.0;
170  double cdfmax = cdfValues[i] + h;
171  if (cdfmax > 1.0)
172  cdfmax = 1.0;
173  double xmin = predSamples[i].quantile(cdfmin);
174  double xmax = predSamples[i].quantile(cdfmax);
175  const double range = xmax - xmin;
176  if (cdfValues[i] - h < 0.0)
177  xmin -= 0.01*range;
178  if (cdfValues[i] + h > 1.0)
179  xmax += 0.01*range;
180  box->emplace_back(xmin, xmax);
181  }
182  }
183 
184  template <class Point>
185  class PointCountByCdfBandwidth : public Functor1<double,double>
186  {
187  public:
188  typedef typename Point::value_type Real;
189 
191  const KDTree<Point>& kdtree,
192  const std::vector<SampleAccumulator<Real> >& predSamples,
193  const double* cdfValues,
194  const unsigned nCdfValues)
195  : kdtree_(kdtree),
196  predSamples_(predSamples),
197  cdfValues_(cdfValues),
198  nCdfValues_(nCdfValues)
199  {
200  assert(cdfValues_);
201  assert(nCdfValues_);
202  assert(predSamples_.size() == nCdfValues_);
203  }
204 
205  inline virtual ~PointCountByCdfBandwidth() {}
206 
207  inline double operator()(const double& h) const
208  {
209  fillCoordBoxFromBandwidth(predSamples_, cdfValues_,
210  nCdfValues_, h, &box_);
211  return kdtree_.nInBox(box_);
212  }
213 
214  private:
215  const KDTree<Point>& kdtree_;
216  const std::vector<SampleAccumulator<Real> >& predSamples_;
217  const double* cdfValues_;
218  mutable BoxND<double> box_;
219  unsigned nCdfValues_;
220  };
221 
222  template <
223  class Point,
224  class ResultDistro,
225  template <class> class BuilderBase = AbsCompositeDistroBuilder
226  >
228  {
229  typedef typename BuilderBase<Point>::result_type CompDistro;
230  typedef typename BuilderBase<Point>::WeightedPtrVec WeightedPtrVec;
231 
232  inline CPP11_auto_ptr<ResultDistro> construct(
233  const std::vector<GridAxis>& predAxes,
234  const unsigned nResponseVars, const bool mode) const
235  {
236  return CPP11_auto_ptr<ResultDistro>(
237  new ResultDistro(predAxes, nResponseVars, mode));
238  }
239 
240  inline CompDistro* callBuilder(
241  const BuilderBase<Point>& builder, const unsigned long uniqueId,
242  const double* predictorCoords, const unsigned nPredictors,
243  const BoxND<double>& predictorBox, const WeightedPtrVec& data,
244  const unsigned* dimsToUse, const unsigned nDimsToUse) const
245  {
246  return builder.buildWeighted(
247  uniqueId, predictorCoords, nPredictors,
248  predictorBox, data, dimsToUse, nDimsToUse);
249  }
250  };
251 
252  template <class Point, class ResultDistro>
253  struct DistroBuilderAdapter<Point, ResultDistro, AbsDistro1DBuilder>
254  {
256  typedef typename AbsDistro1DBuilder<Point>::WeightedPtrVec WeightedPtrVec;
257 
258  inline CPP11_auto_ptr<ResultDistro> construct(
259  const std::vector<GridAxis>& predAxes,
260  const unsigned /* nResponseVars */, const bool mode) const
261  {
262  return CPP11_auto_ptr<ResultDistro>(new ResultDistro(predAxes, mode));
263  }
264 
265  inline CompDistro* callBuilder(
266  const AbsDistro1DBuilder<Point>& builder, const unsigned long uniqueId,
267  const double* predictorCoords, const unsigned nPredictors,
268  const BoxND<double>& predictorBox, const WeightedPtrVec& data,
269  const unsigned* dimsToUse, const unsigned /* nDimsToUse */) const
270  {
271  return builder.buildWeighted(
272  uniqueId, predictorCoords, nPredictors,
273  predictorBox, data, dimsToUse[0]);
274  }
275  };
276 
277  template <
278  class Point,
279  class ResultDistro,
280  template <class> class BuilderBase
281  >
282  CPP11_auto_ptr<ResultDistro> buildInterpolatedHelper(
283  const std::vector<Point>& data,
284  const unsigned* dimPredictors, const unsigned nPredictors,
285  const std::string* predictorNames,
286  const unsigned* predictorNumBins, const int predictorSymbetaPower,
287  const double effectiveEventsPerBin, const bool stretchPredKernels,
288  const unsigned* dimResponses, const unsigned nResponseVars,
289  const BuilderBase<Point>& builder,
290  const bool interpolateSwitch, const unsigned reportFrequency)
291  {
292  typedef typename Point::value_type Real;
293  typedef typename BuilderBase<Point>::result_type CompDistro;
294  const unsigned dim_size = PointDimensionality<Point>::dim_size;
295 
296  // Some hardwired parameters
297  const unsigned maxdim = 32;
298  const double bandwidthTol = 1.0e-7;
299  const double initScaleStep = 0.02;
300 
301  // Check that the input arguments are reasonable
302  const unsigned long dataLen = data.size();
303  if (!dataLen) throw std::invalid_argument(
304  "In npstat::Private::buildInterpolatedHelper: no data points");
305  if (!nPredictors) throw std::invalid_argument(
306  "In npstat::Private::buildInterpolatedHelper: no predictors");
307  if (nPredictors >= maxdim) throw std::invalid_argument(
308  "In npstat::Private::buildInterpolatedHelper: "
309  "too many predictors");
310  if (nResponseVars >= maxdim) throw std::invalid_argument(
311  "In npstat::Private::buildInterpolatedHelper: "
312  "too many response variables");
313  if (effectiveEventsPerBin >= static_cast<double>(dataLen))
314  throw std::invalid_argument(
315  "In npstat::Private::buildInterpolatedHelper: "
316  "too many effective events requested per bin");
317 
318  assert(dimPredictors);
319  assert(predictorNumBins);
320  assert(dimResponses);
321 
322  for (unsigned i=0; i<nPredictors; ++i)
323  if (!predictorNumBins[i]) throw std::invalid_argument(
324  "In npstat::Private::buildInterpolatedHelper: "
325  "zero predictor bins in one of the dimensions");
326 
327  // Check correctness of the dimension numbers
328  {
329  unsigned dims[2*maxdim];
330  for (unsigned i=0; i<nPredictors; ++i)
331  dims[i] = dimPredictors[i];
332  for (unsigned i=0; i<nResponseVars; ++i)
333  dims[i+nPredictors] = dimResponses[i];
334  unsigned dmax = 0;
335  const unsigned ndims = nPredictors + nResponseVars;
336  for (unsigned i=0; i<ndims; ++i)
337  if (dims[i] > dmax)
338  dmax = dims[i];
339  if (dmax >= dim_size) throw std::invalid_argument(
340  "In npstat::Private::buildInterpolatedHelper: "
341  "dimension number out of range");
342  if (!areAllElementsUnique(&dims[0], &dims[0]+ndims))
343  throw std::invalid_argument(
344  "In npstat::Private::buildInterpolatedHelper: "
345  "duplicate dimensions detected");
346  }
347 
348  // We will need empirical quantile values for all predictors
349  std::vector<SampleAccumulator<Real> > predSamples(nPredictors);
350  for (unsigned ipred=0; ipred<nPredictors; ++ipred)
351  {
352  SampleAccumulator<Real>& acc(predSamples[ipred]);
353  acc.reserve(dataLen);
354  const unsigned idim = dimPredictors[ipred];
355  for (unsigned long ipt=0; ipt<dataLen; ++ipt)
356  acc.accumulate(data[ipt][idim]);
357  }
358 
359  // Grid axes for the GridInterpolatedDistribution
360  std::vector<GridAxis> predAxes;
361  predAxes.reserve(nPredictors);
362  {
363  std::vector<double> gridCoords;
364  for (unsigned ipred=0; ipred<nPredictors; ++ipred)
365  {
366  const unsigned nbins = predictorNumBins[ipred];
367  gridCoords.clear();
368  gridCoords.reserve(nbins);
369  for (unsigned ibin=0; ibin<nbins; ++ibin)
370  {
371  const double q = (ibin + 0.5)/nbins;
372  const double x = predSamples[ipred].quantile(q);
373  gridCoords.push_back(x);
374  }
375  if (predictorNames)
376  predAxes.emplace_back(
377  gridCoords, predictorNames[ipred].c_str());
378  else
379  predAxes.emplace_back(gridCoords);
380  }
381  }
382 
383  // We can now initialize the resulting distribution
384  DistroBuilderAdapter<Point, ResultDistro, BuilderBase> adap;
385  CPP11_auto_ptr<ResultDistro> distro = adap.construct(
386  predAxes, nResponseVars, interpolateSwitch);
387 
388  // To get the requested number of effective points we will need
389  // the tree of data points
390  const KDTree<Point> kdtree(data, dimPredictors, nPredictors);
391 
392  // See what kind of cdf bandwidth one would need to use in order
393  // to obtain the requested effective number of points in the center
394  // of the predictor distribution (in the assumption of uniform
395  // copula)
396  double predictorCdfs[maxdim];
397  for (unsigned idim=0; idim<nPredictors; ++idim)
398  predictorCdfs[idim] = 0.5;
399  const double centralBw = symbetaBandwidthInsideUnitBox(
400  predictorSymbetaPower, effectiveEventsPerBin/dataLen,
401  predictorCdfs, nPredictors);
402 
403  // Objects that will be needed inside the cycle over
404  // the predictor grid
405  PointCountByCdfBandwidth<Point> pointCounter(
406  kdtree, predSamples, predictorCdfs, nPredictors);
407  BoxND<double> pBox;
408  CdfWeightAssigner<Point> wAssign(predSamples, dimPredictors,
409  predictorCdfs, nPredictors,
410  predictorSymbetaPower);
411  unsigned gridCell[maxdim];
412  double predCoords[maxdim];
413 
414  // Scan over the points in the predictor space
415  for (BoxNDScanner<double> scan(BoxND<double>::unitBox(nPredictors),
416  predictorNumBins, nPredictors);
417  scan.isValid(); ++scan)
418  {
419  scan.getCoords(predictorCdfs, nPredictors);
420 
421  // Find the bandwidth to use for weighting the points
422  double bwGuess = centralBw, realBw = 0.0;
423  if (stretchPredKernels)
424  {
425  // Figure out the bandwidth needed for the requested
426  // symbeta in the assumption of uniform copula
427  bwGuess = symbetaBandwidthInsideUnitBox(
428  predictorSymbetaPower, effectiveEventsPerBin/dataLen,
429  predictorCdfs, nPredictors);
430 
431  // Figure out the bandwidth needed for the flat kernel
432  // in the assumption of uniform copula
433  const double bw0 = symbetaBandwidthInsideUnitBox(
434  0, effectiveEventsPerBin/dataLen,
435  predictorCdfs, nPredictors);
436 
437  // Figure out the actual bandwidth needed for the flat
438  // kernel to get the requested effective number of events
439  double realBw0 = 0.0;
440  const bool status = findRootInLogSpace(
441  pointCounter, effectiveEventsPerBin, bw0, bandwidthTol,
442  &realBw0, initScaleStep);
443  assert(status);
444 
445  // Scale the bandwidth to get the "real" bandwidth
446  realBw = bwGuess*realBw0/bw0;
447  }
448  else
449  {
450  // Figure out the effective number of events we would
451  // get with this bandwidth for this cdf point in the
452  // assumption of uniform copula
453  const double ratioExp = symbetaEffRatioInsideUnitBox(
454  predictorSymbetaPower, bwGuess,
455  predictorCdfs, nPredictors);
456 
457  // Figure out the bandwidth needed for the flat kernel
458  // to get that number of events in the assumption of
459  // uniform copula
460  const double bw0 = symbetaBandwidthInsideUnitBox(
461  0, ratioExp, predictorCdfs, nPredictors);
462 
463  // Figure out the actual bandwidth needed for the flat
464  // kernel to get this effective number of events
465  double realBw0 = 0.0;
466  const bool status = findRootInLogSpace(
467  pointCounter, ratioExp*dataLen, bw0, bandwidthTol,
468  &realBw0, initScaleStep);
469  assert(status);
470 
471  // Scale the bandwidth to get the "real" bandwidth
472  realBw = bwGuess*realBw0/bw0;
473  }
474 
475  // Now, weight the points according to the bandwidth found
476  wAssign.clear();
477  wAssign.setBandwidth(realBw);
478  const double nBw = symbetaNumSigmas(predictorSymbetaPower,
479  nPredictors);
480  fillCoordBoxFromBandwidth(predSamples, predictorCdfs,
481  nPredictors, nBw*realBw, &pBox);
482  kdtree.visitInBox(wAssign, pBox);
483  const double nEff = wAssign.result();
484  assert(nEff > 0.0);
485 
486  // Get coordinates of the cell center in the predictor space
487  scan.getIndex(gridCell, nPredictors);
488  for (unsigned ipred=0; ipred<nPredictors; ++ipred)
489  predCoords[ipred] = predAxes[ipred].coordinate(gridCell[ipred]);
490 
491  // Build the distribution for this grid point
492  const unsigned long scanCycle = scan.state();
493  CompDistro* cdnd = adap.callBuilder(
494  builder, scanCycle, predCoords, nPredictors, pBox,
495  wAssign.getPoints(), dimResponses, nResponseVars);
496  distro->setGridDistro(gridCell, nPredictors, cdnd);
497 
498  // Print some info about the processed point
499  if (reportFrequency)
500  if (scanCycle % reportFrequency == 0)
501  {
502  // Generate a time stamp
503  struct tm *current;
504  time_t now;
505  time(&now);
506  current = localtime(&now);
507  char timestamp[10];
508  sprintf(timestamp, "%02i:%02i:%02i", current->tm_hour,
509  current->tm_min, current->tm_sec);
510 
511  // Other things we want to print
512  const unsigned long nCycles = scan.maxState();
513 
514  // Print a message
515  std::cout << timestamp << " "
516  << "In npstat::Private::buildInterpolatedHelper:"
517  << "\n Processed cell " << scanCycle+1
518  << '/' << nCycles
519  << ", bwGuess = " << bwGuess
520  << ", bw = " << realBw
521  << ", Neff = " << nEff << ".\n Visited "
522  << wAssign.getVisitCount() << " points, "
523  << wAssign.getNZeroWeights() << " unused."
524  << std::endl;
525  }
526  }
527 
528  return distro;
529  }
530  }
531 }
532 
533 #endif // NPSTAT_BUILDINTERPOLATEDHELPERS_HH_
Interface definition for classes which build composite distrubutions.
Interface definition for classes which reconstruct univariate distrubutions from data samples.
Interface for piecemeal processing of a data collection.
Iteration over uniformly spaced coordinates inside a multidimensional box.
Template to represent rectangles, boxes, and hyperboxes.
A number of useful 1-d continuous statistical distributions.
Non-uniformly spaced coordinate sets for use in constructing rectangular grids.
k-d tree template
Compile-time dimensionality detector for classes like std::array.
Accumulates a sample and calculates various descriptive statistics.
Interface definitions and concrete simple functors for a variety of functor-based calculations.
A simple O(n^2) template function for checking if all elements in a container are unique....
Definition: AbsCompositeDistroBuilder.hh:30
Definition: AbsDistro1DBuilder.hh:28
virtual AbsDistribution1D * buildWeighted(unsigned long uniqueId, const double *predictorCoords, unsigned nPredictors, const BoxND< double > &predictorBox, const WeightedPtrVec &data, unsigned responseDimToUse) const
Definition: KDTree.hh:37
Definition: buildInterpolatedHelpers.hh:58
void process(const Point &point)
Definition: buildInterpolatedHelpers.hh:102
void clear()
Definition: buildInterpolatedHelpers.hh:92
double result()
Definition: buildInterpolatedHelpers.hh:128
Definition: buildInterpolatedHelpers.hh:186
Definition: SampleAccumulator.hh:37
Root finding in the log space by interval division.
Definition: AbsArrayProjector.hh:14
bool findRootInLogSpace(const Functor1< Result, Arg1 > &f, const Result &rhs, const Arg1 &x0, double tol, Arg1 *x, double logstep=0.5)
Definition: AbsDistribution1D.hh:31
virtual double density(double x) const =0
Definition: AbsVisitor.hh:20
static BoxND unitBox(unsigned long ndim)
Definition: SimpleFunctors.hh:58
Definition: PointDimensionality.hh:20
Definition: buildInterpolatedHelpers.hh:228