1 #ifndef NPSTAT_BUILDINTERPOLATEDHELPERS_HH_
2 #define NPSTAT_BUILDINTERPOLATEDHELPERS_HH_
24 #include "geners/CPP11_auto_ptr.hh"
46 double symbetaNumSigmas(
int symbetaPower,
unsigned dim);
48 double symbetaBandwidthInsideUnitBox(
49 int symbetaPower,
double nEffToNRatio,
50 const double* coords,
unsigned dim);
52 double symbetaEffRatioInsideUnitBox(
53 int symbetaPower,
double bandwidth,
54 const double* coords,
unsigned dim);
56 template <
class Po
int>
60 typedef typename Point::value_type Real;
61 typedef std::pair<const Point*, double> WeightedPointPtr;
62 typedef std::vector<WeightedPointPtr> WeightedPtrVec;
66 const unsigned* dimPredictors,
67 const double* coords,
const unsigned nCoords,
69 : predSamples_(predSamples),
70 dimPredictors_(dimPredictors),
72 symbetaPow_(symbetaPow),
74 symbeta_(0.0, 1.0, symbetaPow > 0 ? symbetaPow : 0),
81 assert(nCoords == predSamples_.size());
86 inline void setBandwidth(
const double h)
108 const unsigned ndim = predSamples_.size();
110 for (
unsigned ipred=0; ipred<ndim; ++ipred)
112 const unsigned idim = dimPredictors_[ipred];
113 const double cdf = predSamples_[ipred].cdf(point[idim]);
114 const double w = g.
density((cdf - coords_[ipred])/h_)/h_;
119 ptrVec_.emplace_back(&point, wprod);
121 wsumsq_ += wprod*wprod;
131 return wsum_*wsum_/wsumsq_;
136 inline const WeightedPtrVec& getPoints()
const {
return ptrVec_;}
137 inline unsigned getVisitCount()
const {
return nVizited_;}
138 inline unsigned getNZeroWeights()
const {
return nZeroWeights_;}
141 const std::vector<SampleAccumulator<Real> >& predSamples_;
142 const unsigned* dimPredictors_;
143 const double* coords_;
144 WeightedPtrVec ptrVec_;
147 SymmetricBeta1D symbeta_;
152 unsigned nZeroWeights_;
155 template <
typename Real>
156 inline void fillCoordBoxFromBandwidth(
157 const std::vector<SampleAccumulator<Real> >& predSamples,
158 const double* cdfValues,
const unsigned nCdfValues,
159 const double h, BoxND<double>* box)
164 box->reserve(nCdfValues);
165 for (
unsigned i=0; i<nCdfValues; ++i)
167 double cdfmin = cdfValues[i] - h;
170 double cdfmax = cdfValues[i] + h;
173 double xmin = predSamples[i].quantile(cdfmin);
174 double xmax = predSamples[i].quantile(cdfmax);
175 const double range = xmax - xmin;
176 if (cdfValues[i] - h < 0.0)
178 if (cdfValues[i] + h > 1.0)
180 box->emplace_back(xmin, xmax);
184 template <
class Po
int>
188 typedef typename Point::value_type Real;
193 const double* cdfValues,
194 const unsigned nCdfValues)
196 predSamples_(predSamples),
197 cdfValues_(cdfValues),
198 nCdfValues_(nCdfValues)
202 assert(predSamples_.size() == nCdfValues_);
207 inline double operator()(
const double& h)
const
209 fillCoordBoxFromBandwidth(predSamples_, cdfValues_,
210 nCdfValues_, h, &box_);
211 return kdtree_.nInBox(box_);
216 const std::vector<SampleAccumulator<Real> >& predSamples_;
217 const double* cdfValues_;
219 unsigned nCdfValues_;
229 typedef typename BuilderBase<Point>::result_type CompDistro;
230 typedef typename BuilderBase<Point>::WeightedPtrVec WeightedPtrVec;
232 inline CPP11_auto_ptr<ResultDistro> construct(
233 const std::vector<GridAxis>& predAxes,
234 const unsigned nResponseVars,
const bool mode)
const
236 return CPP11_auto_ptr<ResultDistro>(
237 new ResultDistro(predAxes, nResponseVars, mode));
240 inline CompDistro* callBuilder(
241 const BuilderBase<Point>& builder,
const unsigned long uniqueId,
242 const double* predictorCoords,
const unsigned nPredictors,
243 const BoxND<double>& predictorBox,
const WeightedPtrVec& data,
244 const unsigned* dimsToUse,
const unsigned nDimsToUse)
const
246 return builder.buildWeighted(
247 uniqueId, predictorCoords, nPredictors,
248 predictorBox, data, dimsToUse, nDimsToUse);
252 template <
class Po
int,
class ResultDistro>
256 typedef typename AbsDistro1DBuilder<Point>::WeightedPtrVec WeightedPtrVec;
258 inline CPP11_auto_ptr<ResultDistro> construct(
259 const std::vector<GridAxis>& predAxes,
260 const unsigned ,
const bool mode)
const
262 return CPP11_auto_ptr<ResultDistro>(
new ResultDistro(predAxes, mode));
267 const double* predictorCoords,
const unsigned nPredictors,
268 const BoxND<double>& predictorBox,
const WeightedPtrVec& data,
269 const unsigned* dimsToUse,
const unsigned )
const
272 uniqueId, predictorCoords, nPredictors,
273 predictorBox, data, dimsToUse[0]);
280 template <
class>
class BuilderBase
282 CPP11_auto_ptr<ResultDistro> buildInterpolatedHelper(
283 const std::vector<Point>& data,
284 const unsigned* dimPredictors,
const unsigned nPredictors,
285 const std::string* predictorNames,
286 const unsigned* predictorNumBins,
const int predictorSymbetaPower,
287 const double effectiveEventsPerBin,
const bool stretchPredKernels,
288 const unsigned* dimResponses,
const unsigned nResponseVars,
289 const BuilderBase<Point>& builder,
290 const bool interpolateSwitch,
const unsigned reportFrequency)
292 typedef typename Point::value_type Real;
293 typedef typename BuilderBase<Point>::result_type CompDistro;
297 const unsigned maxdim = 32;
298 const double bandwidthTol = 1.0e-7;
299 const double initScaleStep = 0.02;
302 const unsigned long dataLen = data.size();
303 if (!dataLen)
throw std::invalid_argument(
304 "In npstat::Private::buildInterpolatedHelper: no data points");
305 if (!nPredictors)
throw std::invalid_argument(
306 "In npstat::Private::buildInterpolatedHelper: no predictors");
307 if (nPredictors >= maxdim)
throw std::invalid_argument(
308 "In npstat::Private::buildInterpolatedHelper: "
309 "too many predictors");
310 if (nResponseVars >= maxdim)
throw std::invalid_argument(
311 "In npstat::Private::buildInterpolatedHelper: "
312 "too many response variables");
313 if (effectiveEventsPerBin >=
static_cast<double>(dataLen))
314 throw std::invalid_argument(
315 "In npstat::Private::buildInterpolatedHelper: "
316 "too many effective events requested per bin");
318 assert(dimPredictors);
319 assert(predictorNumBins);
320 assert(dimResponses);
322 for (
unsigned i=0; i<nPredictors; ++i)
323 if (!predictorNumBins[i])
throw std::invalid_argument(
324 "In npstat::Private::buildInterpolatedHelper: "
325 "zero predictor bins in one of the dimensions");
329 unsigned dims[2*maxdim];
330 for (
unsigned i=0; i<nPredictors; ++i)
331 dims[i] = dimPredictors[i];
332 for (
unsigned i=0; i<nResponseVars; ++i)
333 dims[i+nPredictors] = dimResponses[i];
335 const unsigned ndims = nPredictors + nResponseVars;
336 for (
unsigned i=0; i<ndims; ++i)
339 if (dmax >= dim_size)
throw std::invalid_argument(
340 "In npstat::Private::buildInterpolatedHelper: "
341 "dimension number out of range");
342 if (!areAllElementsUnique(&dims[0], &dims[0]+ndims))
343 throw std::invalid_argument(
344 "In npstat::Private::buildInterpolatedHelper: "
345 "duplicate dimensions detected");
349 std::vector<SampleAccumulator<Real> > predSamples(nPredictors);
350 for (
unsigned ipred=0; ipred<nPredictors; ++ipred)
353 acc.reserve(dataLen);
354 const unsigned idim = dimPredictors[ipred];
355 for (
unsigned long ipt=0; ipt<dataLen; ++ipt)
356 acc.accumulate(data[ipt][idim]);
360 std::vector<GridAxis> predAxes;
361 predAxes.reserve(nPredictors);
363 std::vector<double> gridCoords;
364 for (
unsigned ipred=0; ipred<nPredictors; ++ipred)
366 const unsigned nbins = predictorNumBins[ipred];
368 gridCoords.reserve(nbins);
369 for (
unsigned ibin=0; ibin<nbins; ++ibin)
371 const double q = (ibin + 0.5)/nbins;
372 const double x = predSamples[ipred].quantile(q);
373 gridCoords.push_back(x);
376 predAxes.emplace_back(
377 gridCoords, predictorNames[ipred].c_str());
379 predAxes.emplace_back(gridCoords);
384 DistroBuilderAdapter<Point, ResultDistro, BuilderBase> adap;
385 CPP11_auto_ptr<ResultDistro> distro = adap.construct(
386 predAxes, nResponseVars, interpolateSwitch);
390 const KDTree<Point> kdtree(data, dimPredictors, nPredictors);
396 double predictorCdfs[maxdim];
397 for (
unsigned idim=0; idim<nPredictors; ++idim)
398 predictorCdfs[idim] = 0.5;
399 const double centralBw = symbetaBandwidthInsideUnitBox(
400 predictorSymbetaPower, effectiveEventsPerBin/dataLen,
401 predictorCdfs, nPredictors);
405 PointCountByCdfBandwidth<Point> pointCounter(
406 kdtree, predSamples, predictorCdfs, nPredictors);
408 CdfWeightAssigner<Point> wAssign(predSamples, dimPredictors,
409 predictorCdfs, nPredictors,
410 predictorSymbetaPower);
411 unsigned gridCell[maxdim];
412 double predCoords[maxdim];
416 predictorNumBins, nPredictors);
417 scan.isValid(); ++scan)
419 scan.getCoords(predictorCdfs, nPredictors);
422 double bwGuess = centralBw, realBw = 0.0;
423 if (stretchPredKernels)
427 bwGuess = symbetaBandwidthInsideUnitBox(
428 predictorSymbetaPower, effectiveEventsPerBin/dataLen,
429 predictorCdfs, nPredictors);
433 const double bw0 = symbetaBandwidthInsideUnitBox(
434 0, effectiveEventsPerBin/dataLen,
435 predictorCdfs, nPredictors);
439 double realBw0 = 0.0;
441 pointCounter, effectiveEventsPerBin, bw0, bandwidthTol,
442 &realBw0, initScaleStep);
446 realBw = bwGuess*realBw0/bw0;
453 const double ratioExp = symbetaEffRatioInsideUnitBox(
454 predictorSymbetaPower, bwGuess,
455 predictorCdfs, nPredictors);
460 const double bw0 = symbetaBandwidthInsideUnitBox(
461 0, ratioExp, predictorCdfs, nPredictors);
465 double realBw0 = 0.0;
467 pointCounter, ratioExp*dataLen, bw0, bandwidthTol,
468 &realBw0, initScaleStep);
472 realBw = bwGuess*realBw0/bw0;
477 wAssign.setBandwidth(realBw);
478 const double nBw = symbetaNumSigmas(predictorSymbetaPower,
480 fillCoordBoxFromBandwidth(predSamples, predictorCdfs,
481 nPredictors, nBw*realBw, &pBox);
482 kdtree.visitInBox(wAssign, pBox);
483 const double nEff = wAssign.result();
487 scan.getIndex(gridCell, nPredictors);
488 for (
unsigned ipred=0; ipred<nPredictors; ++ipred)
489 predCoords[ipred] = predAxes[ipred].coordinate(gridCell[ipred]);
492 const unsigned long scanCycle = scan.state();
493 CompDistro* cdnd = adap.callBuilder(
494 builder, scanCycle, predCoords, nPredictors, pBox,
495 wAssign.getPoints(), dimResponses, nResponseVars);
496 distro->setGridDistro(gridCell, nPredictors, cdnd);
500 if (scanCycle % reportFrequency == 0)
506 current = localtime(&now);
508 sprintf(timestamp,
"%02i:%02i:%02i", current->tm_hour,
509 current->tm_min, current->tm_sec);
512 const unsigned long nCycles = scan.maxState();
515 std::cout << timestamp <<
" "
516 <<
"In npstat::Private::buildInterpolatedHelper:"
517 <<
"\n Processed cell " << scanCycle+1
519 <<
", bwGuess = " << bwGuess
520 <<
", bw = " << realBw
521 <<
", Neff = " << nEff <<
".\n Visited "
522 << wAssign.getVisitCount() <<
" points, "
523 << wAssign.getNZeroWeights() <<
" unused."
Interface definition for classes which build composite distrubutions.
Interface definition for classes which reconstruct univariate distrubutions from data samples.
Interface for piecemeal processing of a data collection.
Iteration over uniformly spaced coordinates inside a multidimensional box.
Template to represent rectangles, boxes, and hyperboxes.
A number of useful 1-d continuous statistical distributions.
Non-uniformly spaced coordinate sets for use in constructing rectangular grids.
Compile-time dimensionality detector for classes like std::array.
Accumulates a sample and calculates various descriptive statistics.
Interface definitions and concrete simple functors for a variety of functor-based calculations.
A simple O(n^2) template function for checking if all elements in a container are unique....
Definition: AbsCompositeDistroBuilder.hh:30
Definition: AbsDistro1DBuilder.hh:28
virtual AbsDistribution1D * buildWeighted(unsigned long uniqueId, const double *predictorCoords, unsigned nPredictors, const BoxND< double > &predictorBox, const WeightedPtrVec &data, unsigned responseDimToUse) const
Definition: buildInterpolatedHelpers.hh:58
void process(const Point &point)
Definition: buildInterpolatedHelpers.hh:102
void clear()
Definition: buildInterpolatedHelpers.hh:92
double result()
Definition: buildInterpolatedHelpers.hh:128
Definition: buildInterpolatedHelpers.hh:186
Definition: SampleAccumulator.hh:37
Root finding in the log space by interval division.
Definition: AbsArrayProjector.hh:14
bool findRootInLogSpace(const Functor1< Result, Arg1 > &f, const Result &rhs, const Arg1 &x0, double tol, Arg1 *x, double logstep=0.5)
Definition: AbsDistribution1D.hh:31
virtual double density(double x) const =0
Definition: AbsVisitor.hh:20
static BoxND unitBox(unsigned long ndim)
Definition: SimpleFunctors.hh:58
Definition: PointDimensionality.hh:20
Definition: buildInterpolatedHelpers.hh:228