Index: trunk/npstat/stat/arrayStats.icc =================================================================== --- trunk/npstat/stat/arrayStats.icc (revision 587) +++ trunk/npstat/stat/arrayStats.icc (revision 588) @@ -1,513 +1,567 @@ #include #include #include #include #include #include "npstat/nm/BoxNDScanner.hh" #include "npstat/rng/permutation.hh" namespace npstat { + template + long double arrayMoment(const Numeric* arr, const unsigned long sz, + const long double center, const unsigned order) + { + if (!sz) throw std::invalid_argument("In npstat::arrayMoment: " + "can not process arrays with zero elements"); + assert(arr); + + long double sum = 0.0L; + switch (order) + { + case 0U: + return 1.0L; + + case 1U: + for (unsigned long i=0; i(arr[i]) - center); + break; + + case 2U: + for (unsigned long i=0; i(arr[i]) - center; + sum += delta*delta; + } + break; + + case 3U: + for (unsigned long i=0; i(arr[i]) - center; + sum += delta*delta*delta; + } + break; + + case 4U: + for (unsigned long i=0; i(arr[i]) - center; + const long double tmp = delta*delta; + sum += tmp*tmp; + } + break; + + default: + for (unsigned long i=0; i(arr[i]) - center; + sum += powl(delta, order); + } + } + return sum/sz; + } + template inline void arrayStats(const Data* arr, const unsigned long sz, double* pmean, double* pstdev, double* pskew, double* pkurt) { if (pmean || pstdev || pskew || pkurt) { if (!sz) throw std::invalid_argument("In npstat::arrayStats: " "can not process arrays with zero elements"); assert(arr); if (pstdev) *pstdev = 0.0; if (pskew) *pskew = 0.0; if (pkurt) *pkurt = 3.0; if (sz == 1UL) { if (pmean) *pmean = static_cast(arr[0]); } else { long double sm = 0.0L; for (unsigned long i=0; i(mean); const bool b = (pskew && sz > 2UL) || (pkurt && sz > 3UL); if (pstdev || b) { long double sumsq = 0.0L, skewsum = 0.0L, kursum = 0.0L; for (unsigned long i=0; i 0.0L) { if (pstdev) *pstdev = static_cast(sqrtl(K2)); if (pskew && sz > 2UL) { const long double K3 = sz/NM1/(sz - 2UL)*skewsum; *pskew = static_cast(K3/powl(K2,1.5L)); } if (pkurt && sz > 3UL) { const long double g2 = kursum*sz/sumsq/sumsq-3.0L; *pkurt = static_cast(3.0L + NM1/(sz-2UL)/(sz-3UL)*((sz+1UL)*g2 + 6.0L)); } } } } } } template inline void arrayCoordMean(const Array& a, const BoxND& limits, double* mean, const unsigned storageLength) { const unsigned lenMean = a.rank(); if (lenMean) { long double meanAcc[CHAR_BIT*sizeof(unsigned long)] = {0.0L,}; if (lenMean > CHAR_BIT*sizeof(unsigned long)) throw std::out_of_range("In npstat::arrayCoordMean: " "array dimensionality is too large"); assert(mean); if (storageLength < lenMean) throw std::invalid_argument("In npstat::arrayCoordMean: " "result buffer is too small"); long double weightSum = 0.0L; for (BoxNDScanner scanner(limits, a.shape()); scanner.isValid(); ++scanner) { scanner.getCoords(mean, lenMean); const long double v = a.linearValue(scanner.state()); weightSum += v; for (unsigned i=0; i(meanAcc[i]/weightSum); } } template inline void arrayCoordCovariance(const Array& a, const BoxND& limits, Matrix* covmat) { const unsigned aRank = a.rank(); if (aRank) { assert(covmat); if (covmat->nRows() != aRank) throw std::invalid_argument("In npstat::arrayCoordCovariance: " "incompatible number of rows in the result matrix"); if (covmat->nColumns() != aRank) throw std::invalid_argument("In npstat::arrayCoordCovariance: " "incompatible number of columns in the result matrix"); double mean[CHAR_BIT*sizeof(unsigned long)]; arrayCoordMean(a, limits, mean, aRank); long double weightSum = 0.0L, weightSumSq = 0.0L; const unsigned matLen = aRank*(aRank+1U)/2; std::vector ldbuf(matLen, 0.0L); long double* acc = &ldbuf[0]; double coords[CHAR_BIT*sizeof(unsigned long)]; for (BoxNDScanner scanner(limits, a.shape()); scanner.isValid(); ++scanner) { scanner.getCoords(coords, aRank); const long double v = a.linearValue(scanner.state()); weightSum += v; weightSumSq += v*v; unsigned ilin = 0; for (unsigned i=0; i(covmat->data()); const unsigned len = covmat->length(); for (unsigned i=0; i( acc[ilin]/weightSum*(effM1 + 1.0L)/effM1); (*covmat)[i][j] = cov; (*covmat)[j][i] = cov; } } } } // The formulae used below assume that the effective number // of events is given by the ratio of squared sum of weights // to sum of weights squared. Once the effective number of events // is known, it is used with the standard population estimates // consistent with SAS (see, for example, the article "Comparing // measures of sample skewness and kurtosis", by D.N. Joanes // and C.A. Gill, The Statistician, Vol 47, pp 183-189 (1998). template inline void arrayShape1D(const Array& a, const double xmin, const double xmax, double* pmean, double* pstdev, double* pskewness, double* pkurtosis) { typedef typename Array::value_type Numeric; if (a.rank() != 1U) throw std::invalid_argument( "In npstat::arrayShape1D: array dimensionality must be 1"); if (xmin >= xmax) throw std::invalid_argument( "In npstat::arrayShape1D: invalid interval specification"); if (pmean || pstdev || pskewness || pkurtosis) { const Numeric* data = a.data(); const Numeric zero = static_cast(0); const unsigned long nbins = a.length(); const double binWidth = (xmax - xmin)/nbins; long double wsum = 0.0L, xsum = 0.0L; for (unsigned long i=0; i(xsum/wsum); if (pmean) *pmean = mean; if (pstdev || pskewness || pkurtosis) { long double wsumsq = 0.0L, varsum = 0.0L; const double shiftedMin = xmin - mean; for (unsigned long i=0; i(sqrtl(K2)); if (pskewness || pkurtosis) { long double skewsum = 0.0L, kursum = 0.0L; for (unsigned long i=0; i 0.0L) { const long double m3 = skewsum/wsum; const long double K3 = effN*effN/effM1/effM2*m3; *pskewness = static_cast(K3/powl(K2,1.5L)); } else *pskewness = 0.0; } if (pkurtosis) { const long double effM3 = effM2 - 1.0L; if (effM3 > 0.0L) { const long double m4 = kursum/wsum; const long double g2 = m4/m2/m2 - 3.0L; *pkurtosis = static_cast(3.0L + effM1/effM2/effM3*((effN+1.0L)*g2 + 6.0L)); } else *pkurtosis = 3.0; } } } } } template inline void arrayQuantiles1D(const Numeric* data, const unsigned long len, const double xmin, const double xmax, const double* qvalues, double* quantiles, const unsigned nqvalues) { if (nqvalues == 0U) return; if (!len) throw std::invalid_argument("In npstat::arrayQuantiles1D: " "can not process arrays with zero elements"); assert(data); if (xmin > xmax) throw std::invalid_argument("In npstat::arrayQuantiles1D: " "invalid interval specification"); assert(qvalues); assert(quantiles); for (unsigned i=0; i 1.0) throw std::domain_error("In npstat::arrayQuantiles1D: " "cdf argument outside of [0, 1] interval"); if (xmin == xmax) { for (unsigned i=0; i zero) { if (!positiveElementFound) { positiveElementFound = true; idlo = id; } sum += value; idhi = id; } } if (!positiveElementFound) throw std::invalid_argument( "In npstat::arrayQuantiles1D: all weights are zero"); const double xlo = xmin + idlo*step; const double xhi = idhi == len - 1UL ? xmax : xmin + (idhi+1UL)*step; const unsigned long lenm1 = len - 1UL; unsigned long scanbin = idlo; long double sumbelow = 0.0L; long double sumabove = data[idlo]; for (unsigned i=0; i= sumabove && scanbin < lenm1) { sumbelow = sumabove; sumabove += data[++scanbin]; } const double dbin = (sumneeded - sumbelow)/data[scanbin]; double qval = xmin + (scanbin + dbin)*step; if (qval > xmax) qval = xmax; quantiles[i] = qval; } } } template inline double arrayEntropy(const Numeric* p, const unsigned long len, const bool normalize) { const Numeric zero = Numeric(); bool havePositive = false; long double entropy = 0.0L; long double sum = 1.0L; if (normalize) { sum = std::accumulate(p, p+len, 0.0L); if (sum <= 0.0L) throw std::invalid_argument("In npstat::arrayEntropy: sum of" " array elements is not positive"); } for (unsigned long i=0; i zero) { havePositive = true; const long double prob = p[i]; entropy -= prob*std::log(prob/sum); } } if (!havePositive) throw std::invalid_argument("In npstat::arrayEntropy: there are no" " positive array elements"); entropy /= sum; return entropy/len; } // C++ does not support partial template specialization for // functions, only for classes namespace Private { template struct PoissonLogLikelihood { inline static double calculate(const Real* means, const Numeric* counts, const unsigned long len) { long double sum = 0.0L; for (unsigned long i=0; i(counts[i])); if (dc < 0.0) throw std::invalid_argument( "In npstat::poissonLogLikelihood: " "this function can not handle negative counts"); if (dc > static_cast(ULONG_MAX)) throw std::invalid_argument( "In npstat::poissonLogLikelihood: " "count is too large"); const unsigned long cnt = static_cast(dc); const double m = means[i]; if (cnt) { if (m <= 0.0) throw std::invalid_argument( "In npstat::poissonLogLikelihood: " "non-positive mean encountered with counts present"); sum += (cnt*log(m) - logfactorial(cnt) - m); } else sum -= m; } return sum; } }; template struct PoissonLogLikelihood { inline static double calculate(const Real* means, const unsigned* counts, const unsigned long len) { long double sum = 0.0L; for (unsigned long i=0; i struct PoissonLogLikelihood { inline static double calculate(const Real* means, const unsigned long* counts, const unsigned long len) { long double sum = 0.0L; for (unsigned long i=0; i inline double poissonLogLikelihood(const Real* means, const Numeric* counts, const unsigned long len) { double logli = 0.0; if (len) { assert(means); assert(counts); logli = Private::PoissonLogLikelihood::calculate( means, counts, len); } return logli; } } Index: trunk/npstat/stat/arrayStats.hh =================================================================== --- trunk/npstat/stat/arrayStats.hh (revision 587) +++ trunk/npstat/stat/arrayStats.hh (revision 588) @@ -1,111 +1,116 @@ #ifndef NPSTAT_ARRAYSTATS_HH_ #define NPSTAT_ARRAYSTATS_HH_ /*! // \file arrayStats.hh // // \brief Various descriptive statistics for 1-d and multidimensional arrays // // Author: I. Volobouev // // July 2010 */ #include "npstat/nm/BoxND.hh" #include "npstat/nm/Matrix.hh" namespace npstat { /** // This function estimates population mean, standard // deviation, skewness, and kurtosis. The array must have at least // one element. A numerically sound two-pass algorithm is used. // // Any of the output pointers can be specified as NULL if the // corresponding quantity is not needed (this might speed the code up). */ template void arrayStats(const Numeric* arr, unsigned long arrLen, double* mean, double* stdev, double* skewness=0, double* kurtosis=0); + /** Function for calculating sample moments w.r.t. some point */ + template + long double arrayMoment(const Numeric* arr, unsigned long arrLen, + long double center, unsigned order); + /** // Calculate the mean along each coordinate using array values as // weights. The coordinate ranges are defined by the given box, // and the array points are assumed to be in the centers of the bins, // like in a histogram. The results are stored in the "mean" array // whose length (given by lengthMean) must not be smaller than // the rank of the array. // // For this function, the array dimensionality can not exceed // CHAR_BIT*sizeof(unsigned long) which is normally 64 on 64-bit machines. */ template void arrayCoordMean(const Array& a, const BoxND& limits, double* mean, unsigned lengthMean); /** // Calculate the array covariance matrix. The "limits" argument has // the same meaning as in the arrayCoordMean function. */ template void arrayCoordCovariance(const Array& a, const BoxND& limits, Matrix* covarianceMatrix); /** // One-dimensional mean, variance, skewness, and kurtosis // using array values as weights. Any of the argument pointers // can be NULL in which case the corresponding quantity is not // calculated. The code estimates population shape quantities // rather than sample quantities (so that it can be best used // with equidistantly binned histograms and such). */ template void arrayShape1D(const Array& a, double xmin, double xmax, double* mean, double* stdev, double* skewness, double* kurtosis); /** // Quantiles for 1-d arrays in which array values are used as weights. // Essentially, the values are treated as histogram bin contents. // All input "qvalues" should be between 0 and 1. The results are // returned in the corresponding elements of the "quantiles" array. // The function will work faster if "qvalues" are sorted in the // increasing order. // // If you expect to call this function more than once using the same // data, it is likely that you can obtain the same information more // efficiently by creating the "BinnedDensity1D" object with its // interpolationDegree argument set to 0 and then using its "quantile" // method. */ template void arrayQuantiles1D(const Numeric* data, unsigned long len, double xmin, double xmax, const double* qvalues, double* quantiles, unsigned nqvalues); /** // This function returns negative sum of p_i ln(p_i) over array // elements p_i, divided by the total number of array elements. // All elements must be non-negative, and there must be at least // one positive element present. Useful for calculating entropies // of distributions represented on a grid. For example, mutual // information of two variables is just the entropy of their copula. */ template double arrayEntropy(const Numeric* p, unsigned long len, bool normalize = false); /** // This function assumes that Poisson distribution parameters are // given in the "means" array while the array "counts" contains // corresponding observations. "len" is the length of both "means" // and "counts" arrays. */ template double poissonLogLikelihood(const Real* means, const Numeric* counts, unsigned long len); } #include "npstat/stat/arrayStats.icc" #endif // NPSTAT_ARRAYSTATS_HH_ Index: trunk/tests/test_StatAccumulator.cc =================================================================== --- trunk/tests/test_StatAccumulator.cc (revision 587) +++ trunk/tests/test_StatAccumulator.cc (revision 588) @@ -1,175 +1,196 @@ #include +#include #include "UnitTest++.h" #include "test_utils.hh" #include "TestAccumulator.hh" #include "npstat/stat/StatAccumulator.hh" +#include "npstat/stat/SampleAccumulator.hh" #include "npstat/stat/BinSummary.hh" +#include "npstat/stat/arrayStats.hh" using namespace std; using namespace npstat; namespace { + TEST(arrayMoment) + { + const long double eps = 1.0e-16L; + const unsigned maxpoints = 33; + + SampleAccumulator acc; + for (unsigned i=0; i