Index: trunk/examples/Python/lorpe_1d_unbinned_cv.py =================================================================== --- trunk/examples/Python/lorpe_1d_unbinned_cv.py (revision 788) +++ trunk/examples/Python/lorpe_1d_unbinned_cv.py (revision 789) @@ -1,146 +1,136 @@ #!/usr/bin/env python3 """ This example illustrates how to perform LOrPE on unbinned samples. The bandwidth is chosen by cross-validation (CV). """ __author__="Igor Volobouev (i.volobouev@ttu.edu)" -__version__="1.0" -__date__ ="June 22 2022" +__version__="1.1" +__date__ ="June 23 2022" # Perform necessary imports import math import npstat as ns import numpy as np from operator import itemgetter from npstat_utils import HistoND import matplotlib.pyplot as plt # Some parameters for the script sampleSize = 500 # Number of points to generate (sample size) xmin = 0.0 # Left edge of the sample histogram xmax = 5.0 # Right edge of the sample histogram numBins = 50 # Number of bins for the sample histogram symbetaPower = 4 # Power parameter of the symmetric beta weight function # w.r.t. which the orthogonal polynomial system will # be generated. Use -1 to employ a Gaussian instead. nDiscrete = 1000 # Number of intervals for displaying the results. normalize = True # Do we want to normalize the density estimate? # Here, this only ensures that the integral is 1 # and does not check for negative densities. nIntegSplits = 50 # Number of intervals to use for various quadratures. nIntegPoints = 4 # Number of Gauss-Legendre integration points to use # on each interval for various quadratures. nBwToTry = 50 # Number of bandwidth values to scan. # Choose boundary handling method. Only two methods are currently # supported for unbinned LOrPE: # # "BM_TRUNCATE" -- Just truncate the weight function at the boundary. # # "BM_FOLD" -- Reflect (fold) the weight function at the boundary and # add to the non-reflected part. # # It is not obvious a priori which boundary handling method will perform # best for any particular case, so it might be useful to try different ones. bh = ns.BoundaryHandling("BM_TRUNCATE") # Filter degrees to try in cross-validation filterDegreesToTry = np.linspace(0.0, 3.0, 7) # Here, we will use the exponential distribution truncated to [xmin, xmax] # in order to generate the random sample d1 = ns.TruncatedDistribution1D(ns.Exponential1D(0.0, 1.0), xmin, xmax) # Generate a random sample according to this density rng = ns.MersenneTwister() sample = d1.generate(rng, sampleSize) # Histogram for displaying the sample h1 = HistoND("int", ns.HistoAxis(numBins, xmin, xmax, "X"), "Sample Histogram", "Counts") h1.fill(sample, 1) # Function for finding the best filter degree and bandwidth by simple # scanning. It returns a tuple (bestDegree, bestBandwidth, bestCV). def bestDegreeAndBandwidth(cvFunctor, sampleScale, filterDegreesToTry, nBwToTry, which): symbetaPower = cvFunctor.symbetaPower() sampleSize = cvFunctor.effectiveSampleSize() scanResults = [] for filterDegree in filterDegreesToTry: # Select a reasonable plug-in bandwidth bw0 = ns.approxAmisePluginBwGauss(filterDegree, sampleSize, sampleScale) if symbetaPower >= 0: bw0 *= ns.approxSymbetaBandwidthRatio(symbetaPower, filterDegree) # Scan bandwidth values around the plug-in value. Of course, # the bandwidth range is not necessarily optimal here. bwmin = bw0/4.0 bwmax = bw0*4.0 print(which, "scanning bandwidth range {:.4f} to {:.4f} for degree {:.1f}".format( bwmin, bwmax, filterDegree)) bwset = np.array(ns.EquidistantInLogSpace(bwmin, bwmax, nBwToTry), dtype=np.double) cvValues = [cvFunctor(filterDegree, bw) for bw in bwset] # Choose the best value for this filter degree. # The function "findScanMaximum1D" will fit # a parabola around the maximum value. bestCV = ns.findScanMaximum1D(np.log(bwset), cvValues) bestH = math.exp(bestCV.location()) s = "best bandwidth for degree {:.1f} is {:.4f}".format(filterDegree, bestH) if bestCV.isOnTheBoundary(): s += " (on the boundary)" print(which, s) scanResults.append((filterDegree, bestH, bestCV.value())) return max(scanResults, key=itemgetter(2)) # Determine a reasonable sample scale acc = ns.DoubleSampleAccumulator() acc.accumulate(sample) robustScale = acc.sigmaRange() -# Perform least squares CV -lscv = ns.LOrPE1DSimpleLSCVFunctor(symbetaPower, xmin, xmax, bh, nIntegSplits, - nIntegPoints, normalize, sample) -bestDegLSCV, bestHLSCV, bestValueLSCV = bestDegreeAndBandwidth( - lscv, robustScale, filterDegreesToTry, nBwToTry, "LSCV:") -iseLSCV = lscv.integratedSquaredError(d1, bestDegLSCV, bestHLSCV) -print("\nBest LSCV filter degree is {:.1f}, bandwidth is {:.4f}, ISE = {:.4g}\n".format( - bestDegLSCV, bestHLSCV, iseLSCV)) - -# Perform regularized likelihood CV, using default regularization parameter -rlcv = ns.LOrPE1DSimpleRLCVFunctor(symbetaPower, xmin, xmax, bh, nIntegSplits, - nIntegPoints, normalize, sample) -bestDegRLCV, bestHRLCV, bestValueRLCV = bestDegreeAndBandwidth( - rlcv, robustScale, filterDegreesToTry, nBwToTry, "RLCV:") -iseRLCV = rlcv.integratedSquaredError(d1, bestDegRLCV, bestHRLCV) -print("\nBest RLCV filter degree is {:.1f}, bandwidth is {:.4f}, ISE = {:.4g}\n".format( - bestDegRLCV, bestHRLCV, iseRLCV)) - # Plot the sample histogram xh, yh = ns.histoOutline1D(h1) plt.plot(xh, yh, 'k') # Overlay the original density. Scale all densities so that # their plots are compatible with the histogram height. area = h1.integral() xdens, ydens = ns.scanDensity1D(d1, xmin, xmax, nDiscrete+1) plt.plot(xdens, ydens*area, 'g', label='True density') -# Overlay the unbinned LOrPE results -x1, y1 = ns.scanDoubleFunctor1(lscv.densityFunctor(bestDegLSCV, bestHLSCV), - xmin, xmax, nDiscrete+1) -plt.plot(x1, y1*area, 'r', label='LOrPE LSCV') - -x2, y2 = ns.scanDoubleFunctor1(rlcv.densityFunctor(bestDegRLCV, bestHRLCV), - xmin, xmax, nDiscrete+1) -plt.plot(x2, y2*area, 'b', label='LOrPE RLCV') +# Run the cross-validation and overlay LOrPE results +colors = ('r', 'b' ) +labels = ('RLCV', 'LSCV' ) +functors = (ns.LOrPE1DSimpleRLCVFunctor, ns.LOrPE1DSimpleLSCVFunctor) +for color, l, fcn in zip(colors, labels, functors): + # Perform the cross-validation + cv = fcn(symbetaPower, xmin, xmax, bh, nIntegSplits, nIntegPoints, normalize, sample) + bestDeg, bestH, bestCVValue = bestDegreeAndBandwidth( + cv, robustScale, filterDegreesToTry, nBwToTry, l + ":") + ise = cv.integratedSquaredError(d1, bestDeg, bestH) + print("\nBest {} filter degree is {:.1f}, bandwidth is {:.4f}, ISE = {:.4g}\n".format( + l, bestDeg, bestH, ise)) + # Overlay the result + x, y = ns.scanDoubleFunctor1(cv.densityFunctor(bestDeg, bestH), + xmin, xmax, nDiscrete+1) + plt.plot(x, y*area, color, label=('LOrPE ' + l)) # Add labels and comments plt.xlabel('X') plt.ylabel('Density estimate') plt.title('Unbinned LOrPE Density Estimation with Cross-Validation') plt.legend() # Display the window plt.show()