Index: trunk/examples/Python/lorpe_1d_features.py =================================================================== --- trunk/examples/Python/lorpe_1d_features.py (revision 787) +++ trunk/examples/Python/lorpe_1d_features.py (revision 788) @@ -1,299 +1,299 @@ #!/usr/bin/env python3 """ This example illustrates how to perform LOrPE on unbinned samples, including the following features: -- Weighted samples -- Variable bandwidth -- Local cross-validation """ __author__="Igor Volobouev (i.volobouev@ttu.edu)" -__version__="1.0" +__version__="1.1" __date__ ="June 23 2022" # Perform necessary imports import math import npstat as ns import numpy as np from operator import itemgetter from npstat_utils import HistoND import matplotlib.pyplot as plt # Some parameters for the script sampleSize = 1000 # Number of points to generate (sample size) xmin = 0.0 # Left edge of the sample histogram xmax = 5.0 # Right edge of the sample histogram numBins = 50 # Number of bins for the sample histogram symbetaPower = 4 # Power parameter of the symmetric beta weight function # w.r.t. which the orthogonal polynomial system will # be generated. Use -1 to employ a Gaussian instead. nDiscrete = 1000 # Number of intervals for displaying the results. normalize = True # Do we want to normalize the density estimate? # Here, this only ensures that the integral is 1 # and does not check for negative densities. nIntegSplits = 50 # Number of intervals to use for various quadratures. nIntegPoints = 4 # Number of Gauss-Legendre integration points to use # on each interval for various quadratures. nBwToTry = 50 # Number of bandwidth values to scan. # Choose boundary handling method. Only two methods are currently # supported for unbinned LOrPE: # # "BM_TRUNCATE" -- Just truncate the weight function at the boundary. # # "BM_FOLD" -- Reflect (fold) the weight function at the boundary and # add to the non-reflected part. # # It is not obvious a priori which boundary handling method will perform # best for any particular case, so it might be useful to try different ones. bh = ns.BoundaryHandling("BM_TRUNCATE") # Generate a weighted sample of points on [xmin, xmax]. Here, we # use a flat distribution to generate the points and some other # distribution (a mixture of exponential and Gaussian) to assign # the weights. d1 = ns.Uniform1D(xmin, xmax - xmin) rng = ns.MersenneTwister() sampleCoords = d1.generate(rng, sampleSize) truncatedGauss = ns.TruncatedDistribution1D(ns.Gauss1D(2.0, 0.2), xmin, xmax) truncatedExponential = ns.TruncatedDistribution1D(ns.Exponential1D(0.0, 1.0), xmin, xmax) weightDistro = ns.DistributionMix1D() weightDistro.add(truncatedGauss, 0.3) weightDistro.add(truncatedExponential, 0.7) weights = [weightDistro.density(x) for x in sampleCoords] weightedSample = list(zip(sampleCoords, weights)) # Histogram for displaying the weighted sample. Note that # the type of bins is double rather than integer. h1 = HistoND("double", ns.HistoAxis(numBins, xmin, xmax, "X"), "Sample Histogram", "Weight") h1.fill(weightedSample) # The two most general NPStat functions which create objects performing # unbinned LOrPE with cross-validation are called "LOrPE1DLSCVFunctor" # and "LOrPE1DRLCVFunctor". The former performs least squares # cross-validation (LSCV) and the latter regularized likelihood # cross-validation (RLCV). The function signatures are very similar. # The only difference is that "LOrPE1DRLCVFunctor" can take an extra # parameter playing the role of regularization strength. If the default # value of that parameter (0.5) is acceptable, the functions have identical # signatures. # # We will do the following here. Only non-negative values of regularization # parameter "rlcvAlpha" make sense for RLCV. So, if the parameter below is # negative, we use LSCV, otherwise we use RLCV. Try different values of # rlcvAlpha and see what happens. rlcvAlpha = -1.0 if rlcvAlpha < 0.0: useRLCV = False lorpeFunctor = ns.LOrPE1DLSCVFunctor else: useRLCV = True lorpeFunctor = ns.LOrPE1DRLCVFunctor # Create the object which performs cross-validation. The function signature # looks as follows (see also C++ header file LOrPE1DCV.hh): # # LOrPE1DLSCVFunctor(symbetaPower, # leftBoundary, # rightBoundary, # bh, # bandwidthFunctor, # localizingWeight, # localizingWeightXmin, # localizingWeightXmax, # nIntegIntervals, # nIntegPoints, # normalizeLOrPEEstimate, # sample) # # The following arguments must be provided: # # symbetaPower is the "power", p, of the symmetric Beta function # used to generate the smoothing kernels. It must # be an integer. If it is non-negative, the # function will look like (1 - x^2)^p. If it is # negative, the standard normal density is used # instead, truncated to the interval [-10, 10]. # I typically use either -1 or 4 for this parameter. # # leftBoundary is the assumed left boundary of the support # of the density we are trying to estimate. # # rightBoundary is the assumed right boundary of the support # of the density we are trying to estimate. # # bh is the boundary handling method. Currently the # kernel extending beyond the support boundary # can be handled by either truncating or folding # (i.e., mirroring). # # bandwidthFunctor This function specifies how the LOrPE bandwidth # depends on the coordinate. The value of this # function will be multiplied by the overall # scale, "bwFactor", in order to determine the # actual location-dependent bandwidth value. # bandwidthFunctor has to be ns.PyFunctor1 wrapping # a python callable. bandwidthFunctor can also be # just a python float (typically 1.0) if variable # bandwidth is not desired. Hopefully, this explanation # will be more clear from the examples. # # localizingWeight This function specifies how cross-validation # is localized. It needs to be either ns.PyFunctor1 # wrapping a python callable or ns.DensityFunctor1D # object (density function of any 1-d probability # distribution implemented by NPStat). It can also be # just a python float (typically 1.0) if # cross-validation localization is not desired. # # localizingWeightXmin The cross-validation will only utilize sample # localizingWeightXmax points inside the interval # [localizingWeightXmin, localizingWeightXmax]. # If you want to use global cross-validation, # set localizingWeightXmin to leftBoundary # (or smaller) and localizingWeightXmax to # rightBoundary (or larger). # # nIntegIntervals These parameters specify how various quadratures # nIntegPoints are performed. "nIntegPoints" is the number of # points for the Gauss-Legendre quadrature, and # "nIntegIntervals" is the number of intervals # into which the support of the estimated density # is split (Gauss-Legendre quadrature is performed # separately on each interval). Note that # Gauss-Legendre quadratures with "nIntegPoints" # points exactly integrate polynomials up to degree # 2*nIntegPoints - 1. If you expect that your # density estimate is going to be very smooth # (this can be expected for negative symbetaPower # values), use larger nIntegPoints and smaller # nIntegIntervals. # # normalizeLOrPEEstimate If this parameter is "True", the code will # automatically normalize the LOrPE density # estimate area to 1 using numerical integration # (see parameters nIntegIntervals and nIntegPoints). # Note that this does not address the potential # problem of negative density values. # # sample The sample of points for which density estimate # is constructed. These can be either unweighted, # in which case "sample" is typically a list of # doubles, or weighted, in which case "sample" # is typically a list of two-element tuples. In # these tuples, the first element is the coordinate # and the second element is the weight. # # LOrPE cross-validation objects can also be created by functions with # simpler signatures. Use "LOrPE1DGlobLSCVFunctor" if local cross-validation # is not needed and "LOrPE1DSimpleLSCVFunctor" if neither local # cross-validation nor location-dependent bandwidth is desired. Similar # finctions exist for RLCV. See the C++ header file LOrPE1DCV.hh for # exact signatures. # The example callable to use for bandwidth dependence on the coordinate class QuadraticBandwidth: def __init__(self, a, x0, y0): self.a = a self.x0 = x0 self.y0 = y0 assert self.y0 > 0.0 assert self.a >= 0.0 def __call__(self, x): delta = x - self.x0 return self.a*delta*delta + self.y0 bandwidthCallable = QuadraticBandwidth(1.0, 2.0, 1.0) bandwidthFunctor = ns.PyFunctor1(bandwidthCallable) # You can also try to uncomment the following in order # to have no bandwidth dependence on the coordinate: # # bandwidthFunctor = 1.0 # Example distribution to use to localize the weight localizingDistro = ns.Gauss1D(2.0, 0.5) localizingWeight = ns.DensityFunctor1D(localizingDistro) localizingWeightXmin = localizingDistro.quantile(0.0) localizingWeightXmax = localizingDistro.quantile(1.0) # Create the LOrPE cross-validation object cv = lorpeFunctor(symbetaPower, xmin, xmax, bh, bandwidthFunctor, localizingWeight, localizingWeightXmin, localizingWeightXmax, nIntegSplits, nIntegPoints, normalize, weightedSample) # Check if we use RLCV and set the regularization parameter # if this is the case if useRLCV: cv.setRlcvAlpha(rlcvAlpha) # For simplicity, we will fix the filter degree here to 0 # and scan the bandwidth without optimizing the degree. # This corresponds to using KDE with boundary correction. # One can, of course, also optimize the degree. See example # script "lorpe_1d_unbinned_cv.py" for this. filterDegree = 0 bwmin = 0.02 bwmax = 1.0 bandwidthFactors = np.array(ns.EquidistantInLogSpace(bwmin, bwmax, nBwToTry), dtype=np.double) cvValues = [cv(filterDegree, bw) for bw in bandwidthFactors] # Find the best bandwidth factor (in log space). # The function "findScanMaximum1D" will fit # a parabola around the maximum value. bestCV = ns.findScanMaximum1D(np.log(bandwidthFactors), cvValues) bestBWFactor = math.exp(bestCV.location()) print("Best bandwidth factor for degree {:.1f} is {:.4f}".format( filterDegree, bestBWFactor)) # Show the bandwidth scan results. Kill the plot window # to proceed beyond this point. plt.plot(bandwidthFactors, cvValues) plt.xscale('log') plt.xlabel('Bandwidth Factor') plt.ylabel('CV Function') plt.title('Cross-Validation Function Dependence on Bandwidth') plt.show() # Plot the dependence of the bandwidth on the coordinate xcoords = np.linspace(xmin, xmax, nDiscrete+1) -bw = [bestBWFactor*bandwidthCallable(x) for x in xcoords] +bw = [bestBWFactor*cv.bandwidthCurve(x) for x in xcoords] plt.plot(xcoords, bw) plt.xlabel('X') plt.ylabel('Bandwidth') plt.title('Bandwidth Dependence on Coordinate') plt.show() # Plot the sample histogram xh, yh = ns.histoOutline1D(h1) plt.plot(xh, yh, 'k') # Overlay the original density. Scale all densities so that # their plots are compatible with the histogram height. area = h1.integral() xdens, ydens = ns.scanDensity1D(weightDistro, xmin, xmax, nDiscrete+1) plt.plot(xdens, ydens*area, 'g', label='True density') # Overlay the unbinned LOrPE results x1, y1 = ns.scanDoubleFunctor1(cv.densityFunctor(filterDegree, bestBWFactor), xmin, xmax, nDiscrete+1) label = "LOrPE " if useRLCV: label += "RLCV" else: label += "LSCV" plt.plot(x1, y1*area, 'r', label=label) # Add labels and comments plt.xlabel('X') plt.ylabel('Density estimate') plt.title('LOrPE with Local Cross-Validation') plt.legend() # Display the window plt.show()