Page MenuHomeHEPForge

No OneTemporary

Index: trunk/npstat/swig/npstat_density_estimation_1d.py
===================================================================
--- trunk/npstat/swig/npstat_density_estimation_1d.py (revision 628)
+++ trunk/npstat/swig/npstat_density_estimation_1d.py (revision 629)
@@ -1,458 +1,468 @@
"""
This module provides a number of useful wrapper classes and functions
related to 1-d density estimation
"""
__author__ = "Igor Volobouev (i.volobouev@ttu.edu)"
__version__ = "1.1"
__date__ = "Sep 20 2019"
import npstat
import numpy as np
from npstat_utils import histoBinType
def histoPluginBandwidth1D(h, sampleSize, deg, symbetaPower):
if h.dim() != 1:
raise ValueError("Expected 1-d histogram as an argument")
assert deg >= 0.0, "Filter degree can not be negative"
if sampleSize <= 0.0:
sampleSize = h.nFillsInRange()
qvalues = (0.15865525393145705, 0.84134474606854295)
quantiles = npstat.histoQuantiles(h, 0, qvalues)
sigmaEstimate = (quantiles[1] - quantiles[0])/2.0
pluginBw = npstat.approxAmisePluginBwGauss(deg, sampleSize, sigmaEstimate)
if symbetaPower >= 0.0:
pluginBw *= npstat.approxSymbetaBandwidthRatio(symbetaPower, deg);
return pluginBw
def BandwidthCVLeastSquares1D(typename, *args):
"""
Least squares cross-validation helper class
"""
funcs = {'unsigned char' : npstat.UCharBandwidthCVLeastSquares1D,
'int' : npstat.IntBandwidthCVLeastSquares1D,
'long long' : npstat.LLongBandwidthCVLeastSquares1D,
'float' : npstat.FloatBandwidthCVLeastSquares1D,
'double' : npstat.DoubleBandwidthCVLeastSquares1D}
return funcs[typename](*args)
def BandwidthCVPseudoLogli1D(typename, *args):
"""
Pseudo-likelihood cross-validation helper class
"""
funcs = {'unsigned char' : npstat.UCharBandwidthCVPseudoLogli1D,
'int' : npstat.IntBandwidthCVPseudoLogli1D,
'long long' : npstat.LLongBandwidthCVPseudoLogli1D,
'float' : npstat.FloatBandwidthCVPseudoLogli1D,
'double' : npstat.DoubleBandwidthCVPseudoLogli1D}
return funcs[typename](*args)
class _BinnedCVCalc1D:
def __init__(self, effectiveSampleSize, weightedSample):
self._effectiveSampleSize = effectiveSampleSize
self._weightedSample = weightedSample
def __call__(self, h, result, filter):
calc = self._makeCalc(h)
if self._weightedSample is not None:
return calc.cvWeightedSample(h, self._weightedSample, result, filter)
elif self._effectiveSampleSize is not None:
return calc.cvWeighted(h, self._effectiveSampleSize, result, filter)
else:
return calc.cv(h, result, filter)
def _makeCalc(self, h):
raise NotImplementedError
class BinnedRLCVCalc1D(_BinnedCVCalc1D):
def __init__(self, regParameter, effectiveSampleSize=None, weightedSample=None):
assert regParameter >= 0.0, "RLCV regularisation parameter can not be negative"
_BinnedCVCalc1D.__init__(self, effectiveSampleSize, weightedSample)
self._regParameter = regParameter
def _makeCalc(self, h):
return BandwidthCVPseudoLogli1D(histoBinType(h), self._regParameter)
class BinnedLSCVCalc1D(_BinnedCVCalc1D):
def __init__(self, effectiveSampleSize=None, weightedSample=None):
_BinnedCVCalc1D.__init__(self, effectiveSampleSize, weightedSample)
def _makeCalc(self, h):
return BandwidthCVLeastSquares1D(histoBinType(h))
class _DiscreteDEBase1D:
"""
Base class for 1-d density estimation on regular grids
"""
def __init__(self, prototypeHisto, i_allowNegativeDensities):
if prototypeHisto.dim() != 1:
raise ValueError("Expected 1-d histogram as a prototype")
if not prototypeHisto.isUniformlyBinned():
raise ValueError("Expected uniformy binned histogram as a prototype")
self._allowNegative = i_allowNegativeDensities
axis = prototypeHisto.axis(0)
self._nBins = axis.nBins()
self._binWidth = axis.binWidth(0)
def expectedNBins(self):
return self._nBins
def expectedBinWidth(self):
return self._binWidth
def allowNegativeDensities(self, b):
self._allowNegative = b
def areNegativeDensitiesAllowed(self):
return self._allowNegative
def isCompatible(self, h):
if h.dim() != 1:
return False
if not h.isUniformlyBinned():
return False
axis = h.axis(0)
return self._nBins == axis.nBins() and self._binWidth == axis.binWidth(0)
def validateData(self, h):
if not self.isCompatible(h):
raise ValueError("Incompatible input histogram")
def setData(self, h):
raise NotImplementedError
def isDataSet(self):
raise NotImplementedError
def estimateDataDensity(self, filterDegree, bandwidth=-1.0,
symbetaPower=-1, rememberFilter=True):
raise NotImplementedError
def estimateDensity(self, h, filterDegree, bandwidth=-1.0,
symbetaPower=-1, rememberFilter=True):
assert filterDegree >= 0.0, "Filter degree can not be negative"
self.setData(h)
return self.estimateDataDensity(filterDegree, bandwidth,
symbetaPower, rememberFilter)
def addFilter(self, filterDegree, bandwidth=-1.0, symbetaPower=-1):
raise NotImplementedError
def nFiltersAdded(self):
raise NotImplementedError
def clearFilters(self):
raise NotImplementedError
def bestFilterByCV(self, h, cvCalc):
raise NotImplementedError
class FFTKDE1D(_DiscreteDEBase1D):
"""
Fast 1-d KDE on regular grids utilizing FFT for convolutions
"""
def __init__(self, prototypeHisto, allowNegativeResults=True,
useMirroring=True):
_DiscreteDEBase1D.__init__(self, prototypeHisto, allowNegativeResults)
self._mirror = useMirroring
self._engine = npstat.ConvolutionEngine1D(2*self._nBins)
self._kernelTable = dict()
self._kernelProperties = list()
self._directId = 2147483647
self._directKernelKey = (0.0, -1.0, 0.0)
# Add more constructor parameters if you want to
# change the settings used below
self._bh = npstat.BoundaryHandling("BM_TRUNCATE")
self._gaussKernelNSigmas = 12.0
def _makeKernel(self, filterDegree, bandwidth, symbetaPower):
assert filterDegree >= 0.0, "Filter degree can not be negative"
assert bandwidth > 0.0, "Kernel bandwidth must be positive"
if symbetaPower < 0.0:
distro = npstat.TruncatedGauss1D(0.0, bandwidth, self._gaussKernelNSigmas)
else:
distro = npstat.SymmetricBeta1D(0.0, bandwidth, symbetaPower)
builder = npstat.getBoundaryFilter1DBuilder(self._bh, distro, self._binWidth)
taper = npstat.contDegreeTaper(filterDegree)
kernelCenter = self._nBins - 1
pf = builder.makeFilter(taper, len(taper)-1, kernelCenter, 2*self._nBins)
- selfContrib = pf.valueAt(pf.peakPosition())
- return (npstat.polyFilter1DToNumpy(pf, kernelCenter, 2*self._nBins), selfContrib)
+ selfContribution = pf.valueAt(pf.peakPosition())
+ return (npstat.polyFilter1DToNumpy(pf, kernelCenter, 2*self._nBins),
+ selfContribution)
def addFilter(self, filterDegree, bandwidth, symbetaPower):
"""
Add a kernel for future use in cross-validation
"""
assert filterDegree >= 0.0, "Filter degree can not be negative"
key = (filterDegree, bandwidth, symbetaPower)
if key not in self._kernelTable:
id = len(self._kernelProperties)
assert id < self._directId, "Too many kernels"
k, sc = self._makeKernel(filterDegree, bandwidth, symbetaPower)
self._engine.depositFilter(id, k, self._nBins+1)
self._kernelTable[key] = id
self._kernelProperties.append((filterDegree, bandwidth, symbetaPower, sc))
return self._kernelTable[key]
def nFiltersAdded(self):
sz = len(self._kernelProperties)
assert sz == self._engine.filterBankSize(), "Filter counting failure"
return sz
def clearFilters(self):
self._kernelTable = dict()
self._kernelProperties = list()
self._engine.clearFilterBank();
self._directKernelKey = (0.0, -1.0, 0.0)
def _getKernelId(self, filterDegree, bandwidth, symbetaPower, addKern):
assert filterDegree >= 0.0, "Filter degree can not be negative"
assert bandwidth > 0.0, "Kernel bandwidth must be positive"
if addKern:
return self.addFilter(filterDegree, bandwidth, symbetaPower)
key = (filterDegree, bandwidth, symbetaPower)
if key in self._kernelTable:
return self._kernelTable[key]
if key != self._directKernelKey:
k, sc = self._makeKernel(filterDegree, bandwidth, symbetaPower)
self._engine.depositFilter(self._directId, k, self._nBins+1)
self._directKernelKey = key
return self._directId
def setData(self, h):
self.validateData(h)
self._engine.setFilter(npstat.doubleHistoBins1D(h, True, self._mirror))
def isDataSet(self):
return self._engine.isFilterSet()
def _buildResult(self, id, makeNonNegative):
arr = self._engine.convolveFilterAndDeposit(id, makeNonNegative)
result = np.resize(arr, self._nBins)
# The result we have now is normalized to the histogram area
# in case of mirroring, and potentially has some leakage
# if there is no mirroring. Renormalize everything to 1.
area = np.sum(result)*self._binWidth
assert area > 0.0, "Result can not be normalized. "\
"Was the input histogram filled?"
result /= area
return result
def estimateDataDensity(self, filterDegree, bandwidth, symbetaPower, addKern=True):
assert self._engine.isFilterSet(), "Data is not set"
assert filterDegree >= 0.0, "Filter degree can not be negative"
id = self._getKernelId(filterDegree, bandwidth, symbetaPower, addKern)
makeNonNegative = (filterDegree == 0.0 or (not self._allowNegative))
return self._buildResult(id, makeNonNegative)
def bestFilterByCV(self, h, cvCalc):
"""
Choose the best kernel among kernels added previsously utilizing
cross-validation
"""
assert self.nFiltersAdded() > 0, "No kernels added"
self.setData(h)
bestCVValue = -1.7e308
bestId = -1
bestResult = None
for id, properties in enumerate(self._kernelProperties):
filterDegree, bandwidth, symbetaPower, sc = properties
makeNonNegative = (filterDegree == 0.0 or (not self._allowNegative))
result = self._buildResult(id, makeNonNegative)
dummyFilter = npstat.ContribConstant1D(sc, h.nBins())
cvValue = cvCalc(h, result, dummyFilter)
if cvValue > bestCVValue:
bestCVValue = cvValue
bestId = id
bestResult = result
assert bestId >= 0, "Kernel optimization scan failed"
filterDegree, bandwidth, symbetaPower, sc = self._kernelProperties[bestId]
key = (filterDegree, bandwidth, symbetaPower)
return (bestCVValue, key, bestResult)
class LOrPE1D(_DiscreteDEBase1D):
"""
Density estimation by LOrPE on regular 1-d grids
"""
def __init__(self, prototypeHisto, allowNegativeResults=True, bh = None):
_DiscreteDEBase1D.__init__(self, prototypeHisto, allowNegativeResults)
self._bh = bh
if bh is None:
self._bh = npstat.BoundaryHandling("BM_FOLD")
self._data = None
self._filterTable = dict()
def setData(self, h):
self.validateData(h)
self._data = npstat.doubleHistoBins1D(h)
def isDataSet(self):
return self._data is not None
def _makeFilter(self, filterDegree, bandwidth, symbetaPower):
assert filterDegree >= 0.0, "Filter degree can not be negative"
return npstat.symbetaLOrPEFilter1D(
symbetaPower, bandwidth, filterDegree,
self._nBins, self._binWidth, self._bh)
def estimateDataDensity(self, filterDegree, bandwidth,
symbetaPower, rememberFilter=True):
assert self.isDataSet(), "Data is not set"
assert filterDegree >= 0.0, "Filter degree can not be negative"
key = (filterDegree, bandwidth, symbetaPower)
if key in self._filterTable:
filt = self._filterTable[key]
elif rememberFilter:
filt = self.addFilter(filterDegree, bandwidth, symbetaPower)
else:
filt = self._makeFilter(filterDegree, bandwidth, symbetaPower)
makeNonNegative = (filterDegree == 0.0 or (not self._allowNegative))
result = filt.filter(self._data, makeNonNegative)
area = np.sum(result)*self._binWidth
assert area > 0.0, "Result can not be normalized. "\
"Was the input histogram filled?"
result /= area
return result
def addFilter(self, filterDegree, bandwidth, symbetaPower):
assert filterDegree >= 0.0, "Filter degree can not be negative"
key = (filterDegree, bandwidth, symbetaPower)
if key in self._filterTable:
filter = self._filterTable[key]
else:
filter = self._makeFilter(filterDegree, bandwidth, symbetaPower)
self._filterTable[key] = filter
return filter
def nFiltersAdded(self):
return len(self._filterTable)
def clearFilters(self):
self._filterTable = dict()
def bestFilterByCV(self, h, cvCalc):
assert self.nFiltersAdded() > 0, "No filters added"
self.setData(h)
bestCVValue = -1.7e308
bestKey = (0.0, -1.0, 0.0)
bestResult = None
buffer = np.empty((self._nBins,), dtype=np.double)
for key, filt in self._filterTable.items():
filterDegree, bandwidth, symbetaPower = key
makeNonNegative = (filterDegree == 0.0 or (not self._allowNegative))
filt.filterIntoBuffer(self._data, buffer, makeNonNegative)
area = np.sum(buffer)*self._binWidth
assert area > 0.0, "Result can not be normalized. "\
"Was the input histogram filled?"
buffer /= area
cvValue = cvCalc(h, buffer, filt)
if cvValue > bestCVValue:
bestCVValue = cvValue
bestKey = key
bestResult = np.copy(buffer)
assert bestResult is not None, "Filter optimization scan failed"
return (bestCVValue, bestKey, bestResult)
class DiscreteOSDE1D(_DiscreteDEBase1D):
"""
OSDE on regular 1-d grids
"""
- def __init__(self, prototypeHisto, allowNegativeResults=True, maxdeg=50):
+ def __init__(self, prototypeHisto, allowNegativeResults=True, maxdeg=50,
+ weightFunction=None):
_DiscreteDEBase1D.__init__(self, prototypeHisto, allowNegativeResults)
assert maxdeg >= 0, "Filter degree can not be negative"
self._data = None
self._filterTable = dict()
- self._maxdeg = max(self._nBins-1, maxdeg)
- self._poly = npstat.OrthoPoly1D(self._maxdeg, np.ones(self._maxdeg+1), self._binWidth)
+ self._maxdeg = min(self._nBins-1, maxdeg)
+ wF = weightFunction
+ if wF is None:
+ wF = np.ones(self._nBins)
+ else:
+ assert len(wF) == self._nBins, "Incompatible number of weights"
+ self._poly = npstat.OrthoPoly1D(self._maxdeg, wF, self._binWidth)
+
+ def maxDegree(self):
+ return self._maxdeg
def setData(self, h):
self.validateData(h)
self._data = np.empty((self._maxdeg+1,), dtype=np.double)
self._poly.calculateCoeffs(npstat.doubleHistoBins1D(h), self._data)
def isDataSet(self):
return self._data is not None
def _makeFilter(self, filterDegree):
assert filterDegree >= 0.0, "Filter degree can not be negative"
taper = npstat.contDegreeTaper(filterDegree)
if len(taper) > self._maxdeg+1:
raise ValueError("Filter degree is too large")
coeffs = np.array(taper, dtype=np.double)
response = np.empty((self._nBins,), dtype=np.double)
self._poly.globalFilterDiag(coeffs, response)
contrib = npstat.ContribVector1D(response)
return (coeffs, contrib)
def estimateDataDensity(self, filterDegree, rememberFilter=True):
assert self.isDataSet(), "Data is not set"
assert filterDegree >= 0.0, "Filter degree can not be negative"
key = filterDegree
if key in self._filterTable:
taper, contrib = self._filterTable[key]
elif rememberFilter:
taper, contrib = self.addFilter(filterDegree)
else:
taper, contrib = self._makeFilter(filterDegree)
coeffs = np.array([a*b for a,b in zip(taper,self._data)], dtype=np.double)
result = np.empty((self._nBins,), dtype=np.double)
self._poly.allSeries(coeffs, result, not self._allowNegative)
area = np.sum(result)*self._binWidth
assert area > 0.0, "Result can not be normalized. "\
"Was the input histogram filled?"
result /= area
return result
def estimateDensity(self, h, filterDegree, rememberFilter=True):
assert filterDegree >= 0.0, "Filter degree can not be negative"
self.setData(h)
return self.estimateDataDensity(filterDegree, rememberFilter)
def addFilter(self, filterDegree):
assert filterDegree >= 0.0, "Filter degree can not be negative"
key = filterDegree
if key in self._filterTable:
filter = self._filterTable[key]
else:
filter = self._makeFilter(filterDegree)
self._filterTable[key] = filter
return filter
def nFiltersAdded(self):
return len(self._filterTable)
def clearFilters(self):
self._filterTable = dict()
def bestFilterByCV(self, h, cvCalc):
assert self.nFiltersAdded() > 0, "No filters added"
self.setData(h)
bestCVValue = -1.7e308
bestKey = -1.0
bestResult = None
buffer = np.empty((self._nBins,), dtype=np.double)
for key, filt in self._filterTable.items():
taper, contrib = filt
coeffs = np.array([a*b for a,b in zip(taper,self._data)], dtype=np.double)
self._poly.allSeries(coeffs, buffer, not self._allowNegative)
area = np.sum(buffer)*self._binWidth
assert area > 0.0, "Result can not be normalized. "\
"Was the input histogram filled?"
buffer /= area
cvValue = cvCalc(h, buffer, contrib)
if cvValue > bestCVValue:
bestCVValue = cvValue
bestKey = key
bestResult = np.copy(buffer)
assert bestResult is not None, "Filter optimization scan failed"
return (bestCVValue, bestKey, bestResult)
Index: trunk/examples/Python/lorpe_1d_cv.py
===================================================================
--- trunk/examples/Python/lorpe_1d_cv.py (revision 628)
+++ trunk/examples/Python/lorpe_1d_cv.py (revision 629)
@@ -1,120 +1,119 @@
#!/usr/bin/env python3
"""
This example illistrates how to choose LOrPE filter settings
in 1-d by cross-validation
"""
__author__="Igor Volobouev (i.volobouev@ttu.edu)"
__version__="1.0"
__date__ ="Sep 19 2019"
# Perform necessary imports
import npstat as ns
from npstat_utils import HistoND
from npstat_density_estimation_1d import LOrPE1D, FFTKDE1D, \
histoPluginBandwidth1D, BinnedLSCVCalc1D, BinnedRLCVCalc1D
import matplotlib.pyplot as plt
import numpy as np
import math
# Some parameters for the script
sampleSize = 1000 # Number of points to generate (sample size)
xmin = 0.0 # Left edge of the sample histogram
xmax = 5.0 # Right edge of the sample histogram
numBins = 100 # Number of bins for the sample histogram
nDiscrete = 500 # Number of cells for sample discretization.
# Should be large enough so that the cell width
# is substantially smaller than any physical scale.
symbetaPower = 4 # Power parameter of the symmetric beta kernel.
# Specify -1 to use a Gaussian kernel instead.
pluginDegree = 2 # Degree of polynomial used to generate a simple
# plug-in estimate for comparison with cross-validation
# Here, we will use the exponential distribution truncated to [xmin, xmax]
# in order to generate the random sample
d1 = ns.TruncatedDistribution1D(ns.Exponential1D(0.0, 1.0), xmin, xmax)
# Generate a random sample according to this density
rng = ns.MersenneTwister()
sample = d1.generate(rng, sampleSize)
# Histogram for displaying the sample
h1 = HistoND("int", ns.HistoAxis(numBins, xmin, xmax, "X"),
"Sample Histogram", "Counts")
any(h1.fill(x, 1) for x in sample)
# Histogram for sample discretization
h2 = HistoND("int", ns.HistoAxis(nDiscrete, xmin, xmax, "X"),
"LOrPE Discretization Histogram", "Counts")
any(h2.fill(x, 1) for x in sample)
# Object which will perform LOrPE
smoother = LOrPE1D(h2)
# Add a simple density estimate with a plug-in bandwidth
bw0 = histoPluginBandwidth1D(h2, sampleSize, pluginDegree, symbetaPower)
print("Plug-in bandwidth is {:.4f} for degree {}".format(bw0, pluginDegree))
yPlugin = smoother.estimateDensity(h2, pluginDegree, bw0, symbetaPower)
# Add vanilla KDE for comparison (without mirroring and high order kernels)
kde = FFTKDE1D(h2, False, False)
bwkde = histoPluginBandwidth1D(h2, sampleSize, 0, symbetaPower)
yKDE = kde.estimateDensity(h2, 0, bwkde, symbetaPower)
print("KDE bandwidth is {:.4f}".format(bwkde))
# Add a number of filters with different filter degrees and bandwidth
# settings for cross-validation calculations
for filterDegree in range(1, 7):
bw0 = histoPluginBandwidth1D(h2, sampleSize, filterDegree, symbetaPower)
print("Building degree {} filters, plug-in bandwidth is {:.4f}".format(
filterDegree, bw0))
for bw in np.logspace(math.log10(bw0/4.0), math.log10(bw0*4.0), 50):
smoother.addFilter(filterDegree, bw, symbetaPower)
# Run the least squares cross-validation (LSCV)
lscvCalc = BinnedLSCVCalc1D()
bestLSCV, paramsLSCV, yLSCV = smoother.bestFilterByCV(h2, lscvCalc)
lscvDegree, lscvBandwidth, lscvSymbetaPower = paramsLSCV
print("LSCV: bandwidth is {:.4f}, degree is {}".format(lscvBandwidth, lscvDegree))
# Run the regularized likelihood cross-validation (RLCV)
regularisationParameter = 0.5
rlcvCalc = BinnedRLCVCalc1D(regularisationParameter)
bestRLCV, paramsRLCV, yRLCV = smoother.bestFilterByCV(h2, rlcvCalc)
rlcvDegree, rlcvBandwidth, rlcvSymbetaPower = paramsRLCV
print("RLCV: bandwidth is {:.4f}, degree is {}".format(rlcvBandwidth, rlcvDegree))
# Create Matplotlib plot window
fig = plt.figure()
ax = fig.add_subplot(111)
# Plot the sample histogram
xh, yh = ns.histoOutline1D(h1)
ax.plot(xh, yh, 'k')
# Overlay the original density. Scale all densities so that their
# plots are compatible with the histogram height.
verticalScale = h1.binVolume()*sampleSize
xdens, ydens = ns.scanDensity1D(d1, xmin, xmax, nDiscrete+1)
ax.plot(xdens, ydens*verticalScale, 'g', label='True density')
# Overlay the KDE
coords = h2.axis(0).binCenters()
ax.plot(coords, yKDE*verticalScale, 'm', label="KDE")
# Overlay the LOrPE results with various bandwidth values
-bandwidths = (bw0, rlcvBandwidth, lscvBandwidth)
colors = ('c', 'r', 'b' )
labels = ('Plug-in', 'RLCV', 'LSCV' )
results = (yPlugin, yRLCV, yLSCV )
-for bw, color, l, y in zip(bandwidths, colors, labels, results):
+for color, l, y in zip(colors, labels, results):
ax.plot(coords, y*verticalScale, color, label="{} estimate".format(l))
# Add labels and comments
plt.xlabel('X')
plt.ylabel('Density estimate')
plt.title('LOrPE with Cross-Validation')
plt.legend()
# Display the window
plt.show()
Index: trunk/examples/Python/discrete_osde_1d.py
===================================================================
--- trunk/examples/Python/discrete_osde_1d.py (revision 0)
+++ trunk/examples/Python/discrete_osde_1d.py (revision 629)
@@ -0,0 +1,103 @@
+#!/usr/bin/env python3
+"""
+This example illistrates how to perform univariate orthogonal series
+density estimation (OSDE) on equdistant grids using NPStat
+"""
+
+__author__="Igor Volobouev (i.volobouev@ttu.edu)"
+__version__="1.0"
+__date__ ="Sep 20 2019"
+
+# Perform necessary imports
+import npstat as ns
+from npstat_utils import HistoND
+from npstat_density_estimation_1d import DiscreteOSDE1D, \
+ BinnedLSCVCalc1D, BinnedRLCVCalc1D
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Some parameters for the script
+sampleSize = 500 # Number of points to generate (sample size)
+xmin = -3.5 # Left edge of the sample histogram
+xmax = 3.5 # Right edge of the sample histogram
+numBins = 70 # Number of bins for the sample histogram
+nDiscrete = 700 # Number of cells for sample discretization
+filterDegree = 10 # Initial guess for the OSDE degree
+
+# We will use a 1-d Gassian mixture to generate the random sample.
+# See the "brute_force_kde_1d.py" script for more comments on this.
+component1 = ns.GaussianMixtureEntry(3.0, -1.0, 0.5)
+component2 = ns.GaussianMixtureEntry(7.0, 1.0, 0.5)
+mixture = ns.GaussianMixture1D(0.0, 1.0, (component1, component2))
+
+# Generate a random sample according to this density
+rng = ns.MersenneTwister()
+sample = mixture.generate(rng, sampleSize)
+
+# Histogram for displaying the sample
+h1 = HistoND("int", ns.HistoAxis(numBins, xmin, xmax, "X"),
+ "Sample Histogram", "Counts")
+any(h1.fill(x, 1) for x in sample)
+
+# Histogram for sample discretization
+h2 = HistoND("int", ns.HistoAxis(nDiscrete, xmin, xmax, "X"),
+ "OSDE Discretization Histogram", "Counts")
+any(h2.fill(x, 1) for x in sample)
+
+# Object which will perform OSDE
+smoother = DiscreteOSDE1D(h2)
+
+# Initial guesstimate
+yFirst = smoother.estimateDensity(h2, filterDegree)
+print("Initial guess OSDE degree is ", filterDegree)
+
+# Add a number of filters with different degrees for cross-validation
+# calculations. Note that degrees do not have to be integer.
+for degree in np.linspace(2.0, 40.0, 77):
+ smoother.addFilter(degree)
+
+# Run the least squares cross-validation (LSCV)
+lscvCalc = BinnedLSCVCalc1D()
+bestLSCV, lscvFilterDegree, yLSCV = smoother.bestFilterByCV(h2, lscvCalc)
+print("LSCV OSDE degree is ", lscvFilterDegree)
+
+# Run the regularized likelihood cross-validation (RLCV)
+regularisationParameter = 0.5
+rlcvCalc = BinnedRLCVCalc1D(regularisationParameter)
+bestRLCV, rlcvFilterDegree, yRLCV = smoother.bestFilterByCV(h2, rlcvCalc)
+print("RLCV OSDE degree is ", rlcvFilterDegree)
+
+# We are done with density estimation at this point. The rest of the script
+# is just plotting results. To begin with, create a Matplotlib plot window.
+fig = plt.figure()
+ax = fig.add_subplot(111)
+
+# Plot the sample histogram
+xh, yh = ns.histoOutline1D(h1)
+ax.plot(xh, yh, 'k')
+
+# Overlay the original density. Scale all densities so that their
+# plots are compatible with the histogram height.
+verticalScale = h1.binVolume()*sampleSize
+xdens, ydens = ns.scanDensity1D(mixture, xmin, xmax, nDiscrete+1)
+ax.plot(xdens, ydens*verticalScale, 'g', label='True density')
+
+# Overlay the OSDE results with various degrees
+xcoords = h2.axis(0).binCenters()
+colors = ('c', 'r', 'b' )
+labels = ('Guess', 'RLCV', 'LSCV' )
+results = (yFirst, yRLCV, yLSCV )
+for color, l, y in zip(colors, labels, results):
+ ax.plot(xcoords, y*verticalScale, color, label="{} estimate".format(l))
+
+# Set the x range of the plot to that of the sample histogram
+ax.set_xlim(h1.axis(0).min(), h1.axis(0).max())
+
+# Add labels and comments
+plt.xlabel('X')
+plt.ylabel('Density estimate')
+plt.title('Legendre Basis OSDE')
+plt.legend(loc=2)
+
+# Display the window
+plt.show()
Property changes on: trunk/examples/Python/discrete_osde_1d.py
___________________________________________________________________
Added: svn:executable
## -0,0 +1 ##
+*
\ No newline at end of property
Index: trunk/examples/Python/kde_1d_by_fft.py
===================================================================
--- trunk/examples/Python/kde_1d_by_fft.py (revision 628)
+++ trunk/examples/Python/kde_1d_by_fft.py (revision 629)
@@ -1,120 +1,119 @@
#!/usr/bin/env python3
"""
This example illistrates how to perform constant-bandwidth univariate
kernel density estimation (KDE) utilizing fast Fourier transforms (FFT)
with the help of the NPStat package.
"""
__author__="Igor Volobouev (i.volobouev@ttu.edu)"
__version__="1.0"
__date__ ="Sep 19 2019"
# Perform necessary imports
import npstat as ns
from npstat_utils import HistoND
from npstat_density_estimation_1d import FFTKDE1D, histoPluginBandwidth1D, \
BinnedLSCVCalc1D, BinnedRLCVCalc1D
import matplotlib.pyplot as plt
import numpy as np
import math
# Some parameters for the script
sampleSize = 500 # Number of points to generate (sample size)
xmin = -3.5 # Left edge of the sample histogram
xmax = 3.5 # Right edge of the sample histogram
numBins = 70 # Number of bins for the sample histogram
nDiscrete = 1024 # Number of cells for sample discretization.
# Should be a power of 2 (this results in the
# most efficient application of FFT).
symbetaPower = -1 # Power parameter of the symmetric beta kernel.
# Specify -1 to use a Gaussian kernel instead.
filterDegree = 2 # Degree of polynomial used to generate high-order kernels.
# In general, even numbers make the most sense here.
# Kernel order will then be filterDegree + 2. The
# kernel will be a bona-fide density (i.e., non-negative
# everywhere) only if filterDegree is 0.
# We will use a 1-d Gassian mixture to generate the random sample.
# See the "brute_force_kde_1d.py" script for more comments on this.
component1 = ns.GaussianMixtureEntry(3.0, -1.0, 0.5)
component2 = ns.GaussianMixtureEntry(7.0, 1.0, 0.5)
mixture = ns.GaussianMixture1D(0.0, 1.0, (component1, component2))
# Generate a random sample according to this density
rng = ns.MersenneTwister()
sample = mixture.generate(rng, sampleSize)
# Histogram for displaying the sample
h1 = HistoND("int", ns.HistoAxis(numBins, xmin, xmax, "X"),
"Sample Histogram", "Counts")
any(h1.fill(x, 1) for x in sample)
# Histogram for sample discretization. Note that the axis range
# is increased, to catch all possible tails. Such an increase may
# or may not make sense, depending on application.
h2 = HistoND("int", ns.HistoAxis(nDiscrete, 1.5*xmin, 1.5*xmax, "X"),
"KDE Discretization Histogram", "Counts")
any(h2.fill(x, 1) for x in sample)
# Object which will perform KDE
smoother = FFTKDE1D(h2)
# Make an initial guess for the KDE bandwidth. This guess will use
# a robust measure of the sample scale, estimated from the histogram.
bw0 = histoPluginBandwidth1D(h2, sampleSize, filterDegree, symbetaPower)
print("Plug-in bandwidth is {:.4f}".format(bw0))
yPlugin = smoother.estimateDensity(h2, filterDegree, bw0, symbetaPower)
# Add a number of kernels with different bandwidth values
# for cross-validation calculations
for bw in np.logspace(math.log10(bw0/4.0), math.log10(bw0*2.0), 100):
smoother.addFilter(filterDegree, bw, symbetaPower)
# Run the least squares cross-validation (LSCV)
lscvCalc = BinnedLSCVCalc1D()
bestLSCV, paramsLSCV, yLSCV = smoother.bestFilterByCV(h2, lscvCalc)
lscvFilterDegree, lscvBandwidth, lscvSymbetaPower = paramsLSCV
print("LSCV bandwidth is {:.4f}".format(lscvBandwidth))
# Run the regularized likelihood cross-validation (RLCV)
regularisationParameter = 0.5
rlcvCalc = BinnedRLCVCalc1D(regularisationParameter)
bestRLCV, paramsRLCV, yRLCV = smoother.bestFilterByCV(h2, rlcvCalc)
rlcvFilterDegree, rlcvBandwidth, rlcvSymbetaPower = paramsRLCV
print("RLCV bandwidth is {:.4f}".format(rlcvBandwidth))
# We are done with density estimation at this point. The rest of the script
# is just plotting results. To begin with, create a Matplotlib plot window.
fig = plt.figure()
ax = fig.add_subplot(111)
# Plot the sample histogram
xh, yh = ns.histoOutline1D(h1)
ax.plot(xh, yh, 'k')
# Overlay the original density. Scale all densities so that their
# plots are compatible with the histogram height.
verticalScale = h1.binVolume()*sampleSize
xdens, ydens = ns.scanDensity1D(mixture, xmin, xmax, nDiscrete+1)
ax.plot(xdens, ydens*verticalScale, 'g', label='True density')
# Overlay the KDE results with various bandwidth values
xkde = h2.axis(0).binCenters()
-bandwidths = (bw0, rlcvBandwidth, lscvBandwidth)
colors = ('c', 'r', 'b' )
labels = ('Plug-in', 'RLCV', 'LSCV' )
results = (yPlugin, yRLCV, yLSCV )
-for bw, color, l, y in zip(bandwidths, colors, labels, results):
+for color, l, y in zip(colors, labels, results):
ax.plot(xkde, y*verticalScale, color, label="{} estimate".format(l))
# Set the x range of the plot to that of the sample histogram
ax.set_xlim(h1.axis(0).min(), h1.axis(0).max())
# Add labels and comments
plt.xlabel('X')
plt.ylabel('Density estimate')
plt.title('KDE by FFT')
plt.legend(loc=2)
# Display the window
plt.show()

File Metadata

Mime Type
text/x-diff
Expires
Wed, May 14, 11:04 AM (21 h, 45 s)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
5111337
Default Alt Text
(33 KB)

Event Timeline