diff --git a/debinningCore/debinning.py b/debinningCore/debinning.py index 208677e..552888c 100644 --- a/debinningCore/debinning.py +++ b/debinningCore/debinning.py @@ -1,386 +1,405 @@ ''' debinning.py The debinning calculation, wrapped in a class, debinningCalculator. Copyright (C) 2015 Abram Krislock This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the, 110., 1.5, 50., 140. GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.txt. ''' ### Imports from __future__ import division import numpy as np import bisect import os from time import time from string import Template # Find the debinning main directory for remaining import paths debinningDir = os.getcwd() while debinningDir.rfind('/') > debinningDir.rfind('debinning'): debinningDir = debinningDir[:debinningDir.rfind('/')] if debinningDir not in os.sys.path: os.sys.path.insert(0, debinningDir) import debinningMath.orderStatistics as oStat from debinningCore import settings if settings.useGPU: #### NOTE: This python script uses GPU programming! I programmed my GPU using CUDA, since my #### video card is NVIDIA GeForce GT 730M. If your video card is different, this script #### may require modification. If your video card is not NVIDIA, it might not work at all! import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule class debinningCalculator(object): ''' Calculates the debinning function from a sample and a background pdf. Mandatory arguments for initialization: gen: A debinningMath.DataGenContainer instance, which provides the signal + background data for the debinning function calculation. fit: A debinningMath.DataGenContainer instance, which provides the "fit function" pdf and cdf. (It does not need to have data). nEx: The "expected" number of events if the fit distribution is good. NOTE: The domains you give to the functions gen and fit will be changed when calculating the invertible covariance matrix. ''' def __init__(self, gen, fit, nEx): # Set data generators self.nEx = nEx self.gen = gen self.fit = fit self.fit.changeDomain() self.gen.changeDomain(self.fit.x) def resetData(self, hardReset=False): ''' Retains most of the debinning function data, which can be reused so long as the "fit function" remains the same. Asks for a new data sample from the signal + background data generator. [If hardReset is True, also deletes all debinning function data.] ''' if 'dataToX' in dir(self): del self.dataToX if 'sumGdotG_j' in dir(self): del self.sumGdotG_j if 'debinData' in dir(self): del self.debinData if hardReset and 'vecGxdotVecGy_xy' in dir(self): del self.vecGxdotVecGy_xy if hardReset and 'mu_x' in dir(self): del self.mu_x if hardReset and 'V_xy' in dir(self): del self.V_xy if hardReset and 'sigmaSq_x' in dir(self): del self.sigmaSq_x if hardReset and 'invV_xy' in dir(self): del self.invV_xy self.gen.generate() def mapDataToX(self, dindex): ''' Given an index pointing to a value in the data sample, returns an index pointing to the closest value in the x domain. ''' # bisect_left(x, datum) locates the left-most entry in x which is >= datum xindex = bisect.bisect_left(self.gen.x, self.gen.data[dindex]) if (xindex > 0 and self.gen.x[xindex] - self.gen.data[dindex] >= self.gen.data[dindex] - self.gen.x[xindex-1]): return xindex - 1 return xindex def calculateGdotG(self): ''' Calculates the normalized scalar product of two vectors: "vecG_x dot vecG_y / n * nEx" for all x and y. ''' # Data generators must have the same domain assert all(self.gen.x == self.fit.x) # Check if the calculation has already been performed for this data if 'vecGxdotVecGy_xy' in dir(self): return # Time the calculation if settings.timing: t = time() #### G vector calculations: GPU and non-GPU versions #### if settings.useGPU: #### GPU version #### # Use the GPU to calculate ALL paralellizable portions of the calculation: if self.nEx % 4: print 'The GPU thread blocks prefer the "number of events" to be a' print 'multiple of 4. Expect problems.' if len(self.gen.x) % 16: print 'The GPU thread blocks which use the x-domain prefer it to have' print 'length which is a multiple of 16. Expect problems.' # GPU template: nKr array... lognKrModTemplate = Template(""" __global__ void lognKr(double *lognKr_r) { const int idr = threadIdx.x + blockDim.x * blockIdx.x; const int r = idr+1 < ${nEx}-idr ? idr+1 : ${nEx}-idr; lognKr_r[idr] = log(${nEx}.); for (double i=${nEx}-1; i>${nEx}-r; i-=1) lognKr_r[idr] += log(i); for (double i=1; i<r; i+=1) lognKr_r[idr] -= log(i); } """) lognKrMod = SourceModule(lognKrModTemplate.substitute(nEx=self.nEx)) # GPU template: vecG dot vecG... vecGxdotVecGyModTemplate = Template(""" __global__ void vecGxdotVecGy(double *vecGxdotVecGy_xy, double *bgF, double *lognKr_r) { const int idx = threadIdx.x + blockDim.x * blockIdx.x; const int idy = threadIdx.y + blockDim.y * blockIdx.y; const int idxy = idx*${lenX} + idy; vecGxdotVecGy_xy[idxy] = 0; if (idy > idx) return; if (bgF[idx] <= 0 || bgF[idy] >= 1) { vecGxdotVecGy_xy[idxy] += ${nEx}. / ${nData}.; } else if (bgF[idy] <= 0) { if (bgF[idx] < 1) vecGxdotVecGy_xy[idxy] += ${nEx}. / ${nData}. * pow(1 - bgF[idx], ${nEx}-1); } else if (bgF[idx] >= 1) { vecGxdotVecGy_xy[idxy] += ${nEx}. / ${nData}. * pow(bgF[idy], ${nEx}-1); } else { for(int idr=0; idr<${nEx}; idr++) { vecGxdotVecGy_xy[idxy] += exp(2.*lognKr_r[idr] - log(${nEx}.) - log(${nData}.) + idr * log(bgF[idx]) + idr * log(bgF[idy]) + (${nEx} - idr - 1) * log(1 - bgF[idx]) + (${nEx} - idr - 1) * log(1 - bgF[idy])); } } } """) vecGxdotVecGyMod = SourceModule(vecGxdotVecGyModTemplate.substitute( nEx=self.nEx, nData=self.gen.n, lenX=len(self.gen.x))) # The GPU will deliver the results to these empty numpy arrays lognKr_r = np.empty(self.nEx) self.vecGxdotVecGy_xy = np.empty([len(self.gen.x), len(self.gen.x)]) # Allocate memory on the GPU for vecG calculations lognKr_r_gpu = cuda.mem_alloc(lognKr_r.nbytes) vecGxdotVecGy_xy_gpu = cuda.mem_alloc(self.vecGxdotVecGy_xy.nbytes) bgF_gpu = cuda.mem_alloc(self.fit.F.nbytes) # Transfer "fit function" to the GPU cuda.memcpy_htod(bgF_gpu, self.fit.F) # Get a reference to the GPU module function and do the calculation lognKr_func = lognKrMod.get_function('lognKr') lognKr_func(lognKr_r_gpu, block=(4, 1, 1), grid=(self.nEx // 4, 1)) vecGxdotVecGy_func = vecGxdotVecGyMod.get_function('vecGxdotVecGy') vecGxdotVecGy_func(vecGxdotVecGy_xy_gpu, bgF_gpu, lognKr_r_gpu, block=(16, 16, 1), grid=(len(self.gen.x) // 16, len(self.gen.x) // 16)) # Get the results back from the GPU ... only the dot product matters cuda.memcpy_dtoh(self.vecGxdotVecGy_xy, vecGxdotVecGy_xy_gpu) # Fix the upper triangle of vecGxdotVecGy_xy for i in xrange(len(self.gen.x)): for j in xrange(i+1, len(self.gen.x)): self.vecGxdotVecGy_xy[i,j] = self.vecGxdotVecGy_xy[j,i] else: #### non-GPU version #### def vecG_rth(xindex, rindex): r = rindex+1 if self.fit.F[xindex] == 0 or (1 - self.fit.F[xindex]) == 0: return 0 else: return np.e ** (oStat.lognKr(self.nEx, r) + (r-1) * np.log(self.fit.F[xindex]) + (self.nEx-r) * np.log(1 - self.fit.F[xindex])) vecG_xr = np.array([[vecG_rth(xi, ri) for ri in xrange(self.nEx)] for xi in xrange(len(self.gen.x))]) self.vecGxdotVecGy_xy = np.array([[np.dot(vecG_xr[xi], vecG_xr[yi]) for yi in xrange(len(self.gen.x))] for xi in xrange(len(self.gen.x))]) self.vecGxdotVecGy_xy /= (self.nEx * self.gen.n) # Report the calculation timing if settings.timing: print 'vecG calculations in %f seconds' % (time() - t) def calculateDebinning(self): ''' Calculates the debinning function for a given data sample, storing the result in "self.debinData". ''' # Data generators must have the same domain assert all(self.gen.x == self.fit.x) # Check if the calculation has already been performed for this data if all(['sumGdotG_j' in dir(self), 'debinData' in dir(self)]): return # Checks for necessary 'input' calculations if not 'vecGxdotVecGy_xy' in dir(self): self.calculateGdotG() if not 'dataToX' in dir(self): self.dataToX = np.array([self.mapDataToX(j) for j in xrange(len(self.gen.data))]) # Calculate the debinning function self.sumGdotG_j = self.vecGxdotVecGy_xy.take(self.dataToX, axis=1).sum(axis=1) self.debinData = self.fit.f * self.sumGdotG_j def calculateExpectations(self, goodFit=False): ''' Calculates mean ("self.mu_x"), variance ("self.sigmaSq_x"), and covariance ("self.V_xy") for the debinning function. If goodFit=True, assumes that the "fit function" is a good fit for the data when calculating the mean and covariance. If goodFit=False, calculates the mean and covariance using the debinning function itself as an approximation of the true data PDF. ''' # Data generators must have the same domain assert all(self.gen.x == self.fit.x) # Check if the calculation has already been performed for this data if all(['mu_x' in dir(self), 'V_xy' in dir(self), 'sigmaSq_x' in dir(self)]): return # Time the calculation if settings.timing: t = time() #### Expectation calculations: GPU and non-GPU versions #### if settings.useGPU: #### GPU version #### # Use the GPU to calculate ALL paralellizable portions of the calculation: if self.nEx % 4: print 'The GPU thread blocks prefer the "number of events" to be a' print 'multiple of 4. Expect problems.' if len(self.gen.x) % 16: print 'The GPU thread blocks which use the x-domain prefer it to have' print 'length which is a multiple of 16. Expect problems.' # GPU template: The mean, mu_x... if not goodFit: meanTemplate = Template(""" __global__ void mean(double *mu_x, double *vecGxdotVecGy_xy, double *f, double *bgf) { const int idx = threadIdx.x + blockDim.x * blockIdx.x; double current = 0.; double previous = 0.; mu_x[idx] = 0.; if (bgf[idx] == 0) return; for (int idz = 0; idz < ${lenX}; idz++) { const int idxz = idx*${lenX} + idz; // Ready the previous and current integrands. previous = current; current = vecGxdotVecGy_xy[idxz] * f[idz]; // Use the previous and current integrands to calculate the // integral using the trapezoid rule. if (idz > 0) mu_x[idx] += 0.5 * ${deltaX} * (current + previous); } mu_x[idx] *= bgf[idx] * ${nData}; } """) meanMod = SourceModule(meanTemplate.substitute(nData=self.gen.n, lenX=len(self.gen.x), deltaX=self.gen.x[1]-self.gen.x[0])) # GPU template: The covariance matrix, V_xy... covarianceTemplate = Template(""" __global__ void covariance(double *V_xy, double *vecGxdotVecGy_xy, double *f, double *bgf, double *mu_x) { const int idx = threadIdx.x + blockDim.x * blockIdx.x; const int idy = threadIdx.y + blockDim.y * blockIdx.y; const int idxy = idx*${lenX} + idy; double current = 0.; double previous = 0.; V_xy[idxy] = 0.; if (bgf[idx] == 0 || bgf[idy] == 0) return; for (int idz = 0; idz < ${lenX}; idz++) { const int idxz = idx*${lenX} + idz; const int idyz = idy*${lenX} + idz; // Ready the previous and current integrands... previous = current; current = vecGxdotVecGy_xy[idxz] * vecGxdotVecGy_xy[idyz] * f[idz]; // Use the previous and current integrands to calculate the // integral using the trapezoid rule. if (idz > 0) V_xy[idxy] += 0.5 * ${deltaX} * (current + previous); } // Finish up the covariance calculation. V_xy[idxy] *= bgf[idx] * bgf[idy] * ${nData}; V_xy[idxy] -= mu_x[idx] * mu_x[idy] / ${nData}; } """) covarianceMod = SourceModule(covarianceTemplate.substitute(nData=self.gen.n, lenX=len(self.gen.x), deltaX=self.gen.x[1]-self.gen.x[0])) # The GPU will deliver the results to these empty numpy arrays if not goodFit: self.mu_x = np.empty(len(self.gen.x)) else: self.mu_x = self.fit.f.copy() self.V_xy = np.empty((len(self.gen.x), len(self.gen.x))) # Allocate memory on the GPU for expectation value calculations mu_x_gpu = cuda.mem_alloc(self.mu_x.nbytes) V_xy_gpu = cuda.mem_alloc(self.V_xy.nbytes) vecGxdotVecGy_xy_gpu = cuda.mem_alloc(self.vecGxdotVecGy_xy.nbytes) f_gpu = cuda.mem_alloc(self.gen.f.nbytes) bgf_gpu = cuda.mem_alloc(self.fit.f.nbytes) # Transfer data to the GPU cuda.memcpy_htod(vecGxdotVecGy_xy_gpu, self.vecGxdotVecGy_xy) if not goodFit: cuda.memcpy_htod(f_gpu, self.debinData) else: cuda.memcpy_htod(f_gpu, self.fit.f) cuda.memcpy_htod(mu_x_gpu, self.mu_x) cuda.memcpy_htod(bgf_gpu, self.fit.f) # Get a reference to the GPU module function and do the calculation if not goodFit: mean_func = meanMod.get_function('mean') mean_func(mu_x_gpu, vecGxdotVecGy_xy_gpu, f_gpu, bgf_gpu, block=(16,1,1), grid=(len(self.gen.x) // 16, 1)) covariance_func = covarianceMod.get_function('covariance') covariance_func(V_xy_gpu, vecGxdotVecGy_xy_gpu, f_gpu, bgf_gpu, mu_x_gpu, block=(16,16,1), grid=(len(self.gen.x) // 16, len(self.gen.x) // 16)) # Get the results back from the GPU if not goodFit: cuda.memcpy_dtoh(self.mu_x, mu_x_gpu) cuda.memcpy_dtoh(self.V_xy, V_xy_gpu) else: #### non-GPU version #### dx = self.gen.x[1] - self.gen.x[0] if not goodFit: self.mu_x = np.zeros(len(self.gen.x)) else: self.mu_x = self.fit.f.copy() self.V_xy = np.zeros((len(self.gen.x), len(self.gen.x))) inner_x = {} # Use the integration operators A and AT from orderStatistics (oStat) if not goodFit: # Mean: for i in xrange(len(self.gen.x)): self.mu_x[i] += oStat.A(self.vecGxdotVecGy_xy[i] * self.debinData, dx)[-1] self.mu_x[i] *= self.fit.f[i] * self.gen.n # Covariance: for i in xrange(len(self.gen.x)): for j in xrange(i+1): self.V_xy[i, j] += oStat.A(self.debinData * self.vecGxdotVecGy_xy[i] * self.vecGxdotVecGy_xy[j], dx)[-1] self.V_xy[i, j] *= self.fit.f[i] * self.fit.f[j] * self.gen.n self.V_xy[i, j] -= self.mu_x[i] * self.mu_x[j] / self.gen.n if j < i: self.V_xy[j, i] += self.V_xy[i, j] # Calculate the variance as the diagonal of the covariance self.sigmaSq_x = np.diag(self.V_xy).copy() # Calculate the inverse covariance matrix # - adjust inverse calculation for vicious round-off errors if settings.invertCovariance: self.invV_xy = np.linalg.inv(self.V_xy * self.gen.n**2) / (self.gen.n**2) # Report the calculation timing if settings.timing: print 'mean and covariance calculations in %f seconds' % (time() - t) + + def resetExpectations(self): + ''' Clears the expectation values for recalculation. ''' + if 'mu_x' in dir(self): + del self.mu_x + if 'V_xy' in dir(self): + del self.V_xy + if 'sigmaSq_x' in dir(self): + del self.sigmaSq_x + if 'invV_xy' in dir(self): + del self.invV_xy + + def saveExpectations(self, saveDict): + ''' Copies the calculated expectation values into the user given dict. ''' + saveDict['mu_x'] = self.mu_x.copy() + saveDict['V_xy'] = self.V_xy.copy() + saveDict['sigmaSq_x'] = self.sigmaSq_x.copy() + if settings.invertCovariance: + saveDict['invV_xy'] = self.invV_xy.copy() diff --git a/debinningMath/sampleFunctions.py b/debinningMath/sampleFunctions.py index 73c52b7..084ae5f 100644 --- a/debinningMath/sampleFunctions.py +++ b/debinningMath/sampleFunctions.py @@ -1,182 +1,204 @@ ''' sampleFunctions.py Some sample probability distributions, inspired by high energy phenomenology. Copyright (C) 2014, 2015 Abram Krislock This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the, 110., 1.5, 50., 140. GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.txt. ''' import numpy as np import bisect from debinningMath.bezierSupport import bezierParTuple as bezPar ###### Sample Probability Distribution Functions ###### def endpoint(x, par): ''' A probability distribution function for an "easy endpoint" scenario. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (BG scale, BG zero1, BG zero2, SG intersect, SG scale, SG zero1, SG zero2) if par == None: par = (110., 1.5, 50., 140.) linearPart = np.concatenate(( par[1] * x[x < par[0]] - par[2], -(par[1]*par[0] - par[2])/(par[3] - par[0])*(x[x >= par[0]] - par[3]) )) # make all entries of linearPart non-negative np.clip(linearPart, 0., np.inf, linearPart) return linearPart def endpointPlusBG(x, par): ''' A probability distribution function for an "easy endpoint" scenario. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (BG scale, BG zero1, BG zero2, SG intersect, SG scale, SG zero1, SG zero2) if par == None: par = (0.00006, 10., 200., 110., 1.5, 50., 140.) cubicPart = par[0] * (x - par[1]) * (x - par[2])**2 # make all entries of cubicPart non-negative np.clip(cubicPart, 0., np.inf, cubicPart) linearPart = np.concatenate(( par[4] * x[x < par[3]] - par[5], -(par[4]*par[3] - par[5])/(par[6] - par[3])*(x[x >= par[3]] - par[6]) )) # make all entries of linearPart non-negative np.clip(linearPart, 0., np.inf, linearPart) return cubicPart + linearPart def endpointBG(x, par): ''' A background probability distribution function for endpoint scenarios. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (BG scale, BG zero1, BG zero2) if par == None: par = (0.00006, 10., 200.) cubicPart = par[0] * (x - par[1]) * (x - par[2])**2 # make all entries of cubicPart non-negative np.clip(cubicPart, 0., np.inf, cubicPart) return cubicPart +def theLine(x, par): + ''' A probability distribution function for a "line excess" scenario. + WARNING: This probability distribution is not normalized. + ''' + if type(x) != np.ndarray: + raise TypeError('Please input x as a numpy array') + # par = (BG scale, BG slope, SG scale, SG mean, SG sigma) + if par == None: + par = (60., 130., 4.) + return par[0] * np.exp(-(x - par[1])**2 / (2 * par[2]**2)) + def theLinePlusBG(x, par): ''' A probability distribution function for a "line excess" scenario. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (BG scale, BG slope, SG scale, SG mean, SG sigma) if par == None: par = (1000., -0.02, 60., 130., 4.) return (par[0] * np.exp(par[1] * x) + par[2] * np.exp(-(x - par[3])**2 / (2 * par[4]**2))) +def broadExcess(x, par): + ''' A probability distribution function for an "broad excess" scenario. + WARNING: This probability distribution is not normalized. + ''' + if type(x) != np.ndarray: + raise TypeError('Please input x as a numpy array') + # par = (BG scale, BG slope, SG scale, SG mean, SG sigma) + if par == None: + par = (130., 90., 30.) + return (par[0] * np.exp(-(x - par[1])**2 / (2 * par[2]**2))) + def broadExcessPlusBG(x, par): ''' A probability distribution function for an "broad excess" scenario. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (BG scale, BG slope, SG scale, SG mean, SG sigma) if par == None: par = (1000., -0.02, 130., 90., 30.) return (par[0] * np.exp(par[1] * x) + par[2] * np.exp(-(x - par[3])**2 / (2 * par[4]**2))) def exponentialBG(x, par): ''' A background probability distribution function for excess scenarios. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (BG scale, BG slope) if par == None: par = (1000., -0.02) return (par[0] * np.exp(par[1] * x)) def gaussianPlusFlat(x, par): ''' A duplicate of the function used in Meerkat's OneDimAdaptiveKernel demo. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (Flat value, gaussian scale, gaussian sigma) if par == None: par = (1., 4., 0.1) return np.abs((np.abs(x) < 1) * (par[0] + par[1] * np.exp(-0.5 * x**2 / par[2]**2))) def flat(x, par): ''' The background for gaussianPlusFlat. WARNING: This probability distribution is not normalized. ''' if type(x) != np.ndarray: raise TypeError('Please input x as a numpy array') # par = (Flat value, gaussian scale, gaussian sigma) if par == None: par = (1.,) return np.abs((np.abs(x) < 1) * par[0]) # This bezierCurves function mainly helps me to make a nifty logo. :) def bezierCurves(x, par): ''' Returns the y value corresponding to x for the piecewize Bezier curve given by par, which must be a list of 'bezierParTuple's sorted by their x0 values. It is assumed that x0 does not repeat between different tuples, and that the Bezier curves are single-valued functions of x. ''' if type(par) != list or type(par[0]) != bezPar: raise TypeError('par must be a list of bezierParTuples!!') if type(x) in (np.ndarray, list, tuple): return np.array([par[bisect.bisect(par, value) - 1].getBezierY(value) for value in x]) else: return par[bisect.bisect(par, x) - 1].getBezierY(x) ###### FITTING FUNCTIONS ###### # These "Kink" functions are used primarily in debinningCore/vsHistogram.py def theKink(x, par): # par = (kink location, slope before kink, slope after kink, kink intersect) if par == None: par = (140., -2., -0.1, 10.) if type(x) == np.ndarray: return np.concatenate(( par[1] * (x[x < par[0]] - par[0]) + par[3], par[2] * (x[x >= par[0]] - par[0]) + par[3] )) else: return (par[1] * (x - par[0]) + par[3] if x < par[0] else par[2] * (x - par[0]) + par[3]) def theKink_BinnedLikelihood(binContents, binEdges, fitBins, fitPars): # Bob Cousins says, "How did you do the fit? Integral of the bins? Proper likelihood?" # Alex Read says, "Integrating the bin shouldn't matter. You're fitting with straight lines." #### Right, but the bin containing the kink is special. x = 0.5*np.array([binEdges[i] + binEdges[i+1] for i in fitBins]) ikink = 0 xL = 0 xR = 0 if not fitPars[0] in binEdges: ikink = bisect.bisect_left(binEdges, fitPars[0]) - 1 if ikink in fitBins: xR = binEdges[ikink+1] xL = binEdges[ikink] ikink = np.abs(x - fitPars[0]).argmin() assert 0.5*(xL + xR) - x[ikink] < 10**(-8) y = theKink(x, fitPars) if ikink * xR * xL != 0: # Correction for the bin which contains xkink: y[ikink] = (fitPars[3] + 0.5 * ((xR - fitPars[0])**2 * fitPars[2] - (fitPars[0] - xL)**2 * fitPars[1]) / (xR - xL)) y = y.clip(10**(-10), y.max()) logy = np.log(y) return sum(binContents[fitBins] * logy) - sum(y) diff --git a/examples/debinningDemo.py b/examples/debinningDemo.py index 9a74411..7bd18e5 100644 --- a/examples/debinningDemo.py +++ b/examples/debinningDemo.py @@ -1,81 +1,132 @@ ''' debinningDemo.py Demonstration of the debinning calculation. Copyright (C) 2015 Abram Krislock This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.txt. ''' ### Imports from matplotlib import pyplot as plt import numpy as np import os debinningDir = os.getcwd() while debinningDir.rfind('/') > debinningDir.rfind('debinning'): debinningDir = debinningDir[:debinningDir.rfind('/')] if debinningDir not in os.sys.path: os.sys.path.insert(0, debinningDir) from debinningCore import settings settings.useGPU = True import debinningCore.debinning as debinning import debinningMath.orderStatistics as oStat import debinningMath.dataGen as dataGen import debinningMath.sampleFunctions as funcs -def debinEasyEndpoint(): - fig, ax1 = plt.subplots(figsize=(11,7), facecolor='#ffffff') - gen = dataGen.DataGenContainer(funcs.endpointPlusBG) - fit = dataGen.DataGenContainer(funcs.endpointBG) - debinCalc = debinning.debinningCalculator(gen, fit, gen.n) - debinCalc.resetData() +def debinDemonstration(): + fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(30,10), facecolor='#ffffff') + + mypoisson = dataGen.MyRandom('mypoisson') + dataGen.randomGenerator.setSeed('someseed') + alpha = 0.02 + nEx = 10000 + +# gen = dataGen.DataGenContainer(funcs.endpointPlusBG) +# fit = dataGen.DataGenContainer(funcs.endpointBG) + fit = dataGen.DataGenContainer(funcs.exponentialBG, + domain=np.linspace(0., 200., 256)) + gen = dataGen.DataPlusBGContainer(funcs.theLine, funcs.exponentialBG, + alpha, nPoints=nEx, + domain=np.linspace(0., 200., 256)) + debinCalc = debinning.debinningCalculator(gen, fit, nEx) x = debinCalc.fit.x dx = x[1] - x[0] + debinCalc.gen.n = int(round(nEx / (1. - alpha))) + debinCalc.resetData() debinCalc.calculateDebinning() + + estimatedUncertainties = {} + goodFitUncertainties = {} debinCalc.calculateExpectations() + debinCalc.saveExpectations(estimatedUncertainties) + debinCalc.resetExpectations() + debinCalc.calculateExpectations(True) + debinCalc.saveExpectations(goodFitUncertainties) bgf_plot = fit.f_unnormed / gen.F_unnormed[-1] ax1.plot(x, gen.f, '--', color='#0088ff', alpha=0.6, linewidth=3) ax1.plot(x, bgf_plot, '-.', color='#0088ff', alpha=0.6, linewidth=3) - ax1.hist(gen.data, bins=np.arange(0, 201, 5), alpha=0.4, color='#66a61e', normed=True) + ax1.hist(gen.data, bins=np.arange(0, 201, 5), alpha=0.4, color='#66a61e', + normed=True) ax1.fill_between(x, debinCalc.debinData + np.sqrt(debinCalc.sigmaSq_x), debinCalc.debinData - np.sqrt(debinCalc.sigmaSq_x), color='#964a01', alpha = 0.2) ax1.plot(x, debinCalc.debinData, '#964a01', linewidth=3) ax1.set_xticks(np.arange(0, 201, 50)) ax1.set_xticks(np.arange(0, 201, 10), minor=True) - ax1.set_xticklabels([r'$%i$' % int(num) for num in np.arange(0, 201, 50)], fontsize=20) + ax1.set_xticklabels([r'$%i$' % int(num) for num in np.arange(0, 201, 50)], + fontsize=20) ax1.set_ylim(bottom=-0.0001, top=0.0201) ax1.set_yticks(np.arange(0, 0.02, 0.005)) - ax1.set_yticklabels([r'$%0.3f$' % num for num in np.arange(0, 0.02, 0.005)], fontsize=20) + ax1.set_yticklabels([r'$%0.3f$' % num for num in np.arange(0, 0.02, 0.005)], + fontsize=20) ax1.set_yticks(np.arange(0, 0.02, 0.005/5), minor=True) ax1.set_xlabel('Physical Observable', labelpad=5, fontsize=22) ax1.set_ylabel('Probability', labelpad=5, fontsize=22) ax1.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) ax1.tick_params(which='minor', length=5, width=1.2, zorder=11) ax1.minorticks_on() +# Covariance matrix investigation: + debinCalc.convolve_est = np.dot(estimatedUncertainties['V_xy'], + debinCalc.debinData) + debinCalc.convolve_goodFit = np.dot(goodFitUncertainties['V_xy'], + debinCalc.debinData) + ax2.plot(x, debinCalc.convolve_est, alpha=0.6, color='#000000', + linewidth=3) + ax2.plot(x, debinCalc.convolve_goodFit, alpha=0.6, color='#964a01', + linewidth=3) + + ax2.set_ylim(bottom=-0.0000001, top=0.0000001) + ax2.set_xticks(np.arange(0, 201, 50)) + ax2.set_xticks(np.arange(0, 201, 10), minor=True) + ax2.set_xticklabels([r'$%i$' % int(num) for num in np.arange(0, 201, 50)], + fontsize=20) + + ax2.set_xlabel('Physical Observable Coordinate', labelpad=5, fontsize=22) + ax2.set_ylabel('Covariance Convolved debinning', labelpad=5, fontsize=22) + + ax2.tick_params(length=10, width=1.2, labelsize=22, zorder=10, pad=10) + ax2.tick_params(which='minor', length=5, width=1.2, zorder=11) + ax2.minorticks_on() + +# convolveCovariance integrals: + print 'For convolve_est, the total integral is:' + print oStat.A(debinCalc.convolve_est, dx)[-1] + print 'For convolve_goodFit, the total integral is:' + print oStat.A(debinCalc.convolve_goodFit, dx)[-1] if __name__ == '__main__': - print 'Now running the debinning demonstration for the endpointPlusBG function' +# print 'Now running the debinning demonstration for endpointPlusBG function' + print 'Now running the debinning demonstration for theLinePlusBG function' plt.ion() - debinEasyEndpoint() + debinDemonstration() print 'Done.\n\n' diff --git a/examples/discriminationMC.py b/examples/discriminationMC.py index b36cedd..e1a8a49 100644 --- a/examples/discriminationMC.py +++ b/examples/discriminationMC.py @@ -1,194 +1,198 @@ ''' discriminationMC.py Demonstration of the discriminating power of the debinning calculation. Copyright (C) 2015 Abram Krislock This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see http://www.gnu.org/licenses/gpl-3.0.txt. ''' ### Imports from __future__ import division from matplotlib import pyplot as plt import numpy as np from collections import OrderedDict as oDict import bisect import os debinningDir = os.getcwd() while debinningDir.rfind('/') > debinningDir.rfind('debinning'): debinningDir = debinningDir[:debinningDir.rfind('/')] if debinningDir not in os.sys.path: os.sys.path.insert(0, debinningDir) from debinningCore import settings settings.useGPU = True -settings.invertCovariance = True import debinningCore.debinning as debinning import debinningMath.orderStatistics as oStat import debinningMath.dataGen as dataGen import debinningMath.sampleFunctions as funcs def iseDiscrimination(): - fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(nrows=2, ncols=2, figsize=(22,14), - facecolor='#ffffff') - bggen = dataGen.DataGenContainer(funcs.endpointBG, + bggen = dataGen.DataGenContainer(funcs.exponentialBG, domain=np.linspace(0., 200., 256)) mypoisson = dataGen.MyRandom('mypoisson') dataGen.randomGenerator.setSeed('someseed') - nExpected = [100, 200, 500, 1000] - axes = {100:ax1, 200:ax2, 500:ax3, 1000:ax4} - alpha = oDict((('zero',0.), ('one',0.01), ('three',0.03), ('seven',0.07))) -# alpha = oDict((('zero',0.), ('one',0.01), ('two',0.02), ('three',0.03), -# ('four',0.04), ('five',0.05), ('six',0.06), ('seven',0.07), -# ('eight',0.08), ('nine',0.09))) + nExpected = [1000] + alpha = oDict((('zero',0.), ('one',0.01), ('two',0.02), ('three',0.03))) + colors = oDict((('zero','#000000'), ('one','#ff0000'), ('two','#8800ff'), ('three','#4444ff'))) debinChisq = {} - debinCovarChisq = {} + debinConvolutionDiff = {} debinChisqMod = {} - debinCovarChisqMod = {} neg2Loglike = {} + neg2Logpoisson = {} for nEx in nExpected: + fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(33,14), + facecolor='#ffffff') for key in alpha: - print 'For nExpected = ' + str(nEx) + ', alpha = ' + key + ' %' - gen = dataGen.DataPlusBGContainer(funcs.endpoint, funcs.endpointBG, + print 'For nExpected = ' + str(nEx) + ', alpha = ' + str(alpha[key]) + gen = dataGen.DataPlusBGContainer(funcs.theLine, funcs.exponentialBG, alpha[key], nPoints=nEx, domain=np.linspace(0., 200., 256)) debinCalc = debinning.debinningCalculator(gen, bggen, nEx) dx = debinCalc.fit.x[1] - debinCalc.fit.x[0] debinChisq[key + str(nEx)] = [] - debinCovarChisq[key + str(nEx)] = [] + debinConvolutionDiff[key + str(nEx)] = [] debinChisqMod[key + str(nEx)] = [] - debinCovarChisqMod[key + str(nEx)] = [] neg2Loglike[key + str(nEx)] = [] + neg2Logpoisson[key + str(nEx)] = [] for i in xrange(10000): n = -1 while n < nEx: n = mypoisson.poisson(nEx / (1. - alpha[key])) # NOTE: "True" n for data generation, nEx for debinning calculation debinCalc.gen.n = n debinCalc.resetData() debinCalc.gen.n = nEx debinCalc.calculateDebinning() - # NOTE: The bg with nEx events is assumed to be a "good fit" - debinCalc.calculateExpectations(True) # <= send in True for goodFit + estimatedUncertainties = {} + goodFitUncertainties = {} + # The uncertainties are estimated using the debinning function itself + debinCalc.calculateExpectations() + debinCalc.saveExpectations(estimatedUncertainties) + debinCalc.resetExpectations() + # The uncertainties are calculated assuming a good fit + debinCalc.calculateExpectations(True) + debinCalc.saveExpectations(goodFitUncertainties) + + convolutionDiff = np.dot(goodFitUncertainties['V_xy'], + debinCalc.debinData) + convolutionDiff -= np.dot(estimatedUncertainties['V_xy'], + debinCalc.debinData) difference = debinCalc.debinData - debinCalc.fit.f differMod = n*debinCalc.debinData - nEx*debinCalc.fit.f + debinChisq[key + str(nEx)].append( - np.sum(difference**2 / debinCalc.sigmaSq_x) - -2.* (n * np.log(nEx) - nEx - oStat.logfact(n)) ) - debinCovarChisq[key + str(nEx)].append( - np.dot(difference, np.dot(debinCalc.invV_xy, difference)) - -2.* (n * np.log(nEx) - nEx - oStat.logfact(n)) ) + np.sum(difference**2 / debinCalc.sigmaSq_x) ) + debinConvolutionDiff[key + str(nEx)].append(convolutionDiff.sum()) debinChisqMod[key + str(nEx)].append( np.sum(differMod**2 / debinCalc.sigmaSq_x) / nEx ) - debinCovarChisqMod[key + str(nEx)].append( - np.dot(differMod, np.dot(debinCalc.invV_xy, differMod)) / nEx ) neg2Loglike[key + str(nEx)].append( - -2. * (np.log(debinCalc.fit.f.take(debinCalc.dataToX)).sum() - + n * np.log(nEx) - nEx - oStat.logfact(n)) ) + -2. * (np.log(debinCalc.fit.f.take(debinCalc.dataToX)).sum()) + -2. * (n * np.log(nEx) - nEx - oStat.logfact(n)) ) + # The pure poisson factor hardly matters at all.... + neg2Logpoisson[key + str(nEx)].append( + -2. * (n * np.log(nEx) - nEx - oStat.logfact(n)) ) # Calculate empirical distribution functions for power calculation - neg2Loglike[key + str(nEx)] = np.sort(neg2Loglike[key + str(nEx)]) debinChisq[key + str(nEx)] = np.sort(debinChisq[key + str(nEx)]) - debinCovarChisq[key + str(nEx)] = np.sort(debinCovarChisq[key + str(nEx)]) + debinConvolutionDiff[key + str(nEx)] = np.sort(debinConvolutionDiff[key + str(nEx)]) debinChisqMod[key + str(nEx)] = np.sort(debinChisqMod[key + str(nEx)]) - debinCovarChisqMod[key + str(nEx)] = np.sort(debinCovarChisqMod[key + str(nEx)]) + neg2Loglike[key + str(nEx)] = np.sort(neg2Loglike[key + str(nEx)]) + neg2Logpoisson[key + str(nEx)] = np.sort(neg2Logpoisson[key + str(nEx)]) # Power and p-value calculations if key == 'zero': # Once the data is sorted, the 95th percentile corresponds to the 95th # percentile of the indices. 10000 * 0.95 = 9500 (minus one). - e95Loglike = neg2Loglike[key + str(nEx)][9499] e95Chisq = debinChisq[key + str(nEx)][9499] - e95CovarChisq = debinCovarChisq[key + str(nEx)][9499] + e95ConvolutionDiff = debinConvolutionDiff[key + str(nEx)][9499] e95ChisqMod = debinChisqMod[key + str(nEx)][9499] - e95CovarChisqMod = debinCovarChisqMod[key + str(nEx)][9499] + e95Loglike = neg2Loglike[key + str(nEx)][9499] + e95Logpoisson = neg2Logpoisson[key + str(nEx)][9499] else: - try: - # bisect_right(data, m) locates the right-most entry in data which is <= m - beta = bisect.bisect_right(neg2Loglike[key + str(nEx)], e95Loglike) / 10000 - print ' -2 log like power at 95% exclusion = ' + str(1. - beta) - except IndexError: - print ' WTF for -2 log like: ' + str(e95Loglike) - print - print neg2Loglike[key + str(nEx)] - print - print + # bisect_right(data, m) locates the right-most entry in data which is <= m try: beta = bisect.bisect_right(debinChisq[key + str(nEx)], e95Chisq) / 10000 print ' debinning chisq power at 95% exclusion = ' + str(1. - beta) except IndexError: print ' WTF for debinning: ' + str(e95Chisq) print print debinChisq[key + str(nEx)] print print try: - beta = bisect.bisect_right(debinCovarChisq[key + str(nEx)], e95CovarChisq) / 10000 - print ' debinning CovarChisq power at 95% exclusion = ' + str(1. - beta) + beta = bisect.bisect_right(debinConvolutionDiff[key + str(nEx)], e95CovarChisq) / 10000 + print ' debinning Convolution Diff power at 95% exclusion = ' + str(1. - beta) except IndexError: print ' WTF for debinning: ' + str(e95CovarChisq) print - print debinCovarChisq[key + str(nEx)] + print debinConvolutionDiff[key + str(nEx)] print print try: beta = bisect.bisect_right(debinChisqMod[key + str(nEx)], e95ChisqMod) / 10000 print ' debinning chisqMod power at 95% exclusion = ' + str(1. - beta) except IndexError: print ' WTF for debinning: ' + str(e95ChisqMod) print print debinChisqMod[key + str(nEx)] print print try: - beta = bisect.bisect_right(debinCovarChisqMod[key + str(nEx)], e95CovarChisqMod) / 10000 - print ' debinning CovarChisqMod power at 95% exclusion = ' + str(1. - beta) + beta = bisect.bisect_right(neg2Loglike[key + str(nEx)], e95Loglike) / 10000 + print ' -2 log like power at 95% exclusion = ' + str(1. - beta) + except IndexError: + print ' WTF for -2 log like: ' + str(e95Loglike) + print + print neg2Loglike[key + str(nEx)] + print + print + try: + beta = bisect.bisect_right(neg2Logpoisson[key + str(nEx)], e95Logpoisson) / 10000 + print ' poisson only power at 95% exclusion = ' + str(1. - beta) except IndexError: - print ' WTF for debinning: ' + str(e95CovarChisqMod) + print ' WTF for -2 log like: ' + str(e95Logpoisson) print - print debinCovarChisqMod[key + str(nEx)] + print neg2Logpoisson[key + str(nEx)] print print print - axes[nEx].hist(debinCovarChisq[key + str(nEx)], bins=51, alpha=0.4, - label=r'$\alpha = %f$' % alpha[key]) - axes[nEx].set_xlabel('Covariant Chisq, nEx=' + str(nEx), labelpad=5, fontsize=22) -''' - if nEx == 1000: - ax1.hist(debinChisq[key + str(nEx)], bins=51, alpha=0.4, - label=r'$\alpha = %f$' % alpha[key]) - ax1.set_xlabel('Chisq, nEx=1000', labelpad=5, fontsize=22) - ax2.hist(debinCovarChisq[key + str(nEx)], bins=51, alpha=0.4, - label=r'$\alpha = %f$' % alpha[key]) - ax2.set_xlabel('CovarChisq, nEx=1000', labelpad=5, fontsize=22) - ax3.hist(debinChisqMod[key + str(nEx)], bins=51, alpha=0.4, - label=r'$\alpha = %f$' % alpha[key]) - ax3.set_xlabel('ChisqMod, nEx=1000', labelpad=5, fontsize=22) - ax4.hist(debinCovarChisqMod[key + str(nEx)], bins=51, alpha=0.4, - label=r'$\alpha = %f$' % alpha[key]) - ax4.set_xlabel('CovarChisqMod, nEx=1000', labelpad=5, fontsize=22) -''' + axes[0][0].hist(debinChisq[key + str(nEx)], bins=51, alpha=0.4, + label=r'$\alpha = %f$' % alpha[key], color=colors[key]) + axes[0][0].set_xlabel('Chisq', labelpad=5, fontsize=22) + axes[1][0].hist(debinConvolutionDiff[key + str(nEx)], bins=51, alpha=0.4, + label=r'$\alpha = %f$' % alpha[key], color=colors[key]) + axes[1][0].set_xlabel('Convolution Diff Integral', labelpad=5, fontsize=22) + axes[0][1].hist(debinChisqMod[key + str(nEx)], bins=51, alpha=0.4, + label=r'$\alpha = %f$' % alpha[key], color=colors[key]) + axes[0][1].set_xlabel('Chisq Modified', labelpad=5, fontsize=22) + axes[0][2].hist(neg2Loglike[key + str(nEx)], bins=51, alpha=0.4, + label=r'$\alpha = %f$' % alpha[key], color=colors[key]) + axes[0][2].set_xlabel('-2 Log Like', labelpad=5, fontsize=22) + axes[1][2].hist(neg2Logpoisson[key + str(nEx)], bins=51, alpha=0.4, + label=r'$\alpha = %f$' % alpha[key], color=colors[key]) + axes[1][2].set_xlabel('-2 Log Like (poisson only)', labelpad=5, fontsize=22) if __name__ == '__main__': - print 'Now running the discrimination demonstration for the endpointBG function' + print 'Now running the discrimination demonstration for the endpointPlusBG function' plt.ion() iseDiscrimination() print 'Done.\n\n'