Page MenuHomeHEPForge

No OneTemporary

diff --git a/bin/prof2-tune b/bin/prof2-tune
--- a/bin/prof2-tune
+++ b/bin/prof2-tune
@@ -1,390 +1,397 @@
#! /usr/bin/env python
"""\
%prog <refdir> [<ipolfile>=ipol.dat] [<runsdir>=<refdir>/../mc] [opts]
Use the interpolations stored in <ipolfile> to find optimised parameters with
the reference histograms found in the <refdir> as the optimisation target.
The <runsdir> is used to calculate the maximum error value seen for each bin,
to regularise interpolated errors which could otherwise blow up, leading to
an unrepresentative small chi2 (and hence fit result) outside the sampled ranges.
+The weight file syntax is derived from YODA path syntax, and allows selecting bin
+ranges either by physical value or by bin number, e.g.
+ /path/parts/to/histo weight
+ /path/parts/to/histo@xmin:xmax weight
+ /path/parts/to/histo#nmin:nmax weight
+
+
TODO:
* Handle run combination file/string (write a hash of the run list into the ipol filename?)
"""
import optparse, os, sys
op = optparse.OptionParser(usage=__doc__)
op.add_option("--wfile", dest="WFILE", default=None, help="Path to a weight file to specify unequal chi2 weights of each bin in the fit (default: %default)")
op.add_option("--output", dest="OUTPUT", default="tunes", help="Prefix for outputs (default: %default)")
op.add_option("--limits", dest="LIMITS", default=None, help="Simple text file with parameter limits and fixed parameters")
op.add_option("--scan", dest="SCAN", default=None, type=int, help="Perform a brute force scan of SCAN points to find minimiser start point")
op.add_option("--limit-errs", dest="USE_RUNSDIR", action="store_true", default=False, help="Re-read the runsdir to regularise error ipols")
op.add_option("-g", "--gradient", dest="GRADIENT", action="store_true", default=False, help="Run minimisation with analytic gradient (EXPERIMENTAL!)")
op.add_option("-v", "--debug", dest="DEBUG", action="store_true", default=False, help="Turn on some debug messages")
op.add_option("-q", "--quiet", dest="QUIET", action="store_true", default=False, help="Turn off messages")
op.add_option("-s", "--strategy", dest="STRATEGY", default=1, type=int, help="Set Minuit strategy [0 fast, 1 default, 2 slow]")
op.add_option("--filter", dest="FILTER", action="store_true", default=False, help="Filter out data bins that have 0 error")
opts, args = op.parse_args()
## Get mandatory arguments
if len(args) < 1:
print "Argument missing... exiting\n\n"
op.print_usage()
sys.exit(1)
REFDIR = args[0]
IFILE = "ipol.dat"
RUNSDIR = os.path.join(REFDIR, "..", "mc")
if len(args) >= 2:
IFILE = args[1]
if len(args) >= 3:
RUNSDIR = args[2]
# TODO: ipol fit limits are in the ipol datfile... automatically use them / warn if result is outside?
## Load Professor and show the standard banner
import professor2 as prof
if not opts.QUIET:
print prof.logo
## Read interpolated and reference histos, and run data
IHISTOS, METADATA = prof.read_ipoldata(IFILE)
DHISTOS = prof.read_all_histos(REFDIR)
## Try to read run histos and extract maximum errors
MAXERRDICT = None
if USE_RUNSDIR:
try:
_, RUNHISTOS = prof.read_all_rundata(RUNSDIR, None) #< don't care about reading params files
MAXERRDICT = prof.find_maxerrs(RUNHISTOS)
except:
print "Could not read run data for error regularisation -- chi2 may be unstable"
## Weight file parsing
matchers = prof.read_pointmatchers(opts.WFILE) if opts.WFILE else None
## Find things available in both ipol and ref data, and in the weight file if there is one
available = []
for ihn in sorted(IHISTOS.keys()):
## Set default bin weights
for ib in IHISTOS[ihn].bins:
ib.w = 1.0
## Find user-specified bin weights if there was a weight file
if matchers is not None:
## Find matches
pathmatch_matchers = [(m,wstr) for m,wstr in matchers.iteritems() if m.match_path(ihn)]
## Ditch histos not listed in the weight file
if not pathmatch_matchers:
del IHISTOS[ihn]
continue
## Attach fit weights to the ibins, setting to zero if there's no position match
for ib in IHISTOS[ihn].bins:
posmatch_matchers = [(m,wstr) for (m,wstr) in pathmatch_matchers if m.match_pos(ib)]
ib.w = float(posmatch_matchers[-1][1]) if posmatch_matchers else 0 #< NB. using last match
for rhn in DHISTOS.keys():
if ihn in rhn: #< TODO: short for rhn = "/REF/"+ihn ?
# TODO: we should eliminate this potential mismatch of ref and MC hnames
available.append([ihn,rhn])
break #< TODO: ok?
# else:
# print "Could not find %s"%ihn
## Prepare lists of ibins and dbins
IBINS, DBINS, MAXERRS, FILTERED = [], [], [], []
BINDICES={} # Allows for more helpful error messages in case of prof.StatError
for a in available:
# TODO: print out the available observables
if len(IHISTOS[a[0]].bins) != len(DHISTOS[a[1]].bins):
print "Inconsistency discovered between data bins and parametrised bins:"
print "Removing histogram", a[0]
del IHISTOS[a[0]]
del DHISTOS[a[1]]
else:
BINDICES[a[0]] = []#range(len(IBINS), len(IBINS) + len(IHISTOS[a[0]])) # This is for debugging
for nb in xrange(len(IHISTOS[a[0]].bins)):
if opts.FILTER and DHISTOS[a[1]].bins[nb].err ==0:
FILTERED.append(1)
continue
if IHISTOS[a[0]].bins[nb].w >0:
IBINS.append(IHISTOS[a[0]].bins[nb])
DBINS.append(DHISTOS[a[1]].bins[nb])
BINDICES[a[0]].append(len(IBINS))
if MAXERRDICT:
MAXERRS.extend(MAXERRS[a[0]])
if not MAXERRS:
MAXERRS = None
if opts.DEBUG:
print "DEBUG: filtered %i bins due to zero data error"%len(FILTERED)
## Sanity checks
assert len(IBINS) == len(DBINS)
if not IBINS:
print "No bins ..., exiting"
sys.exit(1)
assert MAXERRS is None or len(IBINS) == len(MAXERRS)
def simpleGoF(params):
"""
Very straightforward goodness-of-fit measure
"""
chi2 = 0.0
for num, ibin in enumerate(IBINS):
## Weight is attached to the ipol bin (default set to 1.0 above)
w = ibin.w
if w == 0:
continue
## Get ipol & ref bin values and compute their difference
ival = ibin.val(params)
dval = DBINS[num].val
diff = dval - ival
## Data error
err2 = DBINS[num].err**2
## Plus interpolation error added in quadrature
maxierr = MAXERRS[ibin] if MAXERRS else None
err2 += ibin.err(params, emax=maxierr)**2
# TODO: compute asymm error for appropriate deviation direction cf. sum([e**2 for e in ibin.ierrs])
if not err2:
culprit=""
i_culprit=-1
for k, v in BINDICES.iteritems():
if num in v:
culprit=k
i_culprit = v.index(num)
raise prof.StatError("Zero uncertainty on a bin being used in the fit -- cannot compute a reasonable GoF!\n\tObservable: %s\n\t%s %f+=%f\n\t%s \nSee weight-syntax in documentation or user --filter CL arg to remove bins with zero data error automatically"%(culprit, ibin, ival, ibin.err(params, emax=maxierr), DBINS[num]))
# TODO: should we square w too, so it penalised deviations _linearly_?
chi2 += w * diff**2 / err2
return chi2
def simpleGoFGradient(params):
"""
Very straightforward goodness-of-fit measure
"""
dchi2 = [0 for x in params]
for num, ibin in enumerate(IBINS):
## Weight is attached to the ipol bin (default set to 1.0 above)
w = ibin.w
if w == 0:
continue
## Get ipol & ref bin values and compute their difference
ival = ibin.val(params)
dval = DBINS[num].val
diff = dval - ival
## Data error
err2 = DBINS[num].err**2
## Plus interpolation error added in quadrature
maxierr = MAXERRS[ibin] if MAXERRS else None
err2 += ibin.err(params, emax=maxierr)**2
# TODO: compute asymm error for appropriate deviation direction cf. sum([e**2 for e in ibin.ierrs])
if not err2:
raise prof.StatError("Zero uncertainty on a bin being used in the fit -- cannot compute a reasonable GoF")
# TODO: should we square w too, so it penalised deviations _linearly_?
igrad = ibin.grad(params)
for p in xrange(len(params)):
dchi2[p] += 2 * w * diff * igrad[p] / err2
N=sum(dchi2)
return [x/N for x in dchi2]
## Take parameter names directly from ifile, or fallback
PNAMES = METADATA["ParamNames"].split()
if not PNAMES:
PNAMES = ["A%03i" % i for i in xrange(int(METADATA["Dimension"]))]
## Function definition wrapper
funcdef = prof.mk_fitfunc("simpleGoF", PNAMES, "profGoF")
exec funcdef in locals()
if opts.DEBUG:
print "Built GoF wrapper from:\n '%s'" % funcdef
try:
from iminuit import Minuit
except ImportError, e:
print "Unable to import iminuit, exiting", e
print "Try installing iminuit with pip: pip install iminuit --user"
import sys
sys.exit(1)
if not opts.QUIET:
print "\n"
print 66*"*"
print "* Using iminuit, please visit https://github.com/iminuit/iminuit *"
print 66*"*"
print "\n"
## Ignition
## Dictionary fitarg for iminuit
FARG=dict()
## Initial conditions --- use pos = center of hypercube, and step = range/10
# TODO: Optionally make an initial brute force scan to choose the Minuit starting point, using prof.scangrid
pmins = [float(x) for x in METADATA["MinParamVals"].split()]
pmaxs = [float(x) for x in METADATA["MaxParamVals"].split()]
assert len(pmins) == len(pmaxs)
pmids = [(pmins[i] + pmaxs[i])/2. for i in xrange(len(pmins))]
pranges = [(pmaxs[i] - pmins[i]) for i in xrange(len(pmins))]
if opts.SCAN is not None:
npoints_per_dim=opts.SCAN
print "Scanning %i points"%(npoints_per_dim**len(pmins))
setup=[]
for num, p in enumerate(PNAMES):
setup.append((p, npoints_per_dim, pmins[num], pmaxs[num]))
grid=prof.scangrid(*setup)
winner = grid.next()
winner_v = simpleGoF([x[1] for x in winner])
for num, g in enumerate(grid):
currV=simpleGoF([x[1] for x in g])
if currV < winner_v:
winner=g
winner_v=currV
if ((num+1)%100 ==0):
print "%i/%i complete"%(num+1, npoints_per_dim**len(pmins))
# This sets the star point
#print "Using startpoint:"
for i, aname in enumerate(PNAMES):
assert(aname==winner[i][0])
pmids[i] = winner[i][1]
#print "%s = %f"%(aname, pmids[i])
for i, aname in enumerate(PNAMES):
FARG[aname] = pmids[i]
FARG['error_%s'%aname] = pranges[i] / 10.
## Fix parameters, set limits (with pname translation)
limits, fixed = prof.read_limitsandfixed(opts.LIMITS)
for i, pname in enumerate(PNAMES):
if pname in limits.keys():
FARG['limit_%s'%pname] = limits[pname]
if pname in fixed.keys():
if not opts.QUIET:
print "Fixing", pname, "= %f"%fixed[PNAMES[i]]
FARG[pname] = fixed[PNAMES[i]]
FARG['fix_%s'%pname] = True
# TODO: errordef as CL params?
PRINTLEVEL = 0 if opts.QUIET else 1
if opts.GRADIENT:
graddef = prof.mk_fitfunc("simpleGoFGradient", PNAMES, "myGrad")
exec graddef in locals()
minuit = Minuit(profGoF, grad_fcn=myGrad, errordef=1, print_level=PRINTLEVEL, forced_parameters=PNAMES, **FARG)
else:
minuit = Minuit(profGoF, errordef=1, print_level=PRINTLEVEL, forced_parameters=PNAMES, **FARG)
minuit.strategy = opts.STRATEGY
import time
start_time = time.time()
## Lift off
minuit.migrad()
print("Minimisation finished after %s seconds" % (time.time() - start_time))
## Now process the result:
## Goodness of fit
if not opts.QUIET:
chi2 = minuit.fval
ndof = len(DBINS) - (len(PNAMES) - len(fixed.keys()))
print "'chi2': %.2f --- Ndf : %i --- ratio : %.2f" % (chi2, ndof, chi2/ndof)
## Check if result is in validity range
result = [minuit.values[p] for p in PNAMES]
rok, rng = prof.is_inrange(result, pmins, pmaxs)
if not rok:
msg="Warning --- parameters are outside the validity of the parametrisation:"
for i in rng:
msg+="\n %s=%f ! in [%f,%f]"%(PNAMES[i], result[i], pmins[i], pmaxs[i])
msg+= "\n You might want to impose limits (--limits) on those parameters."
if not opts.QUIET:
print msg
# Max number of characters in any of parameter names --- for formatting (ljust)
LMAX=max([len(p) for p in PNAMES])
## Write out result
with open("%s_results.txt" % opts.OUTPUT,"w") as f:
## Meta info
f.write("# ProfVersion: %s\n" % prof.version())
f.write("# Date: %s\n" % prof.mk_timestamp())
f.write("# InterpolationFile: %s\n" % os.path.abspath(IFILE))
f.write("# DataDirectory: %s\n" % os.path.abspath(REFDIR))
## Limits
lstring = ""
for p in PNAMES:
if limits.has_key(p):
lstring += "\n#\t%s\t%f %f" % (p.ljust(LMAX), limits[p][0], limits[p][1])
f.write("#\n# Limits: %s" % lstring)
# Fixed parameters
fstring = ""
for p in PNAMES:
if fixed.has_key(p):
fstring += "\n#\t%s\t%f" % (p.ljust(LMAX), fixed[p])
f.write("\n#\n# Fixed: %s\n" % fstring)
f.write("#\n# Minimisation result:\n#\n")
## The tuned parameter values
for i, p in enumerate(PNAMES):
f.write("%s\t%f\n" % (p.ljust(LMAX), minuit.values[PNAMES[i]]))
# Correlation matrix --- if params are fixed the covariance is not defined
# The keys of minuit.covariance are tuples of strings
f.write("#\n# Correlation matrix:\n#\n")
t1, t2 = zip(*minuit.covariance.keys())
l1=list(t1)
CNAMES=list(set(l1))
from math import sqrt
for i in PNAMES:
s="# %s"%i.ljust(LMAX)
for j in PNAMES:
if i in CNAMES and j in CNAMES:
if minuit.covariance[(i,j)] >=0:
s+= " %.2f"%(minuit.covariance[(i,j)]/(sqrt(minuit.covariance[(i,i)])*sqrt(minuit.covariance[(j,j)])))
else:
s+= " %.2f"%(minuit.covariance[(i,j)]/(sqrt(minuit.covariance[(i,i)])*sqrt(minuit.covariance[(j,j)])))
else:
s+= " ---"
f.write(s+"\n")
# Weights --- dump them all at the end
f.write("#\n#\n# Weights used\n#\n")
#
if matchers is None:
for k in IHISTOS.keys():
f.write("# %s\t1.0\n"%k)
else:
with open(opts.WFILE) as g:
for line in g:
l=line.strip()
if len(l)==0 or l.startswith("#"):
continue
f.write("# %s\n"%l)
## Write out ipolhistos
try:
import yoda
result = [minuit.values[name] for name in PNAMES]
scatters=[IHISTOS[k].toDataHisto(result).toScatter2D() for k in sorted(IHISTOS.keys())]
yoda.writeYODA(scatters, "%s_ipolhistos.yoda" % opts.OUTPUT)
except ImportError:
print "Unable to import yoda, not writing out ipolhistos"
diff --git a/pyext/professor2/dataio.py b/pyext/professor2/dataio.py
--- a/pyext/professor2/dataio.py
+++ b/pyext/professor2/dataio.py
@@ -1,219 +1,219 @@
# -*- python -*-
from professor2.histos import *
from professor2.paramsio import *
import os.path, glob
def read_histos_root(path):
"Load histograms from a ROOT file, into a dict of path -> yoda.Histo[DataBin]"
histos = {}
# TODO: Could just use YODA for everything, including ROOT reading?
try:
import ROOT
ROOT.gROOT.SetBatch(True)
except ImportError:
print "PyROOT not available... skipping"
return histos
# TODO: use yoda.root.getall
def _getallrootobjs(d, basepath="/"):
"Recurse through a ROOT file/dir and generate (path, obj) pairs"
for key in d.GetListOfKeys():
kname = key.GetName()
if key.IsFolder():
# TODO: -> "yield from" in Py3
for i in _getallrootobjs(d.Get(kname), basepath+kname+"/"):
yield i
else:
yield basepath+kname, d.Get(kname)
try:
f = ROOT.TFile(path)
for rname, robj in _getallrootobjs(f):
bins = []
if robj.InheritsFrom("TH1"):
# TODO: allow 2D histos
if robj.InheritsFrom("TH2"):
continue
for ib in xrange(robj.GetNbinsX()):
xmin = robj.GetXaxis().GetBinLowEdge(ib+1)
xmax = robj.GetXaxis().GetBinUpEdge(ib+1)
y = robj.GetBinContent(ib+1)
ey = robj.GetBinError(ib+1)
bins.append(DataBin(xmin, xmax, y, ey))
histos[rname] = Histo(bins, rname)
elif robj.InheritsFrom("TGraph"):
for ip in xrange(robj.GetN()):
x, y = ROOT.Double(), ROOT.Double()
robj.GetPoint(ip, x, y)
xmin = x - robj.GetErrorXlow(ip)
xmax = x + robj.GetErrorXhigh(ip)
ey = (robj.GetErrorXlow(ip) + robj.GetErrorXhigh(ip)) / 2.0
bins.append(DataBin(xmin, xmax, y, ey))
histos[rname] = Histo(bins, rname)
f.Close()
except Exception, e:
print "Can't load histos from ROOT file '%s': %s" % (path, e)
return histos
def read_histos_yoda(path):
"Load histograms from a YODA-supported file type, into a dict of path -> yoda.Histo[DataBin]"
histos = {}
try:
import yoda
s2s = []
aos = yoda.read(path, asdict=False)
for ao in aos:
import os
## Skip the Rivet cross-section and event counter objects
# TODO: Avoid Rivet-specific behaviour by try block handling & scatter.dim requirements
if os.path.basename(ao.path).startswith("_"):
continue
##
s2s.append(ao.mkScatter())
del aos #< pro-active YODA memory clean-up
#
#s2s = [ao.mkScatter() for ao in yoda.read(path, asdict=False)]
for s2 in s2s:
bins = [DataBin(p.xMin, p.xMax, p.y, p.yErrAvg) for p in s2.points]
#bins = [DataBin(p.xMin, p.xMax, p.y*(p.xMax-p.xMin), p.yErrAvg) for p in s2.points]
# bins = [DataBin(p.xMin, p.xMax, p.y, p.yErrs) for p in s2.points]
histos[s2.path] = Histo(bins, s2.path)
del s2s #< pro-active YODA memory clean-up
except Exception, e:
print "Can't load histos from file '%s': %s" % (path, e)
return histos
def read_histos(filepath, stripref=True):
"""
Load histograms from file, into a dict of path -> yoda.Histo[DataBin]
If stripref = True, remove any leading /REF prefix from the histo path
before putting it in the dictionary.
"""
histos = {}
if filepath.endswith(".root"):
histos.update(read_histos_root(filepath))
elif any(filepath.endswith(ext) for ext in [".yoda", ".aida", ".flat"]):
histos.update(read_histos_yoda(filepath))
if stripref:
newhistos = {}
for p, h in histos.iteritems():
if p.startswith("/REF"):
p = p.replace("/REF", "", 1)
h.path = p
newhistos[p] = h
histos = newhistos
return histos
def read_all_histos(dirpath, stripref=True):
"""
Load histograms from all files in the given dir, into a dict of path -> yoda.Histo[DataBin]
If stripref = True, remove any leading /REF prefix from the histo path
before putting it in the dictionary.
"""
histos = {}
filepaths = glob.glob(os.path.join(dirpath, "*"))
for fp in filepaths:
histos.update(read_histos(fp, stripref))
return histos
def read_rundata(dirs, pfname="params.dat", verbosity=1): #, formats="yoda,root,aida,flat"):
"""
Read interpolation anchor point data from a provided set of run directory paths.
Returns a pair of dicts, the first mapping run names (i.e. rundir basenames) to
the parameter value list for each run, and the second mapping observable names
(i.e. histogram paths) to a run -> histo dict.
"""
params, histos = {}, {}
import os, glob, re
re_pfname = re.compile(pfname) if pfname else None
numruns = len(dirs)
for num, d in enumerate(sorted(dirs)):
run = os.path.basename(d)
- if verbosity >= 2 or (verbosity >= 1 and (num % 100 == 99)): # or num == numruns-1)):
+ if verbosity >= 2 or (verbosity >= 1 and (num % 100 == 99 or num == 0)):
pct = 100*(num+1)/float(numruns)
print "Reading run '%s' data: %d/%d = %2.0f%%" % (run, num+1, numruns, pct)
files = glob.glob(os.path.join(d, "*"))
for f in files:
## Params file
#if os.path.basename(f) == pfname:
if re_pfname and re_pfname.search(os.path.basename(f)):
params[run] = read_paramsfile(f)
## Histo file
else:
try:
## Read as a path -> Histo dict
hs = read_histos(f)
## Restructure into the path -> run -> Histo return dict
for path, hist in hs.iteritems():
histos.setdefault(path, {})[run] = hist
except:
pass #< skip files that can't be read as histos
## Check that a params file was found and read in this dir... or that no attempt was made to find one
if pfname:
if run not in params.keys():
raise Exception("No params file '%s' found in run dir '%s'" % (pfname, d))
else:
params = None
return params, histos
def read_all_rundata(runsdir, pfname="params.dat", verbosity=1):
rundirs = glob.glob(os.path.join(runsdir, "*"))
return read_rundata(rundirs, pfname, verbosity)
def read_all_rundata_yaml(yamlfile):
from professor2.utils import mkdict
PARAMS = {}
HISTOS = {}
import yaml
print "Loading YAML from %s"%yamlfile
Y=yaml.load(open(yamlfile))
for num, y in enumerate(Y):
P=mkdict()
for p in sorted(y['Params'].keys()):
P[p] = float(y['Params'][p])
PARAMS[num]=P
for f in y['YodaFiles']:
hs = read_histos(f)
for path, hist in hs.iteritems():
HISTOS.setdefault(path, {})[num] = hist
return PARAMS, HISTOS
def find_maxerrs(histos):
"""
Helper function to find the maximum error values found for each bin in the histos double-dict.
histos is a nested dict of DataHisto objects, indexed first by histo path and then by
run name, i.e. it is the second of the two objects returned by read_histos().
Returns a dict of lists of floats, indexed by histo path. Each list of floats contains the
max error size seen for each bin of the named observable, across the collection of runs
histos.keys().
This functions is useful for regularising chi2 etc. computation by constraining interpolated
uncertainties to within the range seen in the run data used to create the ipols, to avoid
blow-up of uncertainties with corresponding distortion of fit optimisation.
TODO: asymm version?
"""
rtn = {}
for hn, hs in histos.iteritems():
numbins_h = hs.next().nbins
maxerrs_h = []
for ib in xrange(numbins_h):
emax = max(h.bins[ib].err for h in hs.values())
maxerrs_h.append(emax)
rtn[hn] = maxerrs_h
return rtn

File Metadata

Mime Type
text/x-diff
Expires
Sat, Dec 21, 4:43 PM (18 h, 26 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
4023497
Default Alt Text
(23 KB)

Event Timeline