diff --git a/ChangeLog b/ChangeLog --- a/ChangeLog +++ b/ChangeLog @@ -1,622 +1,626 @@ +2020-10-18 Andy Buckley + + * Fix YODA call-parentheses for v1.8 onwards. + 2019-09-06 Andy Buckley * Release 2.3.2 2019-07-20 Andy Buckley * Fix prof2-tune arguments and write out a params summary file that can be used with prof2-predict. 2019-05-08 Andy Buckley * Apply patches to prof2-I, from Christine Rasmussen. 2019-06-13 Holger Schulz * Release 2.3.1 2019-02-04 Holger Schulz * added reading data from hdf5 2018-10-02 Holger Schulz * Release 2.3.0, probably the last C++ release. 2018-02-07 Andy Buckley * Add numCoeffs, numParams, and exprString methods to Ipol. 2017-10-25 Holger Schulz * Bugfix missing import statement 2017-12-29 Holger Schulz * Exposed more C++ functions to python for Pade approximation testing 2017-11-10 Holger Schulz * Release 2.2.2 --- this is the result of many months of fiddling with especially multinest requirements. * Sampling --- sobol, latin hypercube and random uniform sampler --- something like PARAM 0 1 2 3 result in sub space sampling --- biases now work e.g. PARAM 0 1 exp(x) * Ipol --- --order auto and friends automatically determine best order, this avoids oversampling. --- --logy parmeterises the logy of things, useful for large paramspace and especially likelihood --- --medianfilt can drop anchor points based on error this is experimental and meant to deal with the occasional huge NLO weight --- The output has the valid region for each object now stored at the end of the coefficients, this is also where they are read from --- this allows the usage of --medianfilt etc * Tune --- Replace the grid sampling with the already existing sampler machinery, so can use random, latin, sobol when using --scan to find good start point for minuit --- --profiles and --contours stores the minuit and minos plots --- limits are by default now set to the ipol region, this can be overridden when using -x CL arg --- tune script can read more than one ipol file A lot of stuff has been put into libraries to allow reusing individual steps. * Multinest --- this is now supported in prof2-tune-nest * GP --- gp_minimize via prof2-tune-gp * emcee --- MCMC sampling via prof2-emcee * Eigentunes --- prof2-eigentunes is back and working, deltaChi2 must be provided, however, via --target * Bootstrapping in principle working but smearing is probably more complicated than what we have now prof2-bootstrap 2017-08-22 Holger Schulz * Improvements to Ipol --- we allow different orders for each bin now, there is also --order auto which does some jackknifing to figure out the best suited polynomial order to avoid overfitting. The Ipol outputformat is now binned 3 which dumps the minparam values and maxparamvalues at the end of the coefficient string. This is necessary as we now also allow to filter the inputs by median error (necessary to tame NLO spiky histograms). The ranges are read back for each bin separately now, happens in the C++ part. * Overhaul of sampling, including things such as full support of sobol sampling. Also, param range files with more than two numbers on a line are interpreted as sub spaces for patch sampling. * Add script prof2-ipolenvelopes which for each bin finds the minimum and maximum polynomial value (using migrad) in the range of the ipols validity. This is nice especially to check envelopes once filtering in the ipol step was done 2017-05-18 Holger Schulz * Refitting of ipols using smeared ipol output. Required some modification in data io structure. 2017-05-04 Holger Schulz * Fix in prof2-tune when checking analysis names, at the same time allow for for closure test tuning, i.e. tune against input folder. 2017-05-02 Holger Schulz * Use -march=native according to eigen3 faq for better performance as well as -O3 in pyext * Replace modulus based status message by simple counter in ipol stage * Set version to 2.2.2beta for imminent release 2017-04-28 Holger Schulz * Use BDSSVD in case Eigen >=3.3.3 * Reworked prof2-ipol and ipol.py to use mk_ipolbin rather than mk_ipolhisto, some speedups plus additional benefit of being able to fiddle with inividual bins in the ipol building stage * add --minos switch to prof2-tune toi run MINOS * Write GOF and NDOF into result file 2017-02-21 Andy Buckley * Change prof2-ipol ierr mode default to 'symm' 2017-02-17 Andy Buckley * Add a --limit-errs argument for prof2-tune, to avoid annoying re-reading of large run dirs * Tweaks to default error ipol order, and release figure handles in envelope plotting. 2017-02-02 Holger Schulz * Update Copyright string * Instead of exiting when prof2-ipol encounters nan coefficients in value or error ipols, spit out a warning and ignore that histogram entirely --- this makes prof2-ipol and friends much more robust * tag as 2.2.1 and release. 2017-02-01 Holger Schulz * Use Agg as default matplotlib backend. * Bump version to 2.2.1 2016-12-19 Holger Schulz * Even more robust prof-tune, added --filter CL option to automatically get rid of zero error bins * Fixed a bug where IpolBin.n was never set which is needed for the PointMatcher (i.e. weights syntax) * Not using YODA Pointmatcher but own due to different regexes required. * Make release 2.2 2016-12-15 Holger Schulz * More robust scripts and more meaningful error messages, especially prof2-tune 2016-11-30 Holger Schulz * Replace Counter code when running mk_Structure with much faster code * Bump version to 2.2 * Bump DataFormat to binned 2 * Allow passing file names or open files (or similar streams) to ipolio functions * prof2-ipol can read yaml now * Add some of Elliot's code for choosing sub spaces in prof2-ipol and some plot routines to contrib * Add SOBOL sampling to contrib 2016-08-26 Holger Schulz * Use more readable Cython DL path in bootstrap 2016-07-18 Andy Buckley * Typo fixes -- thanks to Leif Gellersen. 2016-07-14 Andy Buckley * Release version 2.1.4 * Improve Makefile tests for ROOT and Cython versions. 2016-07-11 Andy Buckley * Add Ipol min/max param vals getters and map them into Python. * Add a built-in /REF-stripping option to data histo loading. * Improvements to the Python ipol metadata interface. 2016-07-05 Andy Buckley * Try to optimise long-vector building by not executing pow() if the exponent would be zero. * Split C++ calcValue function into two, to allow re-use of a precalculated long-vector. 2016-06-23 Andy Buckley * Split 'dot product' part of ipol value calculation into a standalone calcValue function. * Use const references when passing const vectors in C++ API. 2016-04-14 Holger Schulz * Introduce 1D sensitivity based on gradient * User needs to specify --cmap or --grad when running prof2-sens now 2016-04-01 Holger Schulz * Add a first version of the parametrisation using Machine Learning 2016-02-16 Holger Schulz * Bump to version 2.1.3 2016-02-15 Holger Schulz * prof2-sens: bugfix, add weight file usage, clean up CL * prof2-I: add weight file usage * prof2-predict: add weight file usage, fix output writing * Extended documentation 2016-02-12 Holger Schulz * Documentation including asciinema 2016-02-11 Holger Schulz * Remove min_runs function, replace with numCoeffs from C++ code * Add prof2-envelopes script * In prof-tune simplechi2 --- continue on weight==0 to allow some robustness when switching off bins 2016-02-10 Holger Schulz * First version of prof2-runcombs plus sampling functions salvaged from Prof1, modified to have no scipy dependence * Add --rc switch to prof2-ipol to optionally read runcombs Syntax is --rc runcoms.dat[colon]5 (Minimally invasive addon) * Code cull in prof2-ipol. Only support multiprocessing version, default 1 thread. Changed CL switch to '-j'. 2015-12-16 Holger Schulz * Only calculate polynomials structure once per instance -> significant speed increase * Add the brute force grid scan (--scan) to prof2-tune 2015-12-11 Holger Schulz * Minimiser tweaks --- no more need for param translation, make strategy steerable (--strategy or -s), -q suppresses iminuit output now, calculation and writing of correlation matrix into results, dump weights into results file 2015-12-10 Holger Schulz * Undo ui madness * Remove numpy dependence (except sampling.py) * Add prof-sens for sensitivity plotting exploting derivatives 2015-12-09 Holger Schulz * Can specify PREFIX when calling make 2015-12-09 Andy Buckley * Improve prof2-predict UI. * Add metadata printing in prof2-lsipol debug mode. * Add -v and -q flag shortcuts to all scripts. 2015-12-07 Holger Schulz * Use __all__ in __init__.py * Started sphinx documentation * Support only iminuit, bump version to 2.2.alpha, restructure imports * Sanity checks when calling parametrisation 2015-12-03 Andy Buckley * Provide a version() function in the C++ library, and map it into Python. Both sources use the VERSION variable defined in the Makefile at build time. * Reduce SVD fitting threshold to 1e-20. 2015-12-03 Holger Schulz * Make derivative calculation work also when scaling params * Let YODA write YODA 2015-12-02 Holger Schulz * Derivative calculation 2015-11-28 Andy Buckley * Version 2.1.2 * Revert range shift in Ipol.cc, since it produced (small) numerical errors rather than improved stability. To be understood... * Add test/mkpolydata script, for generating polynomial pseudodata used in closure tests. 2015-11-27 Andy Buckley * Fix bug in histo loading. * Adding log binning and other options to prof2-residuals. 2015-11-24 Andy Buckley * Version 2.1.1 release. * Fix prof2-predict YODA writing, and improve DataHisto.toYODA. * Change parameter rescaling from using the [0,1] range to instead use [1,2], to avoid arbitrarily scaled param numbers. BREAKS IPOL FILE COMPATIBILITY WITH 2.1.0!! * Set minimizer initial step size = prange/10. * Add automatic determination of maximum MC errors in prof-tune, and use them to regularize interpolated errs which could distort the fit. * Separate internal Minuit function names (with syntax restrictions) from the free-form external param names. 2015-11-23 Andy Buckley * Add pyext/professor2/chi2.py * Add find_maxerrs() to the Python interface, for use in error regularisation. * Use the param file name as a regex. * Use __slots__ in the internal histogram and bin types, for better memory efficiency. * Fix (re-fix?) use of inappropriate indices in calculation of mean and median MC uncertainties. * More proactive data object deletion after conversion to YODA data format. 2015-11-22 Andy Buckley * Version 2.1.0 (significant version change due to scaling, all-orders, and ROOT support) 2015-11-18 Andy Buckley * dataio.py: Add ROOT file reading ability. * Change rescaling I/O behaviour a bit, moving some hacked Python-only Ipol string constructors into explicit API improvements, and only activating rescaling if DoParamScaling is set true. 2015-11-16 Andy Buckley * Add CPPFLAGS steering in Makefile and API tidying. 2015-11-11 Holger Schulz * Add automatic raw param value -> unit range mapping to Ipol. * Add any-order long vector calculation. 2015-11-10 Andy Buckley * Add prof2-lsipol as a handy way to make a tuning weights file and for general convenience. 2015-10-06 Andy Buckley * Version 2.0.0 * Add 'make dist' target to Makefile, and tweak/reduce Cython necessity * Allow calls to IpolBin and IpolHisto methods with params passed as unpacked *args as well as a packed iterable. * Add possibility to pass params as an dict-like as well as a list or tuple of floats -- note that the dict-like must return *ordered* values consistent with the Ipol training. Also ensure the float type. * Add conversion of C++ exceptions to Python exceptions. * Add professor2.utils module, providing an opportunistic use of OrderedDict for params so the params order required by Ipol is the same as the order in which they are specified in the (first run's) params.dat file. * Change Python-mapped Ipol methods with no args to be properties. * Add nice __repr__ methods for Histo and Bin objects. * Specialise Histo as IpolHisto and DataHisto, and provide toData{Histo,Bin} on IpolHisto and IpolBin respectively. * Remove ProfMaster. 2015-10-04 Andy Buckley * Add protection of svd.setThreshold call to ensure it is supported/needed, via Eigen version number macros. * Add vmin and vmax args to Ipol and IpolBin in Python. 2015-10-02 Andy Buckley * Put histo file reading in a try..except block so read failures on non-histo files in the run dirs are not fatal. * Add protection in the histo loader, so we don't trip up on Rivet's new cross-section and counter objects. 2015-09-30 Andy Buckley * src/Ipol.h: Use Eigen/SVD include path rather than non-standard eigen3/Eigen/SVD. 2015-09-14 Holger Schulz * Use Eigen's svd.setThreshold(1e-20) to prevent the errors we saw with Eigen 3.2.2 and later when looking at 5th order polynomials in 3 dimensinos with 900+ anchors * Some clean up 2015-09-11 Andy Buckley * Adding prof2-residuals script for ipol faithfulness testing. * Rename scripts from prof-* to prof2-* to allow parallel installations of Prof1 and Prof2. * Move some non-core scripts from bin to contrib. * Make root-config dependency optional * Lots of reworking to make prof-tune work again, and to use weight file parsing. 2015-09-10 Andy Buckley * Use weight file path parsing in prof-ipol. * Add prof.weights submodule, with a copy of PointMatcher and a new read_pointmatchers function for handling weight files. * Provide __version__attribute in the Python module. * User script CLI consistency, simplification, general smoothing... 2015-08-30 Andy Buckley * Add recursive scangrid generator function, and other tweaks. 2015-08-18 Andy Buckley * Tweaks to ipol file parsing functions. * Fix a bug in IpolBin's use of interpolated errors. * Convert mean and median error parameterisation to use new 0th order ipols. 2015-08-15 Holger Schulz * A first version of catching singular matrices in the SVD * Ipol has 1 additional argument, "threshold", that determines what singular values are considered 0, fully propagated to pyext * Going 2 Pro 2 Fessional * Some startup checks of prof- scripts * remove bin/prof-sampling, rename prof-prediction -> prof-predict 2015-08-13 Holger Schulz * Reverted unintended commit of hack. * Added very basic prof-config script to bin * Update PATH when sourcing setup.sh 2015-08-12 Holger Schulz * Added prof-prediction which reads in a ifile and a point in parameter space (either a text file, a comma separated list or just the args) and writes out an ipolhisto. Tested to work. 2015-08-03 Holger Schulz * Added 0-order polynomials, i.e. constant values. Currently, the coefficent is simply the value of the first anochorpoint. 2015-07-15 Holger Schulz * Some pull plotting functionality 2015-07-08 Holger Schulz * Calculate and print G.o.f. * Write some meta info to results * Read limits, fixed params from single textfile --limits * Output files steered by --output * Remove now obsolete bin/prof-interpolate 2015-07-08 Holger Schulz * First working version of prof-tune (pyminuit, simpleGOF) * Multiprocessing to speed up prof-ipol (--multi) * prof-ipol-tabulated to write out interpolation of tabulated data * Minor bugfixes 2015-06-16 Andy Buckley * Adding mean and median strategies for assigning constant ipol bin errors. 2015-06-03 Andy Buckley * Adding a Bin base class in the Python side, and a coherent handling of multiple (interpolated) bin errors. * Various tweaks and minor fixes. 2015-05-10 Andy Buckley * Add 'Minuit' class importing to the __init__.py file. 2015-05-05 Andy Buckley * Rewrite C++ ipolstring parsing using STL string functions rather than Boost: shorter! Boost dependency GONE :-) * Replace boost::tuple with std::pair and clean up ParamPoints interface a bit. * Add a --veto option to prof-sample, for a user-specified vetoing function. * Move Sampler from prof-sample into professor2/__init__.py 2015-05-04 Andy Buckley * Adding a first version of prof-tune with ipol reading from file. * Adding options for (average) error interpolation and ipol persistency to prof-ipol and module functions. 2015-05-03 Andy Buckley * Add bin/prof-ipol simple script using the functions below -- to be merged with prof-interpolate. * professor2/__init__.py: Add Histo, DataBin, IpolBin and basic data handling functions. * Remove unnecessary dlopen fiddling from professor2/__init__.py 2015-04-23 Andy Buckley * Remove bound ParamPoints pointer from Ipol, and ditch lazy coeff evaluation in favour of simplicity and construct-time coeff calculating. * Move long vector and coeff building into unbound functions rather than Ipol methods. * Start of ChangeLog. Library has already iterated quite a bit. * FILL IN EARLY HISTORY FROM HG LOG diff --git a/pyext/professor2/dataio.py b/pyext/professor2/dataio.py --- a/pyext/professor2/dataio.py +++ b/pyext/professor2/dataio.py @@ -1,402 +1,402 @@ # -*- python -*- from professor2.histos import * from professor2.paramsio import * import os.path, glob def read_histos_root(path): "Load histograms from a ROOT file, into a dict of path -> yoda.Histo[DataBin]" histos = {} # TODO: Could just use YODA for everything, including ROOT reading? try: import ROOT ROOT.gROOT.SetBatch(True) except ImportError: print("PyROOT not available... skipping") return histos # TODO: use yoda.root.getall def _getallrootobjs(d, basepath="/"): "Recurse through a ROOT file/dir and generate (path, obj) pairs" for key in d.GetListOfKeys(): kname = key.GetName() if key.IsFolder(): # TODO: -> "yield from" in Py3 for i in _getallrootobjs(d.Get(kname), basepath+kname+"/"): yield i else: yield basepath+kname, d.Get(kname) try: f = ROOT.TFile(path) for rname, robj in _getallrootobjs(f): bins = [] if robj.InheritsFrom("TH1"): # TODO: allow 2D histos if robj.InheritsFrom("TH2"): continue for ib in range(robj.GetNbinsX()): xmin = robj.GetXaxis().GetBinLowEdge(ib+1) xmax = robj.GetXaxis().GetBinUpEdge(ib+1) y = robj.GetBinContent(ib+1) ey = robj.GetBinError(ib+1) bins.append(DataBin(xmin, xmax, y, ey)) histos[rname] = Histo(bins, rname) elif robj.InheritsFrom("TGraph"): for ip in range(robj.GetN()): x, y = ROOT.Double(), ROOT.Double() robj.GetPoint(ip, x, y) xmin = x - robj.GetErrorXlow(ip) xmax = x + robj.GetErrorXhigh(ip) ey = (robj.GetErrorXlow(ip) + robj.GetErrorXhigh(ip)) / 2.0 bins.append(DataBin(xmin, xmax, y, ey)) histos[rname] = Histo(bins, rname) f.Close() except Exception as e: print("Can't load histos from ROOT file '%s': %s" % (path, e)) return histos def read_histos_yoda(path): "Load histograms from a YODA-supported file type, into a dict of path -> yoda.Histo[DataBin]" histos = {} try: import yoda s2s = [] aos = yoda.read(path, asdict=False) for ao in aos: import os ## Skip the Rivet cross-section and event counter objects # TODO: Avoid Rivet-specific behaviour by try block handling & scatter.dim requirements - if os.path.basename(ao.path).startswith("_"): + if os.path.basename(ao.path()).startswith("_"): continue ## s2s.append(ao.mkScatter()) del aos #< pro-active YODA memory clean-up # - for s2 in filter(lambda x:x.dim==2, s2s): # Filter for Scatter1D - bins = [DataBin(p.xMin, p.xMax, p.y, p.yErrAvg) for p in s2.points] - histos[s2.path] = Histo(bins, s2.path) + for s2 in filter(lambda x : x.dim() == 2, s2s): # Filter for Scatter1D + bins = [DataBin(p.xMin(), p.xMax(), p.y(), p.yErrAvg()) for p in s2.points()] + histos[s2.path()] = Histo(bins, s2.path()) del s2s #< pro-active YODA memory clean-up except Exception as e: print("Can't load histos from file '%s': %s" % (path, e)) return histos def read_histos(filepath, stripref=True): """ Load histograms from file, into a dict of path -> yoda.Histo[DataBin] If stripref = True, remove any leading /REF prefix from the histo path before putting it in the dictionary. """ histos = {} if filepath.endswith(".root"): histos.update(read_histos_root(filepath)) elif any(filepath.endswith(ext) for ext in [".yoda", ".aida", ".flat"]): histos.update(read_histos_yoda(filepath)) if stripref: newhistos = {} for p, h in histos.items(): if p.startswith("/REF"): p = p.replace("/REF", "", 1) h.path = p newhistos[p] = h histos = newhistos return histos def read_all_histos(dirpath, stripref=True): """ Load histograms from all files in the given dir, into a dict of path -> yoda.Histo[DataBin] If stripref = True, remove any leading /REF prefix from the histo path before putting it in the dictionary. """ histos = {} filepaths = glob.glob(os.path.join(dirpath, "*")) for fp in filepaths: histos.update(read_histos(fp, stripref)) return histos def read_rundata(dirs, pfname="params.dat", verbosity=1): #, formats="yoda,root,aida,flat"): """ Read interpolation anchor point data from a provided set of run directory paths. Returns a pair of dicts, the first mapping run names (i.e. rundir basenames) to the parameter value list for each run, and the second mapping observable names (i.e. histogram paths) to a run -> histo dict. """ params, histos = {}, {} import os, glob, re re_pfname = re.compile(pfname) if pfname else None numruns = len(dirs) for num, d in enumerate(sorted(dirs)): run = os.path.basename(d) if verbosity >= 2 or (verbosity >= 1 and (num % 100 == 99 or num == 0)): pct = 100*(num+1)/float(numruns) print("Reading run '%s' data: %d/%d = %2.0f%%" % (run, num+1, numruns, pct)) files = glob.glob(os.path.join(d, "*")) for f in files: ## Params file #if os.path.basename(f) == pfname: if re_pfname and re_pfname.search(os.path.basename(f)): params[run] = read_paramsfile(f) ## Histo file else: try: ## Read as a path -> Histo dict hs = read_histos(f) ## Restructure into the path -> run -> Histo return dict for path, hist in hs.items(): histos.setdefault(path, {})[run] = hist except: pass #< skip files that can't be read as histos ## Check that a params file was found and read in this dir... or that no attempt was made to find one if pfname: if run not in list(params.keys()): raise Exception("No params file '%s' found in run dir '%s'" % (pfname, d)) else: params = None return params, histos def read_params(topdir, pfname="params.dat", verbosity=0): """ Read interpolation anchor point data from a provided set of run directory paths. Returns a dict mapping run names (i.e. rundir basenames) to the parameter value list for each run. """ params = {} import os, glob, re re_pfname = re.compile(pfname) if pfname else None dirs = [x for x in glob.glob(os.path.join(topdir, "*")) if os.path.isdir(x)] numruns = len(dirs) for num, d in enumerate(sorted(dirs)): run = os.path.basename(d) if verbosity >= 2 or (verbosity >= 1 and (num % 100 == 99 or num == 0)): pct = 100*(num+1)/float(numruns) print("Reading run '%s' data: %d/%d = %2.0f%%" % (run, num+1, numruns, pct)) files = glob.glob(os.path.join(d, "*")) for f in files: ## Params file #if os.path.basename(f) == pfname: if re_pfname and re_pfname.search(os.path.basename(f)): params[run] = read_paramsfile(f) ## Check that a params file was found and read in this dir... or that no attempt was made to find one if pfname: if run not in list(params.keys()): raise Exception("No params file '%s' found in run dir '%s'" % (pfname, d)) else: params = None return params # TODO this is much slower --- understand why! # # http://stackoverflow.com/questions/16415156/using-sets-with-the-multiprocessing-module # def read_rundata(dirs, pfname="params.dat", verbosity=1, nthreads=1): #, formats="yoda,root,aida,flat"): # """ # Read interpolation anchor point data from a provided set of run directory paths. # Returns a pair of dicts, the first mapping run names (i.e. rundir basenames) to # the parameter value list for each run, and the second mapping observable names # (i.e. histogram paths) to a run -> histo dict. # """ # params, histos = {}, {} # import os, glob, re # re_pfname = re.compile(pfname) if pfname else None # import time, multiprocessing # time1 = time.time() # from multiprocessing.managers import Manager # params = {} # histos = {} # manager = SyncManager() # manager.start() # histflat = manager.list() # # Logic: # # # # Only directories containing the uniquely named params file are valid, # # so read the params first and ignore all directories not having one # # of those and then use the runs to prepare structure for multiproc dict # # # ## The job queue # q = multiprocessing.Queue() # # # for d in dirs: # run = os.path.basename(d) # files = glob.glob(os.path.join(d, "*")) # for f in files: # ## Params file # if re_pfname and re_pfname.search(os.path.basename(f)): # params[run]=read_paramsfile(f) # # histos[run]={} # q.put(d) # import sys # def worker(q, rflat): # while True: # if q.empty(): # break # d = q.get() # run = os.path.basename(d) # files = glob.glob(os.path.join(d, "*")) # for f in files: # ## Params file # if re_pfname and not re_pfname.search(os.path.basename(f)): # try: # ## Read as a path -> Histo dict # hs = read_histos(f) # temp=[] # for path, hist in hs.iteritems(): # rflat.append([path,run,hist]) # except Exception, e: # print e # pass #< skip files that can't be read as histos # workers = [multiprocessing.Process(target=worker, args=(q, histflat)) for i in range(nthreads)] # map(lambda x:x.start(), workers) # map(lambda x:x.join(), workers) # time2 = time.time() # sys.stderr.write('\rReading took %0.2fs\n\n' % ((time2-time1))) # for p, r, h in histflat: # histos.setdefault(p, {})[r] =h # time3 = time.time() # sys.stderr.write('\rData preparaion took %0.2fs\n\n' % ((time3-time2))) # return params, histos def read_all_rundata(runsdir, pfname="params.dat", verbosity=1):#, nthreads=1): rundirs = glob.glob(os.path.join(runsdir, "*")) return read_rundata(rundirs, pfname, verbosity)#, nthreads) def read_all_rundata_yaml(yamlfile): from professor2.utils import mkdict PARAMS = {} HISTOS = {} import yaml print("Loading YAML from %s"%yamlfile) - Y=yaml.load(open(yamlfile)) + Y = yaml.load(open(yamlfile)) for num, y in enumerate(Y): - P=mkdict() + P = mkdict() for p in sorted(y['Params'].keys()): P[p] = float(y['Params'][p]) - PARAMS[num]=P + PARAMS[num] = P for f in y['YodaFiles']: hs = read_histos(f) for path, hist in hs.items(): HISTOS.setdefault(path, {})[num] = hist return PARAMS, HISTOS def find_maxerrs(histos): """ Helper function to find the maximum error values found for each bin in the histos double-dict. histos is a nested dict of DataHisto objects, indexed first by histo path and then by run name, i.e. it is the second of the two objects returned by read_histos(). Returns a dict of lists of floats, indexed by histo path. Each list of floats contains the max error size seen for each bin of the named observable, across the collection of runs histos.keys(). This functions is useful for regularising chi2 etc. computation by constraining interpolated uncertainties to within the range seen in the run data used to create the ipols, to avoid blow-up of uncertainties with corresponding distortion of fit optimisation. TODO: asymm version? """ rtn = {} for hn, hs in histos.items(): numbins_h = hs.next().nbins maxerrs_h = [] for ib in range(numbins_h): emax = max(h.bins[ib].err for h in list(hs.values())) maxerrs_h.append(emax) rtn[hn] = maxerrs_h return rtn def read_all_rundata_h5(fname): """ Helper/transition function --- read tuning data from hdf5 file and make Histo, params structure. """ try: import h5py except ImportError: raise Exception("Module h5py not found --- try pip install h5py") with h5py.File(fname, "r") as f: binNames = [i.astype(str) for i in f.get("index")] pnames = [p.astype(str) for p in f.get("params").attrs["names"]] _P = [dict(zip(pnames, p)) for p in f.get("params")] # Parameter points as dicts import numpy as np # Strip the bin numbers and use set to get list of unique histonames binNamesStripped = np.array([i.split("#")[0] for i in binNames]) histoNames = sorted(list(set(binNamesStripped))) # dict structure for convenient access of bins by index --- histoname : [indices ...] lookup = { hn : np.where(binNamesStripped == hn)[0] for hn in histoNames } histos = {} with h5py.File(fname, "r") as f: # Read y-values and errors # NOTE --- first index is the bin index, the second is the run index # e.g. Y[33][122] is the y-value of bin 34 in run 123 Y = f.get("values")[:] E = f.get("errors")[:] xmin = f.get("xmin")[:] xmax = f.get("xmax")[:] # Useable run indices, i.e. not all things NaN USE = np.where(~np.all(np.isnan(Y), axis=0))[0] # The usable parameter points P = [_P[u] for u in USE] for hname in histoNames: histos[hname] = {} # Connection between histograms and dataset indices bindices = lookup[hname] # This is an iteration over runs for u in USE: _Y = Y[bindices][:,u] _E = E[bindices][:,u] _xmin = xmin[bindices] _xmax = xmax[bindices] _bins = [DataBin(_a,_b,_c,_d) for _a,_b,_c,_d in zip(_xmin,_xmax,_Y,_E)] _histo = Histo(_bins, hname) histos.setdefault(hname, {})[u] = _histo params = { u : p for u, p in zip(USE, P) } return params, histos if __name__ == "__main__": import sys - P,H = read_all_rundata_h5(sys.argv[1]) + P, H = read_all_rundata_h5(sys.argv[1]) diff --git a/pyext/professor2/histos.py b/pyext/professor2/histos.py --- a/pyext/professor2/histos.py +++ b/pyext/professor2/histos.py @@ -1,290 +1,288 @@ # -*- python -*- - - class Histo(object): "A simple histogram -- just a Bin container with an optional path name" __slots__ = ["_bins", "path"] def __init__(self, bins=None, path=None): self._bins = bins if bins else [] self.path = path @property def nbins(self): return len(self.bins) @property def bins(self): return self._bins @bins.setter def bins(self, bs): self._bins = bs self._bins.sort() for i, b in enumerate(self._bins): b.n = i def __len__(self): return self.nbins def __repr__(self): return "<%s with %d bins>" % (self.__class__.__name__, self.nbins) class DataHisto(Histo): "Specialisation of Histo as a container of DataBins" def __init__(self, dbins=None, path=None): Histo.__init__(self, dbins, path) def toScatter2D(self, manpath=None): path = manpath if manpath is not None else self.path points = [(self.bins[i].xmid, self.bins[i].val, [self.bins[i].xmid-self.bins[i].xmin, self.bins[i].xmax-self.bins[i].xmid], self.bins[i].errs) for i in range(len(self.bins))] import yoda return yoda.Scatter2D(points, path, path) # def mkSmearedCopy(self): # Yorig = [b.val for b in self.bins] # from copy import deepcopy # B=deepcopy(self.bins) # Ysmeared = [b.smearBin() for b in B] # dY = [0.5*abs(o - s) for o, s in zip(Yorig, Ysmeared)] # import math # errorfactor=1.#math.sqrt(2.) # # Yerrnew = [b.err*errorfactor for b in B] # Yerrnew = [math.sqrt(b.err*b.err + dy*dy) for b, dy in zip(B, dY)] # # Yerrnew = [b.err + dy for b, dy in zip(B, dY)] # for num, b in enumerate(B): # b.val=Ysmeared[num] # b.errs=Yerrnew[num] # return Histo(B, self.path) def mkSmearedCopy(self, errorfactor=1): Yorig = [b.val for b in self.bins] NORM = sum(Yorig) from copy import deepcopy B=deepcopy(self.bins) Ysmeared = [b.smearBin() for b in B] NORMsmeared = sum(Ysmeared) # from IPython import embed # embed() # import sys # sys.exit(1) # import math Yerrnew = [b.err*errorfactor for b in B] # Yerrnew = [math.sqrt(b.err*b.err + dy*dy) for b, dy in zip(B, dY)] # Yerrnew = [b.err + dy for b, dy in zip(B, dY)] for num, b in enumerate(B): b.val=Ysmeared[num]*NORM/NORMsmeared # b.errs=Yerrnew[num] return Histo(B, self.path) class IpolHisto(Histo): "Specialisation of Histo as a container of IpolBins" def __init__(self, ibins=None, path=None): Histo.__init__(self, ibins, path) def toDataHisto(self, *params): "Convert this IpolBin to a DataBin with values and errors computed at params" dbins = [ib.toDataBin(*params) for ib in self.bins] dhist = DataHisto(dbins, self.path) return dhist class Bin(object): "A base class for binned data, handling the common x-edge stuff" __slots__ = ["xmin", "xmax", "n"] def __init__(self, xmin, xmax, n=None): self.xmin = xmin self.xmax = xmax self.n = n @property def xmid(self): return (self.xmin + self.xmax) / 2.0 @property def xedges(self): return (self.xmin, self.xmax) def __cmp__(self, other): return cmp(self.xmin, other.xmin) def smear(self, SEED): pass class DataBin(Bin): "A bin containing a data value and its error(s)" __slots__ = ["val", "_errs"] def __init__(self, xmin, xmax, val=None, errs=None): Bin.__init__(self, xmin, xmax) self.val = val self._errs = errs # TODO: return numerical 0 if _errs is None? @property def err(self): "Get a scalar error value, by averaging if necessary" if self._errs is None: return 0.0 elif hasattr(self._errs, "__len__"): assert len(self._errs) == 2 return sum(self._errs) / 2.0 return self._errs @err.setter def err(self, e): "Set a scalar error value" assert not hasattr(self._errs, "__len__") self._errs = e @property def errs(self): "Get a pair of error values, by construction if necessary" if self._errs is None: return (0.0, 0.0) elif hasattr(self._errs, "__len__"): assert len(self._errs) == 2 return self._errs return (self._errs, self._errs) @errs.setter def errs(self, e): "Set a pair of error values" if e is None: self._errs = None elif hasattr(e, "__len__"): assert len(e) == 2 self._errs = e else: self._errs = [e,e] def smearBin(self): import random return random.gauss(self.val, self.err) def __repr__(self): return "<%s x=[%.3g..%.3g], y=%.3g, ey=[%.3g,%.3g]>" % \ (self.__class__.__name__, self.xmin, self.xmax, self.val, self.errs[0], self.errs[1]) class IpolBin(Bin): """ A bin containing a value interpolation and its error(s) TODO: * Provide ierr and ierrs getter/setter pairs cf. err/errs on DataBin? They can't be averaged (?), so not sure it makes sense... * Allow ipol'd error handling, with wrapped relative error parameterisation as an option? """ __slots__ = ["ival", "ierrs", "__dict__"] def __init__(self, xmin, xmax, ival=None, ierrs=None, n=None): Bin.__init__(self, xmin, xmax, n) self.ival = ival self.ierrs = ierrs #def val(self, *params, vmin=None, vmax=None): #< needs Python3 # return self.ival.value(*params, vmin=vmin, vmax=vmax) def val(self, *params, **vminmax): "Get the interpolated value of this bin" vmin = vminmax.get("vmin", None) vmax = vminmax.get("vmax", None) return self.ival.value(*params, vmin=vmin, vmax=vmax) def der(self, *params, **vminmax): "Get the derivative according to the parametrisation" return self.ival.der(*params) def grad(self, *params, **vminmax): "Get the gradient according to the parametrisation" return self.ival.grad(*params) # TODO: need uniform access to two ipols regardless of what's stored (and maybe also a way to make a single avg Ipol...) #def err(self, *params, emin=0, emax=None): #< needs Python3 def err(self, *params, **eminmax): "Get a single, averaged interpolated error for this bin" # emin = eminmax.get("emin", 0) # emax = eminmax.get("emax", None) # if self.ierrs is None: # return 0.0 # elif hasattr(self.ierrs, "__len__"): # assert len(self.ierrs) == 2 # return (self.ierrs[0].value(*params, vmin=emin, vmax=emax) + self.ierrs[1].value(*params, vmin=emin, vmax=emax))/2.0 # else: # return self.ierrs.value(*params, vmin=emin, vmax=emax) es = self.errs(*params, **eminmax) return (es[0] + es[1])/2.0 #def errs(self, params, emin=0, emax=None): #< needs Python3 def errs(self, *params, **eminmax): "Get a pair of interpolated errors for this bin" emin = eminmax.get("emin", 0) emax = eminmax.get("emax", None) if self.ierrs is None: return (0.0, 0.0) elif hasattr(self.ierrs, "__len__"): assert len(self.ierrs) == 2 return (self.ierrs[0].value(*params, vmin=emin, vmax=emax), self.ierrs[1].value(*params, vmin=emin, vmax=emax)) else: e = self.ierrs.value(*params, vmin=emin, vmax=emax) return (e, e) @property def has_const_err(): "Determine whether this bin's errors are fixed or variable -- the latter requires regularisation" if self.ierrs is None: return True if hasattr(self.ierrs, "__len__"): assert len(self.ierrs) == 2 return (self.ierrs[0].order == 0 and self.ierrs[1].order == 0) else: return self.ierrs.order == 0 #def toDataBin(self, *params, vmin=None, vmax=None, emin=0, emax=None): #< needs Python3 def toDataBin(self, *params, **veminmax): #< needs Python3 "Convert this IpolBin to a DataBin with values and errors computed at params, with optional range limits" vmin = veminmax.get("vmin", None) vmax = veminmax.get("vmax", None) emin = veminmax.get("vmin", 0) emax = veminmax.get("vmax", None) db = DataBin(self.xmin, self.xmax, val=self.val(*params, vmin=vmin, vmax=vmax), errs=self.errs(*params, vmin=emin, vmax=emax)) return db def __repr__(self): s = "<%s x=[%.3g..%.3g]" % (self.__class__.__name__, self.xmin, self.xmax) try: s += ", %d params, ival order %d" % (self.ival.dim, self.ival.order) except: pass try: s += ", ierr order %d" % self.ierrs[0].order except: pass s += ">" return s def mkEnvelope(histos): """ Take DataHistos and return coordinates for plotting """ # Iterate over bins, get envelope representation # For each bin, return [xmin, xmax, mean(y), min(y), max(y)] E=[] for num, b in enumerate(histos[list(histos.keys())[0]].bins): t_b = [x.bins[num].val for x in list(histos.values())] E.append([b.xmin, b.xmax, sum(t_b)/len(t_b), min(t_b), max(t_b)]) return E