cms-analysis · edjtscott · Jan 19, 2021
diff --git a/Combine/Checks/README.md b/Combine/Checks/README.md
@@ -0,0 +1,57 @@
+# Impacts
+
+Impacts documentation to come here. 
+
+# Bias studies
+
+Here we provide a script to perform a simple bias study. 
+
+## Inputs
+
+The only pre-requisite is a workspace with a single category, 
+and therefore a single multipdf (the object that controls the envelope method) 
+and a single pdf index corresponding to the choice of functional form.
+
+To create this, it's simple to use the existing combineCards functionality, for example:
+```
+combineCards.py Datacard.txt --ic cat_name > Datacard_cat_name.txt
+```
+
+This creates a .txt datacard with only categories matching the reg exp `cat_name` included. 
+Be careful: you will probably have to manually delete some pdfindex lines at the bottom; 
+the script does not know that these correspond to the analysis categories, 
+and therefore will leave them all in (you only want the one corresponding to the category you are studying). 
+
+Once that is done, you can run your usual `text2workspace` command to generate the `-d, --datacard` input for this script. 
+
+## Usage
+
+The script is split into three different stages:
+ * `-t, --toys`:  throw and save a total of `-n,--nToys` toys for each of the candidate functions included in the envelope
+ * `-f, --fits`:  fit each of those toys and extract the uncertainty
+ * `-p, --plots`: plot the pull distribution of the resulting fits
+
+You can then inspect the output plots and hope to see an approximately gaussian shape with zero mean and unit width.
+Normally, provided that the absolute value of the mean is less than 0.14, this is considered satisfactory. 
+
+The three steps can be run in one go, but it's probably safer to run them one-by-one. 
+Here is an example:
+
+```
+./RunBiasStudy.py -d Datacard_mu_ggH_cat0.root -t 
+./RunBiasStudy.py -d Datacard_mu_ggH_cat0.root -f -c "--cminDefaultMinimizerStrategy 0 --X-rtd MINIMIZER_freezeDisassociatedParams --X-rtd MINIMIZER_multiMin_hideConstants --X-rtd MINIMIZER_multiMin_maskConstraints --X-rtd MINIMIZER_multiMin_maskChannels=2 --freezeParameters MH" 
+./RunBiasStudy.py -d Datacard_mu_ggH_cat0.root -p --gaussianFit
+```
+The options for the second step are passed to combine; these are recommended to get the fit to converge. 
+The additional option on the plotting is fairly self-explanatory; it adds a gaussian fit to the output plot.
+
+## More options
+
+There are various things one can tweak for these studies. 
+Here is a list of the common options: 
+ * `-n,--nToys`: the default number of toys is 1000 per function, but can be lowered or raised. 
+ * `-e,--expectSignal`: the injected signal strength is 1 by default, but zero can also be checked, or higher values for searches. 
+ * `-s,--seed`: the default value of -1 finds a random seed; you can fix this for reproducility if you prefer.
+ * `--poi`: if your parameter of interest is called something other than `r`, say so here.
+ * `--split`: default number of toys to be thrown or fits to be performed in one go. Set to 500 but may need to be lowered for memory reasons if you have a more complicated fit. 
+ * `--selectFunction`: you can specify a string here to only select certain functions for these studies (e.g. `bern` to match all Bernstein polynomials, `exp1` to match just the first-order exponential).
diff --git a/Combine/Checks/RunBiasStudy.py b/Combine/Checks/RunBiasStudy.py
@@ -0,0 +1,125 @@
+#!/usr/bin/env python
+
+from biasUtils import *
+
+from optparse import OptionParser
+parser = OptionParser()
+parser.add_option("-d","--datacard",default="Datacard.root")
+parser.add_option("-w","--workspace",default="w")
+parser.add_option("-t","--toys",action="store_true", default=False)
+parser.add_option("-n","--nToys",default=1000,type="int")
+parser.add_option("-f","--fits",action="store_true", default=False)
+parser.add_option("-p","--plots",action="store_true", default=False)
+parser.add_option("-e","--expectSignal",default=1.,type="float")
+parser.add_option("-m","--mH",default=125.,type="float")
+parser.add_option("-c","--combineOptions",default="")
+parser.add_option("-s","--seed",default=-1,type="int")
+parser.add_option("--dryRun",action="store_true", default=False)
+parser.add_option("--poi",default="r")
+parser.add_option("--split",default=500,type="int")
+parser.add_option("--selectFunction",default=None)
+parser.add_option("--gaussianFit",action="store_true", default=False)
+(opts,args) = parser.parse_args()
+print
+if opts.nToys>opts.split and not opts.nToys%opts.split==0: raise RuntimeError('The number of toys %g needs to be smaller than or divisible by the split number %g'%(opts.nToys, opts.split))
+
+import ROOT as r
+r.gROOT.SetBatch(True)
+r.gStyle.SetOptStat(2211)
+
+ws = r.TFile(opts.datacard).Get(opts.workspace)
+
+pdfs = rooArgSetToList(ws.allPdfs())
+multipdfName = None
+for pdf in pdfs:
+    if pdf.InheritsFrom("RooMultiPdf"):
+        if multipdfName is not None: raiseMultiError() 
+        multipdfName = pdf.GetName()
+        print 'Conduct bias study for multipdf called %s'%multipdfName
+multipdf = ws.pdf(multipdfName)
+print
+
+varlist = rooArgSetToList(ws.allCats())
+indexName = None
+for var in varlist:
+    if var.GetName().startswith('pdfindex'):
+        if indexName is not None: raiseMultiError()
+        indexName = var.GetName()
+        print 'Found index called %s'%indexName
+print
+
+from collections import OrderedDict as od
+indexNameMap = od()
+for ipdf in range(multipdf.getNumPdfs()):
+    if opts.selectFunction is not None:
+        if not multipdf.getPdf(ipdf).GetName().count(opts.selectFunction): continue
+    indexNameMap[ipdf] = multipdf.getPdf(ipdf).GetName()
+
+if opts.toys:
+    if not path.isdir('BiasToysn'): system('mkdir -p BiasToys')
+    toyCmdBase = 'combine -m %.4f -d %s -M GenerateOnly --expectSignal %.4f -s %g --saveToys %s '%(opts.mH, opts.datacard, opts.expectSignal, opts.seed, opts.combineOptions)
+    for ipdf,pdfName in indexNameMap.iteritems():
+        name = shortName(pdfName)
+        if opts.nToys > opts.split:
+            for isplit in range(opts.nToys//opts.split):
+                toyCmd = toyCmdBase + ' -t %g -n _%s_split%g --setParameters %s=%g --freezeParameters %s'%(opts.split, name, isplit, indexName, ipdf, indexName)
+                run(toyCmd, dry=opts.dryRun)
+                system('mv higgsCombine_%s* %s'%(name, toyName(name,split=isplit)))
+        else: 
+            toyCmd = toyCmdBase + ' -t %g -n _%s --setParameters %s=%g --freezeParameters %s'%(opts.nToys, name, indexName, ipdf, indexName)
+            run(toyCmd, dry=opts.dryRun)
+            system('mv higgsCombine_%s* %s'%(name, toyName(name)))
+print
+
+if opts.fits:
+    if not path.isdir('BiasFits'): system('mkdir -p BiasFits')
+    fitCmdBase = 'combine -m %.4f -d %s -M MultiDimFit -P %s --algo singles %s '%(opts.mH, opts.datacard, opts.poi, opts.combineOptions)
+    for ipdf,pdfName in indexNameMap.iteritems():
+        name = shortName(pdfName)
+        if opts.nToys > opts.split:
+            for isplit in range(opts.nToys//opts.split):
+                fitCmd = fitCmdBase + ' -t %g -n _%s_split%g --toysFile=%s'%(opts.split, name, isplit, toyName(name,split=isplit))
+                run(fitCmd, dry=opts.dryRun)
+                system('mv higgsCombine_%s* %s'%(name, fitName(name,split=isplit)))
+            run('hadd %s BiasFits/*%s*split*.root'%(fitName(name),name), dry=opts.dryRun)
+        else:
+            fitCmd = fitCmdBase + ' -t %g -n _%s --toysFile=%s'%(opts.nToys, name, toyName(name))
+            run(fitCmd, dry=opts.dryRun)
+            system('mv higgsCombine_%s* %s'%(name, fitName(name)))
+
+if opts.plots:
+    if not path.isdir('BiasPlots'): system('mkdir -p BiasPlots')
+    for ipdf,pdfName in indexNameMap.iteritems():
+        name = shortName(pdfName)
+        tfile = r.TFile(fitName(name))
+        tree = tfile.Get('limit')
+        pullHist = r.TH1F('pullsForTruth_%s'%name, 'Pull distribution using the envelope to fit %s'%name, 80, -4., 4.)
+        pullHist.GetXaxis().SetTitle('Pull')
+        pullHist.GetYaxis().SetTitle('Entries')
+        for itoy in range(opts.nToys):
+            tree.GetEntry(3*itoy)
+            if not getattr(tree,'quantileExpected')==-1: 
+                raiseFailError(itoy,True) 
+                continue
+            bf = getattr(tree, 'r')
+            tree.GetEntry(3*itoy+1)
+            if not abs(getattr(tree,'quantileExpected')--0.32)<0.001: 
+                raiseFailError(itoy,True) 
+                continue
+            lo = getattr(tree, 'r')
+            tree.GetEntry(3*itoy+2)
+            if not abs(getattr(tree,'quantileExpected')-0.32)<0.001: 
+                raiseFailError(itoy,True) 
+                continue
+            hi = getattr(tree, 'r')
+            diff = bf - opts.expectSignal
+            unc = 0.5 * (hi-lo)
+            if unc > 0.: 
+                pullHist.Fill(diff/unc)
+        canv = r.TCanvas()
+        pullHist.Draw()
+        if opts.gaussianFit:
+           r.gStyle.SetOptFit(111)
+           pullHist.Fit('gaus')
+        canv.SaveAs('%s.pdf'%plotName(name))
+        canv.SaveAs('%s.png'%plotName(name))
diff --git a/Combine/Checks/biasUtils.py b/Combine/Checks/biasUtils.py
@@ -0,0 +1,48 @@
+#!/usr/bin/env python
+
+def rooArgSetToList(argset): ## taken from Andrea Marini's great repo here: https://github.com/amarini/rfwsutils/blob/master/wsutils.py#L300-L313
+    """creates a python list with the contents of argset (which should be a RooArgSet)"""
+    it = argset.createIterator()
+
+    retval = []
+    while True:
+        obj = it.Next()
+
+        if obj == None:
+            break
+
+        retval.append(obj)
+
+    return retval
+
+def raiseMultiError(lax=False):
+    raise RuntimeError('Found more than one multipdf here - please create a workspace with just one for these bias studies. You can use "combineCards.py Datacard.txt --ic cat_name" for this)')
+
+def raiseFailError(itoy, lax=False):
+    text = 'some fits have failed, wrong quantile for toy number %g'%itoy
+    if not lax: raise RuntimeError('ERROR %s'%text)
+    else: print 'WARNING %s'%text
+
+def shortName(name):
+    return name.split('_')[-1]
+
+def toyName(name, split=None):
+    retval = 'BiasToys/biasStudy_%s_toys.root'%name
+    if split is not None: 
+        split = int(split)
+        retval = retval.replace(name,'%s_split%g'%(name,split))
+    return retval
+
+def fitName(name, split=None):
+    retval = 'BiasFits/biasStudy_%s_fits.root'%name
+    if split is not None: 
+        split = int(split)
+        retval = retval.replace(name,'%s_split%g'%(name,split))
+    return retval
+
+def plotName(name):
+    return 'BiasPlots/biasStudy_%s_pulls'%name
+
+def run(cmd, dry=False):
+   print cmd
+   if not dry: system(cmd)