diff --git a/coffea/btag_tools/__init__.py b/coffea/btag_tools/__init__.py new file mode 100644 index 000000000..85eac8535 --- /dev/null +++ b/coffea/btag_tools/__init__.py @@ -0,0 +1,10 @@ +"""BTag tools: CMS analysis-level b-tagging corrections and uncertainties + +These classes provide computation of CMS b-tagging and mistagging +corrections and uncertainties on columnar data. +""" +from .btagscalefactor import BTagScaleFactor + +__all__ = [ + 'BTagScaleFactor', +] diff --git a/coffea/btag_tools/btagscalefactor.py b/coffea/btag_tools/btagscalefactor.py new file mode 100644 index 000000000..c93f0325f --- /dev/null +++ b/coffea/btag_tools/btagscalefactor.py @@ -0,0 +1,222 @@ +from threading import Lock +import pandas +import numpy +import numba + + +class BTagScaleFactor: + '''A class holding one complete BTag scale factor for a given working point + + Parameters + ---------- + filename : str + The BTag-formatted CSV file to read (accepts .csv, .csv.gz, etc.) + See pandas read_csv for all supported compressions. + workingpoint : str or int + The working point, one of LOOSE, MEDIUM, TIGHT, or RESHAPE (0-3, respectively) + methods : str, optional + The scale factor derivation method to use for each flavor, 'b,c,light' respectively. + Defaults to 'comb,comb,incl' + keep_df : bool, optional + If set true, keep the parsed dataframe as an attribute (.df) for later inspection + ''' + LOOSE, MEDIUM, TIGHT, RESHAPE = range(4) + FLAV_B, FLAV_C, FLAV_UDSG = range(3) + _formulaLock = Lock() + _formulaCache = {} + _btvflavor = numpy.array([0, 1, 2, 3]) + _flavor = numpy.array([0, 4, 5, 6]) + _wpString = {'loose': LOOSE, 'medium': MEDIUM, 'tight': TIGHT} + _expectedColumns = [ + 'OperatingPoint', 'measurementType', 'sysType', 'jetFlavor', 'etaMin', + 'etaMax', 'ptMin', 'ptMax', 'discrMin', 'discrMax', 'formula' + ] + + @classmethod + def readcsv(cls, filename): + '''Reads a BTag-formmated CSV file into a pandas dataframe + + This function also merges the bin min and max into a tuple representing the bin + + Parameters + ---------- + filename : str + The file to open + + Returns + ------- + df : pandas.DataFrame + A dataframe containing all info in the file + discriminator : str + The name of the discriminator the correction map is for + ''' + df = pandas.read_csv(filename, skipinitialspace=True) + discriminator = df.columns[0].split(';')[0] + + def cleanup(colname): + if ';' in colname: + _, colname = colname.split(';') + return colname.strip() + df.rename(columns=cleanup, inplace=True) + if not list(df.columns) == BTagScaleFactor._expectedColumns: + raise RuntimeError('Columns in BTag scale factor file %s as expected' % filename) + for var in ['eta', 'pt', 'discr']: + df[var + 'Bin'] = list(zip(df[var + 'Min'], df[var + 'Max'])) + del df[var + 'Min'] + del df[var + 'Max'] + return df, discriminator + + def __init__(self, filename, workingpoint, methods='comb,comb,incl', keep_df=False): + if workingpoint not in [0, 1, 2, 3]: + try: + workingpoint = BTagScaleFactor._wpString[workingpoint.lower()] + except (KeyError, AttributeError): + raise ValueError('Unrecognized working point') + methods = methods.split(',') + self.workingpoint = workingpoint + df, self.discriminator = BTagScaleFactor.readcsv(filename) + cut = (df['jetFlavor'] == self.FLAV_B) & (df['measurementType'] == methods[0]) + if len(methods) > 1: + cut |= (df['jetFlavor'] == self.FLAV_C) & (df['measurementType'] == methods[1]) + if len(methods) > 2: + cut |= (df['jetFlavor'] == self.FLAV_UDSG) & (df['measurementType'] == methods[2]) + cut &= df['OperatingPoint'] == workingpoint + df = df[cut] + mavailable = list(df['measurementType'].unique()) + if not all(m in mavailable for m in methods): + raise ValueError('Unrecognized jet correction method, available: %r' % mavailable) + df = df.set_index(['sysType', 'jetFlavor', 'etaBin', 'ptBin', 'discrBin']).sort_index() + if keep_df: + self.df = df + self._corrections = {} + for syst in list(df.index.levels[0]): + corr = df.loc[syst] + allbins = list(corr.index) + # NOTE: here we force the class to assume abs(eta) based on lack of examples where signed eta is used + edges_eta = numpy.array(sorted(set(abs(x) for tup in corr.index.levels[1] for x in tup))) + edges_pt = numpy.array(sorted(set(x for tup in corr.index.levels[2] for x in tup))) + edges_discr = numpy.array(sorted(set(x for tup in corr.index.levels[3] for x in tup))) + alledges = numpy.meshgrid(self._btvflavor[:-1], edges_eta[:-1], edges_pt[:-1], edges_discr[:-1], indexing='ij') + mapping = numpy.full(alledges[0].shape, -1) + + def findbin(btvflavor, eta, pt, discr): + for i, (fbin, ebin, pbin, dbin) in enumerate(allbins): + if btvflavor == fbin and ebin[0] <= eta < ebin[1] and pbin[0] <= pt < pbin[1] and dbin[0] <= discr < dbin[1]: + return i + return -1 + + for idx, _ in numpy.ndenumerate(mapping): + btvflavor, eta, pt, discr = (x[idx] for x in alledges) + mapping[idx] = findbin(btvflavor, eta, pt, discr) + + self._corrections[syst] = ( + edges_eta, + edges_pt, + edges_discr, + mapping, + numpy.array(corr['formula']), + ) + self._compiled = {} + + def __getstate__(self): + state = dict(self.__dict__) + state.pop('_compiled') + return state + + def __setstate__(self, state): + self.__dict__ = state + self._compiled = {} + + @classmethod + def _compile(cls, formula): + with BTagScaleFactor._formulaLock: + try: + return BTagScaleFactor._formulaCache[formula] + except KeyError: + if 'x' in formula: + feval = eval('lambda x: ' + formula, {'log': numpy.log, 'sqrt': numpy.sqrt}) + out = numba.vectorize([ + numba.float32(numba.float32), + numba.float64(numba.float64), + ])(feval) + else: + val = eval(formula, {'log': numpy.log, 'sqrt': numpy.sqrt}) + + def duck(_, out, where): + out[where] = val + out = duck + BTagScaleFactor._formulaCache[formula] = out + return out + + def _lookup(self, axis, values): + if len(axis) == 2: + return numpy.zeros(shape=values.shape, dtype=numpy.uint) + return numpy.clip(numpy.searchsorted(axis, values, side='right') - 1, 0, len(axis) - 2) + + def eval(self, systematic, flavor, abseta, pt, discr=None, ignore_missing=False): + '''Evaluate this scale factor as a function of jet properties + + Parameters + ---------- + systematic : str + Which systematic to evaluate. Nominal correction is 'central', the options depend + on the scale factor and method + flavor : numpy.ndarray or awkward.Array + The generated jet hadron flavor, following the enumeration: + 0: uds quark or gluon, 4: charm quark, 5: bottom quark + abseta : numpy.ndarray or awkward.Array + The absolute value of the jet pseudorapitiy + pt : numpy.ndarray or awkward.Array + The jet transverse momentum + discr : numpy.ndarray or awkward.Array, optional + The jet tagging discriminant value (default None), optional for all scale factors except + the reshaping scale factor + ignore_missing : bool, optional + If set true, any values that have no correction will return 1. instead of throwing + an exception. Out-of-bounds values are always clipped to the nearest bin. + + Returns + ------- + out : numpy.ndarray or awkward.Array + An array with shape matching ``pt``, containing the per-jet scale factor + ''' + if self.workingpoint == BTagScaleFactor.RESHAPE and discr is None: + raise ValueError('RESHAPE scale factor requires a discriminant array') + try: + functions = self._compiled[systematic] + except KeyError: + functions = [BTagScaleFactor._compile(f) for f in self._corrections[systematic][-1]] + self._compiled[systematic] = functions + try: + flavor.counts + jin, flavor = flavor, flavor.flatten() + abseta = abseta.flatten() + pt = pt.flatten() + discr = discr.flatten() if discr is not None else None + except AttributeError: + jin = None + corr = self._corrections[systematic] + idx = ( + 2 - self._lookup(self._flavor, flavor), # transform to btv definiton + self._lookup(corr[0], abseta), + self._lookup(corr[1], pt), + self._lookup(corr[2], discr) if discr is not None else 0, + ) + mapidx = corr[3][idx] + out = numpy.ones(mapidx.shape, dtype=pt.dtype) + for ifunc in numpy.unique(mapidx): + if ifunc < 0 and not ignore_missing: + raise ValueError('No correction was available for some items') + func = functions[ifunc] + if self.workingpoint == BTagScaleFactor.RESHAPE: + var = numpy.clip(discr, corr[2][0], corr[2][-1]) + else: + var = numpy.clip(pt, corr[1][0], corr[1][-1]) + func(var, out=out, where=(mapidx == ifunc)) + + if jin is not None: + out = jin.copy(content=out) + return out + + def __call__(self, systematic, flavor, abseta, pt, discr=None, ignore_missing=False): + return self.eval(systematic, flavor, abseta, pt, discr, ignore_missing) diff --git a/docs/source/reference.rst b/docs/source/reference.rst index 9b117f05c..bdab12729 100644 --- a/docs/source/reference.rst +++ b/docs/source/reference.rst @@ -20,6 +20,7 @@ and/or heavy dependencies. Below lists the packages available in the ``coffea`` coffea.nanoaod.methods coffea.lookup_tools coffea.jetmet_tools + coffea.btag_tools coffea.lumi_tools coffea.hist coffea.util diff --git a/setup.py b/setup.py index dc71b44c7..f55798abe 100644 --- a/setup.py +++ b/setup.py @@ -68,15 +68,16 @@ def get_description(): 'lz4', 'six', 'cloudpickle', - 'mplhep>=0.0.16', + 'mplhep>=0.0.16,!=0.0.37', 'packaging', - 'ipywidgets' + 'ipywidgets', + 'pandas', ] EXTRAS_REQUIRE = {} if six.PY3: - pandas = ['pandas'] templates = ['jinja2'] - EXTRAS_REQUIRE['spark'] = ['pyspark>=2.4.1', 'pyarrow>=0.10.0,!=0.14.0,<0.16'] + templates + pandas + EXTRAS_REQUIRE['spark'] = ['pyspark>=2.4.1', 'pyarrow>=0.10.0,!=0.14.0'] + templates + EXTRAS_REQUIRE['spark'] = ['pyspark>=2.4.1', 'pyarrow>=0.10.0,!=0.14.0,<0.16'] + templates EXTRAS_REQUIRE['parsl'] = ['parsl>=0.7.2'] EXTRAS_REQUIRE['dask'] = ['dask>=2.6.0', 'distributed>=2.6.0', 'bokeh>=1.3.4', 'blosc'] if six.PY2: diff --git a/tests/samples/DeepCSV_2016LegacySF_V1_TuneCP5.btag.csv.gz b/tests/samples/DeepCSV_2016LegacySF_V1_TuneCP5.btag.csv.gz new file mode 100644 index 000000000..9a152d18a Binary files /dev/null and b/tests/samples/DeepCSV_2016LegacySF_V1_TuneCP5.btag.csv.gz differ diff --git a/tests/test_btag.py b/tests/test_btag.py new file mode 100644 index 000000000..bad95c48b --- /dev/null +++ b/tests/test_btag.py @@ -0,0 +1,27 @@ +import pytest +from coffea.btag_tools import BTagScaleFactor +from dummy_distributions import dummy_jagged_eta_pt +import numpy + +def test_BTagScalefactor(): + sf1 = BTagScaleFactor('tests/samples/testBTagSF.btag.csv', 'medium') + sf2 = BTagScaleFactor('tests/samples/DeepCSV_102XSF_V1.btag.csv.gz', BTagScaleFactor.RESHAPE, 'iterativefit') + sf3 = BTagScaleFactor('tests/samples/DeepCSV_102XSF_V1.btag.csv.gz', BTagScaleFactor.TIGHT) + sf4 = BTagScaleFactor('tests/samples/DeepCSV_2016LegacySF_V1_TuneCP5.btag.csv.gz', BTagScaleFactor.RESHAPE, 'iterativefit', keep_df=True) + # import pdb; pdb.set_trace() + + counts, test_eta, test_pt = dummy_jagged_eta_pt() + test_flavor = numpy.random.choice([0, 4, 5], size=len(test_eta)) + test_allb = numpy.ones_like(test_flavor) * 5 + test_discr = numpy.random.rand(len(test_eta)) + + sf1.eval('central', test_flavor, test_eta, test_pt, test_discr) + sf1.eval('up', test_flavor, test_eta, test_pt) + sf2.eval('central', test_allb, test_eta, test_pt, test_discr) + with pytest.raises(ValueError): + sf2.eval('up', test_allb, test_eta, test_pt) + sf3.eval('central', test_flavor, test_eta, test_pt, test_discr) + sf3.eval('up', test_flavor, test_eta, test_pt) + with pytest.raises(ValueError): + sf4.eval('central', test_flavor, test_eta, test_pt, test_discr) + sf4.eval('central', test_allb, test_eta, test_pt, test_discr) diff --git a/tests/test_lookup_tools.py b/tests/test_lookup_tools.py index 4056b77ce..176af2dd0 100644 --- a/tests/test_lookup_tools.py +++ b/tests/test_lookup_tools.py @@ -110,7 +110,7 @@ def test_btag_csv_scalefactors(): extractor = lookup_tools.extractor() extractor.add_weight_sets(["testBTag * tests/samples/testBTagSF.btag.csv", "* * tests/samples/DeepCSV_102XSF_V1.btag.csv.gz", - "* * tests/samples/DeepJet_102XSF_WP_V1.btag.csv.gz" + "* * tests/samples/DeepJet_102XSF_WP_V1.btag.csv.gz", ]) extractor.finalize() diff --git a/validation/btag/setup.sh b/validation/btag/setup.sh new file mode 100644 index 000000000..6e003cc14 --- /dev/null +++ b/validation/btag/setup.sh @@ -0,0 +1,5 @@ +wget https://github.com/cms-sw/cmssw/raw/CMSSW_10_0_X/CondTools/BTau/test/BTagCalibrationStandalone.cpp +wget https://github.com/cms-sw/cmssw/raw/CMSSW_10_0_X/CondTools/BTau/test/BTagCalibrationStandalone.h +cp ../../tests/samples/DeepCSV_102XSF_V1.btag.csv.gz . +gzip -d DeepCSV_102XSF_V1.btag.csv.gz +cp ../../tests/samples/testBTagSF.btag.csv . diff --git a/validation/btag/validate.py b/validation/btag/validate.py new file mode 100644 index 000000000..8e513f369 --- /dev/null +++ b/validation/btag/validate.py @@ -0,0 +1,78 @@ +import ROOT +ROOT.gROOT.SetBatch(True) +ROOT.PyConfig.IgnoreCommandLineOptions = True +ROOT.gInterpreter.LoadFile('BTagCalibrationStandalone.cpp') + +from functools import partial +import numpy +from coffea.btag_tools import BTagScaleFactor + + +def stdvec(l): + out = ROOT.std.vector('string')() + for item in l: + out.push_back(item) + return out + + +def makesf(btagReader): + def btv_sf(syst, flavor, abseta, pt, discr=None): + def wrap(flavor, abseta, pt, discr): + if flavor == 5: + btvflavor = ROOT.BTagEntry.FLAV_B + elif flavor == 4: + btvflavor = ROOT.BTagEntry.FLAV_C + elif flavor == 0: + btvflavor = ROOT.BTagEntry.FLAV_UDSG + else: + raise ValueError + if discr is None: + return btagReader.eval_auto_bounds(syst, btvflavor, abseta, pt) + return btagReader.eval_auto_bounds(syst, btvflavor, abseta, pt, discr) + return numpy.vectorize(wrap, otypes='d')(flavor, abseta, pt, discr) + return btv_sf + + +def validate_btag(filename, btagtype, etamax): + btagData = ROOT.BTagCalibration(btagtype, filename) + npts = 10000 + flavor = numpy.full(npts, 5) + abseta = numpy.random.uniform(0, etamax, size=npts) + pt = numpy.random.exponential(50, size=npts) + numpy.random.exponential(20, size=npts) + pt = numpy.maximum(20.1, pt) + discr = numpy.random.rand(npts) + + coffea_sf = BTagScaleFactor(filename, BTagScaleFactor.RESHAPE, 'iterativefit', keep_df=True) + btagReader = ROOT.BTagCalibrationReader(ROOT.BTagEntry.OP_RESHAPING, 'central', stdvec(['up_jes', 'down_jes'])) + btagReader.load(btagData, ROOT.BTagEntry.FLAV_B, 'iterativefit') + btv_sf = makesf(btagReader) + + for syst in ['central', 'up_jes', 'down_jes']: + csf = coffea_sf.eval(syst, flavor, abseta, pt, discr) + bsf = btv_sf(syst, flavor, abseta, pt, discr) + print(abs(csf - bsf).max()) + + flavor = numpy.random.choice([0, 4, 5], size=npts) + coffea_sf = BTagScaleFactor(filename, BTagScaleFactor.TIGHT, 'comb,mujets,incl', keep_df=True) + btagReader = ROOT.BTagCalibrationReader(ROOT.BTagEntry.OP_TIGHT, 'central', stdvec(['up', 'down'])) + btagReader.load(btagData, ROOT.BTagEntry.FLAV_B, 'comb') + btagReader.load(btagData, ROOT.BTagEntry.FLAV_C, 'mujets') + btagReader.load(btagData, ROOT.BTagEntry.FLAV_UDSG, 'incl') + btv_sf = makesf(btagReader) + + for syst in ['central', 'up', 'down']: + csf = coffea_sf.eval(syst, flavor, abseta, pt, discr) + bsf = btv_sf(syst, flavor, abseta, pt, discr) + print(abs(csf - bsf).max()) + + +validate_btag( + filename='DeepCSV_102XSF_V1.btag.csv', + btagtype='DeepCSV', + etamax=2.5, +) +validate_btag( + filename='testBTagSF.btag.csv', + btagtype='CSVv2', + etamax=2.4, +)