Skip to content

Commit

Permalink
Merge pull request #267 from CoffeaTeam/btagsf
Browse files Browse the repository at this point in the history
Introduce dedicated b-tagging scale factor class
  • Loading branch information
nsmith- authored Feb 16, 2020
2 parents 6303f29 + 4c71d55 commit 7e1ccec
Show file tree
Hide file tree
Showing 9 changed files with 349 additions and 5 deletions.
10 changes: 10 additions & 0 deletions coffea/btag_tools/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
"""BTag tools: CMS analysis-level b-tagging corrections and uncertainties
These classes provide computation of CMS b-tagging and mistagging
corrections and uncertainties on columnar data.
"""
from .btagscalefactor import BTagScaleFactor

__all__ = [
'BTagScaleFactor',
]
222 changes: 222 additions & 0 deletions coffea/btag_tools/btagscalefactor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,222 @@
from threading import Lock
import pandas
import numpy
import numba


class BTagScaleFactor:
'''A class holding one complete BTag scale factor for a given working point
Parameters
----------
filename : str
The BTag-formatted CSV file to read (accepts .csv, .csv.gz, etc.)
See pandas read_csv for all supported compressions.
workingpoint : str or int
The working point, one of LOOSE, MEDIUM, TIGHT, or RESHAPE (0-3, respectively)
methods : str, optional
The scale factor derivation method to use for each flavor, 'b,c,light' respectively.
Defaults to 'comb,comb,incl'
keep_df : bool, optional
If set true, keep the parsed dataframe as an attribute (.df) for later inspection
'''
LOOSE, MEDIUM, TIGHT, RESHAPE = range(4)
FLAV_B, FLAV_C, FLAV_UDSG = range(3)
_formulaLock = Lock()
_formulaCache = {}
_btvflavor = numpy.array([0, 1, 2, 3])
_flavor = numpy.array([0, 4, 5, 6])
_wpString = {'loose': LOOSE, 'medium': MEDIUM, 'tight': TIGHT}
_expectedColumns = [
'OperatingPoint', 'measurementType', 'sysType', 'jetFlavor', 'etaMin',
'etaMax', 'ptMin', 'ptMax', 'discrMin', 'discrMax', 'formula'
]

@classmethod
def readcsv(cls, filename):
'''Reads a BTag-formmated CSV file into a pandas dataframe
This function also merges the bin min and max into a tuple representing the bin
Parameters
----------
filename : str
The file to open
Returns
-------
df : pandas.DataFrame
A dataframe containing all info in the file
discriminator : str
The name of the discriminator the correction map is for
'''
df = pandas.read_csv(filename, skipinitialspace=True)
discriminator = df.columns[0].split(';')[0]

def cleanup(colname):
if ';' in colname:
_, colname = colname.split(';')
return colname.strip()
df.rename(columns=cleanup, inplace=True)
if not list(df.columns) == BTagScaleFactor._expectedColumns:
raise RuntimeError('Columns in BTag scale factor file %s as expected' % filename)
for var in ['eta', 'pt', 'discr']:
df[var + 'Bin'] = list(zip(df[var + 'Min'], df[var + 'Max']))
del df[var + 'Min']
del df[var + 'Max']
return df, discriminator

def __init__(self, filename, workingpoint, methods='comb,comb,incl', keep_df=False):
if workingpoint not in [0, 1, 2, 3]:
try:
workingpoint = BTagScaleFactor._wpString[workingpoint.lower()]
except (KeyError, AttributeError):
raise ValueError('Unrecognized working point')
methods = methods.split(',')
self.workingpoint = workingpoint
df, self.discriminator = BTagScaleFactor.readcsv(filename)
cut = (df['jetFlavor'] == self.FLAV_B) & (df['measurementType'] == methods[0])
if len(methods) > 1:
cut |= (df['jetFlavor'] == self.FLAV_C) & (df['measurementType'] == methods[1])
if len(methods) > 2:
cut |= (df['jetFlavor'] == self.FLAV_UDSG) & (df['measurementType'] == methods[2])
cut &= df['OperatingPoint'] == workingpoint
df = df[cut]
mavailable = list(df['measurementType'].unique())
if not all(m in mavailable for m in methods):
raise ValueError('Unrecognized jet correction method, available: %r' % mavailable)
df = df.set_index(['sysType', 'jetFlavor', 'etaBin', 'ptBin', 'discrBin']).sort_index()
if keep_df:
self.df = df
self._corrections = {}
for syst in list(df.index.levels[0]):
corr = df.loc[syst]
allbins = list(corr.index)
# NOTE: here we force the class to assume abs(eta) based on lack of examples where signed eta is used
edges_eta = numpy.array(sorted(set(abs(x) for tup in corr.index.levels[1] for x in tup)))
edges_pt = numpy.array(sorted(set(x for tup in corr.index.levels[2] for x in tup)))
edges_discr = numpy.array(sorted(set(x for tup in corr.index.levels[3] for x in tup)))
alledges = numpy.meshgrid(self._btvflavor[:-1], edges_eta[:-1], edges_pt[:-1], edges_discr[:-1], indexing='ij')
mapping = numpy.full(alledges[0].shape, -1)

def findbin(btvflavor, eta, pt, discr):
for i, (fbin, ebin, pbin, dbin) in enumerate(allbins):
if btvflavor == fbin and ebin[0] <= eta < ebin[1] and pbin[0] <= pt < pbin[1] and dbin[0] <= discr < dbin[1]:
return i
return -1

for idx, _ in numpy.ndenumerate(mapping):
btvflavor, eta, pt, discr = (x[idx] for x in alledges)
mapping[idx] = findbin(btvflavor, eta, pt, discr)

self._corrections[syst] = (
edges_eta,
edges_pt,
edges_discr,
mapping,
numpy.array(corr['formula']),
)
self._compiled = {}

def __getstate__(self):
state = dict(self.__dict__)
state.pop('_compiled')
return state

def __setstate__(self, state):
self.__dict__ = state
self._compiled = {}

@classmethod
def _compile(cls, formula):
with BTagScaleFactor._formulaLock:
try:
return BTagScaleFactor._formulaCache[formula]
except KeyError:
if 'x' in formula:
feval = eval('lambda x: ' + formula, {'log': numpy.log, 'sqrt': numpy.sqrt})
out = numba.vectorize([
numba.float32(numba.float32),
numba.float64(numba.float64),
])(feval)
else:
val = eval(formula, {'log': numpy.log, 'sqrt': numpy.sqrt})

def duck(_, out, where):
out[where] = val
out = duck
BTagScaleFactor._formulaCache[formula] = out
return out

def _lookup(self, axis, values):
if len(axis) == 2:
return numpy.zeros(shape=values.shape, dtype=numpy.uint)
return numpy.clip(numpy.searchsorted(axis, values, side='right') - 1, 0, len(axis) - 2)

def eval(self, systematic, flavor, abseta, pt, discr=None, ignore_missing=False):
'''Evaluate this scale factor as a function of jet properties
Parameters
----------
systematic : str
Which systematic to evaluate. Nominal correction is 'central', the options depend
on the scale factor and method
flavor : numpy.ndarray or awkward.Array
The generated jet hadron flavor, following the enumeration:
0: uds quark or gluon, 4: charm quark, 5: bottom quark
abseta : numpy.ndarray or awkward.Array
The absolute value of the jet pseudorapitiy
pt : numpy.ndarray or awkward.Array
The jet transverse momentum
discr : numpy.ndarray or awkward.Array, optional
The jet tagging discriminant value (default None), optional for all scale factors except
the reshaping scale factor
ignore_missing : bool, optional
If set true, any values that have no correction will return 1. instead of throwing
an exception. Out-of-bounds values are always clipped to the nearest bin.
Returns
-------
out : numpy.ndarray or awkward.Array
An array with shape matching ``pt``, containing the per-jet scale factor
'''
if self.workingpoint == BTagScaleFactor.RESHAPE and discr is None:
raise ValueError('RESHAPE scale factor requires a discriminant array')
try:
functions = self._compiled[systematic]
except KeyError:
functions = [BTagScaleFactor._compile(f) for f in self._corrections[systematic][-1]]
self._compiled[systematic] = functions
try:
flavor.counts
jin, flavor = flavor, flavor.flatten()
abseta = abseta.flatten()
pt = pt.flatten()
discr = discr.flatten() if discr is not None else None
except AttributeError:
jin = None
corr = self._corrections[systematic]
idx = (
2 - self._lookup(self._flavor, flavor), # transform to btv definiton
self._lookup(corr[0], abseta),
self._lookup(corr[1], pt),
self._lookup(corr[2], discr) if discr is not None else 0,
)
mapidx = corr[3][idx]
out = numpy.ones(mapidx.shape, dtype=pt.dtype)
for ifunc in numpy.unique(mapidx):
if ifunc < 0 and not ignore_missing:
raise ValueError('No correction was available for some items')
func = functions[ifunc]
if self.workingpoint == BTagScaleFactor.RESHAPE:
var = numpy.clip(discr, corr[2][0], corr[2][-1])
else:
var = numpy.clip(pt, corr[1][0], corr[1][-1])
func(var, out=out, where=(mapidx == ifunc))

if jin is not None:
out = jin.copy(content=out)
return out

def __call__(self, systematic, flavor, abseta, pt, discr=None, ignore_missing=False):
return self.eval(systematic, flavor, abseta, pt, discr, ignore_missing)
1 change: 1 addition & 0 deletions docs/source/reference.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ and/or heavy dependencies. Below lists the packages available in the ``coffea``
coffea.nanoaod.methods
coffea.lookup_tools
coffea.jetmet_tools
coffea.btag_tools
coffea.lumi_tools
coffea.hist
coffea.util
Expand Down
9 changes: 5 additions & 4 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,15 +68,16 @@ def get_description():
'lz4',
'six',
'cloudpickle',
'mplhep>=0.0.16',
'mplhep>=0.0.16,!=0.0.37',
'packaging',
'ipywidgets'
'ipywidgets',
'pandas',
]
EXTRAS_REQUIRE = {}
if six.PY3:
pandas = ['pandas']
templates = ['jinja2']
EXTRAS_REQUIRE['spark'] = ['pyspark>=2.4.1', 'pyarrow>=0.10.0,!=0.14.0,<0.16'] + templates + pandas
EXTRAS_REQUIRE['spark'] = ['pyspark>=2.4.1', 'pyarrow>=0.10.0,!=0.14.0'] + templates
EXTRAS_REQUIRE['spark'] = ['pyspark>=2.4.1', 'pyarrow>=0.10.0,!=0.14.0,<0.16'] + templates
EXTRAS_REQUIRE['parsl'] = ['parsl>=0.7.2']
EXTRAS_REQUIRE['dask'] = ['dask>=2.6.0', 'distributed>=2.6.0', 'bokeh>=1.3.4', 'blosc']
if six.PY2:
Expand Down
Binary file not shown.
27 changes: 27 additions & 0 deletions tests/test_btag.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
import pytest
from coffea.btag_tools import BTagScaleFactor
from dummy_distributions import dummy_jagged_eta_pt
import numpy

def test_BTagScalefactor():
sf1 = BTagScaleFactor('tests/samples/testBTagSF.btag.csv', 'medium')
sf2 = BTagScaleFactor('tests/samples/DeepCSV_102XSF_V1.btag.csv.gz', BTagScaleFactor.RESHAPE, 'iterativefit')
sf3 = BTagScaleFactor('tests/samples/DeepCSV_102XSF_V1.btag.csv.gz', BTagScaleFactor.TIGHT)
sf4 = BTagScaleFactor('tests/samples/DeepCSV_2016LegacySF_V1_TuneCP5.btag.csv.gz', BTagScaleFactor.RESHAPE, 'iterativefit', keep_df=True)
# import pdb; pdb.set_trace()

counts, test_eta, test_pt = dummy_jagged_eta_pt()
test_flavor = numpy.random.choice([0, 4, 5], size=len(test_eta))
test_allb = numpy.ones_like(test_flavor) * 5
test_discr = numpy.random.rand(len(test_eta))

sf1.eval('central', test_flavor, test_eta, test_pt, test_discr)
sf1.eval('up', test_flavor, test_eta, test_pt)
sf2.eval('central', test_allb, test_eta, test_pt, test_discr)
with pytest.raises(ValueError):
sf2.eval('up', test_allb, test_eta, test_pt)
sf3.eval('central', test_flavor, test_eta, test_pt, test_discr)
sf3.eval('up', test_flavor, test_eta, test_pt)
with pytest.raises(ValueError):
sf4.eval('central', test_flavor, test_eta, test_pt, test_discr)
sf4.eval('central', test_allb, test_eta, test_pt, test_discr)
2 changes: 1 addition & 1 deletion tests/test_lookup_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ def test_btag_csv_scalefactors():
extractor = lookup_tools.extractor()
extractor.add_weight_sets(["testBTag * tests/samples/testBTagSF.btag.csv",
"* * tests/samples/DeepCSV_102XSF_V1.btag.csv.gz",
"* * tests/samples/DeepJet_102XSF_WP_V1.btag.csv.gz"
"* * tests/samples/DeepJet_102XSF_WP_V1.btag.csv.gz",
])
extractor.finalize()

Expand Down
5 changes: 5 additions & 0 deletions validation/btag/setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
wget https://github.com/cms-sw/cmssw/raw/CMSSW_10_0_X/CondTools/BTau/test/BTagCalibrationStandalone.cpp
wget https://github.com/cms-sw/cmssw/raw/CMSSW_10_0_X/CondTools/BTau/test/BTagCalibrationStandalone.h
cp ../../tests/samples/DeepCSV_102XSF_V1.btag.csv.gz .
gzip -d DeepCSV_102XSF_V1.btag.csv.gz
cp ../../tests/samples/testBTagSF.btag.csv .
78 changes: 78 additions & 0 deletions validation/btag/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import ROOT
ROOT.gROOT.SetBatch(True)
ROOT.PyConfig.IgnoreCommandLineOptions = True
ROOT.gInterpreter.LoadFile('BTagCalibrationStandalone.cpp')

from functools import partial
import numpy
from coffea.btag_tools import BTagScaleFactor


def stdvec(l):
out = ROOT.std.vector('string')()
for item in l:
out.push_back(item)
return out


def makesf(btagReader):
def btv_sf(syst, flavor, abseta, pt, discr=None):
def wrap(flavor, abseta, pt, discr):
if flavor == 5:
btvflavor = ROOT.BTagEntry.FLAV_B
elif flavor == 4:
btvflavor = ROOT.BTagEntry.FLAV_C
elif flavor == 0:
btvflavor = ROOT.BTagEntry.FLAV_UDSG
else:
raise ValueError
if discr is None:
return btagReader.eval_auto_bounds(syst, btvflavor, abseta, pt)
return btagReader.eval_auto_bounds(syst, btvflavor, abseta, pt, discr)
return numpy.vectorize(wrap, otypes='d')(flavor, abseta, pt, discr)
return btv_sf


def validate_btag(filename, btagtype, etamax):
btagData = ROOT.BTagCalibration(btagtype, filename)
npts = 10000
flavor = numpy.full(npts, 5)
abseta = numpy.random.uniform(0, etamax, size=npts)
pt = numpy.random.exponential(50, size=npts) + numpy.random.exponential(20, size=npts)
pt = numpy.maximum(20.1, pt)
discr = numpy.random.rand(npts)

coffea_sf = BTagScaleFactor(filename, BTagScaleFactor.RESHAPE, 'iterativefit', keep_df=True)
btagReader = ROOT.BTagCalibrationReader(ROOT.BTagEntry.OP_RESHAPING, 'central', stdvec(['up_jes', 'down_jes']))
btagReader.load(btagData, ROOT.BTagEntry.FLAV_B, 'iterativefit')
btv_sf = makesf(btagReader)

for syst in ['central', 'up_jes', 'down_jes']:
csf = coffea_sf.eval(syst, flavor, abseta, pt, discr)
bsf = btv_sf(syst, flavor, abseta, pt, discr)
print(abs(csf - bsf).max())

flavor = numpy.random.choice([0, 4, 5], size=npts)
coffea_sf = BTagScaleFactor(filename, BTagScaleFactor.TIGHT, 'comb,mujets,incl', keep_df=True)
btagReader = ROOT.BTagCalibrationReader(ROOT.BTagEntry.OP_TIGHT, 'central', stdvec(['up', 'down']))
btagReader.load(btagData, ROOT.BTagEntry.FLAV_B, 'comb')
btagReader.load(btagData, ROOT.BTagEntry.FLAV_C, 'mujets')
btagReader.load(btagData, ROOT.BTagEntry.FLAV_UDSG, 'incl')
btv_sf = makesf(btagReader)

for syst in ['central', 'up', 'down']:
csf = coffea_sf.eval(syst, flavor, abseta, pt, discr)
bsf = btv_sf(syst, flavor, abseta, pt, discr)
print(abs(csf - bsf).max())


validate_btag(
filename='DeepCSV_102XSF_V1.btag.csv',
btagtype='DeepCSV',
etamax=2.5,
)
validate_btag(
filename='testBTagSF.btag.csv',
btagtype='CSVv2',
etamax=2.4,
)

0 comments on commit 7e1ccec

Please sign in to comment.