Skip to content

Commit

Permalink
change to single process for bootstrap_stderr
Browse files Browse the repository at this point in the history
Signed-off-by: zhuyuhua-v <[email protected]>
  • Loading branch information
zhuyuhua-v committed Dec 23, 2024
1 parent b86aa21 commit 3c658cd
Showing 1 changed file with 9 additions and 26 deletions.
35 changes: 9 additions & 26 deletions lm_eval/api/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -457,32 +457,15 @@ def __call__(self, v):


def bootstrap_stderr(f, xs, iters):
import multiprocessing as mp

pool = mp.Pool(mp.cpu_count())
# this gives a biased estimate of the stderr (i.e w/ the mean, it gives something
# equivalent to stderr calculated without Bessel's correction in the stddev.
# Unfortunately, I haven't been able to figure out what the right correction is
# to make the bootstrap unbiased - i considered multiplying by sqrt(n/(n-1)) but
# that would be ad-hoc and I can't prove that that would actually be an unbiased estimator)
# Thankfully, shouldn't matter because our samples are pretty big usually anyways
res = []
chunk_size = min(1000, iters)
from tqdm import tqdm

print("bootstrapping for stddev:", f.__name__)
for bootstrap in tqdm(
pool.imap(
_bootstrap_internal(f, chunk_size),
[(i, xs) for i in range(iters // chunk_size)],
),
total=iters // chunk_size,
):
# sample w replacement
res.extend(bootstrap)

pool.close()
return sample_stddev(res)
res = []
chunk_size = min(1000, iters)
from tqdm import tqdm
print("bootstrapping for stddev:", f.__name__)
for i in tqdm(range(iters // chunk_size)):
bootstrap = _bootstrap_internal(f, chunk_size)((i, xs))
# sample w replacement
res.extend(bootstrap)
return sample_stddev(res)


def stderr_for_metric(metric, bootstrap_iters: int):
Expand Down

0 comments on commit 3c658cd

Please sign in to comment.