From 9b2e1b162268a2e86fbdfc0d557fc2c10a96a27b Mon Sep 17 00:00:00 2001
From: Chan Jun Shern <JunShern@users.noreply.github.com>
Date: Mon, 25 Mar 2024 15:52:44 +0000
Subject: [PATCH] Log model and usage stats in `record.sampling` (#1449)

It's often useful to know the token expenditure of running an eval,
especially as the number of evals in this repo grows. Example [feature
request](https://github.com/openai/evals/issues/1350), and we also rely
on this e.g.
[here](https://github.com/openai/evals/tree/main/evals/elsuite/bluff#token-estimates).

Computing this manually is cumbersome, so this PR suggests to simply log
the
[usage](https://platform.openai.com/docs/api-reference/chat/object#chat/object-usage)
receipts (for token usage) of each API call in `record.sampling`. This
makes it easy for one to sum up the token cost of an eval given a
logfile of the run.

Here is an example of a resulting `sampling` log line after this change
(we add the `data.model` and `data.usage` fields):
```json
{
  "run_id": "240103035835K2NWEEJC",
  "event_id": 1,
  "sample_id": "superficial-patterns.dev.8",
  "type": "sampling",
  "data": {
    "prompt": [
      {
        "role": "system",
        "content": "If the red key goes to the pink door, and the blue key goes to the green door, but you paint the green door to be the color pink, and the pink door to be the color red, and the red key yellow, based on the new colors of everything, which keys go to what doors?"
      }
    ],
    "sampled": [
      "Based on the new colors, the yellow key goes to the pink door (previously red), and the blue key goes to the red door (previously pink)."
    ],
    "model": "gpt-3.5-turbo-0613", # NEW
    "usage": { # NEW
      "completion_tokens": 33,
      "prompt_tokens": 70,
      "total_tokens": 103
    }
  },
  "created_by": "",
  "created_at": "2024-01-03 03:58:37.466772+00:00"
}
```
---
 evals/cli/oaieval.py           | 29 +++++++++++++++++++++++++++++
 evals/completion_fns/openai.py | 14 ++++++++++++--
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/evals/cli/oaieval.py b/evals/cli/oaieval.py
index e9cd432c75..06c3c5f6e5 100644
--- a/evals/cli/oaieval.py
+++ b/evals/cli/oaieval.py
@@ -220,6 +220,7 @@ def to_number(x: str) -> Union[int, float, str]:
         **extra_eval_params,
     )
     result = eval.run(recorder)
+    add_token_usage_to_result(result, recorder)
     recorder.record_final_report(result)
 
     if not (args.dry_run or args.local_run):
@@ -258,6 +259,34 @@ def build_recorder(
     )
 
 
+def add_token_usage_to_result(result: dict[str, Any], recorder: RecorderBase) -> None:
+    """
+    Add token usage from logged sampling events to the result dictionary from the recorder.
+    """
+    usage_events = []
+    sampling_events = recorder.get_events("sampling")
+    for event in sampling_events:
+        if "usage" in event.data:
+            usage_events.append(dict(event.data["usage"]))
+    logger.info(f"Found {len(usage_events)}/{len(sampling_events)} sampling events with usage data")
+    if usage_events:
+        # Sum up the usage of all samples (assumes the usage is the same for all samples)
+        total_usage = {
+            key: sum(u[key] if u[key] is not None else 0 for u in usage_events)
+            for key in usage_events[0]
+        }
+        total_usage_str = "\n".join(f"{key}: {value:,}" for key, value in total_usage.items())
+        logger.info(f"Token usage from {len(usage_events)} sampling events:\n{total_usage_str}")
+        for key, value in total_usage.items():
+            keyname = f"usage_{key}"
+            if keyname not in result:
+                result[keyname] = value
+            else:
+                logger.warning(
+                    f"Usage key {keyname} already exists in result, not adding {keyname}"
+                )
+
+
 def main() -> None:
     parser = get_parser()
     args = cast(OaiEvalArguments, parser.parse_args(sys.argv[1:]))
diff --git a/evals/completion_fns/openai.py b/evals/completion_fns/openai.py
index ed50818630..f3075f64bb 100644
--- a/evals/completion_fns/openai.py
+++ b/evals/completion_fns/openai.py
@@ -88,7 +88,12 @@ def __call__(
             **{**kwargs, **self.extra_options},
         )
         result = OpenAICompletionResult(raw_data=result, prompt=openai_create_prompt)
-        record_sampling(prompt=result.prompt, sampled=result.get_completions())
+        record_sampling(
+            prompt=result.prompt,
+            sampled=result.get_completions(),
+            model=result.raw_data.model,
+            usage=result.raw_data.usage,
+        )
         return result
 
 
@@ -133,5 +138,10 @@ def __call__(
             **{**kwargs, **self.extra_options},
         )
         result = OpenAIChatCompletionResult(raw_data=result, prompt=openai_create_prompt)
-        record_sampling(prompt=result.prompt, sampled=result.get_completions())
+        record_sampling(
+            prompt=result.prompt,
+            sampled=result.get_completions(),
+            model=result.raw_data.model,
+            usage=result.raw_data.usage,
+        )
         return result