pytorch-labs · xuzhao9 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025 · Jan 22, 2025
diff --git a/.ci/scribe/upload.py b/.ci/scribe/upload.py
diff --git a/.ci/tritonbench/run-benchmark.sh b/.ci/tritonbench/run-benchmark.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+set -x
+
+if [ -z "${SETUP_SCRIPT}" ]; then
+  echo "ERROR: SETUP_SCRIPT is not set"
+  exit 1
+fi
+
+if [ -z "$1" ]; then
+  echo "ERROR: BENCHMARK_NAME must be set as the first argument."
+  exit 1
+fi
+
+. "${SETUP_SCRIPT}"
+
+BENCHMARK_NAME=$1
+
+python "benchmarks/${BENCHMARK_NAME}/run.py" --ci
diff --git a/.ci/tritonbench/test-nightly.sh b/.ci/tritonbench/test-nightly.sh
diff --git a/.ci/upload/scribe.py b/.ci/upload/scribe.py
@@ -0,0 +1,128 @@
+"""
+Upload result json file to scribe.
+"""
+
+import argparse
+import json
+import os
+import requests
+import time
+
+from collections import defaultdict
+
+CATEGORY_NAME = "perfpipe_pytorch_user_benchmarks"
+BENCHMARK_SCHEMA = {
+    "int": ["time"],
+    "normal": [
+        "benchmark_date",
+        "unix_user",
+        "submission_group_id",
+        "cuda_version",
+        "device",
+        "conda_env",
+        "pytorch_commit",
+        "triton_commit",
+        "tritonbench_commit",
+        "triton_branch",
+        "pytorch_branch",
+        "tritonbench_branch",
+        "triton_commit_time",
+        "pytorch_commit_time",
+        "tritonbench_commit_time",
+        "github_action",
+        "github_actor",
+        "github_base_ref",
+        "github_ref",
+        "github_ref_protected",
+        "github_repository",
+        "github_run_attempt",
+        "github_run_id",
+        "github_run_number",
+        "github_workflow_ref",
+        "github_workflow_sha",
+        "job_name",
+        "runner_arch",
+        "runner_name",
+        "runner_os",
+        "metric_id",
+    ],
+    "float": ["metric_value"],
+}
+
+class ScribeUploader:
+    def __init__(self, category, schema):
+        self.category = category
+        self.schema = schema
+
+    def _format_message(self, field_dict):
+        assert "time" in field_dict, "Missing required Scribe field 'time'"
+        message = defaultdict(dict)
+        for field, value in field_dict.items():
+            field = field.lower()
+            if value is None:
+                continue
+            if field in self.schema["normal"]:
+                message["normal"][field] = str(value)
+            elif field in self.schema["int"]:
+                message["int"][field] = int(value)
+            elif field in self.schema["float"]:
+                message["float"][field] = float(value)
+            else:
+                raise ValueError(
+                    "Field {} is not currently used, "
+                    "be intentional about adding new fields to schema".format(field)
+                )
+        return message
+
+    def _upload(self, messages: list):
+        access_token = os.environ.get(
+            "TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN"
+        )
+        if not access_token:
+            raise ValueError("Can't find access token from environment variable")
+        url = "https://graph.facebook.com/scribe_logs"
+        r = requests.post(
+            url,
+            data={
+                "access_token": access_token,
+                "logs": json.dumps(
+                    [
+                        {
+                            "category": self.category,
+                            "message": json.dumps(message),
+                            "line_escape": False,
+                        }
+                        for message in messages
+                    ]
+                ),
+            },
+        )
+        print(r.text)
+        r.raise_for_status()
+
+    def post_benchmark_results(self, bm_data):
+        messages = []
+        base_message = {
+            "time": int(time.time()),
+        }
+        base_message.update(bm_data["env"])
+        base_message.update(bm_data["github"])
+        base_message["submission_group_id"] = f"tritonbench.{bm_data['name']}"
+        base_message["unix_user"] = "tritonbench_ci"
+        for metric in bm_data["metrics"]:
+            msg = base_message.copy()
+            msg["metric_id"] = metric
+            msg["metric_value"] = bm_data["metrics"][metric]
+            formatted_msg = self._format_message(msg)
+            messages.append(formatted_msg)
+        self._upload(messages)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--json", required=True, type=argparse.FileType("r"), help="Userbenchmark json"
+    )
+    args = parser.parse_args()
+    uploader = ScribeUploader(category=CATEGORY_NAME, schema=BENCHMARK_SCHEMA)
+    benchmark_data = json.load(args.json)
+    uploader.post_benchmark_results(benchmark_data)
diff --git a/.github/workflows/_linux-benchmark-h100.yml b/.github/workflows/_linux-benchmark-h100.yml
@@ -0,0 +1,62 @@
+name: linux-benchmark-h100
+on:
+  workflow_call:
+    secrets:
+      TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN:
+        required: True
+        description: |
+          Tritonbench Scribe Graph Access Token
+    inputs:
+      benchmark_name:
+        required: True
+        type: string
+        description: |
+          Benchmark name
+      conda_env:
+        required: True
+        type: string
+        description: |
+          Conda environment to activate when testing Triton
+
+jobs:
+  linux-benchmark-h100:
+    if: github.repository_owner == 'pytorch-labs'
+    runs-on: [gcp-h100-runner]
+    timeout-minutes: 240
+    environment: docker-s3-upload
+    env:
+      SETUP_SCRIPT: "/workspace/setup_instance.sh"
+      CONDA_ENV: ${{ inputs.conda_env }}
+      JOB_NAME: tritonbench-h100-${{ inputs.conda_env }}-${{ inputs.benchmark_name }}
+      TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
+    steps:
+      - name: Checkout Tritonbench (if on pull_request)
+        if: github.event_name == 'pull_request'
+        uses: actions/checkout@v3
+        with:
+          submodules: recursive
+      - name: Tune Nvidia GPU
+        run: |
+          sudo nvidia-smi -pm 1
+          sudo ldconfig
+          nvidia-smi
+      - name: Benchmarking
+        run: |
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ]; then
+            cd /workspace/tritonbench
+          fi
+          bash ./.ci/tritonbench/run-benchmark.sh ${{ inputs.benchmark_name }}
+          cp -r ./.benchmarks/${{ inputs.benchmark_name }} benchmark-output
+      - name: Upload result to GH Actions Artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: ${{ env.JOB_NAME }}
+          path: benchmark-output/
+      - name: Upload result to Scribe
+        run: |
+          if [ "${GITHUB_EVENT_NAME}" != "pull_request" ]; then
+            cd /workspace/tritonbench
+          fi
+          . "${SETUP_SCRIPT}"
+          latest_result_json=$(find ./benchmark-output/ -name "result.json"  | sort -r | head -n 1)
+          python ./.ci/upload/scribe.py --json ${latest_result_json}
diff --git a/.github/workflows/_linux-nightly-h100.yml b/.github/workflows/_linux-nightly-h100.yml
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
@@ -4,16 +4,20 @@ on:
     # Test nightly docker daily at 4 PM UTC
     - cron: '0 16 * * *'
   workflow_dispatch:
-  # TODO: remove this before commit!
   pull_request:
-
+    paths:
+      - benchmarks/nightly/**
+      - .github/workflows/_linux-benchmark-h100.yml
+      - .github/workflows/nightly.yml
 
 jobs:
   h100-triton-main-nightly-test:
-    uses: ./.github/workflows/_linux-nightly-h100.yml
+    uses: ./.github/workflows/_linux-benchmark-h100.yml
     with:
       conda_env: "triton-main"
-      job_name: "tritonbench-h100-triton-main-nightly"
+      benchmark_name: "nightly"
+    secrets:
+      TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN: ${{ secrets.TRITONBENCH_SCRIBE_GRAPHQL_ACCESS_TOKEN }}
 
 
 concurrency:

diff --git a/benchmarks/nightly/run.py b/benchmarks/nightly/run.py
@@ -73,10 +73,15 @@ def reduce(run_timestamp, output_dir, output_files, args):
     if args.ci and "TRITONBENCH_TRITON_REPO_PATH" in os.environ:
         repo_locs["triton"] = os.environ.get("TRITONBENCH_TRITON_REPO_PATH", None)
         repo_locs["pytorch"] = os.environ.get("TRITONBENCH_PYTORCH_REPO_PATH", None)
-    aggregated_obj = {"env": get_run_env(run_timestamp, repo_locs), "metrics": {}}
+    aggregated_obj = {
+        "name": "nightly",
+        "env": get_run_env(run_timestamp, repo_locs),
+        "metrics": {},
+    }
     # Collecting GitHub environment variables when running in CI environment
     if args.ci:
         aggregated_obj["github"] = get_github_env()
+
     for result_json_file in output_files:
         with open(
             result_json_file,