Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ci: add test for Huggingface Accelerate #1257

Merged
merged 3 commits into from
Jan 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
74 changes: 74 additions & 0 deletions .github/actions/print-environment/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
name: print-environment

inputs:
conda:
required: false
type: string
default: ''
description: "Conda environment to use (must exist beforehand)"
os_packages:
required: false
type: string
default: ''
description: "Space separated list of OS packages to evaluate"
pip_packages:
required: false
type: string
default: ''
description: "Space separated list of PyPi packages to evaluate"

runs:
using: composite
steps:
- name: Print environment
shell: bash
run: |
if [ -n "${{ inputs.conda }}" ]; then
source activate ${{ inputs.conda }}
fi
{
echo "### Environment"
echo "| | |"
echo "| --- | --- |"
echo "| jobs.$GITHUB_JOB.versions.os | $(source /etc/os-release && echo $VERSION_ID) |"
echo "| jobs.$GITHUB_JOB.versions.linux-kernel | $(uname -r) |"
echo "| jobs.$GITHUB_JOB.versions.python | $(python --version | cut -f2 -d' ' || true) |"
packages="${{ inputs.os_packages}}"
packages+=" \
level-zero \
libigc1 \
libigc2 \
libze1 \
libze-intel-gpu1 \
intel-i915-dkms \
intel-level-zero-gpu \
intel-opencl-icd"
for package in $packages; do
package_version="$(dpkg -l | grep $package | grep ii | head -1 | sed 's/ */ /g' | cut -f3 -d" " || true)"
echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |"
done
packages="${{ inputs.pip_packages}}"
packages+=" \
numpy \
torch \
torchaudio \
torchvision"
for package in $packages; do
package_version=$(python -c "import $package; print($package.__version__)" || true)
echo "| jobs.$GITHUB_JOB.versions.$package | $package_version |"
done
# printing annotations for GPU cards
var="[$(cat /sys/class/drm/render*/device/vendor || true)]"
echo "| jobs.$GITHUB_JOB.drm.render_nodes_vendor_ids | $(echo $var | sed 's/ /,/g') |"
var="[$(cat /sys/class/drm/render*/device/device || true)]"
echo "| jobs.$GITHUB_JOB.drm.render_nodes_device_ids | $(echo $var | sed 's/ /,/g') |"
var=$(python -c "import torch; print(torch.version.xpu)" || true)
echo "| jobs.$GITHUB_JOB.torch.version.xpu | $var |"
var=$(python -c "import torch; print(torch.xpu.device_count())" || true)
echo "| jobs.$GITHUB_JOB.torch.xpu.device_count | $var |"
# printing annotations with key environment variables
echo "| jobs.$GITHUB_JOB.env.ZE_AFFINITY_MASK | $ZE_AFFINITY_MASK |"
echo "| jobs.$GITHUB_JOB.env.NEOReadDebugKeys | $NEOReadDebugKeys |"
echo "| jobs.$GITHUB_JOB.env.PYTORCH_ENABLE_XPU_FALLBACK | $PYTORCH_ENABLE_XPU_FALLBACK |"
echo "| jobs.$GITHUB_JOB.env.PYTORCH_DEBUG_XPU_FALLBACK | $PYTORCH_DEBUG_XPU_FALLBACK |"
} >> $GITHUB_STEP_SUMMARY
96 changes: 96 additions & 0 deletions .github/scripts/parse-junitxml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import argparse
import sys

from junitparser import JUnitXml, Error, Failure, Skipped
from pathlib import Path

parser = argparse.ArgumentParser()
parser.add_argument('junitxml', nargs='+')
parser.add_argument('--stats', action='store_true', help='Print results summary table')
parser.add_argument('--errors', action='store_true', help='Print errorred tests in detailed results table')
parser.add_argument('--failed', action='store_true', help='Print failed tests in detailed results table')
parser.add_argument('--passed', action='store_true', help='Print passed tests in detailed results table')
parser.add_argument('--skipped', action='store_true', help='Print skipped tests in detailed results table')
args = parser.parse_args()

if not any([args.stats, args.errors, args.failed, args.passed, args.skipped]):
args.stats = args.passed = args.failed = args.skipped = True

xmls = [ JUnitXml.fromfile(f) for f in args.junitxml ]

need_group = len(args.junitxml) > 1
need_suite = any(len(x) > 1 for x in xmls)

def get_test_group(xml_name):
return Path(xml_name).stem

def get_rn(xml_name, suite_name):
rn = {}
if need_group:
rn |= { "Test group": get_test_group(args.junitxml[idx]) }
if need_suite:
rn |= { "Test suite": suite.name }
return rn

def print_md_row(row, print_header):
if print_header:
header = " | ".join([f"{key}" for key, _ in row.items()])
print(f"| {header} |")
header = " | ".join(["-"*len(key) for key, _ in row.items()])
print(f"| {header} |")
row = " | ".join([f"{value}" for _, value in row.items()])
print(f"| {row} |")

if args.stats:
print_header = True
for idx, xml in enumerate(xmls):
for suite in xml:
passed = suite.tests - suite.errors - suite.failures - suite.skipped
rn = get_rn(args.junitxml[idx], suite.name)
rn |= {
"Errors": suite.errors,
"Failed": suite.failures,
"Passed": passed,
"Skipped": suite.skipped,
"Tests": suite.tests,
"Time(s)": suite.time
}
print_md_row(rn, print_header)
print_header = False

if any([args.passed, args.failed, args.skipped]):
if args.stats:
print("")
print_header = True
for idx, xml in enumerate(xmls):
for suite in xml:
rn01 = get_rn(args.junitxml[idx], suite.name)
for case in suite:
output, result, message = (args.passed, "passed", "")
if case.result:
message = f"{case.result[0].message.splitlines()[0]}"
if isinstance(case.result[0], Error):
output, result = (args.errors, "error")
elif isinstance(case.result[0], Skipped):
output, result = (args.skipped, "skipped")
elif isinstance(case.result[0], Failure):
output, result = (args.failed, "failed")
output = args.failed
else:
print("fatal: unknown result type", file=sys.stderr)
sys.exit(1)
if output:
rn = rn01
rn |= {
# Be warned on pytest-pspec: if installed, pytest will
# use descriptions of classes and methods as test node
# ids. These descriptions might contain spaces and line
# breaks not suitable for simple Markdown tables.
"Class name": ' '.join(case.classname.split()),
"Test name": ' '.join(case.name.split()),
"Status": result,
"Time(s)": case.time,
"Message": message
}
print_md_row(rn, print_header)
print_header = False
154 changes: 154 additions & 0 deletions .github/workflows/_linux_accelerate.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
name: Linux Accelerate Test

on:
pull_request:
branches:
- main
paths:
- '.github/scripts/parse-junitxml.py'
- '.github/actions/print-environment/action.yml'
- '.github/workflows/_linux_accelerate.yml'
workflow_dispatch:
inputs:
pytorch:
required: false
type: string
default: 'nightly'
description: Pytorch branch/commit
python:
required: false
type: string
default: '3.10'
description: Python version
runner:
required: true
type: string
default: 'linux.idc.xpu'
description: Runner label
accelerate:
dvrogozh marked this conversation as resolved.
Show resolved Hide resolved
required: false
type: string
default: 'v1.2.1'
description: Accelerate version
transformers:
required: false
type: string
default: 'v4.47.1'
description: Transformers version

permissions: read-all

jobs:
Torch-XPU-Accelerate-Tests:
runs-on: ${{ inputs.runner != '' && inputs.runner || 'linux.idc.xpu' }}
env:
WORK_DIR: 'accelerate'
NEOReadDebugKeys: 0
DisableScratchPages: 0
accelerate: ${{ inputs.accelerate != '' && inputs.accelerate || 'v1.2.1' }}
transformers: ${{ inputs.transformers != '' && inputs.transformers || 'v4.47.1' }}
python: ${{ inputs.python != '' && inputs.python || '3.10' }}
PYTORCH_DEBUG_XPU_FALLBACK: 1
ZE_AFFINITY_MASK: 0
PARSE_JUNIT: ${{ github.workspace }}/torch-xpu-ops/.github/scripts/parse-junitxml.py
steps:
- name: Checkout torch-xpu-ops
chuanqi129 marked this conversation as resolved.
Show resolved Hide resolved
uses: actions/checkout@v4
with:
path: torch-xpu-ops
- name: Checkout Accelerate
uses: actions/checkout@v4
with:
repository: huggingface/accelerate
ref: ${{ env.accelerate }}
path: accelerate
- name: Create unique Conda ENV name
run: |
echo "CONDA_ENV_NAME=hf_accelerate_test_${ZE_AFFINITY_MASK}" >> $GITHUB_ENV
- name: Prepare Conda ENV
run: |
echo "Using Conda ENV name: $CONDA_ENV_NAME"
which conda && conda clean -ay
conda remove --all -y -n $CONDA_ENV_NAME || rm -rf $(dirname ${CONDA_EXE})/../envs/$CONDA_ENV_NAME
conda create -y -n $CONDA_ENV_NAME python=${{ env.python }}
source activate $CONDA_ENV_NAME
pip install junitparser
pip install transformers==${{ env.transformers }}
- name: Prepare Stock XPU Pytorch
run: |
source activate $CONDA_ENV_NAME
pip install torch torchvision torchaudio --pre --index-url https://download.pytorch.org/whl/nightly/xpu
- name: Prepare Accelerate
run: |
source activate $CONDA_ENV_NAME
cd $WORK_DIR
pip install -e .
pip install -e ".[testing]"
rm -rf tests_log && mkdir -p tests_log
rm -rf reports
cp ${{ github.workspace }}/torch-xpu-ops/.github/scripts/spec.py ./
- name: Report installed versions
run: |
source activate $CONDA_ENV_NAME
echo "pip installed packages:"
pip list | tee ${{ github.workspace }}/$WORK_DIR/tests_log/pip_list.txt
echo "lspci gpu devices:"
lspci -d ::0380 | tee ${{ github.workspace }}/$WORK_DIR/tests_log/lspci_0380.txt
echo "GPU render nodes:"
cat /sys/class/drm/render*/device/device | tee ${{ github.workspace }}/$WORK_DIR/tests_log/device_IDs.txt
echo "xpu-smi output:"
xpu-smi discovery -y --json --dump -1
- name: Sanity check installed packages
run: |
source activate $CONDA_ENV_NAME
# These checks are to exit earlier if for any reason torch
# packages were reinstalled back to CUDA versions (not expected).
pip show torch | grep Version | grep xpu
pip show torchaudio | grep Version | grep xpu
pip show torchvision | grep Version | grep xpu
python -c 'import torch; exit(not torch.xpu.is_available())'
- name: Run tests
run: |
source activate $CONDA_ENV_NAME
cd $WORK_DIR && rm -rf reports && mkdir -p reports
# Excluding tests due to:
# * tests/test_examples.py::FeatureExamplesTests::test_profiler fails on
# Kineto profiler initialization for XPU device: PTI_ERROR_INTERNAL
# * tests/test_cli.py::ModelEstimatorTester::test_gated for failures due
# to not root caused environment configuration issue
pattern="not test_profiler and not test_gated"
cmd=(python3 -m pytest -rsf --junitxml=reports/accelerate.xml -k "$pattern" tests/)
{
echo "### Running"
echo "\`\`\`"
echo "${cmd[@]@Q}"
echo "\`\`\`"
} >> $GITHUB_STEP_SUMMARY
"${cmd[@]}"
- name: Print result tables
if: ${{ ! cancelled() }}
run: |
source activate $CONDA_ENV_NAME
cd $WORK_DIR
{
echo "### Results"
python3 $PARSE_JUNIT reports/accelerate.xml --stats
echo "### Failed"
python3 $PARSE_JUNIT reports/accelerate.xml --errors --failed
echo "### Skipped"
python3 $PARSE_JUNIT reports/accelerate.xml --skipped
} >> $GITHUB_STEP_SUMMARY
- name: Print environment
if: ${{ ! cancelled() }}
uses: ./torch-xpu-ops/.github/actions/print-environment
with:
conda: $CONDA_ENV_NAME
pip_packages: 'accelerate transformers'
- name: Upload Test log
if: ${{ ! cancelled() }}
uses: actions/upload-artifact@v4
with:
name: Torch-XPU-Accelerate-Log-${{ github.event.pull_request.number || github.sha }}
path: |
${{ github.workspace }}/accelerate/reports
${{ github.workspace }}/accelerate/tests_log
Loading