Skip to content

Commit

Permalink
Merge branch 'huggingface:main' into nomicbert
Browse files Browse the repository at this point in the history
  • Loading branch information
bhavika authored Oct 21, 2024
2 parents 0b6aa14 + 8af46e5 commit cdbf5f2
Show file tree
Hide file tree
Showing 87 changed files with 5,969 additions and 4,732 deletions.
10 changes: 8 additions & 2 deletions .github/workflows/build_main_documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ on:

jobs:
build_documentation:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04

steps:
- uses: actions/checkout@v2
Expand Down Expand Up @@ -57,6 +57,7 @@ jobs:
- name: Free disk space
run: |
df -h
sudo apt-get update
sudo apt-get purge -y '^apache.*'
sudo apt-get purge -y '^imagemagick.*'
sudo apt-get purge -y '^dotnet.*'
Expand All @@ -66,7 +67,7 @@ jobs:
sudo apt-get purge -y '^mysql.*'
sudo apt-get purge -y '^java.*'
sudo apt-get purge -y '^openjdk.*'
sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
sudo apt-get purge -y microsoft-edge-stable azure-cli google-chrome-stable firefox mono-devel
df -h
sudo apt-get autoremove -y >/dev/null 2>&1
sudo apt-get clean
Expand Down Expand Up @@ -110,6 +111,8 @@ jobs:
- name: Setup environment
run: |
python -m venv venv-doc
source venv-doc/bin/activate
pip uninstall -y doc-builder
cd doc-builder
git pull origin main
Expand All @@ -135,6 +138,7 @@ jobs:
- name: Make Furiosa documentation
run: |
source venv-doc/bin/activate
cd optimum-furiosa
pip install .
sudo apt install software-properties-common
Expand All @@ -159,6 +163,7 @@ jobs:
- name: Make TPU documentation
run: |
sudo docker system prune -a -f
source venv-doc/bin/activate
cd optimum-tpu
pip install -U pip
pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
Expand Down Expand Up @@ -192,6 +197,7 @@ jobs:
- name: Push to repositories
run: |
source venv-doc/bin/activate
cd optimum/optimum-doc-build
sudo chmod -R ugo+rwx optimum
doc-builder push optimum --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit ${{ github.sha }} See: https://github.com/huggingface/optimum/commit/${{ github.sha }}" --n_retries 5 --upload_version_yml
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/build_pr_documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@ on:
- "optimum/**.py"
- "docs/**.mdx"
- "docs/**.yml"
- ".github/workflows/build_pr_documentation.yml"

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
build_documentation:
runs-on: ubuntu-latest
runs-on: ubuntu-22.04
env:
COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
PR_NUMBER: ${{ github.event.number }}
Expand Down Expand Up @@ -60,6 +61,8 @@ jobs:

- name: Setup environment
run: |
python -m venv venv-doc
source venv-doc/bin/activate
pip uninstall -y doc-builder
cd doc-builder
git pull origin main
Expand Down Expand Up @@ -99,6 +102,7 @@ jobs:
- name: Make TPU documentation
run: |
sudo docker system prune -a -f
source venv-doc/bin/activate
cd optimum-tpu
pip install -U pip
pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/doctests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ on:
jobs:
do-the-job:
name: Start self-hosted EC2 runner
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g6-4xlarge-plus
env:
AWS_REGION: us-east-1
steps:
Expand Down
14 changes: 14 additions & 0 deletions .github/workflows/stale.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
name: 'Close stale issues and PRs'
on:
schedule:
- cron: '30 1 * * *'

jobs:
stale:
runs-on: ubuntu-latest
steps:
- uses: actions/stale@v8
with:
stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
days-before-stale: 30
days-before-close: 5
5 changes: 2 additions & 3 deletions .github/workflows/test_bettertransformer.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,8 @@ jobs:
strategy:
fail-fast: false
matrix:
python-version: [3.8, 3.9]
os: [ubuntu-20.04, macos-13]
exclude: [{ python-version: 3.8, os: macos-13 }]
python-version: [3.9]
os: [ubuntu-20.04, macos-14]

runs-on: ${{ matrix.os }}
steps:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_bettertransformer_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ on:
jobs:
do-the-job:
name: Start self-hosted EC2 runner
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g6-4xlarge-plus
env:
AWS_REGION: us-east-1
steps:
Expand Down
3 changes: 2 additions & 1 deletion .github/workflows/test_exporters_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ jobs:
do-the-job:
if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
name: Start self-hosted EC2 runner
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g6-4xlarge-plus
env:
AWS_REGION: us-east-1
steps:
Expand Down
64 changes: 64 additions & 0 deletions .github/workflows/test_fx_automatic_parallel.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
name: Automatic Model Parallelism Test on GPUs

on:
pull_request:
branches:
- main
paths:
- 'optimum/fx/parallelization/**.py'
push:
branches:
- main
paths:
- 'optimum/fx/parallelization/**.py'

concurrency:
group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
cancel-in-progress: true

jobs:
run_gpu_tests:
strategy:
fail-fast: false
matrix:
config:
- name: GPU-enabled Optimum Test Suite
image: nvidia/cuda:12.4.1-devel-ubuntu22.04
gpu_target: ["aws-g5-12xlarge-plus"]

name: ${{ matrix.config.name }}
runs-on:
group: "${{matrix.gpu_target}}"

container:
image: ${{ matrix.config.image }}
options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
env:
NCCL_DEBUG: INFO
defaults:
run:
shell: bash

steps:
- uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Checkout optimum
uses: actions/checkout@v4
with:
fetch-depth: 1

- name: Run nvidia-smi
run: |
nvidia-smi
- name: Install dependencies
run: |
python3 -m pip install -U pip
python3 -m pip install torch transformers
python3 -m pip install .[tests]
- name: Run automatic model parallelism tests
run: |
pytest -s -v -o log_cli=true tests/fx/parallelization
3 changes: 2 additions & 1 deletion .github/workflows/test_gptq.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,8 @@ on:

jobs:
test_gptq:
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g6-4xlarge-plus

steps:
- name: Checkout code
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_onnx.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
fail-fast: false
matrix:
python-version: [3.8, 3.9]
os: [ubuntu-20.04, macos-13]
os: [ubuntu-20.04, macos-14]

runs-on: ${{ matrix.os }}
steps:
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test_onnxruntime.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
- name: Test with pytest (in parallel)
env:
FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
working-directory: tests
run: |
pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
3 changes: 2 additions & 1 deletion .github/workflows/test_onnxruntime_gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@ jobs:
do-the-job:
if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
name: Start self-hosted EC2 runner
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g6-4xlarge-plus
env:
AWS_REGION: us-east-1
steps:
Expand Down
5 changes: 3 additions & 2 deletions .github/workflows/test_onnxruntime_train.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ jobs:
do-the-job:
if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'training')}}
name: Run ORTTrainer test
runs-on: [single-gpu, nvidia-gpu, t4, ci]
runs-on:
group: aws-g6-4xlarge-plus
env:
AWS_REGION: us-east-1
steps:
Expand All @@ -22,4 +23,4 @@ jobs:
docker build -f tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer -t onnxruntime/train .
- name: Run test within docker container
run: |
docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime/train:latest
docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime/train:latest
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
# Run code quality checks
style_check:
black --check .
ruff .
ruff check .

style:
black .
ruff . --fix
ruff check . --fix

# Run tests for the library
test:
Expand Down
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -268,3 +268,34 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
```

You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training).


### Quanto

[Quanto](https://github.com/huggingface/optimum-quanto) is a pytorch quantization backend.

You can quantize a model either using the python API or the `optimum-cli`.

```python
from transformers import AutoModelForCausalLM
from optimum.quanto import QuantizedModelForCausalLM, qint4

model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B')
qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')
```

The quantized model can be saved using `save_pretrained`:

```python
qmodel.save_pretrained('./Llama-3.1-8B-quantized')
```

It can later be reloaded using `from_pretrained`:

```python
from optimum.quanto import QuantizedModelForCausalLM

qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3.1-8B-quantized')
```

You can see more details and [examples](https://github.com/huggingface/optimum-quanto/tree/main/examples) in the [Quanto](https://github.com/huggingface/optimum-quanto) repository.
2 changes: 1 addition & 1 deletion docs/source/bettertransformer/overview.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ In the 2.0 version, PyTorch includes a native scaled dot-product attention opera
We provide an integration with these optimizations out of the box in 🤗 Optimum, so that you can convert any supported 🤗 Transformers model so as to use the optimized paths & `scaled_dot_product_attention` function when relevant.

<Tip warning={true}>
PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorc latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA.
PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorch latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA.
</Tip>

<Tip warning={true}>
Expand Down
6 changes: 3 additions & 3 deletions docs/source/concept_guides/quantization.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,11 @@ paragraph.

### Symmetric and affine quantization schemes

The equation above is called the *affine quantization sheme* because the mapping from `[a, b]` to `int8` is an affine one.
The equation above is called the *affine quantization scheme* because the mapping from `[a, b]` to `int8` is an affine one.

A common special case of this scheme is the *symmetric quantization scheme*, where we consider a symmetric range of float values `[-a, a]`.
In this case the integer space is usally `[-127, 127]`, meaning that the `-128` is opted out of the regular `[-128, 127]` signed `int8` range.
The reason being that having both ranges symmetric allows to have `Z = 0`. While one value out of the 256 representable
In this case the integer space is usually `[-127, 127]`, meaning that the `-128` is opted out of the regular `[-128, 127]` signed `int8` range.
The reason being that having a symmetric range allows to have `Z = 0`. While one value out of the 256 representable
values is lost, it can provide a speedup since a lot of addition operations can be skipped.

**Note**: To learn how the quantization parameters `S` and `Z` are computed, you can read the
Expand Down
2 changes: 1 addition & 1 deletion docs/source/index.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
</a>
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./intel/index"
><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Intel</div>
<p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span></p>
<p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> , <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://intel.github.io/intel-extension-for-pytorch/index.html', '_blank');">IPEX</span></p>
</a>
<a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-neuron/index"
><div class="w-full text-center bg-gradient-to-br from-orange-400 to-orange-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">AWS Trainium/Inferentia</div>
Expand Down
1 change: 1 addition & 0 deletions docs/source/installation.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
| [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview) | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]` |
| [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` |
| [OpenVINO](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[openvino]` |
| [IPEX](https://huggingface.co/docs/optimum/intel/index) | `pip install --upgrade --upgrade-strategy eager optimum[ipex]` |
| [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview) | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia` |
| [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index) | `pip install --upgrade --upgrade-strategy eager optimum[amd]` |
| [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index) | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]` |
Expand Down
5 changes: 5 additions & 0 deletions docs/source/onnxruntime/package_reference/modeling_ort.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,11 @@ The following ORT classes are available for the following custom tasks.

## Stable Diffusion

#### ORTDiffusionPipeline

[[autodoc]] onnxruntime.ORTDiffusionPipeline
- __call__

#### ORTStableDiffusionPipeline

[[autodoc]] onnxruntime.ORTStableDiffusionPipeline
Expand Down
Loading

0 comments on commit cdbf5f2

Please sign in to comment.