Merge branch 'huggingface:main' into nomicbert

huggingface · Oct 21, 2024 · cdbf5f2 · cdbf5f2
2 parents 0b6aa14 + 8af46e5
commit cdbf5f2
Show file tree

Hide file tree

Showing 87 changed files with 5,969 additions and 4,732 deletions.
diff --git a/.github/workflows/build_main_documentation.yml b/.github/workflows/build_main_documentation.yml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
 
     steps:
       - uses: actions/checkout@v2
@@ -57,6 +57,7 @@ jobs:
       - name: Free disk space
         run: |
           df -h
+          sudo apt-get update
           sudo apt-get purge -y '^apache.*'
           sudo apt-get purge -y '^imagemagick.*'
           sudo apt-get purge -y '^dotnet.*'
@@ -66,7 +67,7 @@ jobs:
           sudo apt-get purge -y '^mysql.*'
           sudo apt-get purge -y '^java.*'
           sudo apt-get purge -y '^openjdk.*'
-          sudo apt-get purge -y microsoft-edge-stable google-cloud-cli azure-cli google-chrome-stable firefox powershell mono-devel
+          sudo apt-get purge -y microsoft-edge-stable azure-cli google-chrome-stable firefox mono-devel
           df -h
           sudo apt-get autoremove -y >/dev/null 2>&1
           sudo apt-get clean
@@ -110,6 +111,8 @@ jobs:
 
       - name: Setup environment
         run: |
+          python -m venv venv-doc
+          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
@@ -135,6 +138,7 @@ jobs:
 
       - name: Make Furiosa documentation
         run: |
+          source venv-doc/bin/activate
           cd optimum-furiosa
           pip install .
           sudo apt install software-properties-common
@@ -159,6 +163,7 @@ jobs:
       - name: Make TPU documentation
         run: |
           sudo docker system prune -a -f
+          source venv-doc/bin/activate
           cd optimum-tpu
           pip install -U pip
           pip install . -f https://storage.googleapis.com/libtpu-releases/index.html
@@ -192,6 +197,7 @@ jobs:
 
       - name: Push to repositories
         run: |
+          source venv-doc/bin/activate
           cd optimum/optimum-doc-build
           sudo chmod -R ugo+rwx optimum
           doc-builder push optimum --doc_build_repo_id "hf-doc-build/doc-build" --token "${{ secrets.HF_DOC_BUILD_PUSH }}" --commit_msg "Updated with commit ${{ github.sha }} See: https://github.com/huggingface/optimum/commit/${{ github.sha }}" --n_retries 5 --upload_version_yml

diff --git a/.github/workflows/build_pr_documentation.yml b/.github/workflows/build_pr_documentation.yml
@@ -8,14 +8,15 @@ on:
       - "optimum/**.py"
       - "docs/**.mdx"
       - "docs/**.yml"
+      - ".github/workflows/build_pr_documentation.yml"
 
 concurrency:
   group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
   cancel-in-progress: true
 
 jobs:
   build_documentation:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     env:
       COMMIT_SHA: ${{ github.event.pull_request.head.sha }}
       PR_NUMBER: ${{ github.event.number }}
@@ -60,6 +61,8 @@ jobs:
 
       - name: Setup environment
         run: |
+          python -m venv venv-doc
+          source venv-doc/bin/activate
           pip uninstall -y doc-builder
           cd doc-builder
           git pull origin main
@@ -99,6 +102,7 @@ jobs:
       - name: Make TPU documentation
         run: |
           sudo docker system prune -a -f
+          source venv-doc/bin/activate
           cd optimum-tpu
           pip install -U pip
           pip install . -f https://storage.googleapis.com/libtpu-releases/index.html

diff --git a/.github/workflows/doctests.yml b/.github/workflows/doctests.yml
@@ -9,7 +9,8 @@ on:
 jobs:
   do-the-job:
     name: Start self-hosted EC2 runner
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
     env:
       AWS_REGION: us-east-1
     steps:

diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -0,0 +1,14 @@
+name: 'Close stale issues and PRs'
+on:
+  schedule:
+    - cron: '30 1 * * *'
+
+jobs:
+  stale:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/stale@v8
+        with:
+          stale-issue-message: 'This issue is stale because it has been open 30 days with no activity. Remove stale label or comment or this will be closed in 5 days.'
+          days-before-stale: 30
+          days-before-close: 5
diff --git a/.github/workflows/test_bettertransformer.yml b/.github/workflows/test_bettertransformer.yml
@@ -15,9 +15,8 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
-        exclude: [{ python-version: 3.8, os: macos-13 }]
+        python-version: [3.9]
+        os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
     steps:

diff --git a/.github/workflows/test_bettertransformer_gpu.yml b/.github/workflows/test_bettertransformer_gpu.yml
@@ -8,7 +8,8 @@ on:
 jobs:
   do-the-job:
     name: Start self-hosted EC2 runner
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
     env:
       AWS_REGION: us-east-1
     steps:

diff --git a/.github/workflows/test_exporters_gpu.yml b/.github/workflows/test_exporters_gpu.yml
@@ -15,7 +15,8 @@ jobs:
   do-the-job:
     if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
     name: Start self-hosted EC2 runner
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
     env:
       AWS_REGION: us-east-1
     steps:

diff --git a/.github/workflows/test_fx_automatic_parallel.yml b/.github/workflows/test_fx_automatic_parallel.yml
@@ -0,0 +1,64 @@
+name: Automatic Model Parallelism Test on GPUs
+
+on:
+  pull_request:
+    branches:
+      - main
+    paths:
+      - 'optimum/fx/parallelization/**.py'
+  push:
+    branches:
+      - main
+    paths:
+      - 'optimum/fx/parallelization/**.py'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+jobs:
+  run_gpu_tests:
+    strategy:
+      fail-fast: false
+      matrix:
+        config:
+          - name: GPU-enabled Optimum Test Suite
+            image: nvidia/cuda:12.4.1-devel-ubuntu22.04
+        gpu_target: ["aws-g5-12xlarge-plus"]
+
+    name: ${{ matrix.config.name }}
+    runs-on:
+      group: "${{matrix.gpu_target}}"
+
+    container:
+      image: ${{ matrix.config.image }}
+      options: --mount type=tmpfs,destination=/tmp --shm-size 64gb --gpus all --ipc host -v /mnt/hf_cache:/mnt/cache/
+      env:
+        NCCL_DEBUG: INFO
+    defaults:
+      run:
+        shell: bash
+
+    steps:
+      - uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Checkout optimum
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 1
+
+      - name: Run nvidia-smi
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          python3 -m pip install -U pip
+          python3 -m pip install torch transformers
+          python3 -m pip install .[tests]
+
+      - name: Run automatic model parallelism tests
+        run: |
+          pytest -s -v -o log_cli=true tests/fx/parallelization
diff --git a/.github/workflows/test_gptq.yml b/.github/workflows/test_gptq.yml
@@ -20,7 +20,8 @@ on:
 
 jobs:
   test_gptq:
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
 
     steps:
       - name: Checkout code

diff --git a/.github/workflows/test_onnx.yml b/.github/workflows/test_onnx.yml
@@ -16,7 +16,7 @@ jobs:
       fail-fast: false
       matrix:
         python-version: [3.8, 3.9]
-        os: [ubuntu-20.04, macos-13]
+        os: [ubuntu-20.04, macos-14]
 
     runs-on: ${{ matrix.os }}
     steps:

diff --git a/.github/workflows/test_onnxruntime.yml b/.github/workflows/test_onnxruntime.yml
@@ -51,7 +51,7 @@ jobs:
 
       - name: Test with pytest (in parallel)
         env:
-          FXMARTYCLONE_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+          HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
         working-directory: tests
         run: |
           pytest onnxruntime -m "not run_in_series" --durations=0 -vvvv -s -n auto
diff --git a/.github/workflows/test_onnxruntime_gpu.yml b/.github/workflows/test_onnxruntime_gpu.yml
@@ -15,7 +15,8 @@ jobs:
   do-the-job:
     if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') || contains( github.event.pull_request.labels.*.name, 'gpu-test') }}
     name: Start self-hosted EC2 runner
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
     env:
       AWS_REGION: us-east-1
     steps:

diff --git a/.github/workflows/test_onnxruntime_train.yml b/.github/workflows/test_onnxruntime_train.yml
@@ -11,7 +11,8 @@ jobs:
   do-the-job:
     if: ${{ (github.event_name == 'workflow_dispatch') || (github.event_name == 'schedule') ||  contains( github.event.pull_request.labels.*.name, 'training')}}
     name: Run ORTTrainer test
-    runs-on: [single-gpu, nvidia-gpu, t4, ci]
+    runs-on:
+      group: aws-g6-4xlarge-plus
     env:
       AWS_REGION: us-east-1
     steps:
@@ -22,4 +23,4 @@ jobs:
           docker build -f tests/onnxruntime/docker/Dockerfile_onnxruntime_trainer -t onnxruntime/train .
       - name: Run test within docker container
         run: |
-          docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime/train:latest
+          docker run --rm --gpus all -v /mnt/cache/.cache/huggingface:/root/.cache/huggingface --workdir=/workspace/optimum/tests onnxruntime/train:latest
diff --git a/Makefile b/Makefile
@@ -23,11 +23,11 @@ REAL_CLONE_URL = $(if $(CLONE_URL),$(CLONE_URL),$(DEFAULT_CLONE_URL))
 # Run code quality checks
 style_check:
 	black --check .
-	ruff .
+	ruff check .
 
 style:
 	black .
-	ruff . --fix
+	ruff check . --fix
 
 # Run tests for the library
 test:

diff --git a/README.md b/README.md
@@ -268,3 +268,34 @@ You can find more examples in the [documentation](https://huggingface.co/docs/op
 ```
 
 You can find more examples in the [documentation](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/trainer) and in the [examples](https://github.com/huggingface/optimum/tree/main/examples/onnxruntime/training).
+
+
+### Quanto
+
+[Quanto](https://github.com/huggingface/optimum-quanto) is a pytorch quantization backend.
+
+You can quantize a model either using the python API or the `optimum-cli`.
+
+```python
+from transformers import AutoModelForCausalLM
+from optimum.quanto import QuantizedModelForCausalLM, qint4
+
+model = AutoModelForCausalLM.from_pretrained('meta-llama/Meta-Llama-3.1-8B')
+qmodel = QuantizedModelForCausalLM.quantize(model, weights=qint4, exclude='lm_head')
+```
+
+The quantized model can be saved using `save_pretrained`:
+
+```python
+qmodel.save_pretrained('./Llama-3.1-8B-quantized')
+```
+
+It can later be reloaded using `from_pretrained`:
+
+```python
+from optimum.quanto import QuantizedModelForCausalLM
+
+qmodel = QuantizedModelForCausalLM.from_pretrained('Llama-3.1-8B-quantized')
+```
+
+You can see more details and [examples](https://github.com/huggingface/optimum-quanto/tree/main/examples) in the [Quanto](https://github.com/huggingface/optimum-quanto) repository.
diff --git a/docs/source/bettertransformer/overview.mdx b/docs/source/bettertransformer/overview.mdx
@@ -24,7 +24,7 @@ In the 2.0 version, PyTorch includes a native scaled dot-product attention opera
 We provide an integration with these optimizations out of the box in 🤗 Optimum, so that you can convert any supported 🤗 Transformers model so as to use the optimized paths & `scaled_dot_product_attention` function when relevant.
 
 <Tip warning={true}>
-PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorc latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA.
+PyTorch-native `scaled_dot_product_attention` is slowly being natively [made default and integrated in 🤗 Transformers](https://huggingface.co/docs/transformers/perf_infer_gpu_one#flashattention-and-memory-efficient-attention-through-pytorchs-scaleddotproductattention). For models that do support SDPA in Transformers, we deprecate BetterTransformer and recommend you to use directly Transformers and PyTorch latest version for the attention optimizations (Flash Attention, memory-efficient attention) through SDPA.
 </Tip>
 
 <Tip warning={true}>

diff --git a/docs/source/concept_guides/quantization.mdx b/docs/source/concept_guides/quantization.mdx
@@ -95,11 +95,11 @@ paragraph.
 
 ### Symmetric and affine quantization schemes
 
-The equation above is called the *affine quantization sheme* because the mapping from `[a, b]` to `int8` is an affine one.
+The equation above is called the *affine quantization scheme* because the mapping from `[a, b]` to `int8` is an affine one.
 
 A common special case of this scheme is the *symmetric quantization scheme*, where we consider a symmetric range of float values `[-a, a]`.
-In this case the integer space is usally `[-127, 127]`, meaning that the `-128` is opted out of the regular `[-128, 127]` signed `int8` range.
-The reason being that having both ranges symmetric allows to have `Z = 0`. While one value out of the 256 representable
+In this case the integer space is usually `[-127, 127]`, meaning that the `-128` is opted out of the regular `[-128, 127]` signed `int8` range.
+The reason being that having a symmetric range allows to have `Z = 0`. While one value out of the 256 representable
 values is lost, it can provide a speedup since a lot of addition operations can be skipped.
 
 **Note**: To learn how the quantization parameters `S` and `Z` are computed, you can read the

diff --git a/docs/source/index.mdx b/docs/source/index.mdx
@@ -36,7 +36,7 @@ The packages below enable you to get the best of the 🤗 Hugging Face ecosystem
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="./intel/index"
       ><div class="w-full text-center bg-gradient-to-br from-blue-400 to-blue-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">Intel</div>
-      <p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span></p>
+      <p class="text-gray-700">Optimize your model to speedup inference with <span class="underline" onclick="event.preventDefault(); window.open('https://docs.openvino.ai/latest/index.html', '_blank');">OpenVINO</span> , <span class="underline" onclick="event.preventDefault(); window.open('https://www.intel.com/content/www/us/en/developer/tools/oneapi/neural-compressor.html', '_blank');">Neural Compressor</span> and <span class="underline" onclick="event.preventDefault(); window.open('https://intel.github.io/intel-extension-for-pytorch/index.html', '_blank');">IPEX</span></p>
     </a>
     <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/docs/optimum-neuron/index"
       ><div class="w-full text-center bg-gradient-to-br from-orange-400 to-orange-500 rounded-lg py-1.5 font-semibold mb-5 text-white text-lg leading-relaxed">AWS Trainium/Inferentia</div>

diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
@@ -25,6 +25,7 @@ If you'd like to use the accelerator-specific features of 🤗 Optimum, you can
 | [ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/overview)                                               | `pip install --upgrade --upgrade-strategy eager optimum[onnxruntime]`       |
 | [Intel Neural Compressor](https://huggingface.co/docs/optimum/intel/index)                                             | `pip install --upgrade --upgrade-strategy eager optimum[neural-compressor]` |
 | [OpenVINO](https://huggingface.co/docs/optimum/intel/index)                                                            | `pip install --upgrade --upgrade-strategy eager optimum[openvino]`          |
+| [IPEX](https://huggingface.co/docs/optimum/intel/index)                                                                | `pip install --upgrade --upgrade-strategy eager optimum[ipex]`              |
 | [NVIDIA TensorRT-LLM](https://huggingface.co/docs/optimum/main/en/nvidia_overview)                                     | `docker run -it --gpus all --ipc host huggingface/optimum-nvidia`           |
 | [AMD Instinct GPUs and Ryzen AI NPU](https://huggingface.co/docs/optimum/amd/index)                                    | `pip install --upgrade --upgrade-strategy eager optimum[amd]`               |
 | [AWS Trainum & Inferentia](https://huggingface.co/docs/optimum-neuron/index)                                           | `pip install --upgrade --upgrade-strategy eager optimum[neuronx]`           |

diff --git a/docs/source/onnxruntime/package_reference/modeling_ort.mdx b/docs/source/onnxruntime/package_reference/modeling_ort.mdx
@@ -119,6 +119,11 @@ The following ORT classes are available for the following custom tasks.
 
 ## Stable Diffusion
 
+#### ORTDiffusionPipeline
+
+[[autodoc]] onnxruntime.ORTDiffusionPipeline
+    - __call__
+
 #### ORTStableDiffusionPipeline
 
 [[autodoc]] onnxruntime.ORTStableDiffusionPipeline