diff --git a/.github/actions/openvino_provider/action.yml b/.github/actions/openvino_provider/action.yml index dd1078bb0d4353..a17986f35d3723 100644 --- a/.github/actions/openvino_provider/action.yml +++ b/.github/actions/openvino_provider/action.yml @@ -177,7 +177,7 @@ runs: else ov_package_url=$(curl -s ${{ inputs.nightly_package_source }} | jq -r '.${{ inputs.platform }}_${{ inputs.arch }}') fi - cd ${{ inputs.install_dir || env.GITHUB_WORKSPACE }} + cd ${{ inputs.install_dir || github.workspace }} package_basename=$(basename $ov_package_url) wget $ov_package_url --progress=bar:force:noscroll -O $package_basename package_folder=${package_basename%.*} @@ -196,7 +196,7 @@ runs: uses: actions/upload-artifact@50769540e7f4bd5e21e526ee35c689e35e0d6874 # v4.4.0 with: name: ${{ steps.openvino_s3_download.outputs.ov_artifact_name }} - path: ${{ steps.openvino_s3_download.outputs.ov_package_path }} + path: ${{ github.workspace }}/${{ steps.openvino_s3_download.outputs.ov_package_path }} if-no-files-found: 'error' - name: Get wheel diff --git a/.github/workflows/debian_10_arm.yml b/.github/workflows/debian_10_arm.yml index 73426222253adb..cf628d12c29b89 100644 --- a/.github/workflows/debian_10_arm.yml +++ b/.github/workflows/debian_10_arm.yml @@ -49,7 +49,7 @@ jobs: Docker: needs: Smart_CI if: "!needs.smart_ci.outputs.skip_workflow" - runs-on: aks-linux-16-cores-arm-docker-build + runs-on: aks-linux-4-cores-8gb-arm-docker-build container: image: openvinogithubactions.azurecr.io/docker_build:0.2 volumes: @@ -75,7 +75,7 @@ jobs: if: "!needs.smart_ci.outputs.skip_workflow" uses: ./.github/workflows/job_build_linux.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.debian_10_arm }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} event-name: ${{ github.event_name }} @@ -104,7 +104,7 @@ jobs: needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_cxx_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_arm }} affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'debian_10' @@ -116,6 +116,8 @@ jobs: needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_cpu_functional_tests.yml with: + # Additional investigation needed why CPU functional tests are failing on v6 VM size's version, + # so leave it as it is for now runner: 'aks-linux-16-cores-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.debian_10_arm }} python-version: '3.7' diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 10de6867c7d0e2..66ce9461f05fe8 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -53,7 +53,7 @@ jobs: Docker: needs: Smart_CI if: "!needs.smart_ci.outputs.skip_workflow" - runs-on: aks-linux-16-cores-arm-docker-build + runs-on: aks-linux-4-cores-8gb-arm-docker-build container: image: openvinogithubactions.azurecr.io/docker_build:0.2 volumes: @@ -78,7 +78,7 @@ jobs: needs: [ Docker, Smart_CI ] uses: ./.github/workflows/job_build_linux.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} event-name: ${{ github.event_name }} @@ -105,7 +105,7 @@ jobs: if: ${{ 'false' }} uses: ./.github/workflows/job_debian_packages.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-4-cores-8gb-arm' image: 'openvinogithubactions.azurecr.io/dockerhub/ubuntu:20.04' Samples: @@ -113,7 +113,7 @@ jobs: if: fromJSON(needs.smart_ci.outputs.affected_components).samples uses: ./.github/workflows/job_samples_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} @@ -123,7 +123,7 @@ jobs: if: fromJSON(needs.smart_ci.outputs.affected_components).JS_API uses: ./.github/workflows/job_openvino_js.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-4-cores-8gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}"}' ONNX_Runtime: @@ -133,7 +133,7 @@ jobs: needs: [ Build, Smart_CI, Docker ] uses: ./.github/workflows/job_onnx_runtime.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"], "options": "-e SCCACHE_AZURE_BLOB_CONTAINER -e SCCACHE_AZURE_CONNECTION_STRING"}' sccache-azure-key-prefix: 'ubuntu20_aarch64_onnxruntime' @@ -142,7 +142,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_tokenizers.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_build.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} @@ -154,7 +154,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_cxx_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }} affected-components: ${{ needs.smart_ci.outputs.affected_components }} os: 'ubuntu_20_04' @@ -164,7 +164,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_python_unit_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-8-cores-16gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -174,7 +174,7 @@ jobs: needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ] uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -184,7 +184,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.11' @@ -195,6 +195,8 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_cpu_functional_tests.yml with: + # Additional investigation needed why CPU functional tests are failing on v6 VM size's version, + # so leave it as it is for now runner: 'aks-linux-16-cores-arm' image: ${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }} python-version: '3.11' @@ -207,7 +209,7 @@ jobs: needs: [ Build, Docker, Smart_CI, Openvino_tokenizers] uses: ./.github/workflows/job_tensorflow_models_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}"}' model_scope: 'precommit' @@ -218,7 +220,7 @@ jobs: needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_models_tests.yml with: - runner: 'aks-linux-16-cores-arm' + runner: 'aks-linux-16-cores-32gb-arm' container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}"}' model_scope: 'precommit' diff --git a/.github/workflows/workflows_scans.yml b/.github/workflows/workflows_scans.yml new file mode 100644 index 00000000000000..0a293a4152b9a0 --- /dev/null +++ b/.github/workflows/workflows_scans.yml @@ -0,0 +1,38 @@ +name: GitHub Actions Workflows Scans +on: + workflow_dispatch: {} + push: + paths: + - '.github/workflows/**' + branches: + - 'master' + - 'releases/**' + pull_request: + paths: + - '.github/workflows/**' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +permissions: read-all + +jobs: + semgrep: + name: github_actions_workflows_scan/semgrep + runs-on: ubuntu-latest + if: ${{ github.repository_owner == 'openvinotoolkit' }} + + container: + image: semgrep/semgrep + + steps: + - name: Checkout + uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: 'false' + sparse-checkout: .github/workflows + + - name: Semgrep scan + run: | + semgrep scan --error -j 8 --config "p/github-actions" .github/workflows/* diff --git a/docs/articles_en/about-openvino/performance-benchmarks.rst b/docs/articles_en/about-openvino/performance-benchmarks.rst index 78a364c18ca4e6..5d9abfe891584f 100644 --- a/docs/articles_en/about-openvino/performance-benchmarks.rst +++ b/docs/articles_en/about-openvino/performance-benchmarks.rst @@ -163,7 +163,7 @@ For a listing of all platforms and configurations used for testing, refer to the 2024.5, as of November 20, 2024. * OpenVINO Model Server performance results are based on release - 2024.4, as of Sept. 30, 2024. + 2024.5, as of November 20, 2024. The results may not reflect all publicly available updates. Intel technologies' features and benefits depend on system configuration and may require enabled hardware, software, or service diff --git a/docs/articles_en/assets/images/genai_main_diagram.svg b/docs/articles_en/assets/images/genai_main_diagram.svg new file mode 100644 index 00000000000000..b01cbd827acb3c --- /dev/null +++ b/docs/articles_en/assets/images/genai_main_diagram.svg @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07ce964e115f1e3942cdf381f44b4dc6d466df62c70396306a4f241fb07ea3ed +size 392244 diff --git a/docs/articles_en/learn-openvino.rst b/docs/articles_en/learn-openvino.rst index 4fca64051003a7..98797c9c67c126 100644 --- a/docs/articles_en/learn-openvino.rst +++ b/docs/articles_en/learn-openvino.rst @@ -14,7 +14,7 @@ Learn OpenVINO Interactive Tutorials (Python) Sample Applications (Python & C++) - Large Language Model Inference Guide + Generative AI workflow @@ -29,5 +29,5 @@ as well as an experienced user. | :doc:`OpenVINO Samples ` | The OpenVINO samples (Python and C++) are simple console applications that show how to use specific OpenVINO API features. They can assist you in executing tasks such as loading a model, running inference, querying particular device capabilities, etc. -| :doc:`Large Language Models in OpenVINO ` +| :doc:`Generative AI workflow ` | Detailed information on how OpenVINO accelerates Generative AI use cases and what models it supports. This tutorial provides instructions for running Generative AI models using Hugging Face Optimum Intel and Native OpenVINO APIs. diff --git a/docs/articles_en/learn-openvino/llm_inference_guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide.rst index 36c001c015f744..bfc4f9b4c49173 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide.rst @@ -1,140 +1,106 @@ -Large Language Model Inference Guide +Generative AI workflow ======================================== .. meta:: - :description: Explore learning materials, including interactive - Python tutorials and sample console applications that explain - how to use OpenVINO features. + :description: learn how to use OpenVINO to run generative AI models. .. toctree:: :maxdepth: 1 :hidden: - Run LLMs with Optimum Intel - Run LLMs on OpenVINO GenAI Flavor - Run LLMs on Base OpenVINO + Inference with OpenVINO GenAI + Inference with Optimum Intel + Generative AI with Base OpenVINO (not recommended) OpenVINO Tokenizers -Large Language Models (LLMs) like GPT are transformative deep learning networks capable of a -broad range of natural language tasks, from text generation to language translation. OpenVINO -optimizes the deployment of these models, enhancing their performance and integration into -various applications. This guide shows how to use LLMs with OpenVINO, from model loading and -conversion to advanced use cases. + + +Generative AI is a specific area of Deep Learning models used for producing new and “original” +data, based on input in the form of image, sound, or natural language text. Due to their +complexity and size, generative AI pipelines are more difficult to deploy and run efficiently. +OpenVINO simplifies the process and ensures high-performance integrations, with the following +options: + +.. tab-set:: + + .. tab-item:: OpenVINO GenAI + + | - Suggested for production deployment for the supported use cases. + | - Smaller footprint and fewer dependencies. + | - More optimization and customization options. + | - Available in both Python and C++. + | - A limited set of supported use cases. + + :doc:`Install the OpenVINO GenAI package <../get-started/install-openvino/install-openvino-genai>` + and run generative models out of the box. With custom + API and tokenizers, among other components, it manages the essential tasks such as the + text generation loop, tokenization, and scheduling, offering ease of use and high + performance. + + .. tab-item:: Hugging Face integration + + | - Suggested for prototyping and, if the use case is not covered by OpenVINO GenAI, production. + | - Bigger footprint and more dependencies. + | - Limited customization due to Hugging Face dependency. + | - Not usable for C++ applications. + | - A very wide range of supported models. + + Using Optimum Intel is a great way to experiment with different models and scenarios, + thanks to a simple interface for the popular API and infrastructure offered by Hugging Face. + It also enables weight compression with + `Neural Network Compression Framework (NNCF) `__, + as well as conversion on the fly. For integration with the final product it may offer + lower performance, though. + +`Check out the GenAI Quick-start Guide [PDF] `__ The advantages of using OpenVINO for LLM deployment: -* **OpenVINO offers optimized LLM inference**: - provides a full C/C++ API, leading to faster operation than Python-based runtimes; includes a - Python API for rapid development, with the option for further optimization in C++. -* **Compatible with diverse hardware**: - supports CPUs, GPUs, and neural accelerators across ARM and x86/x64 architectures, integrated - Intel® Processor Graphics, discrete Intel® Arc™ A-Series Graphics, and discrete Intel® Data - Center GPU Flex Series; features automated optimization to maximize performance on target - hardware. -* **Requires fewer dependencies**: - than frameworks like Hugging Face and PyTorch, resulting in a smaller binary size and reduced - memory footprint, making deployments easier and updates more manageable. -* **Provides compression and precision management techniques**: - such as 8-bit and 4-bit weight compression, including embedding layers, and storage format - reduction. This includes fp16 precision for non-compressed models and int8/int4 for compressed - models, like GPTQ models from `Hugging Face `__. -* **Supports a wide range of deep learning models and architectures**: - including text, image, and audio generative models like Llama 2, MPT, OPT, Stable Diffusion, - Stable Diffusion XL. This enables the development of multimodal applications, allowing for - write-once, deploy-anywhere capabilities. -* **Enhances inference capabilities**: - fused inference primitives such as Scaled Dot Product Attention, Rotary Positional Embedding, - Group Query Attention, and Mixture of Experts. It also offers advanced features like in-place - KV-cache, dynamic quantization, KV-cache quantization and encapsulation, dynamic beam size - configuration, and speculative sampling. -* **Provides stateful model optimization**: - models from the Hugging Face Transformers are converted into a stateful form, optimizing - inference performance and memory usage in long-running text generation tasks by managing past - KV-cache tensors more efficiently internally. This feature is automatically activated for many - supported models, while unsupported ones remain stateless. Learn more about the - :doc:`Stateful models and State API <../openvino-workflow/running-inference/stateful-models>`. - -OpenVINO offers three main paths for Generative AI use cases: - -* **Hugging Face**: use OpenVINO as a backend for Hugging Face frameworks (transformers, - diffusers) through the `Optimum Intel `__ - extension. -* **OpenVINO GenAI Flavor**: use OpenVINO GenAI APIs (Python and C++). -* **Base OpenVINO**: use OpenVINO native APIs (Python and C++) with - `custom pipeline code `__. - -In both cases, the OpenVINO runtime is used for inference, and OpenVINO tools are used for -optimization. The main differences are in footprint size, ease of use, and customizability. - -The Hugging Face API is easy to learn, provides a simple interface and hides the complexity of -model initialization and text generation for a better developer experience. However, it has more -dependencies, less customization, and cannot be ported to C/C++. - -The OpenVINO GenAI Flavor reduces the complexity of LLMs implementation by -automatically managing essential tasks like the text generation loop, tokenization, -and scheduling. The Native OpenVINO API provides a more hands-on experience, -requiring manual setup of these functions. Both methods are designed to minimize dependencies -and the overall application footprint and enable the use of generative models in C++ applications. - -It is recommended to start with Hugging Face frameworks to experiment with different models and -scenarios. Then the model can be used with OpenVINO APIs if it needs to be optimized -further. Optimum Intel provides interfaces that enable model optimization (weight compression) -using `Neural Network Compression Framework (NNCF) `__, -and export models to the OpenVINO model format for use in native API applications. - -Proceed to run LLMs with: +.. dropdown:: Fewer dependencies and smaller footprint + :animate: fade-in-slide-down + :color: secondary + + Less bloated than frameworks such as Hugging Face and PyTorch, with a smaller binary size and reduced + memory footprint, makes deployments easier and updates more manageable. + +.. dropdown:: Compression and precision management + :animate: fade-in-slide-down + :color: secondary + + Techniques such as 8-bit and 4-bit weight compression, including embedding layers, and storage + format reduction. This includes fp16 precision for non-compressed models and int8/int4 for + compressed models, like GPTQ models from `Hugging Face `__. + +.. dropdown:: Enhanced inference capabilities + :animate: fade-in-slide-down + :color: secondary + + Advanced features like in-place KV-cache, dynamic quantization, KV-cache quantization and + encapsulation, dynamic beam size configuration, and speculative sampling, and more are + available. + +.. dropdown:: Stateful model optimization + :animate: fade-in-slide-down + :color: secondary + + Models from the Hugging Face Transformers are converted into a stateful form, optimizing + inference performance and memory usage in long-running text generation tasks by managing past + KV-cache tensors more efficiently internally. This feature is automatically activated for + many supported models, while unsupported ones remain stateless. Learn more about the + :doc:`Stateful models and State API <../openvino-workflow/running-inference/stateful-models>`. + +.. dropdown:: Optimized LLM inference + :animate: fade-in-slide-down + :color: secondary + + Includes a Python API for rapid development and C++ for further optimization, offering + better performance than Python-based runtimes. + + +Proceed to guides on: -* :doc:`Hugging Face and Optimum Intel <./llm_inference_guide/llm-inference-hf>` * :doc:`OpenVINO GenAI Flavor <./llm_inference_guide/genai-guide>` -* :doc:`Native OpenVINO API <./llm_inference_guide/llm-inference-native-ov>` - -The table below summarizes the differences between Hugging Face and the native OpenVINO API -approaches. - -.. dropdown:: Differences between Hugging Face and the native OpenVINO API - - .. list-table:: - :widths: 20 25 55 - :header-rows: 1 - - * - - - Hugging Face through OpenVINO - - OpenVINO Native API - * - Model support - - Supports transformer-based models such as LLMs - - Supports all model architectures from most frameworks - * - APIs - - Python (Hugging Face API) - - Python, C++ (OpenVINO API) - * - Model Format - - Source Framework / OpenVINO - - Source Framework / OpenVINO - * - Inference code - - Hugging Face based - - Custom inference pipelines - * - Additional dependencies - - Many Hugging Face dependencies - - Lightweight (e.g. numpy, etc.) - * - Application footprint - - Large - - Small - * - Pre/post-processing and glue code - - Provided through high-level Hugging Face APIs - - Must be custom implemented (see OpenVINO samples and notebooks) - * - Performance - - Good, but less efficient compared to native APIs - - Inherent speed advantage with C++, but requires hands-on optimization - * - Flexibility - - Constrained to Hugging Face API - - High flexibility with Python and C++; allows custom coding - * - Learning Curve and Effort - - Lower learning curve; quick to integrate - - Higher learning curve; requires more effort in integration - * - Ideal Use Case - - Ideal for quick prototyping and Python-centric projects - - Best suited for high-performance, resource-optimized production environments - * - Model Serving - - Paid service, based on CPU/GPU usage with Hugging Face - - Free code solution, run script for own server; costs may incur for cloud services - like AWS but generally cheaper than Hugging Face rates +* :doc:`Hugging Face and Optimum Intel <./llm_inference_guide/llm-inference-hf>` + + diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst index 41e5cbb5733c58..d725b306d57908 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide-npu.rst @@ -1,4 +1,4 @@ -Run LLMs with OpenVINO GenAI Flavor on NPU +Inference with OpenVINO GenAI ========================================== .. meta:: diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst index f18b66915fc3ce..16290b17eca323 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide.rst @@ -1,4 +1,4 @@ -Run LLM Inference on OpenVINO with the GenAI Flavor +Inference with OpenVINO GenAI =============================================================================================== .. meta:: @@ -9,39 +9,333 @@ Run LLM Inference on OpenVINO with the GenAI Flavor :hidden: NPU inference of LLMs - genai-guide/genai-use-cases -This guide will show you how to integrate the OpenVINO GenAI flavor into your application, covering -loading a model and passing the input context to receive generated text. Note that the vanilla flavor of OpenVINO -will not work with these instructions, make sure to -:doc:`install OpenVINO GenAI <../../get-started/install-openvino/install-openvino-genai>`. +OpenVINO™ GenAI is a library of pipelines and methods, extending the OpenVINO runtime to work +with generative AI models more efficiently. This article provides reference code and guidance +on its usage. Note that the base OpenVINO version will not work with these instructions, +make sure to :doc:`install OpenVINO with GenAI <../../get-started/install-openvino/install-openvino-genai>`. -.. note:: +.. image:: ../../assets/images/genai_main_diagram.svg + :align: center + :width: 500 + :alt: OpenVINO workflow diagram for convenience - The examples use the CPU as the target device, however, the GPU is also supported. - Note that for the LLM pipeline, the GPU is used only for inference, while token selection, tokenization, and - detokenization remain on the CPU, for efficiency. Tokenizers are represented as a separate model and also run - on the CPU. -1. Export an LLM model via Hugging Face Optimum-Intel. A chat-tuned TinyLlama model is used in this example: +| Here is sample code for several Generative AI use case scenarios. Note that these are very basic + examples and may need adjustments for your specific needs, like changing the inference device. +| For a more extensive instruction and additional options, see the + `step-by-step chat-bot guide <#chat-bot-use-case-step-by-step>`__ below. - .. code-block:: python +.. dropdown:: Text-to-Image Generation - optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format fp16 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + .. tab-set:: + + .. tab-item:: Python + :sync: python + + .. tab-set:: + + .. tab-item:: main.py + :name: mainpy + + .. code-block:: python + + import openvino_genai + from PIL import Image + import numpy as np + + class Generator(openvino_genai.Generator): + def __init__(self, seed, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + np.random.seed(seed) + self.mu = mu + self.sigma = sigma + + def next(self): + return np.random.normal(self.mu, self.sigma) + + + def infer(model_dir: str, prompt: str): + device = 'CPU' # GPU can be used as well + random_generator = Generator(42) + pipe = openvino_genai.Text2ImagePipeline(model_dir, device) + image_tensor = pipe.generate( + prompt, + width=512, + height=512, + num_inference_steps=20, + num_images_per_prompt=1, + random_generator=random_generator + ) + + image = Image.fromarray(image_tensor.data[0]) + image.save("image.bmp") + + .. tab-item:: LoRA.py + :name: lorapy + + .. code-block:: python + + import openvino as ov + import openvino_genai + import numpy as np + import sys + + + class Generator(openvino_genai.Generator): + def __init__(self, seed, mu=0.0, sigma=1.0): + openvino_genai.Generator.__init__(self) + np.random.seed(seed) + self.mu = mu + self.sigma = sigma + + def next(self): + return np.random.normal(self.mu, self.sigma) + + + def image_write(path: str, image_tensor: ov.Tensor): + from PIL import Image + image = Image.fromarray(image_tensor.data[0]) + image.save(path) + + + def infer(models_path: str, prompt: str): + prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" + + device = "CPU" # GPU, NPU can be used as well + adapter_config = openvino_genai.AdapterConfig() + + for i in range(int(len(adapters) / 2)): + adapter = openvino_genai.Adapter(adapters[2 * i]) + alpha = float(adapters[2 * i + 1]) + adapter_config.add(adapter, alpha) + + pipe = openvino_genai.Text2ImagePipeline(models_path, device, adapters=adapter_config) + print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") + image = pipe.generate(prompt, + random_generator=Generator(42), + width=512, + height=896, + num_inference_steps=20) + + image_write("lora.bmp", image) + print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") + image = pipe.generate(prompt, + adapters=openvino_genai.AdapterConfig(), + random_generator=Generator(42), + width=512, + height=896, + num_inference_steps=20 + ) + image_write("baseline.bmp", image) + + For more information, refer to the + `Python sample `__ + + .. tab-item:: C++ + :sync: cpp + + .. tab-set:: + + .. tab-item:: main.cpp + :name: maincpp + + .. code-block:: cpp + + #include "openvino/genai/text2image/pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::Text2ImagePipeline pipe(models_path, device); + ov::Tensor image = pipe.generate(prompt, + ov::genai::width(512), + ov::genai::height(512), + ov::genai::num_inference_steps(20), + ov::genai::num_images_per_prompt(1)); + + imwrite("image_%d.bmp", image, true); + + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + .. tab-item:: LoRA.cpp + :name: loracpp + + .. code-block:: cpp + + #include "openvino/genai/text2image/pipeline.hpp" + + #include "imwrite.hpp" + + int32_t main(int32_t argc, char* argv[]) try { + OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); + + const std::string models_path = argv[1], prompt = argv[2]; + const std::string device = "CPU"; // GPU, NPU can be used as well + + ov::genai::AdapterConfig adapter_config; + for(size_t i = 0; i < (argc - 3)/2; ++i) { + ov::genai::Adapter adapter(argv[3 + 2*i]); + float alpha = std::atof(argv[3 + 2*i + 1]); + adapter_config.add(adapter, alpha); + } - *Optional*. Optimize the model: + ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); - The model is an optimized OpenVINO IR with FP16 precision. For enhanced LLM performance, - it is recommended to use lower precision for model weights, such as INT4, and to compress weights - using NNCF during model export directly: + std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; + ov::Tensor image = pipe.generate(prompt, + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("lora.bmp", image, true); - .. code-block:: python + std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; + image = pipe.generate(prompt, + ov::genai::adapters(), + ov::genai::random_generator(std::make_shared(42)), + ov::genai::width(512), + ov::genai::height(896), + ov::genai::num_inference_steps(20)); + imwrite("baseline.bmp", image, true); - optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + return EXIT_SUCCESS; + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + For more information, refer to the + `C++ sample `__ -2. Perform generation using the new GenAI API: + +.. dropdown:: Speech Recognition + + The application performs inference on speech recognition Whisper Models. The samples include + the ``WhisperPipeline`` class and use audio files in WAV format at a sampling rate of 16 kHz + as input. + + .. tab-set:: + + .. tab-item:: Python + :sync: cpp + + .. code-block:: python + + import openvino_genai + import librosa + + + def read_wav(filepath): + raw_speech, samplerate = librosa.load(filepath, sr=16000) + return raw_speech.tolist() + + + def infer(model_dir: str, wav_file_path: str): + device = "CPU" # GPU or NPU can be used as well. + pipe = openvino_genai.WhisperPipeline(model_dir, device) + + # The pipeline expects normalized audio with a sampling rate of 16kHz. + raw_speech = read_wav(wav_file_path) + result = pipe.generate( + raw_speech, + max_new_tokens=100, + language="<|en|>", + task="transcribe", + return_timestamps=True, + ) + + print(result) + + for chunk in result.chunks: + print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "audio_utils.hpp" + #include "openvino/genai/whisper_pipeline.hpp" + + int main(int argc, char* argv[]) try { + if (3 > argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); + } + + std::filesystem::path models_path = argv[1]; + std::string wav_file_path = argv[2]; + std::string device = "CPU"; // GPU or NPU can be used as well. + + ov::genai::WhisperPipeline pipeline(models_path, device); + + ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); + config.max_new_tokens = 100; + config.language = "<|en|>"; + config.task = "transcribe"; + config.return_timestamps = true; + + // The pipeline expects normalized audio with a sampling rate of 16kHz. + ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); + auto result = pipeline.generate(raw_speech, config); + + std::cout << result << "\n"; + + for (auto& chunk : *result.chunks) { + std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; + } + + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) { + } + return EXIT_FAILURE; + } + + For more information, refer to the + `C++ sample `__. + + +.. dropdown:: Using GenAI in Chat Scenario + + For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache + across inputs may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific + methods are used to mark a conversation session, as shown in the samples below: .. tab-set:: @@ -50,9 +344,35 @@ will not work with these instructions, make sure to .. code-block:: python - import openvino_genai as ov_genai - pipe = ov_genai.LLMPipeline(model_path, "CPU") - print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) + import openvino_genai + + + def streamer(subword): + print(subword, end='', flush=True) + return False + + + def infer(model_dir: str): + device = 'CPU' # GPU can be used as well. + pipe = openvino_genai.LLMPipeline(model_dir, device) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + while True: + try: + prompt = input('question:\n') + except EOFError: + break + pipe.generate(prompt, config, streamer) + print('\n----------') + pipe.finish_chat() + + + + For more information, refer to the + `Python sample `__. .. tab-item:: C++ :sync: cpp @@ -60,27 +380,250 @@ will not work with these instructions, make sure to .. code-block:: cpp #include "openvino/genai/llm_pipeline.hpp" - #include - int main(int argc, char* argv[]) { - std::string model_path = argv[1]; - ov::genai::LLMPipeline pipe(model_path, "CPU"); - std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); + int main(int argc, char* argv[]) try { + if (2 != argc) { + throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); + } + std::string prompt; + std::string models_path = argv[1]; + + std::string device = "CPU"; // GPU, NPU can be used as well + ov::genai::LLMPipeline pipe(models_path, device); + + ov::genai::GenerationConfig config; + config.max_new_tokens = 100; + std::function streamer = [](std::string word) { + std::cout << word << std::flush; + return false; + }; + + pipe.start_chat(); + std::cout << "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, config, streamer); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; } -The `LLMPipeline` is the main object used for decoding. You can construct it directly from the -folder with the converted model. It will automatically load the main model, tokenizer, detokenizer, -and the default generation configuration. -Once the model is exported from Hugging Face Optimum-Intel, it already contains all the information -necessary for execution, including the tokenizer/detokenizer and the generation config, ensuring that -its results match those generated by Hugging Face. + For more information, refer to the + `C++ sample `__ + + +.. dropdown:: Using GenAI with Vision Language Models + + OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for + inference of multimodal text-generation Vision Language Models (VLMs). + With a text prompt and an image as input, VLMPipeline can generate text using + models such as LLava or MiniCPM-V. See the chat scenario presented + in the samples below: + + .. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: python + + import numpy as np + import openvino_genai + from PIL import Image + from openvino import Tensor + from pathlib import Path + + + def streamer(subword: str) -> bool: + print(subword, end='', flush=True) + + + def read_image(path: str) -> Tensor: + pic = Image.open(path).convert("RGB") + image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) + return Tensor(image_data) + + + def read_images(path: str) -> list[Tensor]: + entry = Path(path) + if entry.is_dir(): + return [read_image(str(file)) for file in sorted(entry.iterdir())] + return [read_image(path)] + + + def infer(model_dir: str, image_dir: str): + rgbs = read_images(image_dir) + device = 'CPU' # GPU can be used as well. + enable_compile_cache = dict() + if "GPU" == device: + enable_compile_cache["CACHE_DIR"] = "vlm_cache" + pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) + + config = openvino_genai.GenerationConfig() + config.max_new_tokens = 100 + + pipe.start_chat() + prompt = input('question:\n') + pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) + + while True: + try: + prompt = input("\n----------\n" + "question:\n") + except EOFError: + break + pipe.generate(prompt, generation_config=config, streamer=streamer) + pipe.finish_chat() + + + For more information, refer to the + `Python sample `__. + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: cpp + + #include "load_image.hpp" + #include + #include + + bool print_subword(std::string&& subword) { + return !(std::cout << subword << std::flush); + } + + int main(int argc, char* argv[]) try { + if (3 != argc) { + throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + } + + std::vector rgbs = utils::load_images(argv[2]); + + std::string device = "CPU"; // GPU can be used as well. + ov::AnyMap enable_compile_cache; + if ("GPU" == device) { + enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); + } + ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + + ov::genai::GenerationConfig generation_config; + generation_config.max_new_tokens = 100; + + std::string prompt; + + pipe.start_chat(); + std::cout << "question:\n"; + + std::getline(std::cin, prompt); + pipe.generate(prompt, + ov::genai::images(rgbs), + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + while (std::getline(std::cin, prompt)) { + pipe.generate(prompt, + ov::genai::generation_config(generation_config), + ov::genai::streamer(print_subword)); + std::cout << "\n----------\n" + "question:\n"; + } + pipe.finish_chat(); + } catch (const std::exception& error) { + try { + std::cerr << error.what() << '\n'; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } catch (...) { + try { + std::cerr << "Non-exception object thrown\n"; + } catch (const std::ios_base::failure&) {} + return EXIT_FAILURE; + } + + + For more information, refer to the + `C++ sample `__ + + +| + + +Chat-bot use case - step by step +############################################################################################### + +This example will show you how to create a chat-bot functionality, using the ``ov_genai.LLMPipeline`` +and a chat-tuned TinyLlama model. Apart from the basic implementation, it provides additional +optimization methods. + +Although CPU is used as inference device in the samples below, you may choose GPU instead. +Note that tasks such as token selection, tokenization, and detokenization are always handled +by CPU only. Tokenizers, represented as a separate model, are also run on CPU. + +Running the model ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +You start with exporting an LLM model via Hugging Face Optimum-Intel. Note that the precision +of ``int4`` is used, instead of the original ``fp16``, for better performance. The weight +compression is done by NNCF at the model export stage. The exported model contains all the +information necessary for execution, including the tokenizer/detokenizer and the generation +config, ensuring that its results match those generated by Hugging Face. + +The `LLMPipeline` is the main object used for decoding and handles all the necessary steps. +You can construct it directly from the folder with the converted model. + + +.. tab-set:: + + .. tab-item:: Python + :sync: py + + .. code-block:: console + + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + + .. code-block:: python + + import openvino_genai as ov_genai + pipe = ov_genai.LLMPipeline(model_path, "CPU") + print(pipe.generate("The Sun is yellow because", max_new_tokens=100)) + + .. tab-item:: C++ + :sync: cpp + + .. code-block:: console + + optimum-cli export openvino --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --weight-format int4 --trust-remote-code "TinyLlama-1.1B-Chat-v1.0" + + .. code-block:: cpp + + #include "openvino/genai/llm_pipeline.hpp" + #include + + int main(int argc, char* argv[]) { + std::string model_path = argv[1]; + ov::genai::LLMPipeline pipe(model_path, "CPU"); + std::cout << pipe.generate("The Sun is yellow because", ov::genai::max_new_tokens(100)); + } + + Streaming the Output -########################### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -For more interactive UIs during generation, streaming of model output tokens is supported. See the example -below, where a lambda function outputs words to the console immediately upon generation: +For more interactive UIs during generation, you can stream output tokens. In this example, a +lambda function outputs words to the console immediately upon generation: .. tab-set:: @@ -177,12 +720,10 @@ You can also create your custom streamer for more sophisticated processing: Optimizing Generation with Grouped Beam Search -####################################################### ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Leverage grouped beam search decoding and configure generation_config for better text generation -quality and efficient batch processing in GenAI applications. - -Specify generation_config to use grouped beam search: +For better text generation quality and more efficient batch processing, specify +``generation_config`` to leverage grouped beam search decoding. .. tab-set:: @@ -218,22 +759,19 @@ Specify generation_config to use grouped beam search: cout << pipe.generate("The Sun is yellow because", config); } + Efficient Text Generation via Speculative Decoding -################################################## +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ Speculative decoding (or assisted-generation) enables faster token generation -when an additional smaller draft model is used alongside the main model. -The draft model predicts the next K tokens one by one in an autoregressive manner, -while the main model validates these predictions and corrects them if necessary. +when an additional smaller draft model is used alongside the main model. This reduces the +number of infer requests to the main model, increasing performance. -Each predicted token is compared, and when there is a difference between the draft and -main model, the last token predicted by the main model is kept. Then, the draft -model acquires this token and tries prediction of the next K tokens, -thus repeating the cycle. +The draft model predicts the next K tokens one by one in an autoregressive manner. The main +model validates these predictions and corrects them if necessary - in case of +a discrepancy, the main model prediction is used. Then, the draft model acquires this token and +runs prediction of the next K tokens, thus repeating the cycle. -This method eliminates the need for multiple infer requests to the main model, -which results in increased performance. Its implementation in the pipeline is -shown in the code samples below: .. tab-set:: @@ -265,7 +803,7 @@ shown in the code samples below: config.max_new_tokens = 100 config.num_assistant_tokens = 5 - pipe.generate(prompt, config, streamer) + pipe.generate("The Sun is yellow because", config, streamer) For more information, refer to the @@ -310,7 +848,7 @@ shown in the code samples below: return false; }; - pipe.generate(prompt, config, streamer); + pipe.generate("The Sun is yellow because", config, streamer); } catch (const std::exception& error) { try { std::cerr << error.what() << '\n'; @@ -327,10 +865,18 @@ shown in the code samples below: For more information, refer to the `C++ sample `__ + + + + + + + Comparing with Hugging Face Results ####################################### -Compare and analyze results with those generated by Hugging Face models. +You can compare the results of the above example with those generated by Hugging Face models by +running the following code: .. tab-set:: @@ -358,30 +904,35 @@ Compare and analyze results with those generated by Hugging Face models. assert hf_output == ov_output -GenAI API -####################################### -OpenVINO GenAI Flavor includes the following API: -* generation_config - defines a configuration class for text generation, enabling customization of the generation process such as the maximum length of the generated text, whether to ignore end-of-sentence tokens, and the specifics of the decoding strategy (greedy, beam search, or multinomial sampling). -* llm_pipeline - provides classes and utilities for text generation, including a pipeline for processing inputs, generating text, and managing outputs with configurable options. -* streamer_base - an abstract base class for creating streamers. -* tokenizer - the tokenizer class for text encoding and decoding. +GenAI API +####################################### +The use case described here uses the following OpenVINO GenAI API methods: + +* generation_config - defines a configuration class for text generation, + enabling customization of the generation process such as the maximum length of + the generated text, whether to ignore end-of-sentence tokens, and the specifics + of the decoding strategy (greedy, beam search, or multinomial sampling). +* llm_pipeline - provides classes and utilities for processing inputs, + text generation, and managing outputs with configurable options. +* streamer_base - an abstract base class for creating streamers. +* tokenizer - the tokenizer class for text encoding and decoding. * visibility - controls the visibility of the GenAI library. -Learn more in the `GenAI API reference `__. +Learn more from the `GenAI API reference `__. Additional Resources #################### * `OpenVINO GenAI Repo `__ * `OpenVINO GenAI Samples `__ +* A Jupyter notebook demonstrating + `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ * `OpenVINO Tokenizers `__ * `Neural Network Compression Framework `__ - - diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst b/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst deleted file mode 100644 index 245a2648aab491..00000000000000 --- a/docs/articles_en/learn-openvino/llm_inference_guide/genai-guide/genai-use-cases.rst +++ /dev/null @@ -1,563 +0,0 @@ -GenAI Use Cases -===================== - -This article provides several use case scenarios for Generative AI model -inference. The applications presented in the code samples below -only require minimal configuration, like setting an inference device. Feel free -to explore and modify the source code as you need. - - -Using GenAI for Text-to-Image Generation -######################################## - -Examples below demonstrate inference on text-to-image models, like Stable Diffusion -1.5, 2.1, and LCM, with a text prompt as input. The :ref:`main.cpp ` -sample shows basic usage of the ``Text2ImagePipeline`` pipeline. -:ref:`lora.cpp ` shows how to apply LoRA adapters to the pipeline. - - -.. tab-set:: - - .. tab-item:: Python - :sync: python - - .. tab-set:: - - .. tab-item:: main.py - :name: mainpy - - .. code-block:: python - - import openvino_genai - from PIL import Image - import numpy as np - - class Generator(openvino_genai.Generator): - def __init__(self, seed, mu=0.0, sigma=1.0): - openvino_genai.Generator.__init__(self) - np.random.seed(seed) - self.mu = mu - self.sigma = sigma - - def next(self): - return np.random.normal(self.mu, self.sigma) - - - def infer(model_dir: str, prompt: str): - device = 'CPU' # GPU can be used as well - random_generator = Generator(42) - pipe = openvino_genai.Text2ImagePipeline(model_dir, device) - image_tensor = pipe.generate( - prompt, - width=512, - height=512, - num_inference_steps=20, - num_images_per_prompt=1, - random_generator=random_generator - ) - - image = Image.fromarray(image_tensor.data[0]) - image.save("image.bmp") - - .. tab-item:: LoRA.py - :name: lorapy - - .. code-block:: python - - import openvino as ov - import openvino_genai - import numpy as np - import sys - - - class Generator(openvino_genai.Generator): - def __init__(self, seed, mu=0.0, sigma=1.0): - openvino_genai.Generator.__init__(self) - np.random.seed(seed) - self.mu = mu - self.sigma = sigma - - def next(self): - return np.random.normal(self.mu, self.sigma) - - - def image_write(path: str, image_tensor: ov.Tensor): - from PIL import Image - image = Image.fromarray(image_tensor.data[0]) - image.save(path) - - - def infer(models_path: str, prompt: str): - prompt = "cyberpunk cityscape like Tokyo New York with tall buildings at dusk golden hour cinematic lighting" - - device = "CPU" # GPU, NPU can be used as well - adapter_config = openvino_genai.AdapterConfig() - - for i in range(int(len(adapters) / 2)): - adapter = openvino_genai.Adapter(adapters[2 * i]) - alpha = float(adapters[2 * i + 1]) - adapter_config.add(adapter, alpha) - - pipe = openvino_genai.Text2ImagePipeline(models_path, device, adapters=adapter_config) - print("Generating image with LoRA adapters applied, resulting image will be in lora.bmp") - image = pipe.generate(prompt, - random_generator=Generator(42), - width=512, - height=896, - num_inference_steps=20) - - image_write("lora.bmp", image) - print("Generating image without LoRA adapters applied, resulting image will be in baseline.bmp") - image = pipe.generate(prompt, - adapters=openvino_genai.AdapterConfig(), - random_generator=Generator(42), - width=512, - height=896, - num_inference_steps=20 - ) - image_write("baseline.bmp", image) - - For more information, refer to the - `Python sample `__ - - .. tab-item:: C++ - :sync: cpp - - .. tab-set:: - - .. tab-item:: main.cpp - :name: maincpp - - .. code-block:: cpp - - #include "openvino/genai/text2image/pipeline.hpp" - - #include "imwrite.hpp" - - int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc == 3, "Usage: ", argv[0], " ''"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::Text2ImagePipeline pipe(models_path, device); - ov::Tensor image = pipe.generate(prompt, - ov::genai::width(512), - ov::genai::height(512), - ov::genai::num_inference_steps(20), - ov::genai::num_images_per_prompt(1)); - - imwrite("image_%d.bmp", image, true); - - return EXIT_SUCCESS; - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - .. tab-item:: LoRA.cpp - :name: loracpp - - .. code-block:: cpp - - #include "openvino/genai/text2image/pipeline.hpp" - - #include "imwrite.hpp" - - int32_t main(int32_t argc, char* argv[]) try { - OPENVINO_ASSERT(argc >= 3 && (argc - 3) % 2 == 0, "Usage: ", argv[0], " '' [ ...]]"); - - const std::string models_path = argv[1], prompt = argv[2]; - const std::string device = "CPU"; // GPU, NPU can be used as well - - ov::genai::AdapterConfig adapter_config; - for(size_t i = 0; i < (argc - 3)/2; ++i) { - ov::genai::Adapter adapter(argv[3 + 2*i]); - float alpha = std::atof(argv[3 + 2*i + 1]); - adapter_config.add(adapter, alpha); - } - - ov::genai::Text2ImagePipeline pipe(models_path, device, ov::genai::adapters(adapter_config)); - - std::cout << "Generating image with LoRA adapters applied, resulting image will be in lora.bmp\n"; - ov::Tensor image = pipe.generate(prompt, - ov::genai::random_generator(std::make_shared(42)), - ov::genai::width(512), - ov::genai::height(896), - ov::genai::num_inference_steps(20)); - imwrite("lora.bmp", image, true); - - std::cout << "Generating image without LoRA adapters applied, resulting image will be in baseline.bmp\n"; - image = pipe.generate(prompt, - ov::genai::adapters(), - ov::genai::random_generator(std::make_shared(42)), - ov::genai::width(512), - ov::genai::height(896), - ov::genai::num_inference_steps(20)); - imwrite("baseline.bmp", image, true); - - return EXIT_SUCCESS; - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - - - - - -Using GenAI in Speech Recognition -################################# - - -The application, shown in code samples below, performs inference on speech -recognition Whisper Models. The samples include the ``WhisperPipeline`` class -and use audio files in WAV format at a sampling rate of 16 kHz as input. - -.. tab-set:: - - .. tab-item:: Python - :sync: cpp - - .. code-block:: python - - import openvino_genai - import librosa - - - def read_wav(filepath): - raw_speech, samplerate = librosa.load(filepath, sr=16000) - return raw_speech.tolist() - - - def infer(model_dir: str, wav_file_path: str): - device = "CPU" # GPU or NPU can be used as well. - pipe = openvino_genai.WhisperPipeline(model_dir, device) - - # The pipeline expects normalized audio with a sampling rate of 16kHz. - raw_speech = read_wav(wav_file_path) - result = pipe.generate( - raw_speech, - max_new_tokens=100, - language="<|en|>", - task="transcribe", - return_timestamps=True, - ) - - print(result) - - for chunk in result.chunks: - print(f"timestamps: [{chunk.start_ts}, {chunk.end_ts}] text: {chunk.text}") - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "audio_utils.hpp" - #include "openvino/genai/whisper_pipeline.hpp" - - int main(int argc, char* argv[]) try { - if (3 > argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " \"\""); - } - - std::filesystem::path models_path = argv[1]; - std::string wav_file_path = argv[2]; - std::string device = "CPU"; // GPU or NPU can be used as well. - - ov::genai::WhisperPipeline pipeline(models_path, device); - - ov::genai::WhisperGenerationConfig config(models_path / "generation_config.json"); - config.max_new_tokens = 100; - config.language = "<|en|>"; - config.task = "transcribe"; - config.return_timestamps = true; - - // The pipeline expects normalized audio with a sampling rate of 16kHz. - ov::genai::RawSpeechInput raw_speech = utils::audio::read_wav(wav_file_path); - auto result = pipeline.generate(raw_speech, config); - - std::cout << result << "\n"; - - for (auto& chunk : *result.chunks) { - std::cout << "timestamps: [" << chunk.start_ts << ", " << chunk.end_ts << "] text: " << chunk.text << "\n"; - } - - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) { - } - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) { - } - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__. - - -Using GenAI in Chat Scenario -############################ - -For chat scenarios where inputs and outputs represent a conversation, maintaining KVCache across inputs -may prove beneficial. The ``start_chat`` and ``finish_chat`` chat-specific methods are used to -mark a conversation session, as shown in the samples below: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: python - - import openvino_genai - - - def streamer(subword): - print(subword, end='', flush=True) - return False - - - def infer(model_dir: str): - device = 'CPU' # GPU can be used as well. - pipe = openvino_genai.LLMPipeline(model_dir, device) - - config = openvino_genai.GenerationConfig() - config.max_new_tokens = 100 - - pipe.start_chat() - while True: - try: - prompt = input('question:\n') - except EOFError: - break - pipe.generate(prompt, config, streamer) - print('\n----------') - pipe.finish_chat() - - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "openvino/genai/llm_pipeline.hpp" - - int main(int argc, char* argv[]) try { - if (2 != argc) { - throw std::runtime_error(std::string{"Usage: "} + argv[0] + " "); - } - std::string prompt; - std::string models_path = argv[1]; - - std::string device = "CPU"; // GPU, NPU can be used as well - ov::genai::LLMPipeline pipe(models_path, device); - - ov::genai::GenerationConfig config; - config.max_new_tokens = 100; - std::function streamer = [](std::string word) { - std::cout << word << std::flush; - return false; - }; - - pipe.start_chat(); - std::cout << "question:\n"; - while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, config, streamer); - std::cout << "\n----------\n" - "question:\n"; - } - pipe.finish_chat(); - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - - -Using GenAI with Vision Language Models -####################################### - -OpenVINO GenAI introduces the ``openvino_genai.VLMPipeline`` pipeline for -inference of multimodal text-generation Vision Language Models (VLMs). -With a text prompt and an image as input, VLMPipeline can generate text using -models such as LLava or MiniCPM-V. See the chat scenario presented -in the samples below: - -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. code-block:: python - - import numpy as np - import openvino_genai - from PIL import Image - from openvino import Tensor - from pathlib import Path - - - def streamer(subword: str) -> bool: - print(subword, end='', flush=True) - - - def read_image(path: str) -> Tensor: - pic = Image.open(path).convert("RGB") - image_data = np.array(pic.getdata()).reshape(1, pic.size[1], pic.size[0], 3).astype(np.uint8) - return Tensor(image_data) - - - def read_images(path: str) -> list[Tensor]: - entry = Path(path) - if entry.is_dir(): - return [read_image(str(file)) for file in sorted(entry.iterdir())] - return [read_image(path)] - - - def infer(model_dir: str, image_dir: str): - rgbs = read_images(image_dir) - device = 'CPU' # GPU can be used as well. - enable_compile_cache = dict() - if "GPU" == device: - enable_compile_cache["CACHE_DIR"] = "vlm_cache" - pipe = openvino_genai.VLMPipeline(model_dir, device, **enable_compile_cache) - - config = openvino_genai.GenerationConfig() - config.max_new_tokens = 100 - - pipe.start_chat() - prompt = input('question:\n') - pipe.generate(prompt, images=rgbs, generation_config=config, streamer=streamer) - - while True: - try: - prompt = input("\n----------\n" - "question:\n") - except EOFError: - break - pipe.generate(prompt, generation_config=config, streamer=streamer) - pipe.finish_chat() - - - For more information, refer to the - `Python sample `__. - - .. tab-item:: C++ - :sync: cpp - - .. code-block:: cpp - - #include "load_image.hpp" - #include - #include - - bool print_subword(std::string&& subword) { - return !(std::cout << subword << std::flush); - } - - int main(int argc, char* argv[]) try { - if (3 != argc) { - throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); - } - - std::vector rgbs = utils::load_images(argv[2]); - - std::string device = "CPU"; // GPU can be used as well. - ov::AnyMap enable_compile_cache; - if ("GPU" == device) { - enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); - } - ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); - - ov::genai::GenerationConfig generation_config; - generation_config.max_new_tokens = 100; - - std::string prompt; - - pipe.start_chat(); - std::cout << "question:\n"; - - std::getline(std::cin, prompt); - pipe.generate(prompt, - ov::genai::images(rgbs), - ov::genai::generation_config(generation_config), - ov::genai::streamer(print_subword)); - std::cout << "\n----------\n" - "question:\n"; - while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, - ov::genai::generation_config(generation_config), - ov::genai::streamer(print_subword)); - std::cout << "\n----------\n" - "question:\n"; - } - pipe.finish_chat(); - } catch (const std::exception& error) { - try { - std::cerr << error.what() << '\n'; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } catch (...) { - try { - std::cerr << "Non-exception object thrown\n"; - } catch (const std::ios_base::failure&) {} - return EXIT_FAILURE; - } - - - For more information, refer to the - `C++ sample `__ - -Additional Resources -##################### - -* :doc:`Install OpenVINO GenAI <../../../get-started/install-openvino/install-openvino-genai>` -* `OpenVINO GenAI Repo `__ -* `OpenVINO GenAI Samples `__ -* A Jupyter notebook demonstrating - `Visual-language assistant with MiniCPM-V2 and OpenVINO `__ -* `OpenVINO Tokenizers `__ diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst index a26b670b5314d0..4fec1acd23e6a7 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-hf.rst @@ -1,4 +1,4 @@ -Run LLMs with Hugging Face and Optimum Intel +Inference with Optimum Intel =============================================================================================== .. meta:: @@ -276,9 +276,10 @@ includes **Dynamic quantization** of activations of 4/8-bit quantized MatMuls an ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"} ) -.. note:: + .. note:: + Currently, for KV-cache quantization, GPU ignores the DYNAMIC_QUANTIZATION_GROUP_SIZE property, using ``group_size = head_size``. Additionally, it does not support the ``get_state()`` and ``set_state()`` APIs when KV-cache quantization is enabled. - Currently, both Dynamic quantization and KV-cache quantization are available for CPU device. + For GPU, KV-cache quantization is enabled by default on platforms without XMX support, and can be disabled by setting KV_CACHE_PRECISION to ``undefined``. Working with Models Tuned with LoRA diff --git a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst index 2476a0423e30e1..d33ae05f68f462 100644 --- a/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst +++ b/docs/articles_en/learn-openvino/llm_inference_guide/llm-inference-native-ov.rst @@ -1,4 +1,4 @@ -Run LLM Inference on Native OpenVINO (not recommended) +Generative AI with Base OpenVINO (not recommended) =============================================================================================== To run Generative AI models using native OpenVINO APIs you need to follow regular diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index 7b135fa7ff0b14..436d383ebf787e 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -146,6 +146,8 @@ offer a limited set of supported OpenVINO features. ov::intel_npu::turbo ov::intel_npu::tiles ov::intel_npu::max_tiles + ov::intel_npu::bypass_umd_caching + ov::intel_npu::defer_weights_load .. tab-item:: Read-only properties @@ -168,7 +170,6 @@ offer a limited set of supported OpenVINO features. ov::intel_npu::device_alloc_mem_size ov::intel_npu::device_total_mem_size ov::intel_npu::driver_version - ov::intel_npu::bypass_umd_caching .. note:: diff --git a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst index 6ab924a61ef150..259f605d46c2f7 100644 --- a/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst +++ b/docs/articles_en/openvino-workflow/running-inference/integrate-openvino-with-your-application/model-representation.rst @@ -247,57 +247,50 @@ OpenVINO™ provides several debug capabilities: * Model can be visualized to image from the xDot format: -.. tab-set:: - - .. tab-item:: Python - :sync: py - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py - :language: python - :fragment: [ov:visualize] - - .. tab-item:: C++ - :sync: cpp - - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp - :language: cpp - :fragment: [ov:visualize] - - -.. code-block:: sh + .. tab-set:: - `ov::pass::VisualizeTree` can be parametrized via environment variables: + .. tab-item:: Python + :sync: py - OV_VISUALIZE_TREE_OUTPUT_SHAPES=1 - visualize shapes + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py + :language: python + :fragment: [ov:visualize] - OV_VISUALIZE_TREE_OUTPUT_TYPES=1 - visualize types + .. tab-item:: C++ + :sync: cpp - OV_VISUALIZE_TREE_MIN_MAX_DENORMAL=1 - pretty denormal values + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp + :language: cpp + :fragment: [ov:visualize] - OV_VISUALIZE_TREE_RUNTIME_INFO=1 - print runtime information - OV_VISUALIZE_TREE_IO=1 - print I/O ports + ``ov::pass::VisualizeTree`` can be parametrized via environment variables: - OV_VISUALIZE_TREE_MEMBERS_NAME=1 - print member names + * ``OV_VISUALIZE_TREE_OUTPUT_SHAPES=1`` - visualize shapes + * ``OV_VISUALIZE_TREE_OUTPUT_TYPES=1`` - visualize types + * ``OV_VISUALIZE_TREE_MIN_MAX_DENORMAL=1`` - pretty denormal values + * ``OV_VISUALIZE_TREE_RUNTIME_INFO=1`` - print runtime information + * ``OV_VISUALIZE_TREE_IO=1`` - print I/O ports + * ``OV_VISUALIZE_TREE_MEMBERS_NAME=1`` - print member names * Also model can be serialized to IR: -.. tab-set:: + .. tab-set:: - .. tab-item:: Python - :sync: py + .. tab-item:: Python + :sync: py - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py - :language: python - :fragment: [ov:serialize] + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.py + :language: python + :fragment: [ov:serialize] - .. tab-item:: C++ - :sync: cpp + .. tab-item:: C++ + :sync: cpp - .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp - :language: cpp - :fragment: [ov:serialize] + .. doxygensnippet:: docs/articles_en/assets/snippets/ov_model_snippets.cpp + :language: cpp + :fragment: [ov:serialize] Additional Resources @@ -306,5 +299,3 @@ Additional Resources * :doc:`Available Operation Sets <../../../documentation/openvino-ir-format/operation-sets/available-opsets>`. * :doc:`OpenVINO™ Runtime Extensibility Developer Guide <../../../documentation/openvino-extensibility>`. * :doc:`Transformations Developer Guide <../../../documentation/openvino-extensibility/transformation-api>`. - - diff --git a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json index 0d53c3813542d2..0de8f188e7de34 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json +++ b/docs/sphinx_setup/_static/benchmarks_files/data/graph-data-ovms-genai.json @@ -6,7 +6,7 @@ "whats_new_model": false, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -30,7 +30,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -63,7 +63,7 @@ "whats_new_model": false, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -87,7 +87,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -120,7 +120,7 @@ "whats_new_model": false, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -144,7 +144,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -177,7 +177,7 @@ "whats_new_model": true, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -201,7 +201,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -228,7 +228,7 @@ "whats_new_model": true, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -252,7 +252,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -283,7 +283,7 @@ "whats_new_model": true, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -307,7 +307,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -338,7 +338,7 @@ "whats_new_model": false, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -362,7 +362,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -393,7 +393,7 @@ "whats_new_model": false, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -417,7 +417,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { @@ -450,7 +450,7 @@ "whats_new_model": false, "PlatformType": "Server Platforms (Intel® Xeon®)", "Parameters": { - "OpenVINO Model Server": { + "Ovms": { "Precisions": [ { "Throughput": { @@ -474,7 +474,7 @@ } ] }, - "vLLM with OpenVINO backend": { + "Vllm": { "Precisions": [ { "Throughput": { diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv index 0d16c5c4998329..fa5ae359fa45c0 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-155H.csv @@ -1,156 +1,95 @@ -Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec -opt-125m-gptq,INT4-MIXED,32,965.9,29,7.7,129.87 -opt-125m-gptq,INT4-MIXED,1024,1507.9,113.1,7.8,128.21 -tiny-llama-1.1b-chat,INT4-MIXED,32,1831.8,46.5,16.7,59.88 -tiny-llama-1.1b-chat,INT4-MIXED,1024,1806.3,635,17.8,56.18 -qwen2-0.5b,INT4-MIXED,32,2551.7,61.4,18.3,54.64 -qwen2-0.5b,INT4-MIXED,1024,2976.6,356.1,19.2,52.08 -tiny-llama-1.1b-chat,INT8-CW,32,1987.4,56,21.6,46.30 -tiny-llama-1.1b-chat,INT8-CW,1024,2209.1,772.7,22.6,44.25 -qwen2-0.5b,INT8-CW,32,2484.9,57.3,22.8,43.86 -qwen2-0.5b,INT8-CW,1024,3102.5,407.1,23.9,41.84 -qwen2-1.5b,INT4-MIXED,32,4265.2,71.7,25.5,39.22 -qwen2-1.5b,INT4-MIXED,1024,4884.5,862.4,26.8,37.31 -dolly-v2-3b,INT4-MIXED,32,2401.3,89.6,27.5,36.36 -red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2511.5,78.6,28.2,35.46 -phi-2,INT4-MIXED,32,2279.5,95.7,29.1,34.36 -minicpm-1b-sft,INT4-MIXED,31,2759.9,104.4,30.9,32.36 -phi-2,INT4-MIXED,32,2620.1,100.8,31,32.26 -stable-zephyr-3b-dpo,INT4-MIXED,30,2636.5,86.8,31.7,31.55 -dolly-v2-3b,INT4-MIXED,1024,3137.1,1782.9,32.2,31.06 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,3118.5,1831.7,33.3,30.03 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1024,2862.7,1821.1,33.5,29.85 -qwen2-1.5b,INT8-CW,32,4831.2,87,33.8,29.59 -opt-2.7b,INT4-MIXED,31,2898.3,73.2,33.9,29.50 -phi-2,INT4-MIXED,1024,2797.4,1887,34,29.41 -orca-mini-3b,INT4-MIXED,32,2877.8,100.3,35,28.57 -stablelm-3b-4e1t,INT4-MIXED,32,2669.4,94.7,35.3,28.33 -qwen2-1.5b,INT8-CW,1024,5455.8,1047.6,35.3,28.33 -minicpm-1b-sft,INT8-CW,31,3104.1,103.5,35.3,28.33 -phi-2,INT4-MIXED,1024,3039.8,1917.4,35.9,27.86 -stable-zephyr-3b-dpo,INT4-MIXED,946,3411.4,1695,37,27.03 -gemma-2b-it,INT4-MIXED,32,3991.7,116.1,37.9,26.39 -opt-2.7b,INT4-MIXED,937,3617.5,1764.9,38.2,26.18 -phi-3-mini-4k-instruct,INT4-MIXED,31,2935.3,111.6,38.2,26.18 -phi-3-mini-4k-instruct,INT4-MIXED,38,3102.4,134,38.4,26.04 -phi-3-mini-4k-instruct,INT4-MIXED,31,2986.1,114.1,38.9,25.71 -phi-3-mini-4k-instruct,INT4-MIXED,38,2977.4,131.1,39,25.64 -gemma-2b-it,INT4-MIXED,1024,4973.3,1249.2,39.7,25.19 -stablelm-3b-4e1t,INT4-MIXED,1024,3196.9,2045.4,39.9,25.06 -dolly-v2-3b,INT8-CW,32,3490.2,107.4,41.5,24.10 -red-pajama-incite-chat-3b-v1,INT8-CW,32,3457.9,105,42.5,23.53 -opt-2.7b,INT8-CW,31,3686.8,107.5,44.1,22.68 -phi-2,INT8-CW,32,3554.9,116.6,44.1,22.68 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3390.7,2277.1,44.2,22.62 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3643.6,2485,44.4,22.52 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3516.4,2280.9,44.5,22.47 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3537.2,2522.4,44.7,22.37 -orca-mini-3b,INT4-MIXED,1024,3557.3,1898.9,45,22.22 -minicpm-1b-sft,FP16,31,3814.4,97.9,45.4,22.03 -stablelm-3b-4e1t,INT8-CW,32,3486.9,100.5,46.1,21.69 -stable-zephyr-3b-dpo,INT8-CW,30,3516.7,101.9,46.1,21.69 -dolly-v2-3b,INT8-CW,1024,4265.9,2178.6,46.2,21.65 -red-pajama-incite-chat-3b-v1,INT8-CW,1020,3979.1,2219.7,47.2,21.19 -red-pajama-incite-chat-3b-v1,INT8-CW,1024,3975.5,2199.7,47.3,21.14 -opt-2.7b,INT8-CW,937,4358.6,1981.8,48.4,20.66 -phi-2,INT8-CW,1024,4058.1,2280.1,48.9,20.45 -gemma-2b-it,INT8-CW,32,4786.8,119.8,49.4,20.24 -chatglm3-6b,INT4-MIXED,32,4141.5,166.6,49.7,20.12 -stablelm-3b-4e1t,INT8-CW,1024,4054.8,2243.5,50.7,19.72 -stable-zephyr-3b-dpo,INT8-CW,946,4521.8,1816.4,51.3,19.49 -gemma-2b-it,INT8-CW,1024,5810.7,1580,51.3,19.49 -chatglm3-6b,INT4-MIXED,32,4651.4,164.7,51.6,19.38 -chatglm3-6b,INT4-MIXED,1024,4235.1,2818.7,52.3,19.12 -orca-mini-3b,INT8-CW,32,4162,109.2,53.3,18.76 -chatglm3-6b,INT4-MIXED,1024,4783.8,2869,54.4,18.38 -gpt-j-6b,INT4-MIXED,32,4667.3,176.7,56.3,17.76 -chatglm3-6b-gptq,INT4-MIXED,32,5369.4,173.9,58.9,16.98 -llama-2-7b-chat-hf,INT4-MIXED,32,4280,173.2,60.1,16.64 -phi-3-mini-4k-instruct,INT8-CW,31,4585.1,123,60.5,16.53 -phi-3-mini-4k-instruct,INT8-CW,38,4597,152,60.5,16.53 -chatglm2-6b,INT4-MIXED,32,4847.8,158.7,60.6,16.50 -vicuna-7b-v1.5,INT4-MIXED,32,4476.9,178.2,61.2,16.34 -chatglm3-6b-gptq,INT4-MIXED,1024,5217.6,2863.7,61.3,16.31 -mistral-7b-v0.1,INT4-MIXED,31,4413.6,194,61.7,16.21 -qwen2-7b,INT4-MIXED,32,7044.7,184.4,61.7,16.21 -mistral-7b-v0.1,INT4-MIXED,32,4427.6,193.3,61.8,16.18 -orca-mini-3b,INT8-CW,1024,4821.6,2239.1,62,16.13 -codegen25-7b,INT4-MIXED,32,4687.2,176.2,62.7,15.95 -chatglm2-6b,INT4-MIXED,1024,5165.9,3148,63,15.87 -llama-2-7b-gptq,INT4-MIXED,32,4632.8,175.2,63.4,15.77 -stablelm-7b,INT4-MIXED,32,5219.5,206.3,63.4,15.77 -qwen-7b-chat,INT4-MIXED,32,7805.6,193.8,63.6,15.72 -gpt-j-6b,INT4-MIXED,1024,5314.9,3111.8,63.6,15.72 -qwen2-7b,INT4-MIXED,1024,7716.2,3548.3,64.1,15.60 -llama-3-8b,INT4-MIXED,32,4910.9,204.8,64.7,15.46 -mistral-7b-v0.1,INT4-MIXED,1024,4720.8,3667.1,64.8,15.43 -mistral-7b-v0.1,INT4-MIXED,1007,4704.7,3685.4,64.9,15.41 -llama-3.1-8b,INT4-MIXED,31,4850.3,211.5,64.9,15.41 -phi-3-mini-4k-instruct,INT8-CW,1023,5128.6,2815.2,65.7,15.22 -phi-3-mini-4k-instruct,INT8-CW,1061,5155,3407.9,65.9,15.17 -mistral-7b-v0.1,INT4-MIXED,32,4939.3,192,66.5,15.04 -llama-3-8b,INT4-MIXED,33,4919.4,261.9,67.2,14.88 -llama-2-7b-chat-hf,INT4-MIXED,1024,4948.2,3811,67.3,14.86 -qwen1.5-7b-chat,INT4-MIXED,32,5943.1,180.5,67.7,14.77 -qwen-7b-chat-gptq,INT4-MIXED,32,8057,187,68.1,14.68 -llama-3-8b,INT4-MIXED,32,5503.5,198.4,68.1,14.68 -qwen-7b-chat,INT4-MIXED,32,8091.6,185.9,68.1,14.68 -llama-3-8b,INT4-MIXED,1024,5569.1,3920.5,68.2,14.66 -llama-3.1-8b,INT4-MIXED,31,5358.6,201,68.2,14.66 -stablelm-7b,INT4-MIXED,1020,5804.4,3726.6,68.8,14.53 -llama-3.1-8b,INT4-MIXED,31,5452.6,202.9,68.8,14.53 -llama-2-7b-chat-hf,INT4-MIXED,32,5023,165.7,69,14.49 -llama-3-8b,INT4-MIXED,32,5413.6,202,69.1,14.47 -llama-3-8b,INT4-MIXED,33,5440.4,262.1,69.2,14.45 -codegen25-7b,INT4-MIXED,1024,5434.6,3513.2,69.9,14.31 -mistral-7b-v0.1,INT4-MIXED,1024,5614.9,3819.1,70,14.29 -mistral-7b-v0.1,INT4-MIXED,31,4927.8,205,70.5,14.18 -llama-3-8b,INT4-MIXED,33,5498.9,270.7,70.6,14.16 -llama-3-8b,INT4-MIXED,1025,5577.4,4271.2,70.6,14.16 -llama-2-7b-gptq,INT4-MIXED,1024,5302.2,3529.4,70.7,14.14 -zephyr-7b-beta,INT4-MIXED,32,5212.4,190.6,71.2,14.04 -llama-3-8b,INT4-MIXED,1024,6161.1,3918,71.5,13.99 -llama-3-8b,INT4-MIXED,1025,6098,4441.8,72.3,13.83 -llama-3-8b,INT4-MIXED,1024,6071.7,3972.2,72.4,13.81 -mistral-7b-v0.1,INT4-MIXED,1007,5224.1,4153.4,73.8,13.55 -llama-3-8b,INT4-MIXED,1025,6156.9,4357,73.9,13.53 -zephyr-7b-beta,INT4-MIXED,1024,5511.6,3978,74.4,13.44 -opt-2.7b,FP16,31,9220.3,107.8,74.7,13.39 -dolly-v2-3b,FP16,32,6058.9,109.9,74.7,13.39 -qwen1.5-7b-chat,INT4-MIXED,1024,7063.2,3791.7,75,13.33 -qwen-7b-chat,INT4-MIXED,1024,8919.5,3763.9,75,13.33 -red-pajama-incite-chat-3b-v1,FP16,32,6036.5,107.5,75.9,13.18 -llama-2-7b-chat-hf,INT4-MIXED,1024,5716.8,4231.7,76.2,13.12 -phi-2,FP16,32,6090.1,115.2,77.1,12.97 -stable-zephyr-3b-dpo,FP16,30,6113.1,112.1,78.6,12.72 -qwen-7b-chat,INT4-MIXED,1024,9212.9,3857.4,78.6,12.72 -stablelm-3b-4e1t,FP16,32,6065.4,110.2,78.7,12.71 -opt-2.7b,FP16,937,9733.8,3750.8,78.8,12.69 -dolly-v2-3b,FP16,1024,6615.2,2230.9,79.1,12.64 -red-pajama-incite-chat-3b-v1,FP16,1020,6588.3,2259.4,80.2,12.47 -glm-4-9b,INT4-MIXED,33,6386.2,328,80.4,12.44 -red-pajama-incite-chat-3b-v1,FP16,1024,6570.3,2268.7,80.4,12.44 -baichuan2-7b-chat,INT4-MIXED,32,5977.9,201.7,81,12.35 -glm-4-9b,INT4-MIXED,32,6389.7,248.1,81,12.35 -phi-2,FP16,1024,6646.2,2406.7,81.4,12.29 -stable-zephyr-3b-dpo,FP16,946,6875.7,1868.2,82.9,12.06 -stablelm-3b-4e1t,FP16,1024,6636.1,2036.9,83,12.05 -chatglm2-6b,INT8-CW,32,6731.8,159.2,84.4,11.85 -glm-4-9b,INT4-MIXED,1025,7061.4,4939.2,85.2,11.74 -qwen-7b-chat-gptq,INT4-MIXED,1024,9175.3,3898,85.3,11.72 -gemma-7b-it,INT4-MIXED,32,7883.9,230.5,86,11.63 -gemma-7b-it,INT4-MIXED,32,8002.6,235,86.1,11.61 -glm-4-9b,INT4-MIXED,1024,7064.9,4411.2,86.2,11.60 -gpt-j-6b,INT8-CW,32,7009.2,176.8,86.4,11.57 -chatglm2-6b,INT8-CW,1024,7050.5,3871.6,86.8,11.52 -chatglm3-6b,INT8-CW,32,6755.9,159,86.8,11.52 -baichuan2-7b-chat,INT4-MIXED,1024,7033.3,4049,88.8,11.26 -chatglm3-6b,INT8-CW,1024,7076.5,3865.9,89.2,11.21 -qwen-7b-chat,INT4-MIXED,32,9245.7,176.3,90,11.11 -gemma-7b-it,INT4-MIXED,1024,9449.4,4305.8,93.2,10.73 -gpt-j-6b,INT8-CW,1024,7672.3,4181.1,93.5,10.70 -gemma-7b-it,INT4-MIXED,1024,9330.5,4222.5,93.7,10.67 -orca-mini-3b,FP16,32,7416.5,122.3,94.7,10.56 -codegen25-7b,INT8-CW,32,7557.6,170.7,98.4,10.16 -qwen-7b-chat,INT4-MIXED,1024,10371.1,4271.7,98.9,10.11 -llama-2-7b-chat-hf,INT8-CW,32,7390.6,171.6,99.9,10.01 +opt-125m-gptq,INT4-MIXED,32,1116,25.8,8.1,123.5 +opt-125m-gptq,INT4-MIXED,1024,1187.1,75.2,8.2,122.0 +qwen2-0.5b,INT4-MIXED,32,1587.4,45.1,15.4,64.9 +qwen2-0.5b,INT4-MIXED,1024,1587.8,228.2,15.6,64.1 +tiny-llama-1.1b-chat,INT4-MIXED,32,1704.2,42.4,17.6,56.8 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1616.3,489.2,18.9,52.9 +qwen2-0.5b,INT8-CW,32,1477.3,51.5,20.2,49.5 +qwen2-0.5b,INT8-CW,1024,1592,263.7,20.6,48.5 +tiny-llama-1.1b-chat,INT8-CW,32,1855.6,60.2,20.7,48.3 +tiny-llama-1.1b-chat,INT8-CW,1024,1992.6,618.2,21.7,46.1 +qwen2-1.5b,INT4-MIXED,32,2024.2,59.6,23.1,43.3 +bloomz-560m,FP16,1024,2773.1,647.8,23.8,42.0 +qwen2-1.5b,INT4-MIXED,1024,2177.7,577.4,23.8,42.0 +bloomz-560m,FP16,32,2582.7,44.2,25.1,39.8 +dolly-v2-3b,INT4-MIXED,32,2507.9,79.8,29.4,34.0 +phi-2,INT4-MIXED,32,2568.9,74.6,29.7,33.7 +qwen2-1.5b,INT8-CW,32,2577.3,81.6,30.5,32.8 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2489.4,69.9,30.5,32.8 +minicpm-1b-sft,INT4-MIXED,31,2442.1,84.7,31,32.3 +qwen2-1.5b,INT8-CW,1024,2739.8,773.3,31.2,32.1 +gemma-2b-it,INT4-MIXED,32,2998.2,103.5,31.4,31.8 +dolly-v2-3b,INT4-MIXED,1024,2508.1,1396.6,32,31.3 +gemma-2b-it,INT4-MIXED,1024,3171.5,822.3,32.2,31.1 +phi-2,INT4-MIXED,1024,2940.5,1395.3,32.2,31.1 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1023,2489.6,1435.5,33.1,30.2 +minicpm-1b-sft,INT8-CW,31,2818.6,86.9,33.4,29.9 +stable-zephyr-3b-dpo,INT4-MIXED,32,2638.2,87.4,33.8,29.6 +stablelm-3b-4e1t,INT4-MIXED,32,2750.5,89.4,35.6,28.1 +stablelm-3b-4e1t,INT4-MIXED,1023,3115.5,1473.1,38.1,26.2 +phi-3-mini-4k-instruct,INT4-MIXED,32,3039.1,109.2,40.4,24.8 +phi-2,INT8-CW,32,3599.7,107.5,42.1,23.8 +gemma-2b-it,INT8-CW,32,3845.4,111.3,42.2,23.7 +dolly-v2-3b,INT8-CW,32,3596.4,110.1,42.5,23.5 +gemma-2b-it,INT8-CW,1024,3844.6,1183,43,23.3 +red-pajama-incite-chat-3b-v1,INT8-CW,32,3590,111,43.3,23.1 +phi-3-mini-4k-instruct,INT4-MIXED,1024,3467.6,1721.6,43.5,23.0 +stablelm-3b-4e1t,INT8-CW,32,3582.8,111,44.3,22.6 +stable-zephyr-3b-dpo,INT8-CW,32,3607.2,110.2,44.5,22.5 +phi-2,INT8-CW,1024,3982,1508,44.6,22.4 +dolly-v2-3b,INT8-CW,1024,3596.5,1529.1,44.9,22.3 +minicpm-1b-sft,FP16,31,3769.9,84,45.4,22.0 +red-pajama-incite-chat-3b-v1,INT8-CW,1023,3952,2064.5,45.7,21.9 +stablelm-3b-4e1t,INT8-CW,1023,3934.5,2286.3,46.8,21.4 +gpt-j-6b,INT4-MIXED,32,4443.5,159.3,56.7,17.6 +phi-3-mini-4k-instruct,INT8-CW,32,4545,117.1,57.6,17.4 +phi-3-mini-4k-instruct,INT8-CW,1024,4810.4,2068.8,60.5,16.5 +gpt-j-6b,INT4-MIXED,1024,4746.4,2397,60.6,16.5 +falcon-7b-instruct,INT4-MIXED,32,5014,203.7,61.3,16.3 +qwen2-7b,INT4-MIXED,32,5269.4,203.8,62.3,16.1 +codegen25-7b,INT4-MIXED,32,4641.1,170.6,63.5,15.7 +llama-2-7b-gptq,INT4-MIXED,32,4597.3,172.1,63.5,15.7 +falcon-7b-instruct,INT4-MIXED,1024,5230.6,2695.3,63.6,15.7 +qwen2-7b,INT4-MIXED,1024,5370.8,2505.9,63.9,15.6 +decilm-7b-instruct,INT4-MIXED,36,4614.2,301.1,65.3,15.3 +codegen25-7b,INT4-MIXED,1024,4641.9,2629.6,67.4,14.8 +llama-2-7b-gptq,INT4-MIXED,1024,4928.1,2584.3,67.6,14.8 +mistral-7b-v0.1,INT4-MIXED,32,4928.5,180.9,69.2,14.5 +llama-2-7b-chat-hf,INT4-MIXED,32,4985.7,160.3,69.5,14.4 +qwen-7b-chat-gptq,INT4-MIXED,32,5426.7,188.3,69.5,14.4 +llama-3-8b,INT4-MIXED,33,5473.4,285.7,70,14.3 +flan-t5-xxl,INT4-MIXED,33,19293.8,211.7,70.1,14.3 +llama-3-8b,INT4-MIXED,33,5389.2,281,70.8,14.1 +mistral-7b-v0.1,INT4-MIXED,1024,5225.4,2713.3,71.8,13.9 +zephyr-7b-beta,INT4-MIXED,32,5306.1,177.9,72.1,13.9 +llama-3-8b,INT4-MIXED,1025,5615.2,2937.8,72.4,13.8 +llama-3-8b,INT4-MIXED,1025,5531.7,2815.4,73.2,13.7 +llama-2-7b-chat-hf,INT4-MIXED,1024,5319.5,2736.2,73.6,13.6 +phi-2,FP16,32,6197,104.6,74.7,13.4 +zephyr-7b-beta,INT4-MIXED,1024,5306.4,2802.3,74.7,13.4 +qwen-7b-chat-gptq,INT4-MIXED,1024,5934.9,2606.9,75,13.3 +dolly-v2-3b,FP16,32,6195.1,105.3,75.3,13.3 +baichuan2-7b-chat,INT4-MIXED,32,5837.9,188.5,76.8,13.0 +red-pajama-incite-chat-3b-v1,FP16,32,6178.6,118,76.8,13.0 +gemma-7b-it,INT4-MIXED,32,6495.9,230.6,77,13.0 +stablelm-3b-4e1t,FP16,32,6174.2,105.9,77.1,13.0 +stable-zephyr-3b-dpo,FP16,32,6217.8,107.9,77.2,13.0 +glm-4-9b-chat,INT4-MIXED,32,6333.4,225,77.3,12.9 +phi-2,FP16,1024,6411.5,2065.2,77.3,12.9 +dolly-v2-3b,FP16,1024,6410.1,2075,77.7,12.9 +llama-3.1-8b,INT4-MIXED,32,6324.6,182.2,78.8,12.7 +red-pajama-incite-chat-3b-v1,FP16,1023,6394.2,2752.4,79.2,12.6 +stablelm-3b-4e1t,FP16,1023,6386.9,2953.3,79.5,12.6 +glm-4-9b-chat,INT4-MIXED,1024,6439.5,3282.2,80,12.5 +baichuan2-7b-chat,INT4-MIXED,1024,6174.1,2752.6,80.6,12.4 +gemma-7b-it,INT4-MIXED,1024,6795.4,3118.3,80.6,12.4 +llama-3.1-8b,INT4-MIXED,1024,6324.8,2865.7,81.3,12.3 +gpt-j-6b,INT8-CW,32,6793.2,167.6,85,11.8 +qwen-7b-chat,INT4-MIXED,32,7274.8,168.8,85.2,11.7 +gpt-j-6b,INT8-CW,1024,6793.3,2668.4,88.8,11.3 +qwen-7b-chat,INT4-MIXED,1024,7610.3,2991.9,90.6,11.0 +flan-t5-xxl,INT4-MIXED,1139,23514,540.8,94.9,10.5 +falcon-7b-instruct,INT8-CW,32,7764.1,181.3,95.5,10.5 +llama-2-7b-chat-hf,INT8-CW,32,7330.9,172,96.1,10.4 +falcon-7b-instruct,INT8-CW,1024,7987.4,3072.8,98.1,10.2 +qwen2-7b,INT8-CW,32,8175.3,211.3,99.6,10.0 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv index 09799a2de31fe6..9aa769e4dd61b9 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_7-258V.csv @@ -1,182 +1,116 @@ -Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec -opt-125m-gptq,INT4-MIXED,1024,1513.6,81.9,7.8,128.21 -opt-125m-gptq,INT4-MIXED,32,979.9,50.4,7.9,126.58 -tiny-llama-1.1b-chat,INT4-MIXED,1024,1943.3,176.3,16.8,59.52 -tiny-llama-1.1b-chat,INT4-MIXED,32,1982.2,59.5,17.1,58.48 -qwen2-0.5b,INT4-MIXED,32,2678,117.3,18.7,53.48 -tiny-llama-1.1b-chat,INT8-CW,32,2080.9,59.4,19,52.63 -qwen2-0.5b,INT4-MIXED,1024,3036.1,165.5,19.2,52.08 -tiny-llama-1.1b-chat,INT8-CW,1024,2287,241.4,19.6,51.02 -qwen2-0.5b,INT8-CW,1024,3084.9,172.1,20,50.00 -qwen2-0.5b,INT8-CW,32,2518,105.5,21.4,46.73 -red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2793.6,141.8,23.9,41.84 -qwen2-1.5b,INT4-MIXED,32,4515.4,118.7,24,41.67 -qwen2-1.5b,INT4-MIXED,1024,4930.1,229.6,24.3,41.15 -dolly-v2-3b,INT4-MIXED,32,2486.1,174,25.4,39.37 -phi-2,INT4-MIXED,32,2552.9,210.6,26.9,37.17 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,2934.1,464.5,27.5,36.36 -qwen2-1.5b,INT8-CW,32,4813.4,119.1,27.8,35.97 -opt-2.7b,INT4-MIXED,31,3172.5,131.9,28.5,35.09 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1024,3038.2,447.1,28.6,34.97 -dolly-v2-3b,INT4-MIXED,1024,2947.4,409,28.8,34.72 -qwen2-1.5b,INT8-CW,1024,5394.8,327.9,29.3,34.13 -stable-zephyr-3b-dpo,INT4-MIXED,30,2728.1,131.2,29.8,33.56 -phi-2,INT4-MIXED,32,2805.1,208.3,30.2,33.11 -minicpm-1b-sft,INT8-CW,31,3104.2,147.8,30.9,32.36 -phi-2,INT4-MIXED,1024,3058.9,602.9,31.1,32.15 -minicpm-1b-sft,INT4-MIXED,31,2970.1,183.7,31.1,32.15 -stablelm-3b-4e1t,INT4-MIXED,32,3077.1,183.2,31.6,31.65 -opt-2.7b,INT4-MIXED,937,3416.7,429.4,31.6,31.65 -stable-zephyr-3b-dpo,INT4-MIXED,946,3211.8,428.8,32.3,30.96 -phi-3-mini-4k-instruct,INT4-MIXED,31,3014.5,116,32.5,30.77 -phi-3-mini-4k-instruct,INT4-MIXED,38,2957.4,153.9,32.5,30.77 -phi-2,INT4-MIXED,1024,3278.9,613.3,33.4,29.94 -phi-3-mini-4k-instruct,INT4-MIXED,38,3288.5,152.9,33.4,29.94 -phi-3-mini-4k-instruct,INT4-MIXED,31,3265.1,123.6,34.1,29.33 -gemma-2b-it,INT4-MIXED,32,4162.1,208.8,34.2,29.24 -stablelm-3b-4e1t,INT4-MIXED,1024,3525.8,524.5,35,28.57 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3427.8,777.5,36.5,27.40 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3405.4,554.1,36.7,27.25 -gemma-2b-it,INT4-MIXED,1024,5053.1,354.8,36.9,27.10 -minicpm-1b-sft,FP16,31,3595.5,124.9,36.9,27.10 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3547.2,755.8,37.1,26.95 -phi-3-mini-4k-instruct,INT4-MIXED,1023,3528.4,536.4,37.4,26.74 -red-pajama-incite-chat-3b-v1,INT8-CW,32,3747.7,189.9,38.1,26.25 -opt-2.7b,INT8-CW,31,3810.7,145.7,38.5,25.97 -chatglm3-6b,INT4-MIXED,32,4120.7,67.3,38.7,25.84 -dolly-v2-3b,INT8-CW,32,3747,188.4,39.2,25.51 -chatglm3-6b,INT4-MIXED,32,4482.9,69.9,40.7,24.57 -chatglm3-6b,INT4-MIXED,1024,4146,606.8,41,24.39 -opt-2.7b,INT8-CW,937,4458.9,587.8,41.8,23.92 -red-pajama-incite-chat-3b-v1,INT8-CW,1024,4088.4,634.1,41.9,23.87 -red-pajama-incite-chat-3b-v1,INT8-CW,1020,4086.8,653.4,42,23.81 -phi-2,INT8-CW,32,3794.6,202.7,42.1,23.75 -chatglm3-6b,INT4-MIXED,1024,4446.7,598.6,42.3,23.64 -stablelm-3b-4e1t,INT8-CW,32,3652.5,146,42.6,23.47 -stable-zephyr-3b-dpo,INT8-CW,30,3768.6,151.9,42.6,23.47 -dolly-v2-3b,INT8-CW,1024,4092,603.1,42.9,23.31 -stablelm-3b-4e1t,INT8-CW,1024,4143.2,671.7,45.2,22.12 -gemma-2b-it,INT8-CW,32,4878.4,221.6,45.6,21.93 -phi-2,INT8-CW,1024,4153.6,810.3,46,21.74 -llama-2-7b-chat-hf,INT4-MIXED,32,4394.6,109.7,46.2,21.65 -chatglm3-6b-gptq,INT4-MIXED,32,5218.9,79.7,46.7,21.41 -stable-zephyr-3b-dpo,INT8-CW,946,4360.1,627.8,46.8,21.37 -vicuna-7b-v1.5,INT4-MIXED,32,4482.3,101.2,47.2,21.19 -gemma-2b-it,INT8-CW,1024,5837.1,507.1,48,20.83 -llama-2-7b-gptq,INT4-MIXED,32,4734.3,102.8,48.1,20.79 -orca-mini-3b,INT4-MIXED,32,2720.1,132,48.1,20.79 -qwen-7b-chat,INT4-MIXED,32,7803.7,178.5,48.3,20.70 -mistral-7b-v0.1,INT4-MIXED,31,4537.5,99,48.5,20.62 -codegen25-7b,INT4-MIXED,32,4723.3,108.5,48.5,20.62 -chatglm3-6b-gptq,INT4-MIXED,1024,5150.8,614.2,48.8,20.49 -mistral-7b-v0.1,INT4-MIXED,32,4572,102.9,48.8,20.49 -llama-3-8b,INT4-MIXED,33,4991.2,252.2,50.9,19.65 -qwen-7b-chat-gptq,INT4-MIXED,32,8088.4,212.6,51,19.61 -chatglm2-6b,INT4-MIXED,32,4960.6,105.5,51.2,19.53 -gpt-j-6b,INT4-MIXED,32,4699.5,259.2,51.4,19.46 -llama-3.1-8b,INT4-MIXED,31,4897.8,106.9,51.5,19.42 -llama-3-8b,INT4-MIXED,32,4999.7,105.9,51.6,19.38 -qwen-7b-chat,INT4-MIXED,32,8085.9,193.5,51.7,19.34 -falcon-7b-instruct,INT4-MIXED,32,5416.2,175,52.5,19.05 -mistral-7b-v0.1,INT4-MIXED,1007,4772.6,803,52.6,19.01 -qwen1.5-7b-chat,INT4-MIXED,32,6027.3,174.9,53,18.87 -mistral-7b-v0.1,INT4-MIXED,1024,4775,717.6,53,18.87 -llama-2-7b-chat-hf,INT4-MIXED,1024,4976.5,992.1,53.1,18.83 -qwen2-7b,INT4-MIXED,32,7087.1,138.1,53.3,18.76 -llama-2-7b-gptq,INT4-MIXED,1024,5351.2,711.6,53.7,18.62 -llama-3-8b,INT4-MIXED,32,5472.8,109.4,53.7,18.62 -phi-3-mini-4k-instruct,INT8-CW,38,4575.3,115.9,53.7,18.62 -stablelm-7b,INT4-MIXED,32,5213.7,128.5,53.8,18.59 -phi-3-mini-4k-instruct,INT8-CW,31,4571.8,118.9,53.8,18.59 -llama-3-8b,INT4-MIXED,33,5480.4,246.8,53.9,18.55 -llama-3-8b,INT4-MIXED,32,5528.2,144.9,54.3,18.42 -llama-3.1-8b,INT4-MIXED,31,5377.3,112.8,54.3,18.42 -chatglm2-6b,INT4-MIXED,1024,5232.3,759.6,54.6,18.32 -llama-3.1-8b,INT4-MIXED,31,5440.4,126.4,54.8,18.25 -llama-3-8b,INT4-MIXED,33,5532.8,248.2,54.9,18.21 -codegen25-7b,INT4-MIXED,1024,5412.9,714.8,55,18.18 -mistral-7b-v0.1,INT4-MIXED,32,4998.5,117.3,55.2,18.12 -mistral-7b-v0.1,INT4-MIXED,31,5000.2,122.4,55.6,17.99 -llama-3-8b,INT4-MIXED,1024,5594,953.5,56.6,17.67 -gpt-j-6b,INT4-MIXED,1024,5323.8,1254,56.8,17.61 -llama-3-8b,INT4-MIXED,1025,5596.7,1192.3,56.8,17.61 -qwen2-7b,INT4-MIXED,1024,7722.1,714.2,57,17.54 -phi-3-mini-4k-instruct,INT8-CW,1023,5067.1,818.5,57.4,17.42 -phi-3-mini-4k-instruct,INT8-CW,1061,5086.1,975.1,57.4,17.42 -llama-2-7b-chat-hf,INT4-MIXED,32,5087.7,126.2,57.9,17.27 -stablelm-7b,INT4-MIXED,1020,5780.5,1248.4,59,16.95 -llama-3-8b,INT4-MIXED,1025,6088.9,1381.5,59,16.95 -llama-3-8b,INT4-MIXED,1024,6084.8,931.2,59.2,16.89 -llama-3-8b,INT4-MIXED,1025,6141.2,1494.3,59.4,16.84 -llama-3-8b,INT4-MIXED,1024,6133.8,1075.2,59.6,16.78 -mistral-7b-v0.1,INT4-MIXED,1024,5472.6,794.3,59.7,16.75 -zephyr-7b-beta,INT4-MIXED,32,5328.5,103.5,59.8,16.72 -falcon-7b-instruct,INT4-MIXED,1024,5677.5,686.2,59.8,16.72 -mistral-7b-v0.1,INT4-MIXED,1007,5243.5,1074,59.9,16.69 -qwen1.5-7b-chat,INT4-MIXED,1024,7096.7,1132.7,60,16.67 -qwen-7b-chat,INT4-MIXED,1024,8872.6,792.8,61,16.39 -qwen-7b-chat,INT4-MIXED,1024,9164.4,822.6,63.3,15.80 -orca-mini-3b,INT8-CW,32,4221.7,170.6,63.5,15.75 -llama-2-7b-chat-hf,INT4-MIXED,1024,5708.1,1397.9,63.6,15.72 -glm-4-9b,INT4-MIXED,33,6402.9,307.1,63.8,15.67 -zephyr-7b-beta,INT4-MIXED,1024,5572.4,1156.4,64.3,15.55 -glm-4-9b,INT4-MIXED,32,6383.1,256.2,64.5,15.50 -baichuan2-7b-chat,INT4-MIXED,32,5926.3,191.8,65.8,15.20 -opt-2.7b,FP16,31,5886,112.2,68,14.71 -dolly-v2-3b,FP16,32,6161.5,147.5,69.5,14.39 -red-pajama-incite-chat-3b-v1,FP16,32,6265.4,146.2,69.6,14.37 -glm-4-9b,INT4-MIXED,1024,6994.5,1013.7,69.8,14.33 -opt-2.7b,FP16,937,6345,379.5,71.6,13.97 -glm-4-9b,INT4-MIXED,1025,7014.9,1416.8,72.5,13.79 -phi-2,FP16,32,6204.7,189.2,72.9,13.72 -stable-zephyr-3b-dpo,FP16,30,6221.4,159.7,73,13.70 -dolly-v2-3b,FP16,1024,6669.9,424.3,73.3,13.64 -red-pajama-incite-chat-3b-v1,FP16,1020,6658.8,484.7,73.4,13.62 -stablelm-3b-4e1t,FP16,32,6216.3,145.4,73.5,13.61 -qwen-7b-chat,INT4-MIXED,32,9294.9,144.4,73.8,13.55 -red-pajama-incite-chat-3b-v1,FP16,1024,6755.1,469.1,73.9,13.53 -qwen-7b-chat-gptq,INT4-MIXED,1024,9152.1,827.2,75.1,13.32 -gemma-7b-it,INT4-MIXED,32,7991.4,128.6,75.8,13.19 -chatglm2-6b,INT8-CW,32,6854.4,110.2,76.3,13.11 -chatglm3-6b,INT8-CW,32,6754.8,112.3,76.4,13.09 -stable-zephyr-3b-dpo,FP16,946,6940,428.6,76.7,13.04 -baichuan2-7b-chat,INT4-MIXED,1024,6930.2,1229.5,76.7,13.04 -gemma-7b-it,INT4-MIXED,32,8061.5,125.6,76.7,13.04 -stablelm-3b-4e1t,FP16,1024,6722.9,480.8,77,12.99 -phi-2,FP16,1024,6709.4,624.1,77.2,12.95 -chatglm2-6b,INT8-CW,1024,7132.9,1361.9,78.7,12.71 -chatglm3-6b,INT8-CW,1024,7037.5,1389.2,78.7,12.71 -qwen-7b-chat,INT4-MIXED,1024,10374.1,1357.5,81.1,12.33 -gemma-7b-it,INT4-MIXED,1024,9398,1268.5,82.7,12.09 -gemma-7b-it,INT4-MIXED,1024,9469.5,1268,83.2,12.02 -gpt-j-6b,INT8-CW,32,7126.5,255.2,87.2,11.47 -falcon-7b-instruct,INT8-CW,32,8287.6,131.1,88.4,11.31 -llama-2-7b-chat-hf,INT8-CW,32,7474.9,139.5,89.7,11.15 -codegen25-7b,INT8-CW,32,7559.4,138,90.8,11.01 -vicuna-7b-v1.5,INT8-CW,32,7390.8,136.6,90.8,11.01 -falcon-7b-instruct,INT8-CW,1024,8546.8,1205.9,92.2,10.85 -stablelm-7b,INT8-CW,32,8356.4,143,92.4,10.82 -qwen2-7b,INT8-CW,32,9940.7,132,92.5,10.81 -baichuan2-13b-chat,INT4-MIXED,32,9879.2,184.9,93.3,10.72 -phi-3-mini-4k-instruct,FP16,38,8290,125.2,93.4,10.71 -phi-3-mini-4k-instruct,FP16,31,8290.5,109.5,93.5,10.70 -gpt-j-6b,INT8-CW,1024,7759,1996.8,93.9,10.65 -llama-2-7b-chat-hf,INT8-CW,1024,8097.8,1701.6,94.7,10.56 -phi-3-medium-4k-instruct,INT4-MIXED,38,8210.4,527,95.1,10.52 -mistral-7b-v0.1,INT8-CW,31,7882.4,128.6,95.1,10.52 -vicuna-7b-v1.5,INT8-CW,1024,8013.2,1558.1,95.1,10.52 -mistral-7b-v0.1,INT8-CW,32,7886.9,140.6,95.2,10.50 -qwen2-7b,INT8-CW,1024,10573.1,1564.5,95.3,10.49 -codegen25-7b,INT8-CW,1024,8253.1,1526.3,95.7,10.45 -zephyr-7b-beta,INT8-CW,32,7785.3,144.4,95.8,10.44 -stablelm-7b,INT8-CW,1020,8921.9,1845,96.9,10.32 -mistral-7b-v0.1,INT8-CW,1007,8127.4,1648.4,97.4,10.27 -qwen-7b-chat,INT8-CW,32,11083.2,140.6,97.7,10.24 -qwen1.5-7b-chat,INT8-CW,32,8870,156.4,98.1,10.19 -llama-3.1-8b,INT8-CW,31,8600.3,189.2,98.4,10.16 -mistral-7b-v0.1,INT8-CW,1024,8134.7,1554.1,98.4,10.16 -qwen-14b-chat,INT4-MIXED,32,9876.2,192.3,98.6,10.14 -zephyr-7b-beta,INT8-CW,1024,8035.2,1580.4,98.8,10.12 -llama-3-8b,INT8-CW,32,8694.2,150.7,99.5,10.05 -llama-3-8b,INT8-CW,33,8700.4,175.4,99.8,10.02 -phi-3-mini-4k-instruct,FP16,1023,8795.2,601.3,99.9,10.01 +opt-125m-gptq,INT4-MIXED,32,1150.2,35.1,8.2,122.0 +opt-125m-gptq,INT4-MIXED,1024,1228,67,8.2,122.0 +qwen2-0.5b,INT4-MIXED,1024,1596.2,83.6,14.4,69.4 +qwen2-0.5b,INT4-MIXED,32,1675.6,63.6,14.9,67.1 +qwen2-0.5b,INT8-CW,32,1857.5,56.9,15,66.7 +qwen2-0.5b,INT8-CW,1024,1663.5,87,15,66.7 +bloomz-560m,INT8-CW,32,1761.1,62.4,15.1,66.2 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1687.9,158.7,15.3,65.4 +bloomz-560m,INT4-MIXED,32,1894.2,40.1,15.4,64.9 +tiny-llama-1.1b-chat,INT4-MIXED,32,1833,74.5,15.7,63.7 +bloomz-560m,INT8-CW,1024,1689.2,146.2,15.8,63.3 +bloomz-560m,INT4-MIXED,1024,1791,150.1,16.4,61.0 +tiny-llama-1.1b-chat,INT8-CW,32,2132.3,35.6,18.1,55.2 +bloomz-560m,FP16,32,2395,36,18.4,54.3 +tiny-llama-1.1b-chat,INT8-CW,1024,1986.4,149.3,19.2,52.1 +bloomz-560m,FP16,1024,2344.4,157.4,19.3,51.8 +qwen2-1.5b,INT4-MIXED,1024,2175.1,184.9,20.4,49.0 +qwen2-1.5b,INT4-MIXED,32,2066.2,94.9,20.6,48.5 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2599.8,118.1,25,40.0 +qwen2-1.5b,INT8-CW,32,2377.4,83.3,25.1,39.8 +qwen2-1.5b,INT8-CW,1024,2483.3,189.6,25.3,39.5 +gemma-2b-it,INT4-MIXED,32,2594.3,181.4,26.1,38.3 +phi-2,INT4-MIXED,32,2912.4,77.7,26.8,37.3 +gemma-2b-it,INT4-MIXED,1024,2594.4,248.2,26.9,37.2 +dolly-v2-3b,INT4-MIXED,32,2610.3,141.3,27,37.0 +stable-zephyr-3b-dpo,INT4-MIXED,32,2956.2,149.2,27.4,36.5 +minicpm-1b-sft,INT4-MIXED,31,2625.8,159.2,28.1,35.6 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1023,3069.7,413.5,28.2,35.5 +minicpm-1b-sft,INT8-CW,31,2868.2,74.1,28.9,34.6 +dolly-v2-3b,INT4-MIXED,1024,3081.5,386,29.4,34.0 +phi-2,INT4-MIXED,1024,3136.2,340,29.6,33.8 +stablelm-3b-4e1t,INT4-MIXED,32,3035.9,150.5,30.6,32.7 +phi-3-mini-4k-instruct,INT4-MIXED,32,3373.2,57.9,32.6,30.7 +stablelm-3b-4e1t,INT4-MIXED,1023,3296.5,456.2,34.4,29.1 +phi-3-mini-4k-instruct,INT4-MIXED,1024,3707.1,432,36.1,27.7 +gemma-2b-it,INT8-CW,32,3370.5,203.8,36.6,27.3 +minicpm-1b-sft,FP16,31,3679.6,80.6,36.9,27.1 +gemma-2b-it,INT8-CW,1024,3503.2,258.5,37.9,26.4 +dolly-v2-3b,INT8-CW,32,3893.3,142.9,39.4,25.4 +red-pajama-incite-chat-3b-v1,INT8-CW,32,3760.7,117.2,39.4,25.4 +phi-2,INT8-CW,32,3765.6,121,39.7,25.2 +stablelm-3b-4e1t,INT8-CW,32,3641.2,123,39.9,25.1 +stable-zephyr-3b-dpo,INT8-CW,32,3743.3,120.1,39.9,25.1 +red-pajama-incite-chat-3b-v1,INT8-CW,1023,4083.1,422.9,41.9,23.9 +dolly-v2-3b,INT8-CW,1024,4211.5,384.1,42.2,23.7 +phi-2,INT8-CW,1024,4096.8,367.2,42.5,23.5 +stablelm-3b-4e1t,INT8-CW,1023,4086.6,459.9,43.5,23.0 +llama-2-7b-gptq,INT4-MIXED,32,4754.8,75.1,46.2,21.6 +codegen25-7b,INT4-MIXED,32,4738.5,74.9,46.9,21.3 +gpt-j-6b,INT4-MIXED,32,4506.5,221.4,47.3,21.1 +decilm-7b-instruct,INT4-MIXED,36,4794.9,199.3,48.5,20.6 +qwen-7b-chat-gptq,INT4-MIXED,32,5615.8,100.5,49.8,20.1 +falcon-7b-instruct,INT4-MIXED,32,4738,79.9,50.7,19.7 +phi-3-mini-4k-instruct,INT8-CW,32,4589.9,83,50.8,19.7 +llama-2-7b-gptq,INT4-MIXED,1024,5246,640,52.1,19.2 +llama-3-8b,INT4-MIXED,33,5475.8,114.7,52.2,19.2 +codegen25-7b,INT4-MIXED,1024,5241.9,643.7,52.5,19.0 +mistral-7b-v0.1,INT4-MIXED,32,5015.3,94.6,52.6,19.0 +qwen2-7b,INT4-MIXED,32,5330.7,86.3,52.7,19.0 +gpt-j-6b,INT4-MIXED,1024,4926.5,867.2,53.2,18.8 +llama-2-7b-chat-hf,INT4-MIXED,32,5100.7,78.7,54.2,18.5 +llama-3-8b,INT4-MIXED,33,5527.1,114.9,54.3,18.4 +phi-3-mini-4k-instruct,INT8-CW,1024,4959.2,450.6,54.6,18.3 +falcon-7b-instruct,INT4-MIXED,1024,4863.4,660.5,54.9,18.2 +qwen2-7b,INT4-MIXED,1024,5375.4,659.8,55.4,18.1 +mistral-7b-v0.1,INT4-MIXED,1024,5286.8,662.8,55.6,18.0 +llama-3-8b,INT4-MIXED,1025,5601,992.5,56.1,17.8 +llama-3-8b,INT4-MIXED,1025,5646.8,1047.1,56.7,17.6 +baichuan2-7b-chat,INT4-MIXED,32,5913.7,86.5,57.2,17.5 +zephyr-7b-beta,INT4-MIXED,32,5339.7,88.5,58.2,17.2 +qwen-7b-chat-gptq,INT4-MIXED,1024,6315.8,664.2,60.1,16.6 +glm-4-9b-chat,INT4-MIXED,32,6349.7,86.5,60.5,16.5 +llama-2-7b-chat-hf,INT4-MIXED,1024,5592.7,856.8,60.9,16.4 +zephyr-7b-beta,INT4-MIXED,1024,5459.1,898.6,61.6,16.2 +baichuan2-7b-chat,INT4-MIXED,1024,6410.3,942.2,63.5,15.7 +gemma-7b-it,INT4-MIXED,32,5816.3,104.5,63.5,15.7 +glm-4-9b-chat,INT4-MIXED,1024,6368.8,1128.2,63.8,15.7 +llama-3.1-8b,INT4-MIXED,32,6315.3,97.4,65,15.4 +llama-3.1-8b,INT4-MIXED,1024,6421.8,902.9,68.2,14.7 +gemma-7b-it,INT4-MIXED,1024,6233.2,1052.7,68.7,14.6 +qwen-7b-chat,INT4-MIXED,32,7320.5,132.3,68.8,14.5 +red-pajama-incite-chat-3b-v1,FP16,32,6318.9,79.2,70.7,14.1 +phi-2,FP16,32,6330.2,83.2,70.8,14.1 +dolly-v2-3b,FP16,32,6327.2,92.7,71.9,13.9 +stable-zephyr-3b-dpo,FP16,32,6356.4,79.8,72.2,13.9 +stablelm-3b-4e1t,FP16,32,6261.9,74.6,72.6,13.8 +phi-2,FP16,1024,6654.4,379.3,73.9,13.5 +red-pajama-incite-chat-3b-v1,FP16,1023,6640.3,442.6,74.4,13.4 +dolly-v2-3b,FP16,1024,6653.9,441.9,74.9,13.4 +qwen-7b-chat,INT4-MIXED,1024,7814.1,909.4,75.5,13.2 +stablelm-3b-4e1t,FP16,1023,6575.3,449.5,75.8,13.2 +falcon-7b-instruct,INT8-CW,32,7487.6,109.4,84.3,11.9 +gpt-j-6b,INT8-CW,32,6918.7,185.3,85.3,11.7 +llama-2-7b-chat-hf,INT8-CW,32,7494.7,110.6,87.9,11.4 +qwen2-7b,INT8-CW,32,8177.7,117.8,88.2,11.3 +falcon-7b-instruct,INT8-CW,1024,7621.2,675.4,88.3,11.3 +codegen25-7b,INT8-CW,32,7582.1,114.6,89,11.2 +qwen2-7b,INT8-CW,1024,8226.2,842,90.4,11.1 +gpt-j-6b,INT8-CW,1024,7353.1,1093.9,90.8,11.0 +phi-3-medium-4k-instruct,INT4-MIXED,38,8184.1,270.2,90.8,11.0 +qwen-7b-chat,INT8-CW,32,9223.8,138.4,91.3,11.0 +baichuan2-7b-chat,INT8-CW,32,8188.4,122.9,91.8,10.9 +phi-3-mini-4k-instruct,FP16,32,8311.5,98.2,92,10.9 +llama-2-7b-chat-hf,INT8-CW,1024,7984.3,874.9,92.8,10.8 +mistral-7b-v0.1,INT8-CW,32,7908.6,116.3,93.1,10.7 +baichuan2-13b-chat,INT4-MIXED,32,10016.5,165.7,93.2,10.7 +zephyr-7b-beta,INT8-CW,32,7812.6,117,93.4,10.7 +codegen25-7b,INT8-CW,1024,8074.3,870.2,94,10.6 +decilm-7b-instruct,INT8-CW,36,7885.2,181.4,94.9,10.5 +mistral-7b-v0.1,INT8-CW,1024,8023.7,906.4,95.7,10.4 +zephyr-7b-beta,INT8-CW,1024,7930.8,915.2,96.3,10.4 +phi-3-medium-4k-instruct,INT4-MIXED,1061,8384.5,2225.7,96.7,10.3 +baichuan2-7b-chat,INT8-CW,1024,8678.3,956.7,96.8,10.3 +llama-3.1-8b,INT8-CW,32,8615.4,121.6,97.7,10.2 +llama-3-8b,INT8-CW,33,8615.1,131.3,97.7,10.2 +phi-3-mini-4k-instruct,FP16,1024,8695.2,509,99.9,10.0 diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv index b16312fa09457c..dfc98271bcd21b 100644 --- a/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv +++ b/docs/sphinx_setup/_static/benchmarks_files/llm_models_9-288V.csv @@ -1,146 +1,82 @@ -Topology,Precision,Input Size,max rss memory,1st latency (ms),2nd latency (ms),2nd tok/sec -opt-125m-gptq,INT4-MIXED,1024,1610.2,146,9.4,106.38 -opt-125m-gptq,INT4-MIXED,32,1087.6,60.8,9.5,105.26 -tiny-llama-1.1b-chat,INT4-MIXED,32,1977,85.7,20.2,49.50 -tiny-llama-1.1b-chat,INT4-MIXED,1024,1940.8,367.7,20.3,49.26 -tiny-llama-1.1b-chat,INT8-CW,32,1855.2,70.2,21.8,45.87 -qwen2-0.5b,INT4-MIXED,1024,3029.3,226.4,22.3,44.84 -qwen2-0.5b,INT8-CW,1024,3093,222,22.3,44.84 -qwen2-0.5b,FP16,1024,2509.5,234.3,22.4,44.64 -qwen2-0.5b,FP16,32,1933.8,146.4,22.4,44.64 -tiny-llama-1.1b-chat,INT8-CW,1024,2288.3,368.6,22.9,43.67 -qwen2-0.5b,INT4-MIXED,32,2670.9,115.1,23,43.48 -qwen2-0.5b,INT8-CW,32,2530,157.9,24.3,41.15 -red-pajama-incite-chat-3b-v1,INT4-MIXED,32,2677.3,186.1,27.9,35.84 -qwen2-1.5b,INT4-MIXED,32,4515.1,179.8,28.7,34.84 -qwen2-1.5b,INT4-MIXED,1024,4927.5,254.3,29.1,34.36 -dolly-v2-3b,INT4-MIXED,32,2420.9,245.6,30.8,32.47 -qwen2-1.5b,INT8-CW,32,4824.9,165.1,31.2,32.05 -phi-2,INT4-MIXED,32,2523.5,233.9,31.5,31.75 -qwen2-1.5b,INT8-CW,1024,5401.8,331.1,32,31.25 -stable-zephyr-3b-dpo,INT4-MIXED,30,2816.2,151.3,32.9,30.40 -red-pajama-incite-chat-3b-v1,INT4-MIXED,1020,2646.7,860.6,33,30.30 -opt-2.7b,INT4-MIXED,31,2814.5,174.7,33.1,30.21 -phi-2,INT4-MIXED,32,2363.6,236.6,34,29.41 -stablelm-3b-4e1t,INT4-MIXED,32,3079.1,220,34,29.41 -minicpm-1b-sft,INT4-MIXED,31,2971,185.1,34.1,29.33 -minicpm-1b-sft,INT8-CW,31,3103.6,233.5,34.3,29.15 -dolly-v2-3b,INT4-MIXED,1024,2152.3,876.6,34.7,28.82 -phi-3-mini-4k-instruct,INT4-MIXED,38,2951,155.4,35.9,27.86 -phi-2,INT4-MIXED,1024,2689.9,971.7,36.5,27.40 -stablelm-3b-4e1t,INT4-MIXED,1024,3335.9,519.3,37.3,26.81 -opt-2.7b,INT4-MIXED,937,3227.5,639.5,37.7,26.53 -phi-3-mini-4k-instruct,INT4-MIXED,38,3289.7,161,37.9,26.39 -gemma-2b-it,INT4-MIXED,32,4099.6,258.6,38,26.32 -tiny-llama-1.1b-chat,FP16,32,3098.7,143.9,38.2,26.18 -stable-zephyr-3b-dpo,INT4-MIXED,946,3548.5,453.9,38.8,25.77 -tiny-llama-1.1b-chat,FP16,1024,3388.6,523,39,25.64 -phi-2,INT4-MIXED,1024,2594.7,964.2,39.1,25.58 -minicpm-1b-sft,FP16,31,3597.7,164.8,39.8,25.13 -gemma-2b-it,INT4-MIXED,1024,5059.1,669.1,40.5,24.69 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3431.8,840.1,40.6,24.63 -phi-3-mini-4k-instruct,INT4-MIXED,1061,3555.6,836.3,41.8,23.92 -qwen2-1.5b,FP16,32,3979.4,111.8,42.5,23.53 -red-pajama-incite-chat-3b-v1,INT8-CW,32,3639.9,199.1,43.6,22.94 -qwen2-1.5b,FP16,1024,4569.8,250.5,44.1,22.68 -dolly-v2-3b,INT8-CW,32,3727,248.2,44.5,22.47 -opt-2.7b,INT8-CW,31,3746.3,175.6,44.6,22.42 -stablelm-3b-4e1t,INT8-CW,32,3651.3,178,45.4,22.03 -chatglm3-6b,INT4-MIXED,32,4050.3,88.1,47.4,21.10 -phi-2,INT8-CW,32,3608.7,232,48.3,20.70 -red-pajama-incite-chat-3b-v1,INT8-CW,1020,2951,816.6,48.4,20.66 -stablelm-3b-4e1t,INT8-CW,1024,4142.8,658.7,48.5,20.62 -opt-2.7b,INT8-CW,937,4019,640.7,48.8,20.49 -stable-zephyr-3b-dpo,INT8-CW,30,3264.5,150.7,48.8,20.49 -gemma-2b-it,INT8-CW,32,4874.7,249.4,48.9,20.45 -chatglm3-6b,INT4-MIXED,32,3902.1,84.9,49.5,20.20 -dolly-v2-3b,INT8-CW,1024,2931.4,865.2,49.7,20.12 -gemma-2b-it,INT8-CW,1024,5834,545.4,50.7,19.72 -vicuna-7b-v1.5,INT4-MIXED,32,4560.3,119.4,50.7,19.72 -chatglm3-6b,INT4-MIXED,1024,4070.1,895.9,50.9,19.65 -chatglm3-6b,INT4-MIXED,1024,3832.1,854.4,52,19.23 -orca-mini-3b,INT4-MIXED,32,2345.5,132.8,52.2,19.16 -phi-2,INT8-CW,1024,3511.6,989.7,53.1,18.83 -chatglm2-6b,INT4-MIXED,32,4960.2,91.5,54.2,18.45 -qwen1.5-7b-chat,INT4-MIXED,32,5936.5,195.7,54.8,18.25 -stable-zephyr-3b-dpo,INT8-CW,946,3700.5,677.9,54.8,18.25 -llama-2-7b-chat-hf,INT4-MIXED,32,4010.5,113.7,55.6,17.99 -qwen-7b-chat,INT4-MIXED,32,7393,132.7,56.1,17.83 -chatglm2-6b,INT4-MIXED,1024,5234.5,747.3,56.2,17.79 -qwen2-7b,INT4-MIXED,32,7086.2,183,56.3,17.76 -phi-3-mini-4k-instruct,INT8-CW,38,4574.4,132.9,56.9,17.57 -llama-2-7b-gptq,INT4-MIXED,32,4134.1,120,58,17.24 -chatglm3-6b-gptq,INT4-MIXED,32,4288.1,99.4,58.1,17.21 -qwen2-7b,INT4-MIXED,1024,7716.4,734.9,58.3,17.15 -mistral-7b-v0.1,INT4-MIXED,31,4509.3,115,58.6,17.06 -codegen25-7b,INT4-MIXED,32,4211.8,136.5,59,16.95 -qwen1.5-7b-chat,INT4-MIXED,1024,7007.2,792.7,60.6,16.50 -chatglm3-6b-gptq,INT4-MIXED,1024,4545.4,860.3,60.9,16.42 -phi-3-mini-4k-instruct,INT8-CW,1061,5087.2,1029.5,60.9,16.42 -gpt-j-6b,INT4-MIXED,32,4013.5,316.1,61.1,16.37 -mistral-7b-v0.1,INT4-MIXED,1007,876.5,984.4,61.7,16.21 -llama-3-8b,INT4-MIXED,32,4357.1,132.8,62,16.13 -llama-2-7b-chat-hf,INT4-MIXED,1024,3564.8,1163.7,62.5,16.00 -qwen-7b-chat-gptq,INT4-MIXED,32,7384.1,217.8,62.9,15.90 -zephyr-7b-beta,INT4-MIXED,32,5331.6,125,62.9,15.90 -qwen-7b-chat,INT4-MIXED,32,6545.8,218.7,63,15.87 -llama-3.1-8b,INT4-MIXED,31,5076.3,110.4,63.4,15.77 -llama-3.1-8b,INT4-MIXED,31,4419,145.6,63.5,15.75 -llama-2-7b-gptq,INT4-MIXED,1024,3434.2,921.6,64.4,15.53 -llama-3-8b,INT4-MIXED,32,4886.7,132.3,65.4,15.29 -stablelm-7b,INT4-MIXED,32,4768.4,132.1,65.5,15.27 -codegen25-7b,INT4-MIXED,1024,1429.7,967.5,65.7,15.22 -zephyr-7b-beta,INT4-MIXED,1024,5575.6,837.2,65.7,15.22 -llama-3-8b,INT4-MIXED,32,4888.3,161.8,66.2,15.11 -mistral-7b-v0.1,INT4-MIXED,31,4401.4,142.7,66.2,15.11 -llama-3-8b,INT4-MIXED,1024,3782.4,1091.5,66.8,14.97 -llama-3.1-8b,INT4-MIXED,31,4781.4,159.4,67,14.93 -glm-4-9b,INT4-MIXED,33,6392.6,298.7,67.2,14.88 -qwen-7b-chat,INT4-MIXED,1024,8472.8,1331.2,67.4,14.84 -gpt-j-6b,INT4-MIXED,1024,1237.8,1638.8,68.1,14.68 -llama-2-7b-chat-hf,INT4-MIXED,32,4497.4,153.2,68.7,14.56 -llama-3-8b,INT4-MIXED,1024,4526.9,1060.3,69.8,14.33 -mistral-7b-v0.1,INT4-MIXED,1007,3968.7,1033.1,69.9,14.31 -llama-3-8b,INT4-MIXED,1024,4297.9,1041.7,70,14.29 -orca-mini-3b,INT8-CW,32,3744.3,174,70.5,14.18 -stablelm-7b,INT4-MIXED,1020,4402.1,1186.4,70.5,14.18 -gemma-2b-it,FP16,32,5806.3,117.6,71.8,13.93 -glm-4-9b,INT4-MIXED,1025,7003.5,1354.2,72.5,13.79 -gemma-2b-it,FP16,1024,6804.7,490.6,73.4,13.62 -stablelm-3b-4e1t,FP16,32,6217,207.5,75.2,13.30 -llama-2-7b-chat-hf,INT4-MIXED,1024,4320.9,1247.7,75.8,13.19 -gemma-7b-it,INT4-MIXED,32,8050.6,134.6,76.1,13.14 -gemma-7b-it,INT4-MIXED,32,7992.6,146.4,76.1,13.14 -qwen-7b-chat,INT4-MIXED,1024,5712.7,1144.4,77.1,12.97 -stablelm-3b-4e1t,FP16,1024,6722.9,491.4,77.7,12.87 -chatglm2-6b,INT8-CW,32,6856.2,111.6,78.9,12.67 -opt-2.7b,FP16,31,5377.5,138,79.6,12.56 -chatglm2-6b,INT8-CW,1024,7133.8,1012.1,81,12.35 -red-pajama-incite-chat-3b-v1,FP16,32,5672.5,211,81.2,12.32 -gemma-7b-it,INT4-MIXED,1024,9399.5,1726.7,82.2,12.17 -dolly-v2-3b,FP16,32,5573,230.6,82.5,12.12 -gemma-7b-it,INT4-MIXED,1024,9460,1241.2,82.7,12.09 -opt-2.7b,FP16,937,4727.8,618.8,84.6,11.82 -baichuan2-7b-chat,INT4-MIXED,32,5782.4,274.1,84.8,11.79 -phi-2,FP16,32,5497.3,244.9,85,11.76 -stable-zephyr-3b-dpo,FP16,30,5714.8,173.1,86,11.63 -red-pajama-incite-chat-3b-v1,FP16,1020,5262.2,817.4,86.2,11.60 -dolly-v2-3b,FP16,1024,2376.1,935.5,87,11.49 -qwen-7b-chat,INT4-MIXED,32,8597.4,226.2,87.7,11.40 -phi-2,FP16,1024,4063.9,969.8,89.7,11.15 -chatglm3-6b,INT8-CW,32,6158.8,123.4,89.8,11.14 -stable-zephyr-3b-dpo,FP16,946,5337.1,781.4,90.5,11.05 -baichuan2-7b-chat,INT4-MIXED,1024,807.4,1725.7,91.8,10.89 -vicuna-7b-v1.5,INT8-CW,32,7391,171.3,92.5,10.81 -chatglm3-6b,INT8-CW,1024,550.7,1210.9,93.3,10.72 -phi-3-mini-4k-instruct,FP16,38,8299.3,142,94.1,10.63 -qwen2-7b,INT8-CW,32,9941.1,139.1,94.9,10.54 -qwen-7b-chat-gptq,INT4-MIXED,1024,6545,1103.9,95.8,10.44 -qwen2-7b,INT8-CW,1024,10575.1,1183,96.7,10.34 -qwen-7b-chat,INT4-MIXED,1024,6777.4,1309.6,96.9,10.32 -vicuna-7b-v1.5,INT8-CW,1024,8013.7,1154.6,96.9,10.32 -phi-3-medium-4k-instruct,INT4-MIXED,38,8212.8,448.3,97,10.31 -zephyr-7b-beta,INT8-CW,32,7888,144.8,97.4,10.27 -phi-3-mini-4k-instruct,FP16,1061,8814.8,1195.7,98.7,10.13 -zephyr-7b-beta,INT8-CW,1024,8136.7,1191.6,99.4,10.06 -llama-2-13b-chat-hf,INT4-MIXED,32,6927.5,165.3,99.9,10.01 +opt-125m-gptq,INT4-MIXED,32,833.1,15.6,3.9,256.4 +opt-125m-gptq,INT4-MIXED,1024,955.9,553.8,4.8,208.3 +bloomz-560m,INT4-MIXED,32,1457.5,48.5,11.1,90.1 +qwen2-0.5b,INT4-MIXED,32,1167.8,95.7,11.5,87.0 +qwen2-0.5b,INT4-MIXED,1024,1266,2330.3,12.7,78.7 +qwen2-0.5b,INT8-CW,32,1496.3,90.5,12.8,78.1 +bloomz-560m,INT8-CW,32,1724.2,84,13.9,71.9 +qwen2-0.5b,INT8-CW,1024,1593,2370.7,14,71.4 +bloomz-560m,INT4-MIXED,1024,1691,2005.3,15.2,65.8 +qwen2-0.5b,FP16,32,2989.8,94.6,15.9,62.9 +bloomz-560m,INT8-CW,1024,1941,2343.4,16.1,62.1 +qwen2-0.5b,FP16,1024,3088.1,2376.8,17.4,57.5 +bloomz-560m,FP16,32,3857,86.7,17.5,57.1 +bloomz-560m,FP16,1024,4085.6,2373.4,19.8,50.5 +tiny-llama-1.1b-chat,INT4-MIXED,32,1738.9,237.4,20,50.0 +tiny-llama-1.1b-chat,INT8-CW,32,2471.2,224.6,22.6,44.2 +tiny-llama-1.1b-chat,INT4-MIXED,1024,1929.3,5993,22.7,44.1 +tiny-llama-1.1b-chat,INT8-CW,1024,2661.8,6238.8,25.2,39.7 +qwen2-1.5b,INT4-MIXED,32,2429,312.8,28.4,35.2 +tiny-llama-1.1b-chat,FP16,32,4834.9,231.7,28.9,34.6 +tiny-llama-1.1b-chat,FP16,1024,5023.2,6191.5,31.7,31.5 +qwen2-1.5b,INT4-MIXED,1024,2600.3,7597.3,31.8,31.4 +stablelm-3b-4e1t,INT4-MIXED,32,3982.1,348.4,32.1,31.2 +qwen2-1.5b,INT8-CW,32,3619,301,32.7,30.6 +qwen2-1.5b,INT8-CW,1024,3790.3,7990.5,34.6,28.9 +stablelm-3b-4e1t,INT4-MIXED,1023,4455.4,11963.2,39.2,25.5 +minicpm-1b-sft,INT4-MIXED,31,5815.4,214.3,40.1,24.9 +qwen2-1.5b,FP16,32,7582.3,304.4,42.2,23.7 +minicpm-1b-sft,INT8-CW,31,6609.6,210.6,43.3,23.1 +qwen2-1.5b,FP16,1024,7753.4,7915.3,44.2,22.6 +gemma-2b-it,INT4-MIXED,32,3728.2,523,46.2,21.6 +stable-zephyr-3b-dpo,INT4-MIXED,32,3689.3,656.5,47.4,21.1 +gemma-2b-it,INT4-MIXED,1024,4207.3,11867.9,47.5,21.1 +minicpm-1b-sft,FP16,31,8999.8,222.2,49.1,20.4 +red-pajama-incite-chat-3b-v1,INT4-MIXED,32,3448.1,1028.9,49.6,20.2 +dolly-v2-3b,INT4-MIXED,32,3448.4,714.8,49.9,20.0 +gemma-2b-it,INT8-CW,32,5423.2,488.8,51,19.6 +gemma-2b-it,INT8-CW,1024,5902.7,12434.4,52.3,19.1 +stable-zephyr-3b-dpo,INT8-CW,32,5630.3,694.5,54.4,18.4 +phi-2,INT4-MIXED,32,3732.9,723.2,54.5,18.3 +phi-2,INT8-CW,32,5600.4,747,55.7,18.0 +dolly-v2-3b,INT8-CW,32,5589.7,1009.8,55.9,17.9 +red-pajama-incite-chat-3b-v1,INT8-CW,32,5590.1,698.9,55.9,17.9 +stablelm-3b-4e1t,INT8-CW,32,5630.1,660.7,56.1,17.8 +dolly-v2-3b,INT4-MIXED,1024,3984.5,15502.8,56.5,17.7 +red-pajama-incite-chat-3b-v1,INT4-MIXED,1023,3915.6,15363.9,56.6,17.7 +llama-2-7b-gptq,INT4-MIXED,32,8618.5,782.9,56.9,17.6 +phi-2,INT4-MIXED,1024,4251.3,15317,61,16.4 +phi-2,INT8-CW,1024,6119.4,15886.6,62,16.1 +red-pajama-incite-chat-3b-v1,INT8-CW,1023,6056.9,15984.9,62.2,16.1 +dolly-v2-3b,INT8-CW,1024,6124.9,16099.7,62.5,16.0 +stablelm-3b-4e1t,INT8-CW,1023,6097.1,16206.9,62.5,16.0 +gemma-2b-it,FP16,32,12208.2,501.4,65.5,15.3 +llama-3-8b,INT4-MIXED,33,8741.2,869,65.7,15.2 +llama-2-7b-gptq,INT4-MIXED,1024,9468.1,26350.7,66.1,15.1 +qwen-7b-chat-gptq,INT4-MIXED,32,8561,773.7,67,14.9 +gemma-2b-it,FP16,1024,12687.8,12168.7,67.1,14.9 +mistral-7b-v0.1,INT4-MIXED,32,8588.7,1020.6,67.4,14.8 +llama-2-7b-chat-hf,INT4-MIXED,32,8626.8,1100,69.4,14.4 +phi-2,FP16,32,11385.9,693.8,70.2,14.2 +dolly-v2-3b,FP16,32,11359,688.5,70.5,14.2 +stable-zephyr-3b-dpo,FP16,32,11432.9,648.5,70.6,14.2 +red-pajama-incite-chat-3b-v1,FP16,32,11364,692.4,70.7,14.1 +stablelm-3b-4e1t,FP16,32,11432.6,649,71.1,14.1 +llama-3-8b,INT4-MIXED,1025,9254.8,29700.3,71.9,13.9 +mistral-7b-v0.1,INT4-MIXED,1024,9121.9,29492.9,73.3,13.6 +phi-3-mini-4k-instruct,INT8-CW,32,7646.1,952.6,75.7,13.2 +qwen-7b-chat-gptq,INT4-MIXED,1024,10458.7,29022.2,75.9,13.2 +zephyr-7b-beta,INT4-MIXED,32,9217.5,1196.6,76.2,13.1 +phi-2,FP16,1024,11902.2,15868,77,13.0 +dolly-v2-3b,FP16,1024,11892.5,15987.1,77.1,13.0 +baichuan2-7b-chat,INT4-MIXED,32,9440.3,1118.1,77.3,12.9 +red-pajama-incite-chat-3b-v1,FP16,1023,11829.1,16008.7,77.3,12.9 +stablelm-3b-4e1t,FP16,1023,11897.5,16030,77.7,12.9 +phi-3-mini-4k-instruct,INT4-MIXED,32,4961.9,968.8,78.2,12.8 +llama-2-7b-chat-hf,INT4-MIXED,1024,9478.1,28958.6,78.6,12.7 +zephyr-7b-beta,INT4-MIXED,1024,9764.2,30982,82.3,12.2 +phi-3-mini-4k-instruct,INT8-CW,1024,8255.7,23200.5,83.1,12.0 +phi-3-mini-4k-instruct,INT4-MIXED,1024,5570.2,22277.1,85.7,11.7 +baichuan2-7b-chat,INT4-MIXED,1024,10305.2,29010,86.4,11.6 +phi-3-mini-4k-instruct,FP16,32,15292.6,934.7,96.4,10.4 +qwen-7b-chat,INT4-MIXED,32,10964.7,1413,97.8,10.2 \ No newline at end of file diff --git a/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf b/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf index bedd9c28286476..53198c7ddb7089 100644 Binary files a/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf and b/docs/sphinx_setup/_static/benchmarks_files/llm_models_platform_list_.pdf differ diff --git a/docs/sphinx_setup/_static/css/custom.css b/docs/sphinx_setup/_static/css/custom.css index f922069c45e354..de8a05732a4d06 100644 --- a/docs/sphinx_setup/_static/css/custom.css +++ b/docs/sphinx_setup/_static/css/custom.css @@ -923,6 +923,8 @@ h5 { position: relative; bottom: -16px; left: 0; + margin-left: auto; + padding-right: 30px; } .modal-footer-content { diff --git a/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf new file mode 100644 index 00000000000000..786f68fdbb86c7 Binary files /dev/null and b/docs/sphinx_setup/_static/download/GenAI_Quick_Start_Guide.pdf differ diff --git a/docs/sphinx_setup/_static/html/modal.html b/docs/sphinx_setup/_static/html/modal.html index 38eb673824f97e..e7bcc1c1c16c58 100644 --- a/docs/sphinx_setup/_static/html/modal.html +++ b/docs/sphinx_setup/_static/html/modal.html @@ -87,6 +87,6 @@

Graph Results

- + + \ No newline at end of file diff --git a/docs/sphinx_setup/_static/html/modalLLM.html b/docs/sphinx_setup/_static/html/modalLLM.html index 37b569d0bd4078..e8535c87f16090 100644 --- a/docs/sphinx_setup/_static/html/modalLLM.html +++ b/docs/sphinx_setup/_static/html/modalLLM.html @@ -87,6 +87,6 @@

Graph Results

- + + \ No newline at end of file diff --git a/docs/sphinx_setup/_static/js/graphs.js b/docs/sphinx_setup/_static/js/graphs.js index 697911bad9402c..04e34d6c2fefe5 100644 --- a/docs/sphinx_setup/_static/js/graphs.js +++ b/docs/sphinx_setup/_static/js/graphs.js @@ -60,8 +60,8 @@ class Filter { // param: GraphData[], clientPlatforms[] static BySortPlatforms(graphDataArr, platformsArr) { return graphDataArr - .filter((data) => platformsArr.includes(data.Platform)) - .sort((a, b) => a.Platform.localeCompare(b.Platform)); + .filter((data) => platformsArr.includes(data.Platform)) + .sort((a, b) => a.Platform.localeCompare(b.Platform)); //sort is necessary } } @@ -145,8 +145,8 @@ class Graph { array.push([obj]) } }) - return array; + return array; } // this returns an object that is used to ender the chart @@ -283,13 +283,13 @@ $(document).ready(function () { const models = networkModels.map((networkModel) => createCheckMark(networkModel, 'networkmodel')); modal.find('.models-column').append(models); - const selectAllModelsButton = createCheckMark('', 'networkmodel', false , false); + const selectAllModelsButton = createCheckMark('', 'networkmodel', false, false); modal.find('.models-selectall').append(selectAllModelsButton); - const selectAllPlatformsButton = createCheckMark('', 'platform', false , false); + const selectAllPlatformsButton = createCheckMark('', 'platform', false, false); modal.find('.platforms-selectall').append(selectAllPlatformsButton); - const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false , false)); + const precisions = Modal.getPrecisionsLabels(graph).map((precision) => createCheckMark(precision, 'precision', false, false)); modal.find('.precisions-column').append(precisions); selectAllCheckboxes(precisions); @@ -304,7 +304,7 @@ $(document).ready(function () { modal.find('#modal-display-graphs').hide(); modal.find('.ietype-column input').first().prop('checked', true); - const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false , true)); + const kpiLabels = Filter.getParameters(graph).map((parameter) => createCheckMark(parameter, 'kpi', false, true)); modal.find('.kpi-column').append(kpiLabels); $('body').prepend(modal); @@ -511,6 +511,7 @@ $(document).ready(function () { listContainer.style.margin = 0; listContainer.style.padding = 0; listContainer.style.paddingLeft = '0px'; + listContainer.style.float = "right"; legendContainer.appendChild(listContainer); } @@ -521,57 +522,55 @@ $(document).ready(function () { const htmlLegendPlugin = { id: 'htmlLegend', afterUpdate(chart, args, options) { - + charts = [...new Set([...charts, ...[chart]])]; const ul = getOrCreateLegendList(chart, chart.options.plugins.htmlLegend.containerID); - // Remove old legend items while (ul.firstChild) { ul.firstChild.remove(); } - const items = chart.legend.legendItems; + const items = chart.options.plugins.legend.labels.generateLabels(chart); items.forEach(item => { const li = document.createElement('li'); li.style.alignItems = 'center'; li.style.display = 'block'; li.style.flexDirection = 'column'; - li.style.marginLeft = '4px'; - + li.style.marginLeft = '6px'; + li.style.cursor = "pointer"; + li.style.fontSize = '0.6rem'; + li.style.textDecoration = item.hidden ? 'line-through' : ''; li.onclick = () => { - chart.toggleDataVisibility(item.index); - chart.update(); + charts.forEach((chartItem) => { + chartItem.setDatasetVisibility(item.datasetIndex, !chartItem.isDatasetVisible(item.datasetIndex)); + chartItem.update(); + }) }; - - // Color box + const boxSpan = document.createElement('span'); boxSpan.style.background = item.fillStyle; boxSpan.style.borderColor = item.strokeStyle; - boxSpan.style.borderWidth = item.lineWidth + 'px'; boxSpan.style.display = 'inline-block'; boxSpan.style.height = '10px'; boxSpan.style.marginRight = '4px'; boxSpan.style.width = '30px'; - // Text - const textContainer = document.createElement('p'); - textContainer.style.color = '#666'; - textContainer.style.margin = 0; - textContainer.style.padding = 0; - textContainer.style.fontSize = '0.6rem'; - textContainer.style.marginLeft = '3px'; - textContainer.style.textDecoration = item.hidden ? 'line-through' : ''; + const textSpan = document.createElement('span'); + textSpan.style.bottom = '1px' + textSpan.style.position = 'relative' + textSpan.style.fontSize = '0.6rem'; + textSpan.style.textDecoration = item.hidden ? 'line-through' : ''; const text = document.createTextNode(item.text); - textContainer.appendChild(text); + textSpan.appendChild(text); li.appendChild(boxSpan); - li.appendChild(textContainer); + li.appendChild(textSpan); ul.appendChild(li); }); } }; - function getChartOptionsByEngines(containerId, allowedAxisIDs) { + function getChartOptionsByEngines(allowedAxisIDs) { const axisConfigs = { x: { title: { display: true, text: 'Request Rate' } @@ -602,11 +601,11 @@ $(document).ready(function () { }, {}), plugins: { legend: { display: false }, - htmlLegend: { containerID: containerId } + htmlLegend: { containerID: 'modal-footer' } } }; } - function getChartOptions(title, containerId) { + function getChartOptions(title) { return { responsive: true, indexAxis: 'y', @@ -633,7 +632,7 @@ $(document).ready(function () { display: false }, htmlLegend: { - containerID: containerId, + containerID: 'modal-footer', } } } @@ -838,7 +837,7 @@ $(document).ready(function () { new Chart(context, { type: 'bar', data: getChartData(labels, datasets), - options: getChartOptions(chartTitle, containerId), + options: getChartOptions(chartTitle), plugins: [htmlLegendPlugin] }); }); @@ -858,9 +857,9 @@ $(document).ready(function () { }) } } - + var charts = []; function processMetricByEngines(labels, datasets, container, widthClass, id) { - var heightRatio = (80 + (labels.length * 55)); + var heightRatio = (30 + (labels.length * 55)); var chart = $('
'); const containerId = `legend-container-${id}`; const legend = $(`
`); @@ -894,8 +893,7 @@ $(document).ready(function () { backgroundColor: precision.color, yAxisID: precision.label === "Throughput" ? 'y' : 'y1', fill: false - } - ) + }) }) }) @@ -914,9 +912,10 @@ $(document).ready(function () { labels: labels, datasets: graphDatas }, - options: getChartOptionsByEngines(containerId, allowedAxisIDs), + options: getChartOptionsByEngines(allowedAxisIDs), plugins: [htmlLegendPlugin] }); + }); } diff --git a/docs/sphinx_setup/_templates/layout.html b/docs/sphinx_setup/_templates/layout.html index 7f873c679c2e83..0d2331b2c83fe3 100644 --- a/docs/sphinx_setup/_templates/layout.html +++ b/docs/sphinx_setup/_templates/layout.html @@ -11,7 +11,7 @@ - + diff --git a/docs/sphinx_setup/index.rst b/docs/sphinx_setup/index.rst index 2e6f960468015f..ad98be58cde1cd 100644 --- a/docs/sphinx_setup/index.rst +++ b/docs/sphinx_setup/index.rst @@ -11,8 +11,8 @@ generative AI, video, audio, and language with models from popular frameworks li TensorFlow, ONNX, and more. Convert and optimize models, and deploy across a mix of Intel® hardware and environments, on-premises and on-device, in the browser or in the cloud. -Check out the `OpenVINO Cheat Sheet. `__ - +| Check out the `OpenVINO Cheat Sheet [PDF] `__ +| Check out the `GenAI Quick-start Guide [PDF] `__ .. container:: diff --git a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py index a1c6aecc45d421..60a48c275d6681 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/gptq.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/gptq.py @@ -177,15 +177,3 @@ def unpatch_model(model): log.warning("Exception raised during GPTQ model unpatching. " "Depending on the exact issue it may lead to broken " "original model.\n%s", error) - - -def detect_gptq_model_raw(model): - return (model and getattr(model, 'config', None) and - getattr(model.config, 'quantization_config', None) and - model.config.quantization_config.quant_method == 'gptq') - - -def detect_gptq_model(model): - return (detect_gptq_model_raw(model) or - getattr(model, 'model', None) and - detect_gptq_model_raw(model.model)) diff --git a/src/bindings/python/src/openvino/frontend/pytorch/quantized.py b/src/bindings/python/src/openvino/frontend/pytorch/quantized.py new file mode 100644 index 00000000000000..310e95cb9985d7 --- /dev/null +++ b/src/bindings/python/src/openvino/frontend/pytorch/quantized.py @@ -0,0 +1,73 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +from typing import Optional +import torch +from openvino.frontend.pytorch import ModuleExtension, gptq +from openvino.frontend.pytorch.patch_model import patch_model, unpatch_model + + +def detect_quantized_model(model: torch.nn.Module) -> Optional[str]: + """Detects the quantization method used in a given PyTorch model. + + Args: + model (torch.nn.Module): The PyTorch model to check for quantization. + + Returns: + str: The quantization method if available, otherwise None. + """ + if (model and getattr(model, "config", None) + and getattr(model.config, "quantization_config", None)): + return model.config.quantization_config.quant_method + if getattr(model, "model", None): + return detect_quantized_model(model.model) + return None + + +def patch_quantized(model: torch.nn.Module) -> None: + """Patches a model based on its quantization type ("awq" or "gptq"). + + Args: + model (torch.nn.Module): The model to patch. + + Raises: + RuntimeError: If the quantization type is unknown. + """ + quant_type = detect_quantized_model(model) + if quant_type == "awq": + extensions = {} + try: + from awq.modules.linear import WQLinear_GEMM + extensions[WQLinear_GEMM] = ModuleExtension( + WQLinear_GEMM, "ov_ext::awq_gemm", + convert=lambda module, target_op, *args, **kwargs: target_op( + args[0], module.qweight, module.qzeros, module.scales, + torch.tensor(module.group_size), + torch.tensor(module.w_bit), module.bias), + evaluate=lambda module, *args, **kwargs: torch.full( + list(args[0].shape[:-1]) + [module.out_features], 0.5, + dtype=torch.float32)) # type: ignore + except ImportError: + pass + patch_model(model, extensions, + "_openvino_quantized_patch_orig_forward") # type: ignore + elif quant_type == "gptq": + model._openvino_gptq_patched = True + gptq.patch_model(model) # type: ignore + else: + raise RuntimeError(f"Unknown quantization type: {quant_type}.") + + +def unpatch_quantized(model: torch.nn.Module) -> None: + """Reverts the patching applied to a quantized PyTorch model. + + Args: + model (torch.nn.Module): The model to unpatch. + """ + if getattr(model, "_openvino_gptq_patched", False): + gptq.unpatch_model(model) # type: ignore + del model._openvino_gptq_patched + else: + unpatch_model(model, + "_openvino_quantized_patch_orig_forward") # type: ignore diff --git a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py index eb32a0a93c669b..6d8fdb1658793e 100644 --- a/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py +++ b/src/bindings/python/src/openvino/frontend/pytorch/ts_decoder.py @@ -16,7 +16,7 @@ graph_has_ops, ) from openvino.runtime import opset11 as ops -from openvino.frontend.pytorch import gptq, patch_model +from openvino.frontend.pytorch import quantized, patch_model from openvino.frontend.pytorch.module_extension import ModuleExtension import inspect @@ -141,27 +141,25 @@ def _get_scripted_model(self, pt_module, example_inputs=None, skip_freeze=False) patch_model.patch_model( pt_module, self.module_extensions, orig_forward_name) - gptq_patched = False - if gptq.detect_gptq_model(pt_module): + patched = False + if quantized.detect_quantized_model(pt_module) is not None: try: - gptq.patch_model(pt_module) - gptq_patched = True + quantized.patch_quantized(pt_module) + patched = True except Exception as error: log.warning( - "Failed patching of AutoGPTQ model. Error message:\n%s" - "\nTracing of the model will likely be unsuccessful or incorrect", - error) - gptq.unpatch_model(pt_module) - gptq_patched = False + "Failed patching of AutoGPTQ model. Error message:\n" + "Tracing of the model will likely be unsuccessful or incorrect", + exc_info=error) + quantized.unpatch_quantized(pt_module) + patched = False try: scripted = torch.jit.trace( pt_module, **input_parameters, strict=False) finally: - if gptq_patched: - gptq.unpatch_model(pt_module) - if self.module_extensions: - patch_model.unpatch_model(pt_module, orig_forward_name) + if patched: + quantized.unpatch_quantized(pt_module) have_to_freeze_ops = ["prim::Uninitialized", "prim::unchecked_cast", "aten::append"] diff --git a/src/bindings/python/src/openvino/passes/__init__.py b/src/bindings/python/src/openvino/passes/__init__.py new file mode 100644 index 00000000000000..037d9774c5b9a0 --- /dev/null +++ b/src/bindings/python/src/openvino/passes/__init__.py @@ -0,0 +1,19 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# type: ignore +# flake8: noqa + +from openvino._pyopenvino.passes import ModelPass, Matcher, MatcherPass, PassBase, WrapType, Or, AnyInput, Optional +from openvino._pyopenvino.passes import ( + consumers_count, + has_static_dim, + has_static_dims, + has_static_shape, + has_static_rank, + rank_equals, + type_matches, + type_matches_any, +) +from openvino._pyopenvino.passes import Serialize, ConstantFolding, VisualizeTree, MakeStateful, LowLatency2, ConvertFP32ToFP16, Version +from openvino.passes.manager import Manager +from openvino.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite diff --git a/src/bindings/python/src/openvino/runtime/passes/graph_rewrite.py b/src/bindings/python/src/openvino/passes/graph_rewrite.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/passes/graph_rewrite.py rename to src/bindings/python/src/openvino/passes/graph_rewrite.py diff --git a/src/bindings/python/src/openvino/runtime/passes/manager.py b/src/bindings/python/src/openvino/passes/manager.py similarity index 100% rename from src/bindings/python/src/openvino/runtime/passes/manager.py rename to src/bindings/python/src/openvino/passes/manager.py diff --git a/src/bindings/python/src/openvino/runtime/passes/__init__.py b/src/bindings/python/src/openvino/runtime/passes/__init__.py index 19a28c7576decd..a74f91fdcfab2e 100644 --- a/src/bindings/python/src/openvino/runtime/passes/__init__.py +++ b/src/bindings/python/src/openvino/runtime/passes/__init__.py @@ -3,8 +3,8 @@ # type: ignore # flake8: noqa -from openvino._pyopenvino.passes import ModelPass, Matcher, MatcherPass, PassBase, WrapType, Or, AnyInput, Optional -from openvino._pyopenvino.passes import ( +from openvino.passes import ModelPass, Matcher, MatcherPass, PassBase, WrapType, Or, AnyInput, Optional +from openvino.passes import ( consumers_count, has_static_dim, has_static_dims, @@ -14,6 +14,6 @@ type_matches, type_matches_any, ) -from openvino._pyopenvino.passes import Serialize, ConstantFolding, VisualizeTree, MakeStateful, LowLatency2, ConvertFP32ToFP16, Version -from openvino.runtime.passes.manager import Manager -from openvino.runtime.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite +from openvino.passes import Serialize, ConstantFolding, VisualizeTree, MakeStateful, LowLatency2, ConvertFP32ToFP16, Version +from openvino.passes.manager import Manager +from openvino.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite diff --git a/src/bindings/python/src/openvino/runtime/passes/graph_rewrite/__init__.py b/src/bindings/python/src/openvino/runtime/passes/graph_rewrite/__init__.py new file mode 100644 index 00000000000000..a9690e891ff5e8 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/passes/graph_rewrite/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# type: ignore +from openvino.passes.graph_rewrite import GraphRewrite, BackwardGraphRewrite diff --git a/src/bindings/python/src/openvino/runtime/passes/manager/__init__.py b/src/bindings/python/src/openvino/runtime/passes/manager/__init__.py new file mode 100644 index 00000000000000..1a2674dd03c2b1 --- /dev/null +++ b/src/bindings/python/src/openvino/runtime/passes/manager/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# type: ignore +from openvino.passes.manager import Manager diff --git a/src/bindings/python/tests/test_graph/test_manager.py b/src/bindings/python/tests/test_graph/test_manager.py index ff72ef43158d6e..5101414228c06e 100644 --- a/src/bindings/python/tests/test_graph/test_manager.py +++ b/src/bindings/python/tests/test_graph/test_manager.py @@ -9,7 +9,7 @@ import openvino.runtime.opset10 as ops from openvino import Core, Model -from openvino.runtime.passes import Manager, Serialize, ConstantFolding, Version +from openvino.passes import Manager, Serialize, ConstantFolding, Version from tests.test_graph.util import count_ops_of_type from tests.utils.helpers import create_filenames_for_ir, compare_models @@ -48,6 +48,23 @@ def test_constant_folding(): assert np.allclose(values_out, values_expected) +def test_runtime_passes_manager(): + import openvino.runtime.passes as rt + node_constant = ops.constant(np.array([[0.0, 0.1, -0.1], [-2.5, 2.5, 3.0]], dtype=np.float32)) + node_ceil = ops.ceiling(node_constant) + model = Model(node_ceil, [], "TestModel") + + assert count_ops_of_type(model, node_ceil) == 1 + assert count_ops_of_type(model, node_constant) == 1 + + pass_manager = rt.Manager() + pass_manager.register_pass(rt.ConstantFolding()) + pass_manager.run_passes(model) + + assert count_ops_of_type(model, node_ceil) == 0 + assert count_ops_of_type(model, node_constant) == 1 + + # request - https://docs.pytest.org/en/7.1.x/reference/reference.html#request @pytest.fixture def prepare_ir_paths(request, tmp_path): diff --git a/src/bindings/python/tests/test_transformations/test_graph_rewrite.py b/src/bindings/python/tests/test_transformations/test_graph_rewrite.py index 4821dad33dff0a..5f5c100597adf2 100644 --- a/src/bindings/python/tests/test_transformations/test_graph_rewrite.py +++ b/src/bindings/python/tests/test_transformations/test_graph_rewrite.py @@ -2,7 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from openvino.runtime import opset8 -from openvino.runtime.passes import Manager, GraphRewrite, MatcherPass, WrapType, Matcher +from openvino.passes import Manager, GraphRewrite, MatcherPass, WrapType, Matcher from tests.test_transformations.utils.utils import count_ops, get_relu_model, PatternReplacement @@ -19,6 +19,19 @@ def test_graph_rewrite(): assert count_ops(model, "Relu") == [2] +def test_runtime_graph_rewrite(): + import openvino.runtime.passes as rt + model = get_relu_model() + + manager = rt.Manager() + # check that register pass returns pass instance + anchor = manager.register_pass(rt.GraphRewrite()) + anchor.add_matcher(PatternReplacement()) + manager.run_passes(model) + + assert count_ops(model, "Relu") == [2] + + def test_register_new_node(): class InsertExp(MatcherPass): def __init__(self): diff --git a/src/bindings/python/tests/test_transformations/test_manager.py b/src/bindings/python/tests/test_transformations/test_manager.py index e78c62d8c1a5c4..e8f113f30b381c 100644 --- a/src/bindings/python/tests/test_transformations/test_manager.py +++ b/src/bindings/python/tests/test_transformations/test_manager.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.passes import Manager, GraphRewrite, BackwardGraphRewrite, Serialize +from openvino.passes import Manager, GraphRewrite, BackwardGraphRewrite, Serialize from tests.test_transformations.utils.utils import MyModelPass, PatternReplacement, expect_exception diff --git a/src/bindings/python/tests/test_transformations/test_matcher_pass.py b/src/bindings/python/tests/test_transformations/test_matcher_pass.py index c32483be316658..8127e4b7612d56 100644 --- a/src/bindings/python/tests/test_transformations/test_matcher_pass.py +++ b/src/bindings/python/tests/test_transformations/test_matcher_pass.py @@ -2,7 +2,7 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 from openvino.runtime import opset8 -from openvino.runtime.passes import Manager, Matcher, MatcherPass, WrapType +from openvino.passes import Manager, Matcher, MatcherPass, WrapType from openvino.runtime.utils import replace_node from tests.test_transformations.utils.utils import count_ops, get_relu_model, PatternReplacement diff --git a/src/bindings/python/tests/test_transformations/test_model_pass.py b/src/bindings/python/tests/test_transformations/test_model_pass.py index 5df3d0a9024dc2..efc797535d8bb8 100644 --- a/src/bindings/python/tests/test_transformations/test_model_pass.py +++ b/src/bindings/python/tests/test_transformations/test_model_pass.py @@ -1,7 +1,7 @@ # -*- coding: utf-8 -*- # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -from openvino.runtime.passes import Manager +from openvino.passes import Manager from tests.test_transformations.utils.utils import get_relu_model, MyModelPass diff --git a/src/bindings/python/tests/test_transformations/test_pattern_ops.py b/src/bindings/python/tests/test_transformations/test_pattern_ops.py index 24b28061582c68..c445c281e47171 100644 --- a/src/bindings/python/tests/test_transformations/test_pattern_ops.py +++ b/src/bindings/python/tests/test_transformations/test_pattern_ops.py @@ -6,8 +6,8 @@ from openvino import PartialShape from openvino.runtime import opset13 as ops -from openvino.runtime.passes import Matcher, WrapType, Or, AnyInput, Optional -from openvino.runtime.passes import ( +from openvino.passes import Matcher, WrapType, Or, AnyInput, Optional +from openvino.passes import ( consumers_count, has_static_dim, has_static_dims, diff --git a/src/bindings/python/tests/test_transformations/test_public_transformations.py b/src/bindings/python/tests/test_transformations/test_public_transformations.py index a10fea786b9770..eac356cd1341f5 100644 --- a/src/bindings/python/tests/test_transformations/test_public_transformations.py +++ b/src/bindings/python/tests/test_transformations/test_public_transformations.py @@ -7,7 +7,7 @@ from openvino import Model, PartialShape, Shape, Core from openvino.runtime import opset13 as ops -from openvino.runtime.passes import ( +from openvino.passes import ( Manager, ConstantFolding, MakeStateful, diff --git a/src/bindings/python/tests/test_transformations/utils/utils.py b/src/bindings/python/tests/test_transformations/utils/utils.py index e0239ce05fdc9d..b5f09a68ff1511 100644 --- a/src/bindings/python/tests/test_transformations/utils/utils.py +++ b/src/bindings/python/tests/test_transformations/utils/utils.py @@ -4,7 +4,7 @@ from openvino import Model, PartialShape from openvino.runtime import opset13 as ops -from openvino.runtime.passes import ModelPass, Matcher, MatcherPass, WrapType +from openvino.passes import ModelPass, Matcher, MatcherPass, WrapType def get_relu_model(): diff --git a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp index 71b5f4ba6c6f96..4d3c9f95350f4b 100644 --- a/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp +++ b/src/common/snippets/include/snippets/lowered/pass/solve_buffer_memory.hpp @@ -34,6 +34,10 @@ class SolveBufferMemory : public Pass { */ bool run(lowered::LinearIR& linear_ir) override; + // For the better performance data should be aligned with cache line size. + // The majority of CPUs have cache line size `64` bytes. + constexpr static size_t byte_alignment = 64; + private: using Buffers = std::vector; /** @@ -64,8 +68,6 @@ class SolveBufferMemory : public Pass { void set_dynamic_buffer_offset(const Buffers& dynamic_buffer_expressions); size_t& m_static_buffer_scratchpad_size; - - constexpr static size_t m_alignment = 32; // 32 bytes for data alignment in allocated memory }; } // namespace pass diff --git a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp index ca85cefd369099..ec7ab6c95eb89a 100644 --- a/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp +++ b/src/common/snippets/src/lowered/pass/solve_buffer_memory.cpp @@ -102,9 +102,8 @@ std::vector SolveBufferMemory::init_boxes(const Buffers& boxes.reserve(map_boxes.size()); for (auto& p : map_boxes) { auto& box = p.second; - // We use data alignment to put data in the line cache - // TODO [143395] : Please check if alignment is really needed here - box.size = utils::div_up(box.size, m_alignment); + // Align with cache line size. The experiments show that it affects performance. + box.size = utils::div_up(box.size, byte_alignment); boxes.push_back(box); } @@ -116,12 +115,12 @@ void SolveBufferMemory::solve_static_buffer_memory(const Buffers& static_buffer_ const auto boxes = init_boxes(static_buffer_expressions, linear_ir); ov::MemorySolver memSolver(boxes); - m_static_buffer_scratchpad_size = static_cast(memSolver.solve()) * m_alignment; // alignment in byte + m_static_buffer_scratchpad_size = static_cast(memSolver.solve()) * byte_alignment; // alignment in byte // Set offsets for Buffers for (const auto& buffer_expr : static_buffer_expressions) { const auto offset = static_cast(memSolver.get_offset(static_cast(buffer_expr->get_cluster_id()))); - buffer_expr->set_offset(offset * m_alignment); // alignment in byte + buffer_expr->set_offset(offset * byte_alignment); // alignment in byte } } diff --git a/src/common/snippets/src/runtime_configurator.cpp b/src/common/snippets/src/runtime_configurator.cpp index 96d13074d042ba..06beb8db94ae3d 100644 --- a/src/common/snippets/src/runtime_configurator.cpp +++ b/src/common/snippets/src/runtime_configurator.cpp @@ -8,6 +8,8 @@ #include "snippets/lowered/pass/init_loops.hpp" #include "snippets/lowered/pass/insert_specific_iterations.hpp" #include "snippets/lowered/pass/mha_parallel_wa_optimizer.hpp" +#include "snippets/lowered/pass/solve_buffer_memory.hpp" +#include "snippets/pass/split_dimension_m.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/utils/loop_utils.hpp" #include "snippets/utils/utils.hpp" @@ -228,7 +230,8 @@ void RuntimeConfigurator::update_loop_info(const lowered::LinearIRCPtr& linear_i void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRCPtr& linear_ir) const { const auto& loop_manager = linear_ir->get_loop_manager(); - m_config->buffer_scratchpad_size = linear_ir->get_static_buffer_scratchpad_size(); + // Align initial buffer scratchpad size with cache line size + m_config->buffer_scratchpad_size = utils::rnd_up(linear_ir->get_static_buffer_scratchpad_size(), lowered::pass::SolveBufferMemory::byte_alignment); auto is_not_executed = [&loop_manager](const lowered::ExpressionPtr& buffer_expr) { const auto& loop_ids = buffer_expr->get_loop_ids(); @@ -254,6 +257,9 @@ void RuntimeConfigurator::update_buffer_scratchpad_size(const lowered::LinearIRC additional_size = std::max(allocation_size * buffer_expr->get_node()->get_element_type().size(), additional_size); } + // Align with cache line size. The experiments shows that it affects performance. + additional_size = utils::rnd_up(additional_size, lowered::pass::SolveBufferMemory::byte_alignment); + cluster_offset = m_config->buffer_scratchpad_size; OPENVINO_ASSERT(!utils::is_dynamic_value(cluster_offset), "Offset of the cluster must be defined!"); m_config->buffer_scratchpad_size += additional_size; diff --git a/src/common/transformations/include/ov_ops/glu.hpp b/src/common/transformations/include/ov_ops/glu.hpp index 760641978b574d..add8c3a0582525 100644 --- a/src/common/transformations/include/ov_ops/glu.hpp +++ b/src/common/transformations/include/ov_ops/glu.hpp @@ -75,10 +75,6 @@ class TRANSFORMATIONS_API GLU : public ov::op::Op { ov::element::Type m_output_type{}; }; -// TODO 157615: Move to shape_inference -TRANSFORMATIONS_API std::vector shape_infer(const GLU* op, - std::vector input_shapes); - } // namespace internal } // namespace op } // namespace ov diff --git a/src/common/transformations/src/ov_ops/glu.cpp b/src/common/transformations/src/ov_ops/glu.cpp index bc3dfb89ab8b9b..9b5fb780d36bb8 100644 --- a/src/common/transformations/src/ov_ops/glu.cpp +++ b/src/common/transformations/src/ov_ops/glu.cpp @@ -4,10 +4,9 @@ #include "ov_ops/glu.hpp" +#include "glu_shape_inference.hpp" #include "openvino/core/partial_shape.hpp" #include "openvino/core/validation_util.hpp" -#include "openvino/op/variadic_split.hpp" -#include "variadic_split_shape_inference.hpp" namespace ov { namespace op { @@ -38,11 +37,9 @@ bool GLU::visit_attributes(ov::AttributeVisitor& visitor) { void GLU::validate_and_infer_types() { auto output_type = m_output_type == ov::element::undefined ? get_input_element_type(0) : m_output_type; - std::vector input_shapes = {get_input_partial_shape(0), - ov::PartialShape(ov::Shape{}), - ov::PartialShape(ov::Shape{2})}; - - set_output_type(0, output_type, shape_infer(this, input_shapes)[0]); + const auto input_shapes = ov::util::get_node_input_partial_shapes(*this); + const auto output_shapes = shape_infer(this, input_shapes); + set_output_type(0, output_type, output_shapes[0]); } std::shared_ptr GLU::clone_with_new_inputs(const ov::OutputVector& new_args) const { @@ -54,21 +51,6 @@ std::shared_ptr GLU::clone_with_new_inputs(const ov::OutputVector& new_arg m_split_to_glu_idx, m_output_type); } - -std::vector shape_infer(const GLU* op, std::vector input_shapes) { - ov::op::v1::VariadicSplit variadic_split; - std::vector axis = {op->get_axis()}; - std::vector split_lengths = {op->get_split_lengths(), -1}; - - std::unordered_map const_data; - const_data.emplace(1, ov::Tensor(ov::element::i64, ov::Shape{}, static_cast(axis.data()))); - const_data.emplace( - 2, - ov::Tensor(ov::element::i64, ov::Shape{split_lengths.size()}, static_cast(split_lengths.data()))); - - return ov::op::v1::shape_infer(&variadic_split, input_shapes, ov::make_tensor_accessor(const_data)); -} - } // namespace internal } // namespace op } // namespace ov diff --git a/src/core/include/openvino/core/any.hpp b/src/core/include/openvino/core/any.hpp index 9badb007d526b9..e002756d361f1f 100644 --- a/src/core/include/openvino/core/any.hpp +++ b/src/core/include/openvino/core/any.hpp @@ -485,6 +485,7 @@ class OPENVINO_API Any { using Ptr = std::shared_ptr; virtual const std::type_info& type_info() const = 0; virtual std::vector base_type_info() const = 0; + bool is_base_type_info(const std::type_info& type_info) const; virtual const void* addressof() const = 0; void* addressof() { return const_cast(const_cast(this)->addressof()); @@ -506,6 +507,9 @@ class OPENVINO_API Any { std::string to_string() const; bool is(const std::type_info& other) const; + bool is_signed_integral() const; + bool is_unsigned_integral() const; + bool is_floating_point() const; template bool is() const { @@ -514,17 +518,24 @@ class OPENVINO_API Any { template T& as() & { - type_check(typeid(decay_t)); return *static_cast*>(addressof()); } template const T& as() const& { - type_check(typeid(decay_t)); return *static_cast*>(addressof()); } + template + T convert() const; + protected: + template + [[noreturn]] U convert_impl() const; + + template + U convert_impl() const; + virtual ~Base() = default; }; @@ -685,6 +696,92 @@ class OPENVINO_API Any { T value; }; + // Generic if there is no specialization for T. + template + T& as_impl(...) { + impl_check(); + if (is()) { + return _impl->as(); + } + + OPENVINO_THROW("Bad as from: ", _impl->type_info().name(), " to: ", typeid(T).name()); + } + + template ::value>::type* = nullptr> + T& as_impl(int) { + if (_impl != nullptr) { + if (_impl->is()) { + return _impl->as(); + } else { + _temp = std::make_shared>(); + _impl->read_to(*_temp); + return _temp->as(); + } + } else { + _temp = std::make_shared>(); + return _temp->as(); + } + } + + template < + class T, + typename std::enable_if>::value>::type* = nullptr> + T& as_impl(int) { + if (_impl == nullptr) { + _temp = std::make_shared>>(T{}); + return _temp->as(); + } else { + if (_impl->is()) { + return _impl->as(); + } else { + auto runtime_attribute = _impl->as_runtime_attribute(); + if (runtime_attribute == nullptr) { + OPENVINO_THROW("Any does not contains pointer to runtime_attribute. It contains ", + _impl->type_info().name()); + } + auto vptr = std::dynamic_pointer_cast(runtime_attribute); + if (vptr == nullptr && T::element_type::get_type_info_static() != runtime_attribute->get_type_info() && + T::element_type::get_type_info_static() != RuntimeAttribute::get_type_info_static()) { + OPENVINO_THROW("Could not as Any runtime_attribute to ", + typeid(T).name(), + " from ", + _impl->type_info().name(), + "; from ", + static_cast(runtime_attribute->get_type_info()), + " to ", + static_cast(T::element_type::get_type_info_static())); + } + _temp = std::make_shared>>( + std::static_pointer_cast(runtime_attribute)); + return _temp->as(); + } + } + } + + template ::value && + !std::is_same::type, bool>::value>::type* = nullptr> + T& as_impl(int); + + template ::value || util::Readable::value) && !std::is_same::value && + (!std::is_arithmetic::value || std::is_same::type, bool>::value)>::type* = + nullptr> + T& as_impl(int) { + impl_check(); + + if (is()) { + return _impl->as(); + } else if (_impl->is()) { + _temp = std::make_shared>>(); + _impl->read_to(*_temp); + return _temp->as(); + } + + OPENVINO_THROW("Bad as from: ", _impl->type_info().name(), " to: ", typeid(T).name()); + } + friend class ::ov::RuntimeAttribute; friend class ::ov::CompiledModel; friend class ::ov::proxy::CompiledModel; @@ -704,11 +801,11 @@ class OPENVINO_API Any { /// @brief Default constructor Any() = default; - /// @brief Сopy constructor + /// @brief Copy constructor /// @param other other Any object Any(const Any& other); - /// @brief Сopy assignment operator + /// @brief Copy assignment operator /// @param other other Any object /// @return reference to the current object Any& operator=(const Any& other); @@ -756,8 +853,8 @@ class OPENVINO_API Any { * @brief Inplace value construction function * * @tparam T Any type - * @tparam Args pack of paramter types passed to T constructor - * @param args pack of paramters passed to T constructor + * @tparam Args pack of parameter types passed to T constructor + * @param args pack of parameters passed to T constructor */ template static Any make(Args&&... args) { @@ -786,130 +883,21 @@ class OPENVINO_API Any { */ template bool is() const { - if (_impl != nullptr) { - if (_impl->is(typeid(decay_t))) { - return true; - } - for (const auto& type_index : _impl->base_type_info()) { - if (util::equal(type_index, typeid(decay_t))) { - return true; - } - } - } - return false; - } - - /** - * Dynamic cast to specified type - * @tparam T type - * @return casted object - */ - template - typename std::enable_if>::value, T>::type& as() { - if (_impl == nullptr) { - _temp = std::make_shared>>(T{}); - return *static_cast*>(_temp->addressof()); - } else { - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } else { - auto runtime_attribute = _impl->as_runtime_attribute(); - if (runtime_attribute == nullptr) { - OPENVINO_THROW("Any does not contains pointer to runtime_attribute. It contains ", - _impl->type_info().name()); - } - auto vptr = std::dynamic_pointer_cast(runtime_attribute); - if (vptr == nullptr && T::element_type::get_type_info_static() != runtime_attribute->get_type_info() && - T::element_type::get_type_info_static() != RuntimeAttribute::get_type_info_static()) { - OPENVINO_THROW("Could not cast Any runtime_attribute to ", - typeid(T).name(), - " from ", - _impl->type_info().name(), - "; from ", - static_cast(runtime_attribute->get_type_info()), - " to ", - static_cast(T::element_type::get_type_info_static())); - } - _temp = std::make_shared>>( - std::static_pointer_cast(runtime_attribute)); - return *static_cast*>(_temp->addressof()); - } - } - } - - /** - * Dynamic cast to specified type - * @tparam T type - * @return casted object - */ - template - typename std::enable_if>::value && - !std::is_same::value && std::is_default_constructible::value && - (util::Istreamable::value || util::Readable::value), - T>::type& - as() { - impl_check(); - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } else if (_impl->is(typeid(std::string))) { - _temp = std::make_shared>>(); - _impl->read_to(*_temp); - return *static_cast*>(_temp->addressof()); - } - for (const auto& type_index : _impl->base_type_info()) { - if (util::equal(type_index, typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } - } - OPENVINO_THROW("Bad cast from: ", _impl->type_info().name(), " to: ", typeid(T).name()); - } - - /** - * Dynamic cast to specified type - * @tparam T type - * @return casted object - */ - template - typename std::enable_if< - !std::is_convertible>::value && !std::is_same::value && - (!std::is_default_constructible::value || (!util::Istreamable::value && !util::Readable::value)), - T>::type& - as() { - impl_check(); - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } - for (const auto& type_index : _impl->base_type_info()) { - if (util::equal(type_index, typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } - } - OPENVINO_THROW("Bad cast from: ", _impl->type_info().name(), " to: ", typeid(T).name()); + return _impl && (_impl->is() || _impl->is_base_type_info(typeid(decay_t))); } /** - * Dynamic cast to specified type + * Dynamic as to specified type * @tparam T type - * @return casted object + * @return reference to caster object */ template - typename std::enable_if::value, T>::type& as() { - if (_impl != nullptr) { - if (_impl->is(typeid(decay_t))) { - return *static_cast*>(_impl->addressof()); - } else { - _temp = std::make_shared>(); - _impl->read_to(*_temp); - return *static_cast(_temp->addressof()); - } - } else { - _temp = std::make_shared>(); - return *static_cast(_temp->addressof()); - } + T& as() { + return as_impl(int{}); } /** - * Dynamic cast to specified type + * Dynamic as to specified type * @tparam T type * @return const reference to caster object */ @@ -983,4 +971,40 @@ inline static void PrintTo(const Any& any, std::ostream* os) { } /** @endcond */ +template <> +OPENVINO_API unsigned long long Any::Base::convert() const; + +template <> +OPENVINO_API long long Any::Base::convert() const; + +template <> +OPENVINO_API double Any::Base::convert() const; + +template ::value && + !std::is_same::type, bool>::value>::type*> +T& Any::as_impl(int) { + impl_check(); + if (is()) { + return _impl->as(); + } else if (util::Readable::value && _impl->is()) { + _temp = std::make_shared>>(); + _impl->read_to(*_temp); + return _temp->as(); + } else if (_impl->is_signed_integral()) { + auto value = _impl->convert(); + _temp = std::make_shared>>(static_cast(value)); + return _temp->as(); + } else if (_impl->is_unsigned_integral()) { + auto value = _impl->convert(); + _temp = std::make_shared>>(static_cast(value)); + return _temp->as(); + } else if (_impl->is_floating_point()) { + auto value = _impl->convert(); + _temp = std::make_shared>>(static_cast(value)); + return _temp->as(); + } + + OPENVINO_THROW("Bad as from: ", _impl->type_info().name(), " to: ", typeid(T).name()); +} } // namespace ov diff --git a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp index 62dfe01ec4ef1d..4861ef4f7d999d 100644 --- a/src/core/reference/include/openvino/reference/utils/registers_pool.hpp +++ b/src/core/reference/include/openvino/reference/utils/registers_pool.hpp @@ -64,7 +64,13 @@ class RegistersPool { } void release() { if (auto pool = regPool.lock()) { - pool->return_to_pool(reg); + try { + pool->return_to_pool(reg); + } catch (...) { + // This function is called by destructor and should not throw. Well formed Reg object won't cause + // any exception throw from return_to_pool, while on badly formed object the destructor is most + // likely called during exception stack unwind. + } regPool.reset(); } } @@ -90,8 +96,10 @@ class RegistersPool { RegistersPool::WeakPtr regPool; }; + static thread_local bool is_created; + virtual ~RegistersPool() { - check_unique_and_update(false); + is_created = false; } template @@ -178,7 +186,7 @@ class RegistersPool { } } - void check_unique_and_update(bool isCtor = true); + void check_unique_and_update(); PhysicalSet m_general_set; PhysicalSet m_simd_set; diff --git a/src/core/reference/src/utils/registers_pool.cpp b/src/core/reference/src/utils/registers_pool.cpp index 413fdcc3ed83cf..a1e6462aa51a36 100644 --- a/src/core/reference/src/utils/registers_pool.cpp +++ b/src/core/reference/src/utils/registers_pool.cpp @@ -34,16 +34,12 @@ RegistersPool::RegistersPool(std::initializer_list regsToExclude, in m_general_set.exclude(Xbyak::Reg64(Xbyak::Operand::RSP)); } -void RegistersPool::check_unique_and_update(bool is_ctor) { - static thread_local bool is_created = false; - if (is_ctor) { - if (is_created) { - OPENVINO_THROW("There should be only one instance of RegistersPool per thread"); - } - is_created = true; - } else { - is_created = false; - } +thread_local bool RegistersPool::is_created = false; + +void RegistersPool::check_unique_and_update() { + OPENVINO_ASSERT(!is_created, "There should be only one instance of RegistersPool per thread"); + + is_created = true; } void RegistersPool::PhysicalSet::set_as_used(size_t reg_idx) { diff --git a/src/core/shape_inference/include/glu_shape_inference.hpp b/src/core/shape_inference/include/glu_shape_inference.hpp new file mode 100644 index 00000000000000..365b57244036a2 --- /dev/null +++ b/src/core/shape_inference/include/glu_shape_inference.hpp @@ -0,0 +1,34 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "ov_ops/glu.hpp" +#include "utils.hpp" +#include "variadic_split_shape_inference.hpp" + +namespace ov { +namespace op { +namespace internal { +template > +std::vector shape_infer(const GLU* op, const std::vector& input_shapes) { + const auto inputs_count = input_shapes.size(); + NODE_SHAPE_INFER_CHECK(op, input_shapes, inputs_count == 1); + + int64_t axis = op->get_axis(); + std::vector split_lengths = {op->get_split_lengths(), -1}; + std::unordered_map const_data; + const_data.emplace(1, ov::Tensor(ov::element::i64, ov::Shape{}, &axis)); + const_data.emplace(2, ov::Tensor(ov::element::i64, ov::Shape{split_lengths.size()}, split_lengths.data())); + + const ov::Shape split_len_size{split_lengths.size()}; + const ov::Shape scalar{}; + std::vector variadic_split_input_shapes{input_shapes[0], scalar, split_len_size}; + + return {std::move( + ov::op::variadic_split::shape_infer(op, variadic_split_input_shapes, ov::make_tensor_accessor(const_data))[0])}; +} +} // namespace internal +} // namespace op +} // namespace ov diff --git a/src/core/shape_inference/include/variadic_split_shape_inference.hpp b/src/core/shape_inference/include/variadic_split_shape_inference.hpp index a0eff51f238e61..e0cd837003a331 100644 --- a/src/core/shape_inference/include/variadic_split_shape_inference.hpp +++ b/src/core/shape_inference/include/variadic_split_shape_inference.hpp @@ -10,10 +10,9 @@ namespace ov { namespace op { -namespace v1 { - +namespace variadic_split { template > -std::vector shape_infer(const VariadicSplit* op, +std::vector shape_infer(const Node* op, const std::vector& input_shapes, const ITensorAccessor& ta = make_tensor_accessor()) { constexpr bool is_dynamic_shape = std::is_base_of::value; @@ -120,6 +119,15 @@ std::vector shape_infer(const VariadicSplit* op, } return output_shapes; } +} // namespace variadic_split + +namespace v1 { +template > +std::vector shape_infer(const VariadicSplit* op, + const std::vector& input_shapes, + const ITensorAccessor& ta = make_tensor_accessor()) { + return op::variadic_split::shape_infer(op, input_shapes, ta); +} } // namespace v1 } // namespace op diff --git a/src/core/src/any.cpp b/src/core/src/any.cpp index 82dc01c99377fd..346819eced93e5 100644 --- a/src/core/src/any.cpp +++ b/src/core/src/any.cpp @@ -6,6 +6,17 @@ #include #include +namespace { +template +bool contains_type_index(Container&& types, const std::type_info& user_type) { + for (auto&& type : types) { + if (ov::util::equal(type, user_type)) { + return true; + } + } + return false; +} +} // namespace namespace ov { @@ -68,6 +79,48 @@ void Any::Base::read_to(Base& other) const { } } +bool Any::Base::is_base_type_info(const std::type_info& user_type) const { + return contains_type_index(base_type_info(), user_type); +} + +bool Any::Base::is_signed_integral() const { + return std::is_signed::value ? contains_type_index(std::initializer_list{typeid(char), + typeid(signed char), + typeid(short), + typeid(int), + typeid(long), + typeid(long long)}, + type_info()) + : contains_type_index(std::initializer_list{typeid(signed char), + typeid(short), + typeid(int), + typeid(long), + typeid(long long)}, + type_info()); +} + +bool Any::Base::is_unsigned_integral() const { + return std::is_signed::value + ? contains_type_index(std::initializer_list{typeid(unsigned char), + typeid(unsigned short), + typeid(unsigned int), + typeid(unsigned long), + typeid(unsigned long long)}, + type_info()) + : contains_type_index(std::initializer_list{typeid(char), + typeid(unsigned char), + typeid(unsigned short), + typeid(unsigned int), + typeid(unsigned long), + typeid(unsigned long long)}, + type_info()); +} +bool Any::Base::is_floating_point() const { + return contains_type_index( + std::initializer_list{typeid(float), typeid(double), typeid(long double)}, + type_info()); +} + Any::~Any() { _temp = {}; _impl = {}; @@ -293,4 +346,42 @@ void Write::operator()(std::ostream& os, const Any& any) const { } } // namespace util + +template +[[noreturn]] U Any::Base::convert_impl() const { + OPENVINO_THROW("Bad cast from: ", type_info().name(), " to: ", typeid(U).name()); +} + +template +U Any::Base::convert_impl() const { + return is() ? static_cast(as()) : convert_impl(); +} + +template <> +long long Any::Base::convert() const { + return std::is_signed::value ? convert_impl() + : convert_impl(); +} + +template <> +unsigned long long Any::Base::convert() const { + return std::is_signed::value ? convert_impl() + : convert_impl(); +} + +template <> +double Any::Base::convert() const { + return convert_impl(); +} } // namespace ov diff --git a/src/core/tests/any.cpp b/src/core/tests/any.cpp index 3914a617ff2982..33e928d60b872d 100644 --- a/src/core/tests/any.cpp +++ b/src/core/tests/any.cpp @@ -11,7 +11,8 @@ #include "common_test_utils/test_assertions.hpp" #include "openvino/core/runtime_attribute.hpp" -using namespace ov; +namespace ov { +namespace test { class DestructorTest { public: @@ -735,3 +736,70 @@ TEST_F(AnyTests, EmptyStringAsAny) { ASSERT_EQ(p.as>(), ref_f); ASSERT_EQ(p.as>(), ref_i); } + +template +class AnyConversionTest : public AnyTests {}; + +TYPED_TEST_SUITE_P(AnyConversionTest); + +using AnyArithmeticTypes = ::testing::Types; + +TYPED_TEST_P(AnyConversionTest, AnyToOtherValue) { + const TypeParam test_value{static_cast(23.15f)}; + const auto a = Any{test_value}; + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); + + EXPECT_EQ(a.as(), static_cast(test_value)); + EXPECT_EQ(a.as(), static_cast(test_value)); +} + +REGISTER_TYPED_TEST_SUITE_P(AnyConversionTest, AnyToOtherValue); +INSTANTIATE_TYPED_TEST_SUITE_P(InstantiationName, AnyConversionTest, AnyArithmeticTypes); + +TEST_F(AnyTests, AnyAsOtherTypeIsIncosisoinet) { + // To show member `as` current behaviour. + // Maybe there should be two members `as` which return value + // and `cast` returns reference if casted type is same as Any underlying type + auto a = Any{10}; + + auto& a_int = a.as(); + auto& a_str = a.as(); + + EXPECT_EQ(a_int, 10); + EXPECT_EQ(a_str, "10"); + + a_int = 15; + EXPECT_EQ(a_int, 15); + // as string ref still has old value + EXPECT_EQ(a_str, "10"); + + a_str = "30"; + EXPECT_EQ(a_int, 15); + // as string ref has new value but is not in sync what any contains. + EXPECT_EQ(a_str, "30"); +} + +} // namespace test +} // namespace ov diff --git a/src/frontends/pytorch/src/op/linear.cpp b/src/frontends/pytorch/src/op/linear.cpp index 2d01dee84c151b..4a5ad4a6b0e73b 100644 --- a/src/frontends/pytorch/src/op/linear.cpp +++ b/src/frontends/pytorch/src/op/linear.cpp @@ -5,6 +5,10 @@ #include "openvino/frontend/pytorch/node_context.hpp" #include "openvino/op/add.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/multiply.hpp" +#include "openvino/op/reshape.hpp" +#include "openvino/op/shape_of.hpp" +#include "openvino/op/subtract.hpp" #include "utils.hpp" namespace ov { @@ -12,6 +16,8 @@ namespace frontend { namespace pytorch { namespace op { +using namespace ov::op; + OutputVector translate_linear(const NodeContext& context) { // schema: aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor num_inputs_check(context, 2, 3); @@ -20,17 +26,91 @@ OutputVector translate_linear(const NodeContext& context) { if (weight.get_element_type() == element::f16 || weight.get_element_type() == element::bf16) { // In case of patched linear it can have mixed fp16/bf16 and fp32 input type. // In other cases these conversion is not required. - weight = context.mark_node(std::make_shared(weight, x)); + weight = context.mark_node(std::make_shared(weight, x)); } - auto matmul = context.mark_node(std::make_shared(x, weight, false, true)); + auto matmul = context.mark_node(std::make_shared(x, weight, false, true)); if (!context.input_is_none(2)) { auto bias = context.get_input(2); if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) { // Same reason as for weight. - bias = context.mark_node(std::make_shared(bias, x)); + bias = context.mark_node(std::make_shared(bias, x)); + } + matmul = context.mark_node(std::make_shared(matmul, bias)); + } + return {matmul}; +}; + +namespace { +uint32_t rearrange_awq_bits(uint32_t num) { + uint32_t result = 0; + uint32_t mask = 0xF; + + // Rearrange each 4-bit part in accordance with the AWQ i32->u4 unpacking schema + result |= (num & (mask << 0)) << 0; + result |= (num & (mask << 16)) >> 12; + result |= (num & (mask << 4)) << 4; + result |= (num & (mask << 20)) >> 8; + result |= (num & (mask << 8)) << 8; + result |= (num & (mask << 24)) >> 4; + result |= (num & (mask << 12)) << 12; + result |= (num & (mask << 28)) >> 0; + + return result; +} + +Output rearrange_constant(const Output& c, uint32_t groups) { + auto constant = std::dynamic_pointer_cast(c.get_node_shared_ptr()); + FRONT_END_OP_CONVERSION_CHECK(constant, "weight must be Constant."); + auto src = constant->get_data_ptr(); + auto initial_shape = constant->get_shape(); + FRONT_END_OP_CONVERSION_CHECK(initial_shape.size() == 2, "Only 2D constants are supported."); + auto new_shape = Shape{initial_shape[0] / groups, groups, initial_shape[1] * 8}; + auto new_qweight = std::make_shared(element::u4, new_shape); + auto dst = const_cast(reinterpret_cast(new_qweight->get_data_ptr())); + for (size_t i = 0; i < shape_size(constant->get_shape()); i++) { + dst[i] = rearrange_awq_bits(src[i]); + } + return new_qweight; +} +} // namespace + +OutputVector translate_linear_awq(const NodeContext& context) { + num_inputs_check(context, 4, 7); + auto x = context.get_input(0); + auto qweight = context.get_input(1); + auto qzeros = context.get_input(2); + auto scales = context.get_input(3); + auto groups = context.const_input(4); + auto bits = context.const_input(5); + + FRONT_END_OP_CONVERSION_CHECK(bits == 4, "Only 4 bit AWQ is supported."); + + auto new_qweight = rearrange_constant(qweight, static_cast(groups)); + auto new_qzeros = rearrange_constant(qzeros, 1); + new_qweight = context.mark_node(std::make_shared(new_qweight, scales.get_element_type())); + new_qzeros = context.mark_node(std::make_shared(new_qzeros, scales.get_element_type())); + + auto w_s = context.mark_node(std::make_shared(new_qweight, new_qzeros)); + FRONT_END_OP_CONVERSION_CHECK(scales.get_partial_shape().is_static(), "Scales must be constant."); + auto scales_shape = scales.get_shape(); + auto new_scales_shape = + v0::Constant::create(element::i32, {3}, std::vector{scales_shape[0], 1, scales_shape[1]}); + scales = context.mark_node(std::make_shared(scales, new_scales_shape, false)); + auto weight = context.mark_node(std::make_shared(w_s, scales)); + auto out_shape = + v0::Constant::create(element::i32, {2}, std::vector{static_cast(qweight.get_shape()[0]), -1}); + weight = context.mark_node(std::make_shared(weight, out_shape, false)); + weight = context.mark_node(std::make_shared(weight, x)); + + auto matmul = context.mark_node(std::make_shared(x, weight, false, false)); + if (!context.input_is_none(6)) { + auto bias = context.get_input(6); + + if (bias.get_element_type() == element::f16 || bias.get_element_type() == element::bf16) { + bias = context.mark_node(std::make_shared(bias, x)); } - matmul = context.mark_node(std::make_shared(matmul, bias)); + matmul = context.mark_node(std::make_shared(matmul, bias)); } return {matmul}; }; diff --git a/src/frontends/pytorch/src/op_table.cpp b/src/frontends/pytorch/src/op_table.cpp index 7307833430411f..ed375fd742d7ed 100644 --- a/src/frontends/pytorch/src/op_table.cpp +++ b/src/frontends/pytorch/src/op_table.cpp @@ -61,7 +61,6 @@ OP_CONVERTER(translate_clamp); OP_CONVERTER(translate_col2im); OP_CONVERTER(translate_constant); OP_CONVERTER(translate_conv_transposend); -OP_CONVERTER(translate_conv1d_ext); OP_CONVERTER(translate_convnd); OP_CONVERTER(translate_convolution); OP_CONVERTER(translate_convolution_mode); @@ -77,7 +76,6 @@ OP_CONVERTER(translate_dot); OP_CONVERTER(translate_elu); OP_CONVERTER(translate_embedding); OP_CONVERTER(translate_embedding_bag); -OP_CONVERTER(translate_embedding_ext); OP_CONVERTER(translate_empty); OP_CONVERTER(translate_empty_like); OP_CONVERTER(translate_erf); @@ -325,6 +323,10 @@ OP_CONVERTER(translate_unbind_int_fx); OP_CONVERTER(translate_unique2); OP_CONVERTER(translate_zeros_fx); OP_CONVERTER(translate_zeros_like_fx); +// Extensions +OP_CONVERTER(translate_conv1d_ext); +OP_CONVERTER(translate_embedding_ext); +OP_CONVERTER(translate_linear_awq); } // namespace op @@ -699,6 +701,7 @@ const std::unordered_map get_supported_ops_ts() { {"aten::zero", op::translate_zeros_like}, {"aten::zeros", op::translate_zeros}, {"aten::zeros_like", op::translate_zeros_like}, + {"ov_ext::awq_gemm", op::translate_linear_awq}, {"ov_ext::embedding", op::translate_embedding_ext}, {"ov_ext::conv1d", op::translate_conv1d_ext}, {"ov_ext::linear", op::translate_linear}, diff --git a/src/frontends/pytorch/src/utils.cpp b/src/frontends/pytorch/src/utils.cpp index 752b9accb71d01..5cc7ec21f30911 100644 --- a/src/frontends/pytorch/src/utils.cpp +++ b/src/frontends/pytorch/src/utils.cpp @@ -42,7 +42,11 @@ using namespace ov::op; void num_inputs_check(const NodeContext& context, size_t min_inputs, size_t max_inputs) { auto num_inputs = context.get_input_size(); - FRONT_END_OP_CONVERSION_CHECK(num_inputs >= min_inputs, "Got less inputs than expected"); + FRONT_END_OP_CONVERSION_CHECK(num_inputs >= min_inputs, + "Got less inputs ", + num_inputs, + " than expected ", + min_inputs); for (auto i = max_inputs; i < num_inputs; i++) { FRONT_END_OP_CONVERSION_CHECK(context.input_is_none(i), "Got more inputs than expected."); } diff --git a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp index d506759fd33716..dab130cb381731 100644 --- a/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp +++ b/src/frontends/tensorflow/src/checkpoint_v1_reader.cpp @@ -254,7 +254,7 @@ void CheckpointV1Reader::read_variable(const std::string& variable_name, ov::Any // This is only present at the first item of each checkpoint file and serves // as a table of contents, listing all the tensor slices saved in this file. - ::tensorflow::SavedTensorSlices sts; + ::tensorflow::SavedTensorSlices sts{}; FRONT_END_GENERAL_CHECK(sts.ParseFromArray(raw_data.data(), static_cast(raw_data.size())), "[TensorFlow Frontend] incorrect input checkpoint file or internal error: cannot parse " "SavedTensorSlices entry"); diff --git a/src/frontends/tensorflow/src/op/var_handle.cpp b/src/frontends/tensorflow/src/op/var_handle.cpp index f0077ae206bf6d..53fdf21d6086bf 100644 --- a/src/frontends/tensorflow/src/op/var_handle.cpp +++ b/src/frontends/tensorflow/src/op/var_handle.cpp @@ -98,7 +98,7 @@ OutputVector translate_varhandle_op(const NodeContext& node) { TENSORFLOW_OP_VALIDATION(node, result, "[TensorFlow Frontend] Internal error: Cannot find requested variable."); - ::tensorflow::BundleEntryProto entry; + ::tensorflow::BundleEntryProto entry{}; TENSORFLOW_OP_VALIDATION(node, entry.ParseFromArray(entry_data, static_cast(entry_size)), "[TensorFlow Frontend] Internal error: Cannot get read bundle entry."); diff --git a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp index a01780d58cfeae..795f4deb3d93ef 100644 --- a/src/frontends/tensorflow/src/op/xla_conv_v2.cpp +++ b/src/frontends/tensorflow/src/op/xla_conv_v2.cpp @@ -111,7 +111,7 @@ OutputVector translate_xla_conv_v2_op(const NodeContext& node) { is_all_one, "[TensorFlow Frontend] internal error: convolutional kernel with holes is not supported"); - ConvolutionDimensionNumbers dimension_numbers; + ConvolutionDimensionNumbers dimension_numbers{}; TENSORFLOW_OP_VALIDATION( node, dimension_numbers.ParseFromArray(dimension_numbers_message.data(), diff --git a/src/frontends/tensorflow/src/variables_index.cpp b/src/frontends/tensorflow/src/variables_index.cpp index 778f8b2f94bb7c..f4e330518e20c2 100644 --- a/src/frontends/tensorflow/src/variables_index.cpp +++ b/src/frontends/tensorflow/src/variables_index.cpp @@ -128,7 +128,7 @@ void VariablesIndex::read_bundle_header() { auto item = m_variables_index.find(""); FRONT_END_GENERAL_CHECK(item != m_variables_index.end(), "Bundle Header isn't found in index"); - ::tensorflow::BundleHeaderProto bundleHeader; + ::tensorflow::BundleHeaderProto bundleHeader{}; FRONT_END_GENERAL_CHECK(bundleHeader.ParseFromArray(item->second.data(), static_cast(item->second.size())), "Bundle Header: Cannot parse Bundle Header"); FRONT_END_GENERAL_CHECK(bundleHeader.version().producer() == 1, "Bundle Header: Unsupported producer version"); @@ -147,7 +147,7 @@ void VariablesIndex::read_checkpointable_object_graph() { return; } - ::tensorflow::BundleEntryProto entry; + ::tensorflow::BundleEntryProto entry{}; FRONT_END_GENERAL_CHECK(entry.ParseFromArray(item->second.data(), static_cast(item->second.size())), "CMO: Cannot parse Bundle Entry"); diff --git a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp index 01f7b556da909f..3a3d5d9910305f 100644 --- a/src/inference/dev_api/openvino/runtime/icompiled_model.hpp +++ b/src/inference/dev_api/openvino/runtime/icompiled_model.hpp @@ -136,11 +136,11 @@ class OPENVINO_RUNTIME_API ICompiledModel : public std::enable_shared_from_this< /** * @brief Release intermediate memory - * + * */ virtual void release_memory(); - virtual ~ICompiledModel() = default; + virtual ~ICompiledModel(); private: std::shared_ptr m_plugin; diff --git a/src/inference/include/openvino/runtime/intel_npu/properties.hpp b/src/inference/include/openvino/runtime/intel_npu/properties.hpp index 49416f61b8b43b..8734757da1d53d 100644 --- a/src/inference/include/openvino/runtime/intel_npu/properties.hpp +++ b/src/inference/include/openvino/runtime/intel_npu/properties.hpp @@ -95,5 +95,12 @@ static constexpr ov::Property max_tiles{"NPU_MAX_TILES"}; */ static constexpr ov::Property bypass_umd_caching{"NPU_BYPASS_UMD_CACHING"}; +/** + * @brief [Only for NPU Plugin] + * Type: boolean, default is false + * This option allows to delay loading the weights until inference is created + */ +static constexpr ov::Property defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}; + } // namespace intel_npu } // namespace ov diff --git a/src/inference/src/cpp/compiled_model.cpp b/src/inference/src/cpp/compiled_model.cpp index d675cba4714887..c780bbee1e991d 100644 --- a/src/inference/src/cpp/compiled_model.cpp +++ b/src/inference/src/cpp/compiled_model.cpp @@ -8,10 +8,6 @@ #include "openvino/runtime/icompiled_model.hpp" #include "openvino/runtime/properties.hpp" -#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) -# include -#endif - #define OV_COMPILED_MODEL_CALL_STATEMENT(...) \ if (_impl == nullptr) \ OPENVINO_THROW("CompiledModel was not initialized."); \ @@ -27,12 +23,6 @@ namespace ov { CompiledModel::~CompiledModel() { _impl = {}; -#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) - // Linux memory margent doesn't return system memory immediate after release. - // It depends on memory chunk size and allocation history. - // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process. - malloc_trim(0); -#endif } CompiledModel::CompiledModel(const std::shared_ptr& impl, const std::shared_ptr& so) diff --git a/src/inference/src/dev/icompiled_model.cpp b/src/inference/src/dev/icompiled_model.cpp index b1cbedac1632ab..f452dd3a330a17 100644 --- a/src/inference/src/dev/icompiled_model.cpp +++ b/src/inference/src/dev/icompiled_model.cpp @@ -10,6 +10,10 @@ #include "openvino/runtime/properties.hpp" #include "transformations/utils/utils.hpp" +#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) +# include +#endif + ov::ICompiledModel::ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const std::shared_ptr& task_executor, @@ -151,3 +155,12 @@ void ov::ICompiledModel::set_model_shared_object(ov::Model& model, const std::sh void ov::ICompiledModel::release_memory() { // nothing to do } + +ov::ICompiledModel::~ICompiledModel() { +#if defined(OPENVINO_GNU_LIBC) && !defined(__ANDROID__) + // Linux memory margent doesn't return system memory immediate after release. + // It depends on memory chunk size and allocation history. + // Try return memory from a process to system now to reduce memory usage and not wait to the end of the process. + malloc_trim(0); +#endif +} diff --git a/src/plugins/intel_cpu/src/config.cpp b/src/plugins/intel_cpu/src/config.cpp index 7ce4c1069e695d..64cf926fed7fbb 100644 --- a/src/plugins/intel_cpu/src/config.cpp +++ b/src/plugins/intel_cpu/src/config.cpp @@ -460,5 +460,15 @@ void Config::updateProperties() { _config.insert({ov::hint::num_requests.name(), std::to_string(hintNumRequests)}); } +void Config::applyRtInfo(const std::shared_ptr& model) { + if (model->has_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()})) { + this->kvCachePrecision = model->get_rt_info({"runtime_options", ov::hint::kv_cache_precision.name()}); + } + if (model->has_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()})) { + this->fcDynamicQuantizationGroupSize = + model->get_rt_info({"runtime_options", ov::hint::dynamic_quantization_group_size.name()}); + } +} + } // namespace intel_cpu } // namespace ov diff --git a/src/plugins/intel_cpu/src/config.h b/src/plugins/intel_cpu/src/config.h index 5f4bb25ede350e..a8439d87803fd4 100644 --- a/src/plugins/intel_cpu/src/config.h +++ b/src/plugins/intel_cpu/src/config.h @@ -106,6 +106,8 @@ struct Config { void updateProperties(); + void applyRtInfo(const std::shared_ptr& model); + std::map _config; int modelPreferThreads = -1; diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp index 83cdd252f9bc6f..4aec56d98873fa 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.cpp @@ -516,6 +516,51 @@ std::set> jit_floor_emitter::get_supported_precisions return {{element::f32}}; } +/// FLOOR_MOD /// +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node) + : jit_emitter(host, host_isa, node, get_arithmetic_binary_exec_precision(node)) { +} + +jit_floor_mod_emitter::jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc): jit_emitter(host, host_isa, exec_prc) { +} + +size_t jit_floor_mod_emitter::get_inputs_count() const { return 2; } + +size_t jit_floor_mod_emitter::get_aux_vecs_count() const { return 1; } + +void jit_floor_mod_emitter::emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + if (host_isa_ == dnnl::impl::cpu::aarch64::asimd) { + emit_isa(in_vec_idxs, out_vec_idxs); + } else { + OV_CPU_JIT_EMITTER_THROW("Can't create jit eltwise kernel"); + } +} + +template +void jit_floor_mod_emitter::emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const { + OV_CPU_JIT_EMITTER_ASSERT(exec_prc_ == ov::element::f32, "unsupported precision: " + exec_prc_.to_string()); + + using TReg = typename dnnl::impl::cpu::aarch64::cpu_isa_traits::TReg; + + TReg dividend = TReg(in_vec_idxs[0]); + TReg divisor = TReg(in_vec_idxs[1]); + TReg r = TReg(out_vec_idxs[0]); + TReg aux = TReg(aux_vec_idxs[0]); + + h->fdiv(aux.s, dividend.s, divisor.s); + h->frintm(aux.s, aux.s); + h->fmul(aux.s, aux.s, divisor.s); + h->fsub(r.s, dividend.s, aux.s); +} + +std::set> jit_floor_mod_emitter::get_supported_precisions(const std::shared_ptr& node) { + return {{element::f32, element::f32}}; +} + /// CEILING /// //Initialization of the emitter, taking node as input jit_ceiling_emitter::jit_ceiling_emitter(dnnl::impl::cpu::aarch64::jit_generator* host, diff --git a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp index fa4f4141c388e4..2cb7e6928ade3e 100644 --- a/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp +++ b/src/plugins/intel_cpu/src/emitters/plugin/aarch64/jit_eltwise_emitters.hpp @@ -213,7 +213,28 @@ class jit_floor_emitter : public jit_emitter { template void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; }; +class jit_floor_mod_emitter : public jit_emitter { +public: + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const ov::element::Type exec_prc = ov::element::f32); + + jit_floor_mod_emitter(dnnl::impl::cpu::aarch64::jit_generator *host, + dnnl::impl::cpu::aarch64::cpu_isa_t host_isa, + const std::shared_ptr& node); + + size_t get_inputs_count() const override; + + size_t get_aux_vecs_count() const override; + + static std::set> get_supported_precisions(const std::shared_ptr& node = nullptr); +private: + void emit_impl(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const override; + + template + void emit_isa(const std::vector &in_vec_idxs, const std::vector &out_vec_idxs) const; +}; class jit_ceiling_emitter : public jit_emitter { public: // Constructor with explicit precision diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp index e68ab224407c7b..53d8fea05a8adf 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_copy_b_emitter.cpp @@ -48,14 +48,11 @@ jit_brgemm_copy_b_emitter::jit_brgemm_copy_b_emitter(jit_generator* h, cpu_isa_t OV_CPU_JIT_EMITTER_ASSERT(!snippets::utils::is_dynamic_vdims(expr->get_input_port_descriptor(0)->get_shape()), "Jit emitter is called when the shapes are unknown"); - const auto& in_subtensor = get_projected_subtensor(expr->get_input_port(0)); - const auto K_blk = *++in_subtensor.rbegin(); - const auto& src_prc = brgemm_repack->get_src_element_type(); const auto& wei_prc = brgemm_repack->get_input_element_type(0); const auto wei_N_blk = brgemm_utils::repacking::compute_inner_n_block(wei_prc); const auto is_transposed = get_is_transposed(expr); - const auto brgemm_type = get_brgemm_type(src_prc, K_blk, is_transposed); + const auto brgemm_type = get_brgemm_type(src_prc, is_transposed); const auto primitive_isa = brgemm_utils::get_primitive_isa(src_prc, with_amx(brgemm_type)); m_with_comp = with_compensations(brgemm_type); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp index 057a3687ab8d16..6e70cbf2e8fe81 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.cpp @@ -5,10 +5,13 @@ #include "jit_brgemm_emitter.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" -#include "snippets/utils/utils.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_amx.hpp" #include "emitters/plugin/x64/utils.hpp" + +#include "snippets/utils/utils.hpp" #include "utils.hpp" -#include "transformations/snippets/x64/op/brgemm_utils.hpp" using namespace Xbyak; using namespace dnnl::impl; @@ -27,11 +30,14 @@ jit_brgemm_emitter::jit_brgemm_emitter(jit_generator* h, cpu_isa_t isa, const auto& brg0Prc = brgemm_node->get_input_element_type(0); const auto& brg1Prc = brgemm_node->get_input_element_type(1); const auto brgemm_type = brgemm_node->get_type(); - BrgemmKernelConfig kernel_config(brg0Prc, brg1Prc, with_amx(brgemm_type), with_compensations(brgemm_type), - brgemm_utils::get_primitive_isa(brg0Prc, with_amx(brgemm_type))); - m_kernel_executor = kernel_table->register_kernel(expr, - compiled_kernel_cache, - kernel_config); + m_is_with_amx = brgemm_utils::with_amx(brgemm_type); + if (m_is_with_amx) { + BrgemmAMXKernelConfig kernel_config(brg0Prc, brg1Prc, brgemm_utils::get_primitive_isa(brg0Prc, true)); + m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + } else { + BrgemmKernelConfig kernel_config(brg0Prc, brg1Prc, with_compensations(brgemm_type), brgemm_utils::get_primitive_isa(brg0Prc, false)); + m_kernel_executor = kernel_table->register_kernel(expr, compiled_kernel_cache, kernel_config); + } // Note: even if the Brgemm node is dynamic, the first shapeInfer and RuntimeConfigurator::update() // are performed before the BrgemmKernelExecutor registration. So we have to trigger update() manually // for both static and the 1st dynamic shapes. @@ -82,18 +88,32 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec if (in.size() > 2) mem_ptrs_idxs.emplace_back(in[2]); + if (std::dynamic_pointer_cast(m_kernel_executor)) + emit_call(mem_ptrs_idxs); + else if (std::dynamic_pointer_cast(m_kernel_executor)) + emit_call(mem_ptrs_idxs); + else + OV_CPU_JIT_EMITTER_THROW("uknown execuor type"); +} + +template::value, bool>::type> +void jit_brgemm_emitter::emit_call(const std::vector& mem_ptrs_idxs) const { EmitABIRegSpills spill(h); spill.preamble(); - h->mov(h->rbp, reinterpret_cast(BrgemmKernelExecutor::execute)); - auto reserved_stack_size = sizeof(BrgemmKernelExecutor::call_args); + h->mov(h->rbp, reinterpret_cast(T::execute)); + auto reserved_stack_size = sizeof(typename T::call_args); // Reserve memory on the stack h->sub(h->rsp, reserved_stack_size); const bool is_dynamic_case = std::any_of(m_memory_offsets.cbegin(), m_memory_offsets.cend(), ov::snippets::utils::is_dynamic_value); Xbyak::Reg64 aux_reg = is_dynamic_case ? ov::intel_cpu::utils::get_aux_gpr(mem_ptrs_idxs) : Xbyak::Reg64(); - const std::vector brgemm_args_offsets {GET_OFF_BRGEMM_ARGS(A), GET_OFF_BRGEMM_ARGS(B), GET_OFF_BRGEMM_ARGS(C), GET_OFF_BRGEMM_ARGS(scratch)}; +#define GET_OFF_CALL_ARGS(field) offsetof(typename T::call_args, field) + const std::vector brgemm_args_offsets = { GET_OFF_CALL_ARGS(A), GET_OFF_CALL_ARGS(B), GET_OFF_CALL_ARGS(C), GET_OFF_CALL_ARGS(scratch) }; +#undef GET_OFF_CALL_ARGS + const auto& mem_ptrs = utils::transform_idxs_to_regs(mem_ptrs_idxs); for (size_t i = 0; i < mem_ptrs.size(); i++) { if (ov::snippets::utils::is_dynamic_value(m_memory_offsets[i])) @@ -108,8 +128,10 @@ void jit_brgemm_emitter::emit_impl(const std::vector& in, const std::vec h->mov(h->qword[h->rsp + brgemm_args_offsets.back()], reinterpret_cast(nullptr)); // abi_param1 always contains jit_snippets_call_args which has amx tile config for each thread - h->lea(h->r10, h->ptr[abi_param1 + GET_OFF(amx_tile_config)]); - h->mov(h->qword[h->rsp + GET_OFF_BRGEMM_ARGS(amx_tile_config)], h->r10); + if (std::is_same()) { + h->lea(h->r10, h->ptr[abi_param1 + GET_OFF(amx_tile_config)]); + h->mov(h->qword[h->rsp + GET_OFF_BRGEMM_AMX_ARGS(amx_tile_config)], h->r10); + } h->mov(abi_param1, reinterpret_cast(m_kernel_executor.get())); h->mov(abi_param2, h->rsp); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp index baa6ed95473034..ccec1b68b18b20 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/jit_brgemm_emitter.hpp @@ -5,7 +5,7 @@ #pragma once #include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/x64/kernel_executors/brgemm.hpp" +#include "emitters/snippets/x64/kernel_executors/brgemm_base.hpp" namespace ov { namespace intel_cpu { @@ -24,15 +24,21 @@ class jit_brgemm_emitter : public jit_emitter { void validate_arguments(const std::vector &in, const std::vector &out) const override; void emit_impl(const std::vector& in, const std::vector& out) const override; + template ::value, bool>::type = true> + void emit_call(const std::vector& mem_ptrs_idxs) const; + // Note: offsets order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if offset is calculated in runtime std::vector m_memory_offsets{}; // Note: cluster ids order: A, B, C (+ scratchpad, if needed). Values can be dynamic_value if there is no buffer std::vector m_buffer_ids{}; - std::shared_ptr m_kernel_executor = nullptr; + std::shared_ptr m_kernel_executor = nullptr; #ifdef SNIPPETS_DEBUG_CAPS friend std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter); #endif + + bool m_is_with_amx {false}; }; } // namespace intel_cpu diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp index fad1be5a5d1289..c57824526d6e20 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.cpp @@ -1,136 +1,55 @@ -// Copyright (C) 2020-2023 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "brgemm.hpp" -#include - #include "common/utils.hpp" #include "dnnl_extension_utils.h" -#include "snippets/lowered/loop_manager.hpp" + #include "snippets/lowered/pass/insert_specific_iterations.hpp" + #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_utils.hpp" -#define DIM_CAST(X) static_cast(X) -#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) using namespace Xbyak; using namespace dnnl::impl; using namespace dnnl::impl::cpu::x64; -namespace { -size_t init_hash(dnnl_data_type_t dt_in0, dnnl_data_type_t dt_in1, bool is_with_amx, - bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t isa) { - size_t seed = 0; -#define HASH(X) seed = hash_combine(seed, X) - HASH(dt_in0); HASH(dt_in1); - HASH(is_with_amx); HASH(is_with_comp); - HASH(isa); -#undef HASH - return seed; -} -} // namespace - namespace ov { namespace intel_cpu { -BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, - dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) : - m_static_params(std::make_shared(in0_dtype, in1_dtype, - is_with_amx, is_with_comp, - primitive_isa)) { - m_hash = compute_hash(); -} - -bool BrgemmKernelConfig::is_completed() const { - return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); -} - -bool BrgemmKernelConfig::operator==(const BrgemmKernelConfig& rhs) const { -#define EQ(X) X == rhs.X - return EQ(m_hash) && EQ(m_beta) && - EQ(m_M) && EQ(m_N) && EQ(m_K) && - EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC) && - (EQ(m_static_params.get()) || *m_static_params == *(rhs.m_static_params)); -#undef EQ -} -void BrgemmKernelConfig::update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta) { - // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) - // To process this case, we have to make this Config as empty (nullify runtime parameters) - if (utils::one_of(0, M, N, K)) { - m_M = 0; m_N = 0; m_K = 0; - m_LDA = 0; m_LDB = 0; m_LDC = 0; - m_beta = 0; - } else { - m_M = M; m_N = N; m_K = K; - m_LDA = LDA; m_LDB = LDB; m_LDC = LDC; - m_beta = beta; - } +BrgemmKernelConfig::BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, + bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : BrgemmBaseKernelConfig(), m_static_params(std::make_shared(in0_dtype, in1_dtype, is_with_comp, primitive_isa)) { m_hash = compute_hash(); } -bool BrgemmKernelConfig::is_empty() const { - return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta); -} - -BrgemmKernelConfig::operator amx_tile_config_t() const { - amx_tile_config_t res; - res.M = m_M; res.N = m_N; res.K = m_K; - return res; -} - BrgemmKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, - dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) : - dt_in0(DTYPE_CAST(in0_dtype)), dt_in1(DTYPE_CAST(in1_dtype)), - is_with_amx(is_with_amx), is_with_comp(is_with_comp), - isa(primitive_isa), - hash(init_hash(dt_in0, dt_in1, is_with_amx, is_with_comp, isa)) { -} + bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : StaticBaseParams(in0_dtype, in1_dtype, primitive_isa, compute_hash(is_with_comp)), is_with_comp(is_with_comp) {} bool BrgemmKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { -#define EQ(X) X == rhs.X - return EQ(hash) && EQ(dt_in0) && EQ(dt_in1)&& EQ(is_with_amx) && EQ(is_with_comp) && EQ(isa); -#undef EQ + return StaticBaseParams::operator==(rhs) && is_with_comp == rhs.is_with_comp; } -size_t BrgemmKernelConfig::compute_hash() const { - size_t seed = m_static_params->hash; -#define HASH(X) seed = hash_combine(seed, X) - HASH(m_M); HASH(m_N); HASH(m_K); - HASH(m_LDA); HASH(m_LDB); HASH(m_LDC); - HASH(m_beta); -#undef HASH - return seed; + +size_t BrgemmKernelConfig::StaticParams::compute_hash(bool is_with_comp) { + return hash_combine(0, is_with_comp); } #ifdef SNIPPETS_DEBUG_CAPS -#define PRINT(X) ss << #X << " = " << X << "\n" std::string BrgemmKernelConfig::StaticParams::to_string() const { std::stringstream ss; - PRINT(dt_in0); PRINT(dt_in1); - PRINT(is_with_amx); PRINT(is_with_comp); - PRINT(isa); - return ss.str(); -} - -std::string BrgemmKernelConfig::to_string() const { - std::stringstream ss; - ss << m_static_params->to_string() << "\n"; - PRINT(m_M); PRINT(m_N); PRINT(m_K); - PRINT(m_LDA); PRINT(m_LDB); PRINT(m_LDC); - PRINT(m_beta); + ss << StaticBaseParams::to_string(); + ss << "is_with_comp = " << is_with_comp << "\n"; return ss.str(); } -#undef PRINT #endif BrgemmKernelExecutor::BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) : CPUKernelExecutor(std::move(kernel_cache), std::move(config)) { } - std::shared_ptr BrgemmKernelExecutor::compile_kernel(const BrgemmKernelConfig& config) const { std::shared_ptr compiled_kernel = std::make_shared(); @@ -138,203 +57,42 @@ std::shared_ptr BrgemmKernelExecutor::compile_kernel(const if (config.is_empty()) return compiled_kernel; - cpu::x64::brgemm_desc_t desc; - auto status = brgemm_desc_init(&desc, config.get_isa(), cpu::x64::brgemm_strd, - config.get_dt_in0(), config.get_dt_in1(), - false, false, cpu::x64::brgemm_row_major, 1.f, - config.get_beta(), - config.get_LDA(), config.get_LDB(), config.get_LDC(), - config.get_M(), config.get_N(), config.get_K(), nullptr); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm descriptor due to invalid params"); - - if (config.is_with_amx()) { - status = brgemm_init_tiles(desc, compiled_kernel->palette); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot initialize brgemm tiles due to invalid params"); - } - - cpu::x64::brgemm_kernel_t* kernel_ = nullptr; - status = brgemm_kernel_create(&kernel_, desc); - OV_CPU_JIT_EMITTER_ASSERT(status == dnnl_success, "Cannot create brgemm kernel due to invalid params"); - compiled_kernel->compiled_kernel = std::unique_ptr(kernel_); + create_brgemm_kernel(compiled_kernel->brgemm_kernel, config.get_dt_in0(), config.get_dt_in1(), config.get_isa(), + config.get_M(), config.get_N(), config.get_K(), config.get_LDA(), config.get_LDB(), config.get_LDC(), config.get_beta()); return compiled_kernel; } -float BrgemmKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, - const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) { - // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop. - // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass). - // It means that previous executed Loops have Loop ID less the current Loop ID. - // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have `beta = 1`. - // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`. - if (loop_id > 0) { - const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info(); - // Check the previous Loops - --loop_id; - while (loop_id >= 0) { - const auto& expanded_loop_info = loop_manager->get_loop_info(loop_id); - if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) - return 0; - if (expanded_loop_info->get_work_amount() > 0) { - // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1` - return 1; - } - --loop_id; - } - } - return 0; -} + void BrgemmKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, const ov::snippets::lowered::LinearIRCPtr& linear_ir, BrgemmKernelConfig& config) const { - const auto& input_pds = expr->get_input_port_descriptors(); - const auto& output_pds = expr->get_output_port_descriptors(); - OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, - "Invalid number of in/out port descriptors"); - - const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); - const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); - auto in0_subtensor = input_pds[0]->get_subtensor(); - auto in1_subtensor = input_pds[1]->get_subtensor(); - - // Need to update M, K, N - // 1. If the original value in subtensor is `FULL_DIM`, it means that - // Brgemm block should process full tensor by this dim -> take dimension from shape - // 2. Otherwise, Brgemm block processes part of the tensor by this dim - // (there is blocking by this dimension) -> take from Loop increment - - auto M = *++in0_subtensor.rbegin(); - auto K = *in0_subtensor.rbegin(); - auto N = *in1_subtensor.rbegin(); - - size_t loop_idx = 0; - const auto& loop_ids = expr->get_loop_ids(); - const auto& loop_manager = linear_ir->get_loop_manager(); - auto get_loop_info = [&](){ - OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); - return loop_manager->get_loop_info(loop_ids[loop_idx++]); - }; - - /* ------- Dimension M ----------*/ - if (ov::snippets::utils::is_full_dim_value(M)) { - M = *++in0_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` - // to avoid extra checks, we validate only first input port - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 1; }; - OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension M"); - M = current_expanded_loop_info->get_increment(); - input_pds[0]->set_subtensor_dim(1, M); - output_pds[0]->set_subtensor_dim(1, M); - } - - /* ------- Dimension N ----------*/ - if (ov::snippets::utils::is_full_dim_value(N)) { - N = *in1_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 0; }; - OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && - out_ports.size() == 1 && check_port(out_ports.back()), - "Incorrect Loop by Brgemm dimension N"); - N = current_expanded_loop_info->get_increment(); - input_pds[1]->set_subtensor_dim(0, N); - output_pds[0]->set_subtensor_dim(0, N); - } - - /* ------- Dimension K ----------*/ - // 1. If Brgemm block processes full dimension K -> `beta = 0` - // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find - // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). - // First of them will have `beta = 0`, other - `beta = 1` - float beta = 0; - if (ov::snippets::utils::is_full_dim_value(K)) { - K = *in0_shape.rbegin(); - } else { - const auto& current_expanded_loop_info = get_loop_info(); - const auto& in_ports = current_expanded_loop_info->get_input_ports(); - const auto& out_ports = current_expanded_loop_info->get_output_ports(); - // Quick validation check: Should we check that port is really Brgemm port? - // Note: We check `is_incremented` attribute only for not incremented ports because - // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization - OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && - out_ports.size() == 1 && !out_ports.front().is_incremented, - "Incorrect Loop by Brgemm dimension K"); - K = current_expanded_loop_info->get_increment(); - input_pds[0]->set_subtensor_dim(0, K); - input_pds[1]->set_subtensor_dim(1, K); - if (K > 0) - beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); - } - - const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); - const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); - auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); - const auto& brgemm_node = as_type_ptr(expr->get_node()); - OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); - // In case of data repacking LDB is chosen in accordance with repacking buffer size - if (with_repacking(brgemm_node->get_type())) - LDB = brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1)); - - config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); + return BrgemmBaseKernelExecutor::update_config(expr, linear_ir, config); } void BrgemmKernelExecutor::execute(const BrgemmKernelExecutor* executor, call_args* args) { + OV_CPU_JIT_EMITTER_ASSERT(executor, "has nullptr executor"); auto kernel = executor->get_kernel(); const auto& config = static_cast(executor->get_config()); OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr compiler kernel or invalid config"); - const auto tile_config = args->amx_tile_config; - if (config.is_with_amx() && tile_config && !config.compatible(tile_config)) { - *tile_config = static_cast(config); - cpu::x64::amx_tile_configure(kernel->palette); - } - - cpu::x64::brgemm_kernel_params_t brgemm_p; // Note: compensations should be applied only once, so we do it only on the first iteration, when beta == 0 - size_t is_with_comp = config.get_beta() == 0 && config.is_with_comp(); - - brgemm_p.batch = nullptr; // default value - brgemm_p.ptr_A = args->A; - brgemm_p.ptr_B = args->B; - brgemm_p.ptr_C = args->C; - brgemm_p.ptr_D = args->C; - brgemm_p.ptr_buf = args->scratch; - brgemm_p.ptr_bias = nullptr; - brgemm_p.do_post_ops = is_with_comp; - brgemm_p.do_apply_comp = is_with_comp; - brgemm_p.skip_accm = 0; - brgemm_p.BS = 1; // default value - OV_CPU_JIT_EMITTER_ASSERT(kernel->compiled_kernel, "has nullptr kernel"); - (*kernel->compiled_kernel)(&brgemm_p); + const auto is_with_comp = config.get_beta() == 0 && config.is_with_comp(); + execute_brgemm_kernel(kernel->brgemm_kernel, args->A, args->B, args->C, args->scratch, is_with_comp); } #ifdef SNIPPETS_DEBUG_CAPS BrgemmKernelReferenceExecutor::BrgemmKernelReferenceExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config) : - BrgemmKernelExecutor(std::move(kernel_cache), std::move(config)) { -} + BrgemmKernelExecutor(std::move(kernel_cache), std::move(config)) {} std::shared_ptr BrgemmKernelReferenceExecutor::compile_kernel(const BrgemmKernelConfig& c) const { const auto& res = std::make_shared(); - res->compiled_kernel.reset(new brgemm_ref_kernel(c)); + res->brgemm_kernel.reset(new brgemm_ref_kernel(c)); return res; } brgemm_ref_kernel::brgemm_ref_kernel(BrgemmKernelConfig c) : m_config(std::move(c)) { - OV_CPU_JIT_EMITTER_ASSERT(!m_config.is_with_comp() && !m_config.is_with_amx(), - "brgemm_ref_kernel doesn't currently support compensations or amx"); + OV_CPU_JIT_EMITTER_ASSERT(!m_config.is_with_comp(), + "brgemm_ref_kernel doesn't currently support compensations"); OV_CPU_JIT_EMITTER_ASSERT(m_config.get_dt_in0() == m_config.get_dt_in1() && m_config.get_dt_in0() == dnnl_data_type_t::dnnl_f32, "brgemm_ref_kernel currently supports only fp32 inputs"); diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp index 2549580c1a176c..1c3d1e18872aea 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm.hpp @@ -1,96 +1,61 @@ -// Copyright (C) 2020-2023 Intel Corporation +// Copyright (C) 2020-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #pragma once -#include "emitters/plugin/x64/jit_emitter.hpp" -#include "emitters/snippets/jit_snippets_call_args.hpp" -#include "emitters/snippets/cpu_kernel_executor_table.hpp" -#include - -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/lowered/loop_info.hpp" +#include "brgemm_base.hpp" namespace ov { namespace intel_cpu { -struct BrgemmKernelConfig : public snippets::KernelExecutorBase::GenericConfig { + +struct BrgemmKernelConfig : public BrgemmBaseKernelConfig { public: BrgemmKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); BrgemmKernelConfig() = delete; - bool is_completed() const override; - size_t hash() const override { return m_hash; } - bool operator==(const BrgemmKernelConfig& rhs) const; - bool operator!=(const BrgemmKernelConfig& rhs) const {return !(*this == rhs);} - std::unique_ptr get_clone_ptr() const override { - return std::unique_ptr( new BrgemmKernelConfig(*this)); - } - void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta); - bool is_empty() const; - dnnl_data_type_t get_dt_in0() const { return m_static_params->dt_in0; } - dnnl_data_type_t get_dt_in1() const { return m_static_params->dt_in1; } - - dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return m_static_params->isa; } - bool is_with_amx() const {return m_static_params->is_with_amx; } - bool is_with_comp() const { return m_static_params->is_with_comp; } - float get_beta() const { return m_beta; } - - dnnl_dim_t get_M() const { return m_M; } - dnnl_dim_t get_N() const { return m_N; } - dnnl_dim_t get_K() const { return m_K; } - - dnnl_dim_t get_LDA() const { return m_LDA; } - dnnl_dim_t get_LDB() const { return m_LDB; } - dnnl_dim_t get_LDC() const { return m_LDC; } - - explicit operator amx_tile_config_t() const; - inline bool compatible(amx_tile_config_t* rhs) const { - return rhs && rhs->M == m_M && rhs->N == m_N && rhs->K == m_K; + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmKernelConfig(*this)); } -#ifdef SNIPPETS_DEBUG_CAPS - std::string to_string() const override; -#endif + bool is_with_comp() const { return m_static_params->is_with_comp; } private: - struct StaticParams { - StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, - bool is_with_amx, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); - const dnnl_data_type_t dt_in0 {dnnl_f32}, dt_in1 {dnnl_f32}; - const bool is_with_amx {false}; + struct StaticParams : StaticBaseParams { + StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, bool is_with_comp, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + const bool is_with_comp {false}; - const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; - const size_t hash {0}; + bool operator==(const StaticParams& rhs) const; bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } #ifdef SNIPPETS_DEBUG_CAPS std::string to_string() const; #endif + private: + static size_t compute_hash(bool is_with_comp); }; - size_t compute_hash() const; - std::shared_ptr m_static_params; - dnnl_dim_t m_M {0}, m_N {0}, m_K {0}, m_LDA {0}, m_LDB {0}, m_LDC {0}; - float m_beta {0}; - size_t m_hash {SIZE_MAX}; + + std::shared_ptr get_static_params() const override { return m_static_params; } + + std::shared_ptr m_static_params {nullptr}; }; +// The `update_kernel` method verifies that a compiled kernel is not nullptr. +// However, the compiled kernel might be empty in cases if nothing is to be compiled (`Config.is_empty() == true`). +// To cover this case, we wrap the `brgemm_kernel_t` in the separate structure which may contain empty `brgemm_kernel_t` struct BrgemmCompiledKernel { - std::unique_ptr compiled_kernel = nullptr; - // Note: Palette is treated as a part of a kernel because it is initialized during the kernel compilation stage. - // Each kernel need to store the pallet it was compiled with. - char palette[64] = {}; + std::shared_ptr brgemm_kernel = nullptr; }; -class BrgemmKernelExecutor : public CPUKernelExecutor { +class BrgemmKernelExecutor : public BrgemmBaseKernelExecutor, + public CPUKernelExecutor { public: struct call_args { const void* A = nullptr; const void* B = nullptr; void* C = nullptr; void* scratch = nullptr; - amx_tile_config_t* amx_tile_config = nullptr; }; BrgemmKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmKernelConfig config); @@ -99,12 +64,10 @@ class BrgemmKernelExecutor : public CPUKernelExecutor compile_kernel(const BrgemmKernelConfig& c) const override; + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, const ov::snippets::lowered::LinearIRCPtr& linear_ir, BrgemmKernelConfig& config) const override; - - static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, - const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info); }; #define GET_OFF_BRGEMM_ARGS(field) offsetof(BrgemmKernelExecutor::call_args, field) @@ -116,6 +79,7 @@ class BrgemmKernelReferenceExecutor : public BrgemmKernelExecutor { protected: std::shared_ptr compile_kernel(const BrgemmKernelConfig& c) const override; }; + struct brgemm_ref_kernel : public dnnl::impl::cpu::x64::brgemm_kernel_t { brgemm_ref_kernel(BrgemmKernelConfig c); void operator()(dnnl::impl::cpu::x64::brgemm_kernel_params_t *) const override; diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp new file mode 100644 index 00000000000000..62c7236735f70e --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.cpp @@ -0,0 +1,249 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_amx.hpp" + +#include "transformations/snippets/x64/op/brgemm_utils.hpp" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" + +#include + + +#define INNER_K_BLK(dtype) static_cast((brgemm_utils::repacking::compute_inner_k_block(in0_dtype))) +#define VNNI_FACTOR(dtype) static_cast((brgemm_utils::compute_vnni_factor(in0_dtype))) +#define EQ(X) X == rhs.X +#define HASH(X) seed = hash_combine(seed, X) + + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + + +namespace ov { +namespace intel_cpu { + +BrgemmAMXKernelConfig::BrgemmAMXKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : BrgemmBaseKernelConfig(), m_static_params(std::make_shared(in0_dtype, in1_dtype, primitive_isa)) { + m_hash = compute_hash(); +} + +BrgemmAMXKernelConfig::StaticParams::StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, + dnnl::impl::cpu::x64::cpu_isa_t primitive_isa) + : StaticBaseParams(in0_dtype, in1_dtype, primitive_isa, compute_hash(INNER_K_BLK(in0_dtype), VNNI_FACTOR(in0_dtype))), + inner_k_blk(INNER_K_BLK(in0_dtype)), vnni_factor(VNNI_FACTOR(in0_dtype)) {} + +bool BrgemmAMXKernelConfig::StaticParams::operator==(const StaticParams& rhs) const { + return StaticBaseParams::operator==(rhs) && EQ(inner_k_blk) && EQ(vnni_factor); +} + +size_t BrgemmAMXKernelConfig::StaticParams::compute_hash(dnnl_dim_t inner_k_blk, dnnl_dim_t vnni_factor) { + size_t seed = 0; + HASH(inner_k_blk); HASH(vnni_factor); + return seed; +} + +bool BrgemmAMXKernelConfig::need_copy_a(dnnl_dim_t K) const { + return K % get_vnni_factor() > 0; +} + +#ifdef SNIPPETS_DEBUG_CAPS +std::string BrgemmAMXKernelConfig::StaticParams::to_string() const { + std::stringstream ss; + ss << StaticBaseParams::to_string(); + ss << "inner_k_blk = " << inner_k_blk << "\n"; + ss << "vnni_factor = " << vnni_factor << "\n"; + return ss.str(); +} +#endif + +BrgemmAMXKernelExecutor::BrgemmAMXKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmAMXKernelConfig config) : + CPUKernelExecutor(std::move(kernel_cache), std::move(config)) {} + +namespace { +struct BrgemmCopyAKey { + BrgemmCopyAKey(cpu_isa_t isa, dnnl_data_type_t dt, dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t K_tail, dnnl_dim_t src_stride, dnnl_dim_t LDA) + : isa(isa), dt(dt), K{K}, K_blk{K_blk}, K_tail{K_tail}, src_stride{src_stride}, LDA{LDA} {} + + size_t hash() const { + size_t seed = 0; + HASH(isa); HASH(dt); HASH(K); HASH(K_blk); HASH(K_tail); HASH(src_stride); HASH(LDA); + return seed; + } + bool operator==(const BrgemmCopyAKey& rhs) const { + return EQ(isa) && EQ(dt) && EQ(K) && EQ(K_blk) && EQ(K_tail) && EQ(src_stride) && EQ(LDA); + } + + cpu_isa_t isa {cpu_isa_t::isa_undef}; + dnnl_data_type_t dt {dnnl_data_type_t::dnnl_data_type_undef}; + dnnl_dim_t K {0}, K_blk {0}, K_tail {0}, src_stride {0}, LDA {0}; +}; +} // namespace + +std::shared_ptr BrgemmAMXKernelExecutor::compile_kernel(const BrgemmAMXKernelConfig& config) const { + std::shared_ptr compiled_kernel = std::make_shared(); + + // Brgemm is not executable - nothing to compile + if (config.is_empty()) + return compiled_kernel; + + const auto& cache = m_kernel_cache.lock(); + OPENVINO_ASSERT(cache, "Invalid kernel cache pointer in BrgemmAMXKernelExecutor::compile_kernel()"); + + auto brgemm_key = [&config](dnnl_dim_t K, dnnl_dim_t LDA, float beta) { + auto key = config; + key.update(config.get_M(), config.get_N(), K, LDA, config.get_LDB(), config.get_LDC(), beta); + return key; + }; + + auto brgemm_builder = [](const BrgemmAMXKernelConfig& k) { + std::shared_ptr ker = std::make_shared(); + create_brgemm_kernel(ker->brgemm_kernel, k.get_dt_in0(), k.get_dt_in1(), k.get_isa(), k.get_M(), k.get_N(), k.get_K(), + k.get_LDA(), k.get_LDB(), k.get_LDC(), k.get_beta(), true, ker->palette); + return ker; + }; + + auto brgemm_copy_a_builder = [](const BrgemmCopyAKey& k) { + std::shared_ptr ker {nullptr}; + create_brgemm_copy_a_kernel(ker, k.isa, k.dt, k.K, k.K_blk, k.K_tail, k.src_stride, k.LDA); + return ker; + }; + + auto K_tail = config.get_K() % config.get_inner_K_blk(); + auto K_body = config.get_K() - K_tail; + + float beta = config.get_beta(); + + // Brgemm Kernel for K_body + if (K_body != 0) { + const auto result = cache->getOrCreate(brgemm_key(K_body, config.get_LDA(), beta), brgemm_builder); + compiled_kernel->K_body_kernel = result.first; + beta = 1; + } + + // Brgemm Kernel for K_tail with BrgemmCopyA if needed + if (K_tail != 0) { + auto LDA = config.get_LDA(); + if (config.need_copy_a(K_tail)) { + const auto copy_A_src_stride = LDA * dnnl_data_type_size(config.get_dt_in0()); + K_tail = ov::snippets::utils::rnd_up(K_tail, config.get_vnni_factor()); + LDA = K_tail; + + const auto key = BrgemmCopyAKey(config.get_isa(), config.get_dt_in0(), config.get_K(), config.get_inner_K_blk(), K_tail, copy_A_src_stride, LDA); + const auto result = cache->getOrCreate(key, brgemm_copy_a_builder); + compiled_kernel->brgemm_copy_a_kernel = result.first; + } + + const auto result = cache->getOrCreate(brgemm_key(K_tail, LDA, beta), brgemm_builder); + compiled_kernel->K_tail_kernel = result.first; + } + + return compiled_kernel; +} + +void BrgemmAMXKernelExecutor::create_brgemm_copy_a_kernel(std::shared_ptr& kernel, + dnnl::impl::cpu::x64::cpu_isa_t isa, dnnl_data_type_t dt, + dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t K_tail, dnnl_dim_t src_stride, dnnl_dim_t LDA) { + matmul::brgemm_matmul_conf_t conf_; + conf_.src_tag = dnnl_abcd; // unused + conf_.K = K; + conf_.K_tail = K_tail; + conf_.K_blk = K_blk; + conf_.use_buffer_a_tail_only = false; + conf_.LDA = LDA; + conf_.has_zero_point_b = false; + conf_.s8s8_compensation_required = false; + conf_.wei_zp_type = dnnl::impl::cpu::x64::none; + conf_.src_zp_type = dnnl::impl::cpu::x64::none; + conf_.src_dt = dt; + conf_.copy_A_src_stride = src_stride; + conf_.a_dt_sz = dnnl_data_type_size(conf_.src_dt); + // copied A has the same precision of original + conf_.tr_a_dt_sz = dnnl_data_type_size(conf_.src_dt); + conf_.transposed_A = false; + conf_.isa = isa; + + std::unique_ptr brgemm_matmul_copy_a = nullptr; + OV_CPU_JIT_EMITTER_ASSERT(create_brgemm_matmul_copy_a(brgemm_matmul_copy_a, &conf_) == dnnl_success, + "Cannot create brgemm copy a kernel due to invalid params"); + kernel = std::move(brgemm_matmul_copy_a); +} + +void BrgemmAMXKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmAMXKernelConfig& config) const { + return BrgemmBaseKernelExecutor::update_config(expr, linear_ir, config); +} + +void BrgemmAMXKernelExecutor::configure_tiles_if_needed(amx_tile_config_t* config, const char* palette, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K) { + auto compatible = [&](amx_tile_config_t* rhs) { + return rhs && rhs->M == M && rhs->N == N && rhs->K == K; + }; + if (config && !compatible(config)) { + config->M = M; config->N = N; config->K = K; + cpu::x64::amx_tile_configure(palette); + } +} + +void BrgemmAMXKernelExecutor::execute_brgemm_copy_a_kernel(const std::shared_ptr& kernel, + const void* src, const void* tr_src, dnnl_dim_t M, dnnl_dim_t K) { + auto ctx = matmul::jit_brgemm_matmul_copy_a_t::ctx_t(); + + ctx.current_M_blk = M; + ctx.zp_b_compensation_buffer_ptr = nullptr; + ctx.zp_a_compensation_result_ptr = nullptr; + ctx.zp_b_neg_value_ptr = nullptr; + ctx.zp_ab_comp_ptr = nullptr; + ctx.src = src; + ctx.tr_src = tr_src; + ctx.current_K_start = 0; + ctx.current_K_blk = K; + + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr brgemm_copy_a_kernel"); + (*kernel)(&ctx); +} + +void BrgemmAMXKernelExecutor::execute(const BrgemmAMXKernelExecutor* executor, call_args* args) { + OV_CPU_JIT_EMITTER_ASSERT(executor, "has nullptr executor"); + auto kernel = executor->get_kernel(); + const auto& config = static_cast(executor->get_config()); + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr compiler kernel or invalid config"); + + const auto* src_ptr = args->A; + const auto* wei_ptr = args->B; + auto* scratch = args->scratch; + + const auto K_tail = config.get_K() % config.get_inner_K_blk(); + const auto K_body = config.get_K() - K_tail; + + if (K_body != 0) { + const auto& K_body_kernel = kernel->K_body_kernel; + configure_tiles_if_needed(args->amx_tile_config, K_body_kernel->palette, config.get_M(), config.get_N(), K_body); + execute_brgemm_kernel(K_body_kernel->brgemm_kernel, src_ptr, wei_ptr, args->C, scratch, false); + + src_ptr = src_ptr + K_body * dnnl_data_type_size(config.get_dt_in0()); + wei_ptr = wei_ptr + (K_body * config.get_LDB()) * dnnl_data_type_size(config.get_dt_in1()); + } + + if (K_tail != 0) { + if (config.need_copy_a(K_tail)) { + auto* tr_src = scratch + BrgemmCPU::SCRATCH_BYTE_SIZE; + + execute_brgemm_copy_a_kernel(kernel->brgemm_copy_a_kernel, src_ptr, tr_src, config.get_M(), K_tail); + src_ptr = tr_src; + } + + const auto& K_tail_kernel = kernel->K_tail_kernel; + configure_tiles_if_needed(args->amx_tile_config, K_tail_kernel->palette, config.get_M(), config.get_N(), K_tail); + execute_brgemm_kernel(K_tail_kernel->brgemm_kernel, src_ptr, wei_ptr, args->C, scratch, false); + } +} + +#undef INNER_K_BLK +#undef VNNI_FACTOR +#undef EQ +#undef HASH + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp new file mode 100644 index 00000000000000..a8544e5343b0ce --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_amx.hpp @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "brgemm_base.hpp" + +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" + +#include +#include + + +namespace ov { +namespace intel_cpu { + +struct BrgemmAMXKernelConfig : public BrgemmBaseKernelConfig { +public: + BrgemmAMXKernelConfig(const element::Type& in0_dtype, const element::Type& in1_dtype, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + BrgemmAMXKernelConfig() = delete; + + std::unique_ptr get_clone_ptr() const override { + return std::unique_ptr(new BrgemmAMXKernelConfig(*this)); + } + + dnnl_dim_t get_inner_K_blk() const { return m_static_params->inner_k_blk; } + dnnl_dim_t get_vnni_factor() const { return m_static_params->vnni_factor; } + + bool need_copy_a(dnnl_dim_t K) const; + +private: + struct StaticParams : StaticBaseParams { + StaticParams(const element::Type& in0_dtype, const element::Type& in1_dtype, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa); + + const dnnl_dim_t inner_k_blk {0}; + const dnnl_dim_t vnni_factor {0}; + + bool operator==(const StaticParams& rhs) const; + bool operator!=(const StaticParams& rhs) const { return !(*this == rhs); } +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + private: + static size_t compute_hash(dnnl_dim_t inner_k_blk, dnnl_dim_t vnni_factor); + }; + + std::shared_ptr get_static_params() const override { return m_static_params; } + + std::shared_ptr m_static_params {nullptr}; +}; + +struct BrgemmAMXCompiledKernel { + struct BrgemmKernel { + std::shared_ptr brgemm_kernel {nullptr}; + // Note: Palette is treated as a part of a kernel because it is initialized during the kernel compilation stage. + // Each kernel need to store the pallet it was compiled with. + char palette[64] = {}; + }; + + std::shared_ptr K_body_kernel {nullptr}; + std::shared_ptr K_tail_kernel {nullptr}; + std::shared_ptr brgemm_copy_a_kernel {nullptr}; +}; + +class BrgemmAMXKernelExecutor : public BrgemmBaseKernelExecutor, + public CPUKernelExecutor { +public: + struct call_args { + const uint8_t* A = nullptr; + const uint8_t* B = nullptr; + void* C = nullptr; + uint8_t* scratch = nullptr; + amx_tile_config_t* amx_tile_config = nullptr; + }; + BrgemmAMXKernelExecutor(ov::intel_cpu::MultiCacheWeakPtr kernel_cache, BrgemmAMXKernelConfig config); + + /** Function that will be called in runtime to execute the kernel */ + static void execute(const BrgemmAMXKernelExecutor* executor, call_args* args); + +protected: + std::shared_ptr compile_kernel(const BrgemmAMXKernelConfig& c) const override; + + void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmAMXKernelConfig& config) const override; + + static void configure_tiles_if_needed(amx_tile_config_t* config, const char* palette, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K); + + static void create_brgemm_copy_a_kernel(std::shared_ptr& kernel, + dnnl::impl::cpu::x64::cpu_isa_t isa, dnnl_data_type_t dt, + dnnl_dim_t K, dnnl_dim_t K_blk, dnnl_dim_t K_tail, dnnl_dim_t src_stride, dnnl_dim_t LDA); + + static void execute_brgemm_copy_a_kernel(const std::shared_ptr& kernel, + const void* src, const void* tr_src, dnnl_dim_t M, dnnl_dim_t K); +}; +#define GET_OFF_BRGEMM_AMX_ARGS(field) offsetof(BrgemmAMXKernelExecutor::call_args, field) + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp new file mode 100644 index 00000000000000..17b1f0e053b577 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.cpp @@ -0,0 +1,273 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "brgemm_base.hpp" + +#include "common/utils.hpp" +#include "dnnl_extension_utils.h" +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_utils.hpp" + +#define DIM_CAST(X) static_cast(X) +#define DTYPE_CAST(X) static_cast(DnnlExtensionUtils::ElementTypeToDataType(X)) +#define PRINT(X) ss << #X << " = " << X << "\n" +#define EQ(X) X == rhs.X +#define HASH(X) seed = hash_combine(seed, X) + +using namespace Xbyak; +using namespace dnnl::impl; +using namespace dnnl::impl::cpu::x64; + +namespace ov { +namespace intel_cpu { + +bool BrgemmBaseKernelConfig::is_completed() const { + return !utils::one_of(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC) || is_empty(); +} + +bool BrgemmBaseKernelConfig::is_empty() const { + return everyone_is(0, m_M, m_N, m_K, m_LDA, m_LDB, m_LDC, m_beta); +} + +bool BrgemmBaseKernelConfig::operator==(const BrgemmBaseKernelConfig& rhs) const { + return EQ(m_hash) && EQ(m_beta) && + EQ(m_M) && EQ(m_N) && EQ(m_K) && + EQ(m_LDA) && EQ(m_LDB) && EQ(m_LDC) && + (EQ(get_static_params()) || *get_static_params() == *(rhs.get_static_params())); +} + +void BrgemmBaseKernelConfig::update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta) { + // If M is zero, it means that Brgemm won't be executed (in Loop with work_amount = 0, for example) + // To process this case, we have to make this Config as empty (nullify runtime parameters) + if (utils::one_of(0, M, N, K)) { + m_M = 0; m_N = 0; m_K = 0; + m_LDA = 0; m_LDB = 0; m_LDC = 0; + m_beta = 0; + } else { + m_M = M; m_N = N; m_K = K; + m_LDA = LDA; m_LDB = LDB; m_LDC = LDC; + m_beta = beta; + } + m_hash = compute_hash(); +} + +size_t BrgemmBaseKernelConfig::compute_hash() const { + size_t seed = get_static_params()->hash(); + HASH(m_M); HASH(m_N); HASH(m_K); + HASH(m_LDA); HASH(m_LDB); HASH(m_LDC); + HASH(m_beta); + return seed; +} + +BrgemmBaseKernelConfig::StaticBaseParams::StaticBaseParams(const element::Type& in0_dtype, const element::Type& in1_dtype, + cpu_isa_t primitive_isa, size_t hash_seed) + : dt_in0(DTYPE_CAST(in0_dtype)), dt_in1(DTYPE_CAST(in1_dtype)), isa(primitive_isa), m_hash(compute_hash(hash_seed, dt_in0, dt_in1, isa)) {} + +bool BrgemmBaseKernelConfig::StaticBaseParams::operator==(const StaticBaseParams& rhs) const { + return EQ(hash()) && EQ(dt_in0) && EQ(dt_in1) && EQ(isa); +} + +size_t BrgemmBaseKernelConfig::StaticBaseParams::compute_hash(size_t hash_seed, dnnl_data_type_t dt_in0, dnnl_data_type_t dt_in1, cpu_isa_t isa) { + size_t seed = hash_seed; + HASH(dt_in0); HASH(dt_in1); HASH(isa); + return seed; +} + +#ifdef SNIPPETS_DEBUG_CAPS +std::string BrgemmBaseKernelConfig::StaticBaseParams::to_string() const { + std::stringstream ss; + PRINT(dt_in0); PRINT(dt_in1); + PRINT(isa); + return ss.str(); +} + +std::string BrgemmBaseKernelConfig::to_string() const { + std::stringstream ss; + ss << get_static_params()->to_string() << "\n"; + PRINT(m_M); PRINT(m_N); PRINT(m_K); + PRINT(m_LDA); PRINT(m_LDB); PRINT(m_LDC); + PRINT(m_beta); + return ss.str(); +} +#endif + +float BrgemmBaseKernelExecutor::get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, + const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info) { + // Find all Expanded loops with the same Unified loop information -> they were decomposed from this Unified Loop. + // Note that LoopInfo are normalized and sorted (due to NormalizedLoopIDs pass). + // It means that previous executed Loops have Loop ID less the current Loop ID. + // - If there is executed Loop (work_amount > 0) and evaluated before the current -> the current Brgemm should have `beta = 1`. + // - If there is not this Loop -> the current executed Brgemm should have `beta = 0`. + if (loop_id > 0) { + const auto& current_unified_loop_info = current_expanded_loop_info->get_unified_loop_info(); + // Check the previous Loops + --loop_id; + while (loop_id >= 0) { + const auto& expanded_loop_info = loop_manager->get_loop_info(loop_id); + if (expanded_loop_info->get_unified_loop_info() != current_unified_loop_info) + return 0; + if (expanded_loop_info->get_work_amount() > 0) { + // there is previous executed Brgemm with `beta = 0` -> the current Brgemm should have `beta = 1` + return 1; + } + --loop_id; + } + } + return 0; +} + +void BrgemmBaseKernelExecutor::update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmBaseKernelConfig& config) { + const auto& input_pds = expr->get_input_port_descriptors(); + const auto& output_pds = expr->get_output_port_descriptors(); + OV_CPU_JIT_EMITTER_ASSERT((input_pds.size() == 2 || input_pds.size() == 3) && output_pds.size() == 1, + "Invalid number of in/out port descriptors"); + + const auto in0_shape = snippets::utils::get_planar_vdims(input_pds[0]->get_shape(), input_pds[0]->get_layout()); + const auto in1_shape = snippets::utils::get_planar_vdims(input_pds[1]->get_shape(), input_pds[1]->get_layout()); + auto in0_subtensor = input_pds[0]->get_subtensor(); + auto in1_subtensor = input_pds[1]->get_subtensor(); + + // Need to update M, K, N + // 1. If the original value in subtensor is `FULL_DIM`, it means that + // Brgemm block should process full tensor by this dim -> take dimension from shape + // 2. Otherwise, Brgemm block processes part of the tensor by this dim + // (there is blocking by this dimension) -> take from Loop increment + + auto M = *++in0_subtensor.rbegin(); + auto K = *in0_subtensor.rbegin(); + auto N = *in1_subtensor.rbegin(); + + size_t loop_idx = 0; + const auto& loop_ids = expr->get_loop_ids(); + const auto& loop_manager = linear_ir->get_loop_manager(); + auto get_loop_info = [&](){ + OPENVINO_ASSERT(loop_idx < loop_ids.size(), "Loop is missed"); + return loop_manager->get_loop_info(loop_ids[loop_idx++]); + }; + + /* ------- Dimension M ----------*/ + if (ov::snippets::utils::is_full_dim_value(M)) { + M = *++in0_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // If BrgemmCopyB in the Loop by M -> first input port will be BrgemmCopyB with `incremented=false` + // to avoid extra checks, we validate only first input port + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 1; }; + OPENVINO_ASSERT(in_ports.size() > 1 && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && + out_ports.size() == 1 && check_port(out_ports.back()), + "Incorrect Loop by Brgemm dimension M"); + M = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[0]->set_subtensor_dim(1, M); + output_pds[0]->set_subtensor_dim(1, M); + } + + /* ------- Dimension N ----------*/ + if (ov::snippets::utils::is_full_dim_value(N)) { + N = *in1_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + auto check_port = [&](const ov::snippets::lowered::LoopPort& p) { return p.dim_idx == 0; }; + OPENVINO_ASSERT(in_ports.size() >= 2 && !in_ports.front().is_incremented && std::all_of(in_ports.cbegin(), in_ports.cend(), check_port) && + out_ports.size() == 1 && check_port(out_ports.back()), + "Incorrect Loop by Brgemm dimension N"); + N = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[1]->set_subtensor_dim(0, N); + output_pds[0]->set_subtensor_dim(0, N); + } + + /* ------- Dimension K ----------*/ + // 1. If Brgemm block processes full dimension K -> `beta = 0` + // 2. If Brgemm block processes part of the dimension K (there is blocking), need to find + // the most first executed Brgemm Block in Loops which iterate through dimension K (work_amount > 0). + // First of them will have `beta = 0`, other - `beta = 1` + float beta = 0; + if (ov::snippets::utils::is_full_dim_value(K)) { + K = *in0_shape.rbegin(); + } else { + const auto& current_expanded_loop_info = get_loop_info(); + const auto& in_ports = current_expanded_loop_info->get_input_ports(); + const auto& out_ports = current_expanded_loop_info->get_output_ports(); + // Quick validation check: Should we check that port is really Brgemm port? + // Note: We check `is_incremented` attribute only for not incremented ports because + // this `is_incremented = true` can be changed by `CleanRepeatedDataPointerShifts` optimization + OPENVINO_ASSERT(in_ports.size() >= 2 && in_ports.front().dim_idx == 0 && in_ports.back().dim_idx == 1 && + out_ports.size() == 1 && !out_ports.front().is_incremented, + "Incorrect Loop by Brgemm dimension K"); + K = current_expanded_loop_info->get_work_amount() > 0 ? current_expanded_loop_info->get_increment() : 0; + input_pds[0]->set_subtensor_dim(0, K); + input_pds[1]->set_subtensor_dim(1, K); + if (K > 0) + beta = get_beta(loop_manager, static_cast(loop_ids.back()), current_expanded_loop_info); + } + + const auto LDA = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(0))); + const auto LDC = DIM_CAST(snippets::utils::get_dim_stride(expr->get_output_port(0))); + auto LDB = DIM_CAST(snippets::utils::get_dim_stride(expr->get_input_port(1))); + + const auto& brgemm_node = as_type_ptr(expr->get_node()); + OV_CPU_JIT_EMITTER_ASSERT(brgemm_node, "Got invalid node type in update_config"); + // In case of data repacking LDB is chosen in accordance with repacking buffer size + if (with_repacking(brgemm_node->get_type())) + LDB = DIM_CAST(brgemm_utils::repacking::compute_LDB(LDB, brgemm_node->get_input_element_type(1))); + + config.update(DIM_CAST(M), DIM_CAST(N), DIM_CAST(K), LDA, LDB, LDC, beta); +} + +void BrgemmBaseKernelExecutor::create_brgemm_kernel(std::shared_ptr& kernel, dnnl_data_type_t dt0, dnnl_data_type_t dt1, + cpu_isa_t isa, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, + dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta, bool with_amx, char* palette) { + cpu::x64::brgemm_desc_t desc; + OV_CPU_JIT_EMITTER_ASSERT(brgemm_desc_init(&desc, isa, cpu::x64::brgemm_strd, dt0, dt1, + false, false, cpu::x64::brgemm_row_major, 1.f, + beta, LDA, LDB, LDC, M, N, K, nullptr) == dnnl_success, + "Cannot initialize brgemm descriptor due to invalid params"); + + if (with_amx) { + OV_CPU_JIT_EMITTER_ASSERT(palette && brgemm_init_tiles(desc, palette) == dnnl_success, + "Cannot initialize brgemm tiles due to invalid params"); + } + + cpu::x64::brgemm_kernel_t* kernel_ = nullptr; + OV_CPU_JIT_EMITTER_ASSERT(brgemm_kernel_create(&kernel_, desc) == dnnl_success, "Cannot create brgemm kernel due to invalid params"); + kernel = std::unique_ptr(kernel_); +} + +void BrgemmBaseKernelExecutor::execute_brgemm_kernel(const std::shared_ptr& kernel, + const void* src, const void* wei, void* dst, void* scratch, bool with_comp) { + cpu::x64::brgemm_kernel_params_t brgemm_p; + brgemm_p.batch = nullptr; // default value + brgemm_p.ptr_A = src; + brgemm_p.ptr_B = wei; + brgemm_p.ptr_C = dst; + brgemm_p.ptr_D = dst; + brgemm_p.ptr_buf = scratch; + brgemm_p.ptr_bias = nullptr; + brgemm_p.do_post_ops = with_comp; + brgemm_p.do_apply_comp = with_comp; + brgemm_p.skip_accm = 0; + brgemm_p.BS = 1; // default value + OV_CPU_JIT_EMITTER_ASSERT(kernel, "has nullptr Brgemm kernel"); + (*kernel)(&brgemm_p); +} + +#undef DIM_CAST +#undef DTYPE_CAST +#undef PRINT +#undef EQ +#undef HASH + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp new file mode 100644 index 00000000000000..74a5c2b76daf65 --- /dev/null +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/kernel_executors/brgemm_base.hpp @@ -0,0 +1,102 @@ +// Copyright (C) 2020-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "openvino/core/type/element_type.hpp" + +#include "cpu/x64/cpu_isa_traits.hpp" + +#include "emitters/plugin/x64/jit_emitter.hpp" +#include "emitters/snippets/jit_snippets_call_args.hpp" +#include "emitters/snippets/cpu_kernel_executor_table.hpp" +#include + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/lowered/loop_info.hpp" + +namespace ov { +namespace intel_cpu { + +struct BrgemmBaseKernelConfig : public snippets::KernelExecutorBase::GenericConfig { +public: + BrgemmBaseKernelConfig() = default; + + bool is_completed() const override; + size_t hash() const override { return m_hash; } + + bool is_empty() const; + void update(dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta); + + bool operator==(const BrgemmBaseKernelConfig& rhs) const; + bool operator!=(const BrgemmBaseKernelConfig& rhs) const {return !(*this == rhs);} + + dnnl_data_type_t get_dt_in0() const { return get_static_params()->dt_in0; } + dnnl_data_type_t get_dt_in1() const { return get_static_params()->dt_in1; } + + dnnl::impl::cpu::x64::cpu_isa_t get_isa() const { return get_static_params()->isa; } + float get_beta() const { return m_beta; } + + dnnl_dim_t get_M() const { return m_M; } + dnnl_dim_t get_N() const { return m_N; } + dnnl_dim_t get_K() const { return m_K; } + + dnnl_dim_t get_LDA() const { return m_LDA; } + dnnl_dim_t get_LDB() const { return m_LDB; } + dnnl_dim_t get_LDC() const { return m_LDC; } + +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const override; +#endif + +protected: + struct StaticBaseParams { + StaticBaseParams(const element::Type& in0_dtype, const element::Type& in1_dtype, dnnl::impl::cpu::x64::cpu_isa_t primitive_isa, size_t hash_seed); + virtual ~StaticBaseParams() = default; + + const dnnl_data_type_t dt_in0 {dnnl_f32}, dt_in1 {dnnl_f32}; + const dnnl::impl::cpu::x64::cpu_isa_t isa {dnnl::impl::cpu::x64::isa_undef}; + + size_t hash() const { return m_hash; } + + bool operator==(const StaticBaseParams& rhs) const; + bool operator!=(const StaticBaseParams& rhs) const { return !(*this == rhs); } +#ifdef SNIPPETS_DEBUG_CAPS + std::string to_string() const; +#endif + protected: + static size_t compute_hash(size_t hash_seed, dnnl_data_type_t dt_in0, dnnl_data_type_t dt_in1, dnnl::impl::cpu::x64::cpu_isa_t isa); + + const size_t m_hash {0}; + }; + + virtual std::shared_ptr get_static_params() const = 0; + size_t compute_hash() const; + + dnnl_dim_t m_M {0}, m_N {0}, m_K {0}, m_LDA {0}, m_LDB {0}, m_LDC {0}; + float m_beta {0}; + size_t m_hash {SIZE_MAX}; +}; + +class BrgemmBaseKernelExecutor { +public: + virtual ~BrgemmBaseKernelExecutor() = default; +protected: + static float get_beta(const ov::snippets::lowered::LoopManagerPtr& loop_manager, int loop_id, + const ov::snippets::lowered::ExpandedLoopInfoPtr& current_expanded_loop_info); + + static void update_config(const ov::snippets::lowered::ExpressionPtr& expr, + const ov::snippets::lowered::LinearIRCPtr& linear_ir, + BrgemmBaseKernelConfig& config); + + static void create_brgemm_kernel(std::shared_ptr& kernel, dnnl_data_type_t dt0, dnnl_data_type_t dt1, + dnnl::impl::cpu::x64::cpu_isa_t isa, dnnl_dim_t M, dnnl_dim_t N, dnnl_dim_t K, + dnnl_dim_t LDA, dnnl_dim_t LDB, dnnl_dim_t LDC, float beta, bool with_amx = false, char* palette = nullptr); + + static void execute_brgemm_kernel(const std::shared_ptr& kernel, const void* src, const void* wei, + void* dst, void* scratch, bool with_comp); +}; + +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp index 78563bc00aa228..269212edf1ab9b 100644 --- a/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp +++ b/src/plugins/intel_cpu/src/emitters/snippets/x64/verbose.cpp @@ -11,6 +11,9 @@ #include "jit_brgemm_copy_b_emitter.hpp" #include "jit_kernel_emitter.hpp" #include "jit_snippets_emitters.hpp" +#include "kernel_executors/brgemm.hpp" +#include "kernel_executors/brgemm_amx.hpp" + #ifndef _WIN32 #include @@ -86,9 +89,12 @@ static std::string init_info_jit_store_memory_emitter(const jit_store_memory_emi std::string init_info_jit_brgemm_emitter(const jit_brgemm_emitter *emitter) { std::stringstream ss; - ss << "Emitter_type_name:jit_brgemm_emitter" - << emitter->m_kernel_executor->to_string() - << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) + ss << "Emitter_type_name:jit_brgemm_emitter"; + if (const auto& common = std::dynamic_pointer_cast(emitter->m_kernel_executor)) + ss << common->to_string(); + if (const auto& amx = std::dynamic_pointer_cast(emitter->m_kernel_executor)) + ss << amx->to_string(); + ss << " m_memory_offset:" << vector_to_string(emitter->m_memory_offsets) << " m_buffer_ids:" << vector_to_string(emitter->m_buffer_ids); return ss.str(); diff --git a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp index 660db85cd61529..5f63904fbb9342 100644 --- a/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp +++ b/src/plugins/intel_cpu/src/nodes/executors/aarch64/jit_eltwise.cpp @@ -26,6 +26,7 @@ bool JitEltwiseExecutor::isSupported( Algorithm::EltwiseEqual, Algorithm::EltwiseExp, Algorithm::EltwiseFloor, + Algorithm::EltwiseFloorMod, Algorithm::EltwiseCeiling, Algorithm::EltwiseGeluErf, Algorithm::EltwiseGeluTanh, diff --git a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp index 7ac3b603353541..9a1662e2c5dab5 100644 --- a/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp +++ b/src/plugins/intel_cpu/src/nodes/kernels/aarch64/jit_uni_eltwise_generic.cpp @@ -648,6 +648,7 @@ std::shared_ptr jit_uni_eltwise_generic::create_eltwise_emitte OV_CASE(Algorithm::EltwiseEqual, ov::intel_cpu::aarch64::jit_equal_emitter), OV_CASE(Algorithm::EltwiseExp, ov::intel_cpu::aarch64::jit_exp_emitter), OV_CASE(Algorithm::EltwiseFloor, ov::intel_cpu::aarch64::jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, ov::intel_cpu::aarch64::jit_floor_mod_emitter), OV_CASE(Algorithm::EltwiseCeiling, ov::intel_cpu::aarch64::jit_ceiling_emitter), OV_CASE(Algorithm::EltwiseHswish, ov::intel_cpu::aarch64::jit_hswish_emitter), OV_CASE(Algorithm::EltwiseIsFinite, ov::intel_cpu::aarch64::jit_is_finite_emitter), @@ -830,6 +831,7 @@ std::set> eltwise_precision_helper::get_supported_pre OV_CASE(Algorithm::EltwiseEqual, jit_equal_emitter), OV_CASE(Algorithm::EltwiseExp, jit_exp_emitter), OV_CASE(Algorithm::EltwiseFloor, jit_floor_emitter), + OV_CASE(Algorithm::EltwiseFloorMod, jit_floor_mod_emitter), OV_CASE(Algorithm::EltwiseCeiling, jit_ceiling_emitter), OV_CASE(Algorithm::EltwiseGeluErf, jit_gelu_erf_emitter), OV_CASE(Algorithm::EltwiseGeluTanh, jit_gelu_tanh_emitter), diff --git a/src/plugins/intel_cpu/src/nodes/subgraph.cpp b/src/plugins/intel_cpu/src/nodes/subgraph.cpp index a23835d398cbe7..d5579fea23b6b1 100644 --- a/src/plugins/intel_cpu/src/nodes/subgraph.cpp +++ b/src/plugins/intel_cpu/src/nodes/subgraph.cpp @@ -32,7 +32,7 @@ #include "emitters/snippets/x64/cpu_generator.hpp" #include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp" #include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp" -#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" +#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.hpp" #include "transformations/snippets/x64/pass/remove_converts.hpp" #include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp" #include "transformations/snippets/x64/pass/eliminate_brgemm_copy_b.hpp" @@ -694,7 +694,7 @@ Subgraph::ControlFlowPasses Subgraph::getControlFlowPasses() const { SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::InsertLoops, ov::intel_cpu::pass::FuseLoadStoreConvert); SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::snippets::lowered::pass::InsertBuffers, - ov::intel_cpu::pass::InsertBrgemmCopyBBuffers); + ov::intel_cpu::pass::InsertBrgemmCopyBuffers); #ifdef SNIPPETS_LIBXSMM_TPP SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::intel_cpu::pass::BrgemmCPUBlocking, diff --git a/src/plugins/intel_cpu/src/plugin.cpp b/src/plugins/intel_cpu/src/plugin.cpp index b74d4f7c8acbbb..6fdbf7a4ea4dee 100644 --- a/src/plugins/intel_cpu/src/plugin.cpp +++ b/src/plugins/intel_cpu/src/plugin.cpp @@ -247,6 +247,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< // update the props after the perf mode translated to configs // TODO: Clarify the behavior of SetConfig method. Skip eng_config or not? Config conf = engConfig; + conf.applyRtInfo(cloned_model); conf.readProperties(config, modelType); Transformations transformations(cloned_model, conf); @@ -520,6 +521,7 @@ ov::SupportedOpsMap Plugin::query_model(const std::shared_ptr& Config conf = engConfig; Config::ModelType modelType = getModelType(model); + conf.applyRtInfo(model); conf.readProperties(config, modelType); auto context = std::make_shared(conf, fake_w_cache, false); @@ -575,7 +577,7 @@ std::shared_ptr Plugin::import_model(std::istream& model_str Config conf = engConfig; Config::ModelType modelType = getModelType(model); - + conf.applyRtInfo(model); // check ov::loaded_from_cache property and erase it to avoid exception in readProperties. auto _config = config; const auto& it = _config.find(ov::loaded_from_cache.name()); diff --git a/src/plugins/intel_cpu/src/plugin.h b/src/plugins/intel_cpu/src/plugin.h index 2548ba2c1cc8af..8973478d30403f 100644 --- a/src/plugins/intel_cpu/src/plugin.h +++ b/src/plugins/intel_cpu/src/plugin.h @@ -50,7 +50,6 @@ class Plugin : public ov::IPlugin { void get_performance_streams(Config& config, const std::shared_ptr& model) const; void calculate_streams(Config& conf, const std::shared_ptr& model, bool imported = false) const; - Config engConfig; /* Explicily configured streams have higher priority than performance hints. So track if streams is set explicitly (not auto-configured) */ diff --git a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp index b99e1bc62c4b11..2dccce257ae116 100644 --- a/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp +++ b/src/plugins/intel_cpu/src/shape_inference/shape_inference.cpp @@ -56,6 +56,7 @@ #include "gather_nd_shape_inference.hpp" #include "gather_shape_inference.hpp" #include "gather_tree_shape_inference.hpp" +#include "glu_shape_inference.hpp" #include "grid_sample_shape_inference.hpp" #include "group_convolution_backprop_shape_inference.hpp" #include "group_convolution_shape_inference.hpp" @@ -575,6 +576,7 @@ const IStaticShapeInferFactory::TRegistry IStaticShapeInferFactory::registry{ _OV_OP_SHAPE_INFER_MASK_REG(ov::op::internal::AUGRUCell, ShapeInferTA, util::bit::mask()), _OV_OP_SHAPE_INFER_MASK_REG(ov::op::internal::AUGRUSequence, ShapeInferTA, util::bit::mask()), _OV_OP_SHAPE_INFER_MASK_REG(ov::op::internal::RMSNorm, ShapeInferTA, util::bit::mask(1)), + _OV_OP_SHAPE_INFER_MASK_REG(ov::op::internal::GLU, ShapeInferTA, util::bit::mask()), }; // clang-format on diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp index 6a4fc83d409355..9088ced9c18649 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.cpp @@ -44,18 +44,17 @@ cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx) { #undef SUPPORT } -BRGEMM_TYPE get_brgemm_type(const ov::element::Type& element_type_a, const Dimension& K_dim, bool transpose_b) { +BRGEMM_TYPE get_brgemm_type(const ov::element::Type& element_type_a, bool transpose_b) { if (element_type_a == element::f32) return transpose_b ? BRGEMM_TYPE::REPACKING_ONLY : BRGEMM_TYPE::STAND_ALONE; OPENVINO_ASSERT(element_type_a != element::bf16 || mayiuse(dnnl::impl::cpu::x64::avx512_core_bf16), "BF16 precision is not supported on this hardware"); - const auto brgemmVNNIFactor = 4 / element_type_a.size(); if (one_of(element_type_a, element::u8, element::i8, element::bf16) && - dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx) && - K_dim.is_static() && K_dim.get_length() % brgemmVNNIFactor == 0) + dnnl::impl::cpu::x64::mayiuse(dnnl::impl::cpu::x64::avx512_core_amx)) return BRGEMM_TYPE::WITH_AMX; + // Note: this condition reproduces logic from the OneDNN Brgemm implementation. This is needed to align with the // backend requirements. More details in onednn/src/cpu/x64/brgemm/brgemm_utils.cpp if (element_type_a == ov::element::i8) @@ -87,6 +86,10 @@ size_t compute_inner_n_block(const ov::element::Type& precision) { } } +size_t compute_inner_k_block(const ov::element::Type& precision) { + return brgemm_utils::get_elems_in_vec(precision); +} + ov::snippets::lowered::ExpressionPtr get_copy_b_expr(const ov::snippets::lowered::ExpressionPtr& brgemm_expr) { OPENVINO_ASSERT(ov::is_type(brgemm_expr->get_node()), "get_copy_b_expr must be called only for BrgemmCPU node"); const auto b_input_expr = brgemm_expr->get_input_port_connector(1)->get_source().get_expr(); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp index 0d8e3f5fb6fc9b..672b67888eef9b 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/op/brgemm_utils.hpp @@ -23,7 +23,7 @@ enum class BRGEMM_TYPE { dnnl::impl::cpu::x64::cpu_isa_t get_primitive_isa(const ov::element::Type& dt_in0, bool is_with_amx); -BRGEMM_TYPE get_brgemm_type(const element::Type& element_type_a, const Dimension& K_dim, bool transpose_b); +BRGEMM_TYPE get_brgemm_type(const element::Type& element_type_a, bool transpose_b); inline bool stand_alone(BRGEMM_TYPE type) { return type == BRGEMM_TYPE::STAND_ALONE; } @@ -45,6 +45,8 @@ size_t get_elems_in_vec(const ov::element::Type& precision); namespace repacking { /// \brief Computes inner N block size used by OneDNN implementation. Depends on tensor precision size_t compute_inner_n_block(const ov::element::Type& precision); +/// \brief Computes inner K block size used by OneDNN implementation. Depends on tensor precision +size_t compute_inner_k_block(const ov::element::Type& precision); /** * @brief Computes leading dimension (LDB) which must be used in brgemm and brgemm_copy_b emitters * @param n_block N block size shared between BrgemmCPU and BrgemmCopyB node diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp index abb6147bac3588..50182765856777 100644 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.cpp @@ -60,15 +60,13 @@ pass::BrgemmToBrgemmCPU::BrgemmToBrgemmCPU() { const auto dimsMatMulIn0 = snippets::utils::get_planar_pshape(brgemm->input(0)); const auto dimsMatMulIn1 = snippets::utils::get_planar_pshape(brgemm->input(1)); - const auto K = *dimsMatMulIn0.rbegin(); - const auto& layout_a = brgemm_in0_desc->get_layout(); const auto& layout_b = brgemm_in1_desc->get_layout(); const auto& layout_c = brgemm_out_desc->get_layout(); const auto element_type_a = brgemm->get_input_element_type(0); const bool transpose_b = !layout_b.empty() && layout_b.back() != layout_b.size() - 1; - const auto brgemm_type = brgemm_utils::get_brgemm_type(element_type_a, K, transpose_b); + const auto brgemm_type = brgemm_utils::get_brgemm_type(element_type_a, transpose_b); const auto offset_a = brgemm->get_offset_a(); const auto offset_b = brgemm->get_offset_b(); const auto offset_c = brgemm->get_offset_c(); diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp deleted file mode 100644 index bd8dd12bd39256..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.cpp +++ /dev/null @@ -1,60 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#include "insert_brgemm_copy_b_buffers.hpp" - -#include "snippets/lowered/loop_manager.hpp" -#include "snippets/itt.hpp" - -#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" -#include "expressions/brgemm_copy_b_buffer_expressions.hpp" - - -using namespace ov::intel_cpu::brgemm_utils::repacking; -using namespace ov::snippets::lowered; - -namespace ov { -namespace intel_cpu { -namespace pass { - -bool InsertBrgemmCopyBBuffers::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { - OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBrgemmCopyBBuffers") - - const auto& factory = linear_ir.get_expr_factory(); - - auto insert_buffer = [&](const ExpressionPtr& copy_b_expr, size_t out_port, LinearIR::constExprIt insertion_pos) { - const auto& copy_b = ov::as_type_ptr(copy_b_expr->get_node()); - const auto& copy_b_out = copy_b_expr->get_output_port_connector(out_port); - const auto copy_b_consumers = copy_b_out->get_consumers(); - OPENVINO_ASSERT(copy_b_consumers.size() == 1, "BufferCopyB must have only one consumer on each out port - Brgemm"); - const auto& buffer_op = std::make_shared(copy_b->output(out_port)); - BufferExpressionPtr buffer_expr = nullptr; - if (out_port == 0) { - buffer_expr = factory->build(buffer_op, {copy_b_out}); - } else if (out_port == 1 && with_compensations(copy_b->get_type())) { - buffer_expr = factory->build(buffer_op, {copy_b_out}); - } else { - OPENVINO_THROW("BrgemmCopyB has incorrect output ports"); - } - return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(copy_b_expr, copy_b_consumers.begin()->get_expr()), - true, insertion_pos, {copy_b_consumers}); - }; - - bool modified = false; - for (auto expr_it = begin; expr_it != end; ++expr_it) { - const auto expr = *expr_it; - if (auto copy_b = ov::as_type_ptr(expr->get_node())) { - for (size_t i = 0; i < expr->get_output_count(); ++i) { - expr_it = insert_buffer(expr, i, std::next(expr_it)); - } - modified = true; - } - } - return modified; -} - -} // namespace pass -} // namespace intel_cpu -} // namespace ov - diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp deleted file mode 100644 index a08bc507aa60da..00000000000000 --- a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (C) 2018-2024 Intel Corporation -// SPDX-License-Identifier: Apache-2.0 -// - -#pragma once - -#include "snippets/lowered/pass/pass.hpp" - -namespace ov { -namespace intel_cpu { -namespace pass { - -/** - * @interface InsertBrgemmCopyBBuffers - * @brief Insert Buffers after BrgemmCopyB with algorithm of allocation size calculation which - * distinguishes with common algorithm - * @ingroup snippets - */ -class InsertBrgemmCopyBBuffers: public snippets::lowered::pass::RangedPass { -public: - InsertBrgemmCopyBBuffers() = default; - OPENVINO_RTTI("InsertBrgemmCopyBBuffers", "Pass"); - bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; -}; - -} // namespace pass -} // namespace intel_cpu -} // namespace ov diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.cpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.cpp new file mode 100644 index 00000000000000..14134b1cd0980f --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.cpp @@ -0,0 +1,104 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "insert_brgemm_copy_buffers.hpp" + +#include "snippets/lowered/loop_manager.hpp" +#include "snippets/itt.hpp" + +#include "transformations/snippets/x64/op/brgemm_cpu.hpp" +#include "transformations/snippets/x64/op/brgemm_copy_b.hpp" +#include "expressions/brgemm_copy_b_buffer_expressions.hpp" + + +using namespace ov::intel_cpu::brgemm_utils::repacking; +using namespace ov::snippets::lowered; + +namespace ov { +namespace intel_cpu { +namespace pass { + +bool InsertBrgemmCopyBuffers::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) { + OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBrgemmCopyBuffers") + + const auto& factory = linear_ir.get_expr_factory(); + + auto insert_copy_b_buffer = [&](const ExpressionPtr& copy_b_expr, size_t out_port, LinearIR::constExprIt insertion_pos) { + const auto& copy_b = ov::as_type_ptr(copy_b_expr->get_node()); + const auto& copy_b_out = copy_b_expr->get_output_port_connector(out_port); + const auto copy_b_consumers = copy_b_out->get_consumers(); + OPENVINO_ASSERT(copy_b_consumers.size() == 1, "BufferCopyB must have only one consumer on each out port - Brgemm"); + const auto& buffer_op = std::make_shared(copy_b->output(out_port)); + BufferExpressionPtr buffer_expr = nullptr; + if (out_port == 0) { + buffer_expr = factory->build(buffer_op, {copy_b_out}); + } else if (out_port == 1 && with_compensations(copy_b->get_type())) { + buffer_expr = factory->build(buffer_op, {copy_b_out}); + } else { + OPENVINO_THROW("BrgemmCopyB has incorrect output ports"); + } + return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(copy_b_expr, copy_b_consumers.begin()->get_expr()), + true, insertion_pos, {copy_b_consumers}); + }; + + auto update_scratchpad = [](const ExpressionPtr& brgemm_expr, const BufferExpressionPtr& scratch_expr) { + OPENVINO_ASSERT(scratch_expr && scratch_expr->is_independent_memory(), "Incorrect Scratchpad buffer for Brgemm AMX"); + const auto src_dt = brgemm_expr->get_node()->get_input_element_type(0); + const auto in_subtensor = ov::snippets::utils::get_projected_subtensor(brgemm_expr->get_input_port(0)); + const auto shape0 = ov::snippets::utils::get_planar_vdims(brgemm_expr->get_input_port(0)); + const auto K_dim = shape0.back(); + const auto M_blk = *++in_subtensor.rbegin(); + OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(M_blk), "M blk cannot be dynamic!"); + + const auto vnni_factor = brgemm_utils::compute_vnni_factor(src_dt); + const auto inner_k_blk = brgemm_utils::repacking::compute_inner_k_block(src_dt); + const auto tile_scratch_size = BrgemmCPU::SCRATCH_BYTE_SIZE; + const auto current_scratch_size = scratch_expr->get_byte_size(); + OPENVINO_ASSERT(current_scratch_size == tile_scratch_size, + "Tile scratchpad for BrgemmAMX should have byte size ", tile_scratch_size); + size_t inner_k_size = 0; + if (ov::snippets::utils::is_dynamic_value(K_dim)) { + // In dynamic case we don't know exactly if we need repacking of MatMul first input. + // Because of that, we allocate maximum possible size for repacked data in compilation stage. + inner_k_size = inner_k_blk; + } else { + // In static case, we allocate buffer for repacked data only if we have to repack MatMul first input: + // only if `K_dim % inner_k_blk > 0` + const auto inner_k_tail = K_dim % inner_k_blk; + inner_k_size = inner_k_tail % vnni_factor > 0 ? ov::snippets::utils::rnd_up(inner_k_tail, vnni_factor) : 0; + } + const auto repacked_in0_size = M_blk * inner_k_size * src_dt.size(); + scratch_expr->set_allocation_size(tile_scratch_size + repacked_in0_size); + }; + + bool modified = false; + for (auto expr_it = begin; expr_it != end; ++expr_it) { + const auto brgemm_expr = *expr_it; + if (const auto brgemm_cpu = ov::as_type_ptr(brgemm_expr->get_node())) { + if (brgemm_utils::with_repacking(brgemm_cpu->get_type())) { + // BrgemmCopyB might be extracted from the body + if (const auto copy_b_expr = brgemm_utils::repacking::get_copy_b_expr(brgemm_expr)) { + auto insertion_it = std::next(linear_ir.find_before(expr_it, copy_b_expr)); + for (size_t i = 0; i < copy_b_expr->get_output_count(); ++i) { + insertion_it = std::next(insert_copy_b_buffer(copy_b_expr, i, insertion_it)); + } + modified = true; + } + } + + if (brgemm_utils::with_amx(brgemm_cpu->get_type())) { + const auto& scratch_expr = + ov::as_type_ptr(brgemm_expr->get_input_port_connector(2)->get_source().get_expr()); + update_scratchpad(brgemm_expr, scratch_expr); + modified = true; + } + } + } + return modified; +} + +} // namespace pass +} // namespace intel_cpu +} // namespace ov + diff --git a/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.hpp b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.hpp new file mode 100644 index 00000000000000..feca42ca3b8496 --- /dev/null +++ b/src/plugins/intel_cpu/src/transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.hpp @@ -0,0 +1,29 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "snippets/lowered/pass/pass.hpp" + +namespace ov { +namespace intel_cpu { +namespace pass { + +/** + * @interface InsertBrgemmCopyBuffers + * @brief Insert Brgemm-specific buffers: + * - after BrgemmCopyB with algorithm of allocation size calculation which distinguishes with common algorithm + * - update size of `NewMemory` Buffer - add allocation byte size for repacked data from first input of Brgemm in AMX scenario + * @ingroup snippets + */ +class InsertBrgemmCopyBuffers: public snippets::lowered::pass::RangedPass { +public: + InsertBrgemmCopyBuffers() = default; + OPENVINO_RTTI("InsertBrgemmCopyBuffers", "0", snippets::lowered::pass::RangedPass); + bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override; +}; + +} // namespace pass +} // namespace intel_cpu +} // namespace ov diff --git a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp index 8ec0900bc7d176..a014eeb2cecdac 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/behavior/ov_executable_network/properties.cpp @@ -327,4 +327,35 @@ TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPUExecutionDevice) { ASSERT_EQ(value.as(), "CPU"); } +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptions) { + ov::Core ie; + ov::Any type; + ov::Any size; + ov::CompiledModel compiledModel; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName)); + OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + ASSERT_EQ(type.as(), ov::element::f16); + ASSERT_EQ(size.as(), 0); +} + +TEST_F(OVClassConfigTestCPU, smoke_CpuExecNetworkCheckCPURuntimOptionsWithCompileConfig) { + ov::Core ie; + ov::Any type; + ov::Any size; + ov::CompiledModel compiledModel; + model->set_rt_info("f16", "runtime_options", ov::hint::kv_cache_precision.name()); + model->set_rt_info("0", "runtime_options", ov::hint::dynamic_quantization_group_size.name()); + ov::AnyMap config; + config[ov::hint::kv_cache_precision.name()] = "u8"; + config[ov::hint::dynamic_quantization_group_size.name()] = "16"; + OV_ASSERT_NO_THROW(compiledModel = ie.compile_model(model, deviceName, config)); + OV_ASSERT_NO_THROW(type = compiledModel.get_property(ov::hint::kv_cache_precision)); + OV_ASSERT_NO_THROW(size = compiledModel.get_property(ov::hint::dynamic_quantization_group_size)); + ASSERT_EQ(type.as(), ov::element::u8); + ASSERT_EQ(size.as(), 16); +} + } // namespace diff --git a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp index d7cfe80d22f617..1696f35fc1bc4a 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/single_layer_tests/classes/eltwise.cpp @@ -259,6 +259,7 @@ std::string EltwiseLayerCPUTest::getPrimitiveType(const utils::EltwiseTypes& elt (eltwise_type == utils::EltwiseTypes::MULTIPLY) || (eltwise_type == utils::EltwiseTypes::SUBTRACT) || (eltwise_type == utils::EltwiseTypes::DIVIDE) || + (eltwise_type == utils::EltwiseTypes::FLOOR_MOD) || (eltwise_type == utils::EltwiseTypes::MOD)) { return "jit"; } @@ -317,6 +318,8 @@ const std::vector& eltwiseOpTypesBinInp() { utils::EltwiseTypes::SUBTRACT, // TODO: Fix CVS-105430 utils::EltwiseTypes::DIVIDE, // TODO: Fix CVS-105430 utils::EltwiseTypes::FLOOR_MOD, // TODO: Fix CVS-111875 +#elif defined(OPENVINO_ARCH_ARM64) + utils::EltwiseTypes::FLOOR_MOD, #endif utils::EltwiseTypes::SQUARED_DIFF, utils::EltwiseTypes::MOD, diff --git a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp index 764133d52a7fdd..b675a7c2da7d42 100644 --- a/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp +++ b/src/plugins/intel_cpu/tests/functional/shared_tests_instances/skip_tests_config.cpp @@ -563,18 +563,6 @@ std::vector disabledTestPatterns() { retVector.emplace_back(R"(.*smoke_Snippets_MHA.*EnforceBF16.*)"); retVector.emplace_back(R"(.*ConcatSDPTest.*bf16.*)"); } - // [150842] Need to support dynamic K dimension of BF16|INT8 MatMul on AMX systems - if (ov::with_cpu_x86_avx512_core_amx()) { - retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[2.2.70.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MatMul/MatMul.CompareWithRefImpl/.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MatMulTransposeB.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MatMulBias.*IS\[0\]=\[\?.\?.\?.\?\].*T\[0\]=(u8|i8|bf16)_T\[1\]=(i8|bf16).*)"); - - retVector.emplace_back(R"(.*smoke_Snippets_MHAWOTransposeEnforceBF16_3D.*IS\[1\]=\[2.64.\?\].*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MHA.*BF16.*/MHA.*IS\[0\]=\[(\?|1).(\?|4).(\?|12).(\?|64)\].*)"); - retVector.emplace_back(R"(.*smoke_Snippets_MHA.*BF16.*/MHA.*IS\[0\]=\[\?.\?.\?\].*)"); - retVector.emplace_back(R"(.*smoke_Snippets_(MHAINT8MatMul|MHAQuantMatMul0|MHAFQAfterMatMul_4D|smoke_Snippets_MHAFQ).*IS\[0\]=\[\?.\?.\?\.\?].*)"); - } #ifdef SNIPPETS_LIBXSMM_TPP // GN in TPP requires exposing tmp Buffer results outside the loop (ticket: 151234) retVector.emplace_back(R"(.*smoke_Snippets_GroupNormalization.*)"); diff --git a/src/plugins/intel_cpu/tests/unit/shape_inference_test/glu_shape_inference_test.cpp b/src/plugins/intel_cpu/tests/unit/shape_inference_test/glu_shape_inference_test.cpp new file mode 100644 index 00000000000000..f7647d52dc5bae --- /dev/null +++ b/src/plugins/intel_cpu/tests/unit/shape_inference_test/glu_shape_inference_test.cpp @@ -0,0 +1,46 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include + +#include "common_test_utils/test_assertions.hpp" +#include "ov_ops/glu.hpp" +#include "utils.hpp" + +using namespace ov; +using namespace ov::intel_cpu; +using ov::op::v0::Constant; +using ov::op::v0::Parameter; +using testing::HasSubstr; + +TEST(StaticShapeInferenceTest, GLUStaticShapeInferenceTestDefaultCtor) { + constexpr int64_t axis = -1; + constexpr int64_t split_lengths = 48; + + const auto op = std::make_shared(); + const auto data = std::make_shared(element::f16, PartialShape::dynamic()); + + op->set_arguments(ov::OutputVector{data}); + op->set_axis(axis); + op->set_split_lengths(split_lengths); + + std::vector static_input_shapes = {StaticShape{20, 1, 96}}; + const auto static_output_shapes = shape_inference(op.get(), static_input_shapes); + ASSERT_EQ(static_output_shapes.size(), 1); + EXPECT_EQ(static_output_shapes[0], StaticShape({20, 1, 48})); +} + +TEST(StaticShapeInferenceTest, GLUStaticShapeInferenceTestBasic) { + constexpr int64_t axis = -1; + constexpr int64_t split_lengths = 48; + const auto glu_type = ov::op::internal::GLU::GluType::Swish; + + const auto data = std::make_shared(element::f16, PartialShape::dynamic()); + const auto op = std::make_shared(data, axis, split_lengths, glu_type, 1); + + std::vector static_input_shapes = {StaticShape{20, 1, 96}}; + const auto static_output_shapes = shape_inference(op.get(), static_input_shapes); + ASSERT_EQ(static_output_shapes.size(), 1); + EXPECT_EQ(static_output_shapes[0], StaticShape({20, 1, 48})); +} diff --git a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp index e31a8bebb95758..9ace85b3038afa 100644 --- a/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp +++ b/src/plugins/intel_cpu/tests/unit/snippets_transformations/x64/lowered/buffer_allocation.cpp @@ -3,6 +3,7 @@ // #include "openvino/opsets/opset.hpp" +#include "openvino/runtime/system_conf.hpp" #include "snippets/snippets_isa.hpp" #include "snippets/lowered/linear_ir.hpp" #include "snippets/lowered/pass/mark_loops.hpp" @@ -17,7 +18,7 @@ #include "transformations/snippets/x64/shape_inference.hpp" #include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp" -#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp" +#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_buffers.hpp" #include "transformations/snippets/x64/op/brgemm_cpu.hpp" #include "transformations/snippets/x64/op/brgemm_copy_b.hpp" @@ -90,7 +91,7 @@ class BufferAllocationCPUTest : public testing::TestWithParam(m_vector_size); pipeline.register_pass(); pipeline.register_pass(); - pipeline.register_pass(); + pipeline.register_pass(); pipeline.register_pass(); pipeline.register_pass(m_vector_size); pipeline.register_pass(); @@ -255,6 +256,11 @@ TEST_P(MHAFP32BufferAllocationTest, BufferAllocationCPU) { } TEST_P(MHABF16AMXBufferAllocationTest, BufferAllocationCPU) { + // Scratchpad memory for AMX with CopyA (dynamic case) has allocation size which depends on element count in vector register. + // So the current `expected_allocation_size` in the test is targeted on real AVX512 platforms with vector registers with 512 bits. + // If the test infrastructure has AVX2, the allocation size will not be matched. + if (!with_cpu_x86_avx512_core()) + GTEST_SKIP(); Validate(); } @@ -363,7 +369,7 @@ INSTANTIATE_TEST_SUITE_P(smoke_Snippets_BufferAllocation_MHABF16AMXOptimizedWSpl ::testing::Values(dynamic_shapes), ::testing::Values(true), ::testing::Values(true), - ::testing::Values(32768), // only WSP buffers + ::testing::Values(34816), // only WSP buffers ::testing::Values(3), ::testing::Values(7)), BufferAllocationCPUTest::getTestCaseName); diff --git a/src/plugins/intel_cpu/tools/commit_slider/tests/commit_slider_test.py b/src/plugins/intel_cpu/tools/commit_slider/tests/commit_slider_test.py index 2d342da339e810..c163a314b6d9e7 100644 --- a/src/plugins/intel_cpu/tools/commit_slider/tests/commit_slider_test.py +++ b/src/plugins/intel_cpu/tools/commit_slider/tests/commit_slider_test.py @@ -14,7 +14,8 @@ BenchmarkAppDataUnstable, BenchmarkAppDataStable, BenchmarkAppNoDegradationData,\ BenchmarkAppUnstableDevData, BenchmarkAppWrongPathData, BenchmarkAppPathFoundData,\ BenchmarkFirstFixedAppData, AcModeData, BenchmarkMetricData, CustomizedLogData, \ - MultiConfigData, ConfigMultiplicatorData, ConfigMultiplicatorWithKeyData + MultiConfigData, ConfigMultiplicatorData, ConfigMultiplicatorWithKeyData, \ + AcModeDataBitwise class CommitSliderTest(TestCase): @skip_commit_slider_devtest @@ -81,9 +82,9 @@ def testBmStable(self): self.assertEqual(breakCommit, actualCommit) @skip_commit_slider_devtest - def testACMode(self): + def testACModeBitwise(self): breakCommit, updatedData = getExpectedCommit( - AcModeData()) + AcModeDataBitwise()) actualCommit, _ = getActualCommit(updatedData) self.assertEqual(breakCommit, actualCommit) diff --git a/src/plugins/intel_cpu/tools/commit_slider/tests/test_data.py b/src/plugins/intel_cpu/tools/commit_slider/tests/test_data.py index 50249b1f24ab0f..53bd9d2c8a3dde 100644 --- a/src/plugins/intel_cpu/tools/commit_slider/tests/test_data.py +++ b/src/plugins/intel_cpu/tools/commit_slider/tests/test_data.py @@ -71,7 +71,8 @@ class TestCase(Enum): CustomizedLog = 15, MultiConfig = 16, ConfigMultiplicator = 17, - MultiConfigWithKey = 18 + MultiConfigWithKey = 18, + AcModeDataBitwise = 19 def requireTestData(self, reqLambda): # mapping json to test data holder @@ -190,6 +191,19 @@ def __init__(self): requireBinarySearchData ) +class AcModeDataBitwise(TestData): + def getTestCase(): + return TestData.TestCase.AcModeDataBitwise + + def getTestName(self): + return "ACMode" + + def __init__(self): + from test_util import requireBinarySearchData + self.requireTestData( + requireBinarySearchData + ) + class BenchmarkFirstFixedAppData(TestData): def getTestCase(): return TestData.TestCase.BmFirstFixed diff --git a/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/fbv_ac_mode_bitwise.cpp b/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/fbv_ac_mode_bitwise.cpp new file mode 100644 index 00000000000000..856438809f6acb --- /dev/null +++ b/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/fbv_ac_mode_bitwise.cpp @@ -0,0 +1,42 @@ +#include + +int main () { + const char *patchGenerator = R"V0G0N( +[ + { + "str": "std::cout << \"prefix\\Perplexity: 100.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "success_1" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 100.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "success_2" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 50.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "error_1", + "state": "BREAK" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 50.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "error_2" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 50.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "error_1" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 50.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "error_2" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 50.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "error_1" + }, + { + "str": "std::cout << \"prefix\\Perplexity: 50.0% [FAILED: abs error = 9.118 | relative error = 0.3144]\\n\";", + "comment": "error_2" + } +] +)V0G0N"; + return 0; +} \ No newline at end of file diff --git a/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/tests_res.json b/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/tests_res.json index 7f357753de17e5..8bdd8bec6a29c9 100644 --- a/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/tests_res.json +++ b/src/plugins/intel_cpu/tools/commit_slider/tests/tests_res/tests_res.json @@ -345,8 +345,27 @@ } } }, + "ACModeBitwise": { + "repoName": "ACModeBitwise", + "patchedFile": "tests_res/fbv_ac_mode_bitwise.cpp", + "testCfg": { + "appCmd" : "{appCmd}", + "appPath": "{appPath}", + "gitPath" : "{gitPath}", + "buildPath" : "{buildPath}", + "verboseOutput": false, + "runConfig" : { + "commitList" : { + "getCommitListCmd" : "git log {start}..{end} --boundary --pretty=\"%h\"" + }, + "mode" : "ac", + "traversal" : "firstFailedVersion", + "threshold": "80%" + } + } + }, "ACMode": { - "repoName": "BmBinarySearchUnstable", + "repoName": "ACMode", "patchedFile": "tests_res/fbv_ac_mode.cpp", "testCfg": { "appCmd" : "{appCmd}", @@ -359,7 +378,7 @@ "getCommitListCmd" : "git log {start}..{end} --boundary --pretty=\"%h\"" }, "mode" : "ac", - "traversal" : "firstFailedVersion", + "traversal" : "allBreaks", "threshold": "80%" } } diff --git a/src/plugins/intel_cpu/tools/commit_slider/utils/modes.py b/src/plugins/intel_cpu/tools/commit_slider/utils/modes.py index 4800d0854ae093..3340f1f597efd8 100644 --- a/src/plugins/intel_cpu/tools/commit_slider/utils/modes.py +++ b/src/plugins/intel_cpu/tools/commit_slider/utils/modes.py @@ -288,7 +288,7 @@ class AccuracyCheckerMode(Mode): def __init__(self, cfg): super().__init__(cfg) self.thresholdPattern = ":\s([0-9]*[.][0-9]*)%.*abs error" - self.breakThroughput = 0 + self.curMetric = None self.createCash() def prepareRun(self, list, cfg): @@ -317,29 +317,21 @@ def prepareRun(self, list, cfg): self.sampleThroughput = float(foundThroughput) return list - def checkCfg(self, cfg): - super().checkCfg(cfg) - if not ("threshold" in cfg["runConfig"]): - raise CfgError("Threshold is not configured") - else: - self.threshold = cfg["runConfig"]["threshold"] - self.threshold = float(self.threshold.strip('%')) - - def compareCommits(self, lCommit: str, rCommit: str, cfg: map): - leftThroughput = self.getPseudoMetric(lCommit, cfg) - rightThroughput = self.getPseudoMetric(rCommit, cfg) - isLeftGood = leftThroughput >= float(self.threshold) - isRightGood = rightThroughput >= float(self.threshold) - if not isRightGood: - self.breakThroughput = rightThroughput + leftMetric = self.getPseudoMetric(lCommit, cfg) + rightMetric = self.getPseudoMetric(rCommit, cfg) + isDiff = leftMetric != rightMetric + if isDiff: + self.curMetric = rightMetric curCommit = rCommit.replace('"', "") commitLogger = getCommitLogger(cfg, curCommit) - commitLogger.info("Current accuracy is {}%".format(rightThroughput)) + commitLogger.info("Current accuracy is {}%".format(rightMetric)) commitLogger.info( - "Commit is {status}".format(status=("bad" if isRightGood else "good")) + "Commit {status} from {c}".format( + status=("differs" if isDiff else "doesn't differ"), + c=lCommit) ) - return isLeftGood != isRightGood + return isDiff def getPseudoMetric(self, commit, cfg): commit = commit.replace('"', "") @@ -372,12 +364,12 @@ def getPseudoMetric(self, commit, cfg): return curThroughput def setOutputInfo(self, pathCommit): - pathCommit.breakThroughput = self.breakThroughput + pathCommit.metric = self.curMetric def getCommitInfo(self, commit): - return "{ci}, throughput = {d}".format( + return "{ci}, metric = {d}".format( ci=super().getCommitInfo(commit), - d=commit.breakThroughput) + d=commit.metric) class CompareBlobsMode(Mode): diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.cpp b/src/plugins/intel_gpu/src/graph/debug_helper.cpp index c2c41fdfab2373..b69d10e137010e 100644 --- a/src/plugins/intel_gpu/src/graph/debug_helper.cpp +++ b/src/plugins/intel_gpu/src/graph/debug_helper.cpp @@ -51,11 +51,13 @@ void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump if (tmp_size == size) { file_stream << "shape: " << size.to_string() << " "; file_stream << "(count: " << size.count() + << ", addr: " << mem->buffer_ptr() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" << (dump_raw ? " raw data" : "") << std::endl; } else { file_stream << "shape: " << tmp_size.to_string() << " "; file_stream << "(count: " << tmp_size.count() + << ", addr: " << mem->buffer_ptr() << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ", original shape: " << size.to_string() << ")" << (dump_raw ? " raw data" : "") << std::endl; diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index de7f51b071ae53..65acb0beb66ba0 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -846,6 +846,27 @@ void prepare_buffer_fusing::run(program& p) { if (user_info.first) { node.get_users().front()->set_output_layout(user_info.second); } + + // In case that the rank of weight node of gemm is less than 4 and, + // it transforms to extend to 4 dims by adding 1 to begin(). + // Therefore, the padding of crop_layout should be shifted properly. + const size_t TDIM = 4; + auto user = node.get_users().front(); + bool allow_new_shape_infer = node.get_program().is_new_shape_infer(); + if (!allow_new_shape_infer && user->is_type() && user->get_dependency(1).id().compare(node.id()) == 0) { + auto input_rank = user->get_kernel_impl_params()->typed_desc()->weight_rank; + if (input_rank < TDIM) { + std::vector l_pad = {0, 0, 0, 0}; + std::vector u_pad = {0, 0, 0, 0}; + + //shift right + size_t shift_right = TDIM - input_rank; + std::copy_n(crop_layout.data_padding._lower_size.begin(), l_pad.size() - shift_right, l_pad.begin() + shift_right); + std::copy_n(crop_layout.data_padding._upper_size.begin(), u_pad.size() - shift_right, u_pad.begin() + shift_right); + + crop_layout.data_padding = padding(l_pad, u_pad); + } + } } node.set_output_layout(crop_layout); node.can_be_optimized(true); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp index 9b66eee4f90a01..e13978a9983c77 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/primitive_base.hpp @@ -92,7 +92,6 @@ struct typed_primitive_impl_ocl : public typed_primitive_impl { // concat buffer fusing for dynamic shape is adaptively applied at runtime. So we need to build dynamic impl at build time. if (impl_param.can_be_optimized() && !((impl_param.is_type() || - impl_param.is_type() || impl_param.is_type() || impl_param.runtime_skippable()) && impl_param.is_dynamic())) { return make_unique(kernel_selector::kernel_data{}); diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp index a2d3f878e1cc69..dad93d94946490 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/scaled_dot_product_attention.cpp @@ -44,10 +44,14 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveGetUpdateDispatchDataFunc(_kernels_data[default_sdpa]); - if (_kernels_data.size() == 2) { + if (_kernels_data.size() >= 2) { auto bt_kernel_impl = kernel_selector.GetImplementation(_kernels_data[indirect_sdpa].kernelName); bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[indirect_sdpa]); } + if (_kernels_data.size() == 3) { + auto bt_kernel_impl = kernel_selector.GetImplementation(_kernels_data[2].kernelName); + bt_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[2]); + } } } @@ -192,11 +196,37 @@ struct scaled_dot_product_attention_impl : multi_stage_primitiveget_input_layout(0); + + auto get_reordered_dimension = [](const ov::PartialShape& pshape, const std::vector& order, size_t idx) -> const ov::Dimension& { + if (order.empty()) + return pshape[idx]; + + return pshape[order[idx]]; + }; + + const auto& desc = instance.get_impl_params()->typed_desc(); + const auto dim_L = get_reordered_dimension(query_layout.get_partial_shape(), desc->input_q_transpose_order, 2 /* y */); + + bool is_generate = dim_L.get_length() == 1; // L + return is_generate; + } + event::ptr execute_impl(const std::vector& events, scaled_dot_product_attention_inst& instance) override { - if (need_indirect_load(instance)) + if (need_indirect_load(instance)) { return execute_stage(events, instance, indirect_sdpa); - else + } else if (need_sdpa_opt_load(instance)) { + return execute_stage(events, instance, _kernels_data.size() -1 /* the last */); + } else { return execute_stage(events, instance, default_sdpa); + } } static kernel_selector::sdpa_configuration get_sdpa_configuration(const kernel_impl_params& impl_param) { @@ -333,6 +363,12 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive(kernels_data); } @@ -344,13 +380,16 @@ struct scaled_dot_product_attention_impl : multi_stage_primitive= 2) { if (_kernels_data[indirect_sdpa].params == nullptr) { _kernels_data[indirect_sdpa].params = std::make_shared(get_kernel_params(impl_param, true)); } update_shapes(*_kernels_data[indirect_sdpa].params, impl_param); (_kernels_data[indirect_sdpa].update_dispatch_data_func)(*_kernels_data[indirect_sdpa].params, _kernels_data[indirect_sdpa]); } + if (_kernels_data.size() == 3) { + (_kernels_data[2].update_dispatch_data_func)(*_kernels_data[default_sdpa].params, _kernels_data[2]); + } } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp index 767128a5be2950..c4c27161b89fe4 100644 --- a/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/onednn/gemm_onednn.cpp @@ -186,8 +186,15 @@ struct gemm_onednn : typed_primitive_onednn_impl { if (ret) { tag = convert_data_format(transposed_format); dnnl::memory::dims original_dims = dims; - for (size_t i = 0; i < original_dims.size(); ++i) { - dims[i] = original_dims[order[i]]; + if (is_input) { + for (size_t i = 0; i < original_dims.size(); ++i) { + dims[i] = original_dims[order[i]]; + } + } else { + // Get non-transposed dims for output dims + for (size_t i = 0; i < original_dims.size(); ++i) { + dims[order[i]] = original_dims[i]; + } } } else { std::ostringstream ostream; diff --git a/src/plugins/intel_gpu/src/graph/include/pass_manager.h b/src/plugins/intel_gpu/src/graph/include/pass_manager.h index 9850c25a64ec5d..61c34c0eff548f 100644 --- a/src/plugins/intel_gpu/src/graph/include/pass_manager.h +++ b/src/plugins/intel_gpu/src/graph/include/pass_manager.h @@ -307,13 +307,8 @@ class memory_dependency_pass : public base_pass { if ((node->can_be_optimized() && !node->is_runtime_skippable()) || !dep->can_be_optimized()) { node->add_memory_dependency(static_cast(dep->get_unique_id())); } else { - if (node->is_runtime_skippable() || dep->is_runtime_skippable()) { + if (node->is_runtime_skippable() || dep->is_runtime_skippable() || dep->can_be_optimized()) { node->add_memory_dependency(static_cast(dep->get_unique_id())); - for (const auto& subdep : dep->get_dependencies()) { - add_memory_dependency(node, subdep.first); - add_memory_dependency(subdep.first, node); - } - return; } for (const auto& subdep : dep->get_dependencies()) { diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h index fffcd100d9691c..8b8c4e6b0b6e97 100644 --- a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h @@ -427,7 +427,7 @@ class primitive_inst { bool use_async_compilation(); // if primitive_inst doesn't replace impl to new impl(static impl with opt kerenl or dynamic impl), return false void update_impl(bool use_async_compilation); - void realloc_if_needed(); + void realloc_if_needed(bool prev_execution_skipped = false); cldnn::network::ptr get_unfused_subgraph(); @@ -481,6 +481,8 @@ class primitive_inst { return false; } + void clear_output_memory(); + // This could be implemented via single map std::unordered_map> // but the overhead on using perf_counter_key as map key is too big, thus we use hash as map key // and store mapping onto original perf_clounter_key for further data analysis and dumps diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp index b51c7825b5a8fa..5680eedcb8f87c 100644 --- a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp +++ b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp @@ -549,7 +549,12 @@ bool primitive_inst::all_dependencies_cpu_impl() const { return check_all_deps_cpu(this); } -void primitive_inst::realloc_if_needed() { +void primitive_inst::clear_output_memory() { + _outputs[0] = nullptr; + _max_output_layout_count[0] = 0; +} + +void primitive_inst::realloc_if_needed(bool prev_execution_skipped) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("realloc_if_needed: " + id())); GPU_DEBUG_GET_INSTANCE(debug_config); GPU_DEBUG_PROFILED_STAGE(instrumentation::pipeline_stage::memory_allocation); @@ -618,6 +623,15 @@ void primitive_inst::realloc_if_needed() { _max_output_layout_count[j] = 0; } } else { + _outputs[0] = variable.get_memory(); + + if (auto compressed_cache_variable = dynamic_cast(&variable)) { + _outputs[2] = compressed_cache_variable->get_compression_scale_state()->get_memory(); + + if (compressed_cache_variable->has_zp_state()) { + _outputs[3] = compressed_cache_variable->get_compression_zp_state()->get_memory(); + } + } GPU_DEBUG_TRACE_DETAIL << id() << " : realloc_if_needed: can_be_optimized = false and memories are not being shared" << std::endl; } } else { @@ -738,21 +752,15 @@ void primitive_inst::realloc_if_needed() { // Clear out memory if was previously reused, but now primitive can't be optimized if (!_node->is_type() && (_node->is_runtime_skippable() || _node->is_type())) { - std::function reset_user_output_memory; - reset_user_output_memory = [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr input_mem_ptr) { - auto curr_output_memory_ptr = curr_inst->output_memory_ptr(0); - if (curr_inst->can_be_optimized() - && (curr_output_memory_ptr - && get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *input_mem_ptr))) { - if (curr_inst->mem_allocated()) { - get_network().get_memory_pool().release_memory(curr_inst->_outputs[0].get(), - curr_inst->get_node().get_unique_id(), curr_inst->id(), get_network_id()); - _mem_allocated = false; - } - curr_inst->_outputs[0] = nullptr; - curr_inst->_max_output_layout_count[0] = 0; - for (auto& user_inst : curr_inst->get_user_insts()) { - reset_user_output_memory(user_inst, input_mem_ptr); + std::function reset_user_output_memory + = [&](cldnn::primitive_inst* curr_inst, cldnn::memory::ptr target_mem_ptr) { + for (auto& user_inst : curr_inst->get_user_insts()) { + auto curr_output_memory_ptr = user_inst->output_memory_ptr(0); + if (user_inst->can_be_optimized() + && (curr_output_memory_ptr + && get_network().get_engine().is_the_same_buffer(*curr_output_memory_ptr, *target_mem_ptr))) { + user_inst->clear_output_memory(); + reset_user_output_memory(user_inst, target_mem_ptr); } } }; @@ -766,9 +774,7 @@ void primitive_inst::realloc_if_needed() { // * iter1: node1(skipped) -> node2(skipped) -> node3(executed) if (_outputs[0] && dep_memory_ptr(0) && !_network.get_engine().is_the_same_buffer(dep_memory(0), output_memory(0))) { - for (auto& user_inst : get_user_insts()) { - reset_user_output_memory(user_inst, dep_memory_ptr(0)); - } + reset_user_output_memory(this, dep_memory_ptr(0)); } return; } else if (_outputs[0] && dep_memory_ptr(0) && @@ -778,16 +784,22 @@ void primitive_inst::realloc_if_needed() { get_node().get_unique_id(), id(), get_network_id()); _mem_allocated = false; } - _outputs[0] = nullptr; - _max_output_layout_count[0] = 0; + clear_output_memory(); // Check users recursively and if the users is can_be_optimized && runtime_skippable // && output_memory of user is same as current input memory, // then reset the users output memory too. // Ex. // * iter0: node1(skipped) -> node2(skipped) -> node3(skipped) // * iter1: node1(executed) -> node2(skipped) -> node3(executed) - for (auto& user_inst : get_user_insts()) { - reset_user_output_memory(user_inst, dep_memory_ptr(0)); + reset_user_output_memory(this, dep_memory_ptr(0)); + } else { + // when this inst was not executed at the previous iteration, + // Reset output memory becuase current output memory is invalid. + if (prev_execution_skipped) { + if (_outputs[0]) { + reset_user_output_memory(this, _outputs[0]); + } + clear_output_memory(); } } } @@ -1389,7 +1401,7 @@ void primitive_inst::do_runtime_in_place_kv_cache() { void primitive_inst::do_runtime_skip_gather() { // Check pattern if (!get_node().is_type() - || !get_node().can_be_optimized() + || !get_node().is_runtime_skippable() || _impl_params->has_fused_primitives() || _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type || get_node().get_dependency(1).is_constant() || get_node().get_dependency(1).is_type()) @@ -1461,7 +1473,6 @@ void primitive_inst::do_runtime_skip_permute() { // Check pattern if (!get_node().is_type() || is_output() - || !get_node().can_be_optimized() || !get_node().is_runtime_skippable() || _impl_params->has_fused_primitives() || _impl_params->get_input_layout(0).data_type != _impl_params->get_output_layout().data_type) @@ -1501,7 +1512,7 @@ void primitive_inst::do_runtime_skip_permute() { void primitive_inst::do_runtime_skip_strided_slice() { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_strided_slice: " + id())); // Check pattern - if (!get_node().is_type() || !get_node().can_be_optimized()) + if (!get_node().is_type() || !get_node().is_runtime_skippable()) return; GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_strided_slice] " << id() << " : check optimizability" << std::endl; @@ -1525,7 +1536,7 @@ void primitive_inst::do_runtime_skip_strided_slice() { void primitive_inst::do_runtime_skip_broadcast() { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, openvino::itt::handle("do_runtime_skip_broadcast: " + id())); // Check pattern - if (!get_node().is_type() || !get_node().can_be_optimized()) + if (!get_node().is_type() || !get_node().is_runtime_skippable()) return; GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_broadcast] " << id() << " : check optimizability" << std::endl; @@ -1634,7 +1645,7 @@ void primitive_inst::do_runtime_skip_scatter_update() { if (!(get_node().is_type() || get_node().is_type() || get_node().is_type()) - || !get_node().can_be_optimized()) + || !get_node().is_runtime_skippable()) return; GPU_DEBUG_TRACE_DETAIL << "[do_runtime_skip_scatter_update] " << id() << " : check optimizability" << std::endl; @@ -1780,6 +1791,10 @@ void primitive_inst::prepare_primitive() { } GPU_DEBUG_TRACE_DETAIL << "-----------------------------------------------------------------" << std::endl; + // If it is optimized out or skipped for zero dimension at the previous iteration, + // Set this flag true to reset output memory in realloc_if_needed. + const bool prev_execution_skipped = can_be_optimized() + || (_impl_params->output_layouts[0].is_static() && _impl_params->output_layouts[0].count() == 0); const auto orig_outputs = _outputs; if ((is_dynamic() || _node->is_in_shape_of_subgraph()) && !has_inner_networks()) { do_runtime_in_place_concat(); @@ -1839,7 +1854,7 @@ void primitive_inst::prepare_primitive() { update_impl(can_use_async_compilation); if (get_flag(ExecutionFlags::IMPL_CHANGED)) { update_weights(); - realloc_if_needed(); + realloc_if_needed(prev_execution_skipped); } } @@ -1848,7 +1863,7 @@ void primitive_inst::prepare_primitive() { if (_node->is_type() && !get_flag(ExecutionFlags::IMPL_CHANGED) && _impl->requires_update(*this, *_impl_params)) { _impl->update(*this, *_impl_params); - realloc_if_needed(); + realloc_if_needed(prev_execution_skipped); } OPENVINO_ASSERT(_impl_params->get_output_layout().is_static(), diff --git a/src/plugins/intel_gpu/src/graph/swiglu.cpp b/src/plugins/intel_gpu/src/graph/swiglu.cpp index e82e4e974b1868..ffd5333318cee4 100644 --- a/src/plugins/intel_gpu/src/graph/swiglu.cpp +++ b/src/plugins/intel_gpu/src/graph/swiglu.cpp @@ -3,6 +3,7 @@ // #include "ov_ops/glu.hpp" +#include "glu_shape_inference.hpp" #include "swiglu_inst.h" #include "primitive_type_base.h" @@ -32,11 +33,7 @@ std::vector swiglu_inst::calc_output_layouts(swiglu_node const& /*node*/ op.set_axis(desc->axis); op.set_split_lengths(desc->split_lengths); - std::vector input_shapes = { - impl_param.get_input_layout(0).get(), - ShapeType(ov::Shape({})), - ShapeType(ov::Shape{2}) - }; + std::vector input_shapes = {impl_param.get_input_layout(0).get()}; std::vector output_shapes = shape_infer(&op, input_shapes); diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/resample_onnx.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/resample_onnx.cl index 3a7fb8be91954b..9f8f2ad5964bda 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/resample_onnx.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/resample_onnx.cl @@ -64,9 +64,6 @@ KERNEL (resample_onnx)(__global INPUT0_TYPE* input, const int in_size[5] = { INPUT0_BATCH_NUM, INPUT0_FEATURE_NUM, INPUT0_SIZE_Z, INPUT0_SIZE_Y, INPUT0_SIZE_X }; - if (feature_num >= OUTPUT_FEATURE_NUM) - return; - const int PADDED_Y = INPUT0_SIZE_Y + PADS_BEGIN[3] + PADS_END[3]; const int PADDED_X = INPUT0_SIZE_X + PADS_BEGIN[4] + PADS_END[4]; const ACCUMULATOR_TYPE iy = FUNC_CALL(get_original_coordinate)(y, SCALES[3], OUTPUT_SIZE_Y, PADDED_Y); diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp index 411e6878ebd72e..d95b008171db24 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/eltwise/eltwise_kernel_blocked_opt.cpp @@ -112,7 +112,7 @@ bool EltwiseKernel_blocked_opt::Validate(const Params& params) const { } const auto vec_size = SelectVecSizeFromFormat(ewParams.outputs[0]); - const auto input0 = ewParams.inputs[0]; + const auto& input0 = ewParams.inputs[0]; const auto& output = ewParams.outputs[0]; // Check that padding before features doesn't mis-align the blocks if (input0.Feature().pad.before % vec_size != 0 || output.Feature().pad.before % vec_size != 0) @@ -137,11 +137,22 @@ bool EltwiseKernel_blocked_opt::Validate(const Params& params) const { }; for (size_t i = 1; i < ewParams.inputs.size(); i++) { - if (ewParams.inputs[i].LogicalSize() == input0.LogicalSize() && !(compareTensors(ewParams.inputs[i], input0))) + const auto& input = ewParams.inputs[i]; + if (input.LogicalSize() == input0.LogicalSize() && !(compareTensors(input, input0))) return false; - if (ewParams.inputs[i].Feature().pad.before % vec_size != 0) { + if (input.Feature().pad.before % vec_size != 0) { return false; } + if (input.GetLayout() == DataLayout::bfyx) { + bool is_valid = input.LogicalSize() == 1; // Scalar value broadcast + is_valid |= input.LogicalSize() % vec_size == 0 && // Feature value broadcast + input.LogicalSize() == input.Feature().v && + input.LogicalSize() == output.Feature().v && + GetInnerBatchBlockSize(input) == 1; + if (!is_valid) { + return false; + } + } } return true; @@ -422,6 +433,7 @@ static inline int SelectVecSizeFromFormat(const DataTensor& tensor) { static inline int GetInnerBatchBlockSize(const DataTensor& tensor) { auto layout = tensor.GetLayout(); switch (layout) { + case DataLayout::bfyx: case DataLayout::b_fs_yx_fsv4: case DataLayout::b_fs_yx_fsv16: case DataLayout::b_fs_zyx_fsv16: diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp index e2a538750d1615..ed0ba87f8f22af 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.cpp @@ -107,7 +107,7 @@ JitConstants SDPAKernelBase::GetJitConstants(const sdpa_params& params) const { }; auto use_index_calc_func = [&](const std::vector order, bool is_query = false) { - if (!params.input0_order.empty() && !is_default_order(params.input0_order)) + if (!order.empty() && !is_default_order(order)) return true; if (params.conf.broadcast_axis != -1) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h index 493bd0acedea32..5cd9c384ff2709 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_base.h @@ -120,6 +120,7 @@ struct sdpa_params : public base_params { DataTensor value_cache_comp_zp; sdpa_configuration conf; + bool should_use_sdpa_opt = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp index 838d34bbf85404..467dd71da37944 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/sdpa/sdpa_kernel_micro.cpp @@ -316,6 +316,9 @@ bool SDPAKernelMicro::Validate(const Params& p) const { const sdpa_params& params = static_cast(p); + if (params.should_use_sdpa_opt) + return false; + if (params.conf.is_paged_attention) return false; diff --git a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp index 561822f9661109..6903b52963a879 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations/kv_cache_compression.cpp @@ -133,7 +133,7 @@ class KVCacheCompressionMatcher : public ov::pass::MatcherPass { KVCacheCompressionMatcher::KVCacheCompressionMatcher(ov::element::Type compression_dt) { using namespace ov::pass::pattern; - if (compression_dt != element::i8) + if (compression_dt != element::i8 && compression_dt != element::u8) return; const auto quantization_type = ov::op::internal::DynamicQuantize::QuantizationType::Asymmetric; diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp index b55c9e00bdab64..cc28dbab3660b9 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp @@ -96,3 +96,95 @@ TEST_P(TransposeMatMulFusionOnGPU, CompareWithRefs){ }; } // namespace + + +//================================================================================= +// Transpose + MatMul + Transpose pattern fusion (TransposeMatMulTransposeMatcher) +//================================================================================= +namespace ov { +namespace test { + +using MatMulTransposeFusionParams = std::tuple; // input C shapes +class MatMulTransposeFusionOnGPU: public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + ov::PartialShape input0; + ov::PartialShape input1; + ov::PartialShape input2; + + std::tie(input0, input1, input2) = obj.param; + + std::ostringstream result; + result << "device=(" << std::string(utils::DEVICE_GPU) << ")_"; + result << ov::test::utils::partialShape2str({input0}) << "_"; + result << ov::test::utils::partialShape2str({input1}) << "_"; + result << ov::test::utils::partialShape2str({input2}) << "_"; + return result.str(); + } +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + + ov::PartialShape shape1; + ov::PartialShape shape2; + ov::PartialShape shape3; + + std::tie(shape1, shape2, shape3) = GetParam(); + + InputShape input_shape1 = {shape1, {shape1.get_shape()}}; + InputShape input_shape2 = {shape2, {shape2.get_shape()}}; + InputShape input_shape3 = {shape3, {shape3.get_shape()}}; + init_input_shapes({input_shape1, input_shape2, input_shape3}); + + const auto param1 = std::make_shared(ov::element::f16, shape1); + const auto param2 = std::make_shared(ov::element::f16, shape2); + const auto param3 = std::make_shared(ov::element::f16, shape3); + + auto input2_shape = shape2.get_shape(); + + //input0 + const auto input0_order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {1, 0, 2, 3}); + const auto input0_transpose = std::make_shared(param1, input0_order); + const auto input0_shape_pattern = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, input2_shape); + const auto input0_reshape = std::make_shared(input0_transpose, input0_shape_pattern, false); + + //input1 + const auto input1_order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {0, 1, 3, 2}); + const auto input1_transpose = std::make_shared(param2, input1_order); + + // matmul & softmax + const auto matmul1 = std::make_shared(input0_reshape, input1_transpose, false, false); + const auto softmax = std::make_shared(matmul1, -1); + + // input3 + const auto input3_transpose = std::make_shared(param3, input0_order); + const auto input3_shape_pattern = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, input2_shape); + const auto input3_reshape = std::make_shared(input3_transpose, input3_shape_pattern, false); + + // target matmul + const auto matmul2 = std::make_shared(softmax, input3_reshape, false, false); + const auto order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {2, 0, 1, 3}); + const auto transpose = std::make_shared(matmul2, order); + + function = std::make_shared(transpose, ov::ParameterVector{param1, param2, param3}); + } +}; + + +} // namespace test +} // namespace ov + + +namespace { +INSTANTIATE_TEST_SUITE_P(smoke_MatMulTransposeFusion, MatMulTransposeFusionOnGPU, + ::testing::Values( + MatMulTransposeFusionParams({3, 8, 16, 1}, {2, 4, 3, 16}, {3, 8, 16, 1})), + MatMulTransposeFusionOnGPU::getTestCaseName); + +TEST_P(MatMulTransposeFusionOnGPU, CompareWithRefs){ + run(); +}; +} // namespace diff --git a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp index 965313126d4362..89b3d38f5051d3 100644 --- a/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp +++ b/src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/scaled_dot_product_attention.cpp @@ -25,7 +25,8 @@ typedef std::tuple, // shape bool, // is_causal bool, // has_attn - bool // has_scale + bool, // has_scale + std::vector> // input_transpose > ScaledAttnGPUTestParams; class ScaledAttnLayerGPUTest : public testing::WithParamInterface, @@ -36,6 +37,7 @@ class ScaledAttnLayerGPUTest : public testing::WithParamInterface& targetInputStaticShapes) override; + void transpose_prepare(std::vector& shapes, const std::vector>& input_transpose); bool is_causal; bool has_attn; bool has_scale; @@ -44,11 +46,14 @@ class ScaledAttnLayerGPUTest : public testing::WithParamInterface& obj) { ov::element::Type inType; std::vector inputShapes; + std::vector> input_transpose; bool is_causal; bool has_attn; bool has_scale; - std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = obj.param; + bool transpose_enable; + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = obj.param; + transpose_enable = (input_transpose.size() != 0); std::ostringstream result; result << "netPRC=" << inType << "_"; result << "IS="; @@ -65,6 +70,7 @@ std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo result << "is_causal=" << is_causal << "_"; result << "has_attn=" << has_attn << "_"; result << "has_scale=" << has_scale << "_"; + result << "with_transpose" << transpose_enable << "_"; return result.str(); } @@ -72,17 +78,19 @@ std::string ScaledAttnLayerGPUTest::getTestCaseName(const testing::TestParamInfo void ScaledAttnLayerGPUTest::SetUp() { ov::element::Type inType; std::vector inputShapes; + std::vector> input_transpose; targetDevice = ov::test::utils::DEVICE_GPU; - std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = this->GetParam(); + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = this->GetParam(); + transpose_prepare(inputShapes, input_transpose); init_input_shapes(inputShapes); ov::ParameterVector inputParams; // q, k, v inputParams.push_back(std::make_shared(inType, inputDynamicShapes[0])); inputParams.push_back(std::make_shared(inType, inputDynamicShapes[1])); - inputParams.push_back(std::make_shared(inType, inputDynamicShapes[1])); + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[2])); inputParams[0]->set_friendly_name("q"); inputParams[1]->set_friendly_name("k"); inputParams[2]->set_friendly_name("v"); @@ -96,7 +104,7 @@ void ScaledAttnLayerGPUTest::SetUp() { inputParams.back()->set_friendly_name("scale"); } else { if (has_attn) { - inputParams.push_back(std::make_shared(inType, inputDynamicShapes[2])); + inputParams.push_back(std::make_shared(inType, inputDynamicShapes[3])); inputParams.back()->set_friendly_name("attention_mask"); } if (has_scale) { @@ -106,9 +114,31 @@ void ScaledAttnLayerGPUTest::SetUp() { } } - ov::OutputVector inputs; + ov::OutputVector inputParams_transpose; for (size_t i = 0; i < inputParams.size(); i++) { - inputs.push_back(inputParams[i]); + inputParams_transpose.push_back(inputParams[i]); + } + if (input_transpose.size() != 0) { + // deal with transpose. + auto tranpose_a_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[0]); + auto tranpose_a = std::make_shared(inputParams[0], tranpose_a_const); + tranpose_a->set_friendly_name("tranpose_a"); + inputParams_transpose[0] = tranpose_a; + + auto tranpose_b_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[1]); + auto tranpose_b = std::make_shared(inputParams[1], tranpose_b_const); + tranpose_b->set_friendly_name("tranpose_b"); + inputParams_transpose[1] = tranpose_b; + + auto tranpose_c_const = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{4}, input_transpose[2]); + auto tranpose_c = std::make_shared(inputParams[2], tranpose_c_const); + tranpose_c->set_friendly_name("tranpose_c"); + inputParams_transpose[2] = tranpose_c; + } + + ov::OutputVector inputs; + for (size_t i = 0; i < inputParams_transpose.size(); i++) { + inputs.push_back(inputParams_transpose[i]); } auto sdp = std::make_shared(inputs, is_causal); @@ -141,17 +171,53 @@ void ScaledAttnLayerGPUTest::SetUp() { } } +void ScaledAttnLayerGPUTest::transpose_prepare(std::vector& shapes, + const std::vector>& input_transpose) { + auto transpose_pshape = [](InputShape& pshapes, const std::vector& order) { + auto transposed_pshape = ov::PartialShape::dynamic(pshapes.first.rank()); + std::vector transposed_cshapes(pshapes.second); + auto& pshape = pshapes.first; + auto& cshape = pshapes.second; + for (size_t i = 0; i < order.size(); i++) { + transposed_pshape[i] = pshape[order[i]]; + for (size_t j = 0; j < cshape.size(); j++) { + transposed_cshapes[j][i] = cshape[j][order[i]]; + } + } + + for (size_t i = 0; i < order.size(); i++) { + pshape[i] = transposed_pshape[i]; + for (size_t j = 0; j < cshape.size(); j++) { + cshape[j][i] = transposed_cshapes[j][i]; + } + } + }; + + if (shapes.empty()) { + return; + } + + shapes.insert(shapes.begin()+1, shapes[1]); + if (input_transpose.empty()) { + return; + } + + for (size_t i = 0; i < input_transpose.size(); i++) { + transpose_pshape(shapes[i], input_transpose[i]); + } +} + void ScaledAttnLayerGPUTest::generate_inputs(const std::vector& targetInputStaticShapes) { std::vector shapes(3); shapes[0] = targetInputStaticShapes[0]; shapes[1] = targetInputStaticShapes[1]; - shapes[2] = targetInputStaticShapes[1]; + shapes[2] = targetInputStaticShapes[2]; if (!has_attn && has_scale) { shapes.push_back(ov::Shape{}); shapes.push_back(ov::Shape{1}); } else { if (has_attn) { - shapes.push_back(targetInputStaticShapes[2]); + shapes.push_back(targetInputStaticShapes[3]); } if (has_scale) { shapes.push_back(ov::Shape{1}); @@ -163,10 +229,11 @@ void ScaledAttnLayerGPUTest::generate_inputs(const std::vector& targe TEST_P(ScaledAttnLayerGPUTest, CompareWithRefs) { ov::element::Type inType; std::vector inputShapes; + std::vector> input_transpose; bool is_causal; bool has_attn; bool has_scale; - std::tie(inType, inputShapes, is_causal, has_attn, has_scale) = this->GetParam(); + std::tie(inType, inputShapes, is_causal, has_attn, has_scale, input_transpose) = this->GetParam(); run(); } @@ -261,11 +328,15 @@ const std::vector> shapes{ }, }; +const std::vector> disable_transpose{}; +const std::vector> enable_transpose{{0, 1, 2, 3}, {0, 1, 2, 3}, {0, 2, 1, 3}}; + const auto params = testing::Combine(testing::Values(ov::element::f16 /*, ov::element::f32 */), testing::ValuesIn(shapes), testing::Values(true, false), testing::Values(true, false), - testing::Values(true, false)); + testing::Values(true, false), + testing::ValuesIn({disable_transpose, enable_transpose})); INSTANTIATE_TEST_SUITE_P(smoke_ScaledAttn_GPU, ScaledAttnLayerGPUTest, diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp index 77477648fd4860..4945cc8d717be3 100644 --- a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/kv_cache.cpp @@ -530,4 +530,81 @@ TEST_F(KVCacheTests, smoke_multipleIterations_stateful_with_set_state) { this->test_smoke_multipleIterations_stateful(false, true, true, 1, 2, ov::element::f16, 5, 1, true); } +class KVCacheIssueTests: public ::testing::Test { +public: + void test_smoke_conflicted_memory_for_two_inf_req() { + #if defined(ANDROID) + GTEST_SKIP(); + #endif + auto core = ov::test::utils::PluginCache::get().core(); + + ov::AnyMap properties = { + ov::hint::kv_cache_precision(ov::element::undefined) + }; + + const size_t n_batch = 1; + const size_t n_heads = 32; + const size_t n_features = 10; + const size_t context_size = 20; + ov::element::Type element_type = ov::element::f16; + + const bool stateful = true; + + auto model = tests::make_llm_kv_cache_pattern(n_batch, + n_heads, + n_features, + element_type, + 2, + stateful, + false, + stateful); + auto compiled_model = core->compile_model(model, ov::test::utils::DEVICE_GPU, properties); + + auto input0 = model->get_parameters().at(0); + auto input1 = model->get_parameters().at(1); + + auto ireq1 = compiled_model.create_infer_request(); + auto ireq2 = compiled_model.create_infer_request(); + + auto ireq1_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, context_size, n_heads, n_features}, -0.5f, 0.5f, 1); + auto ireq1_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, n_heads, context_size, context_size}, -0.5f, 0.5f, 1); + ireq1.set_tensor(input0, ireq1_input0); + ireq1.set_tensor(input1, ireq1_input1); + + auto ireq2_input0 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, context_size + 1, n_heads, n_features}, -0.5f, 0.5f, 555); + auto ireq2_input1 = ov::test::utils::create_and_fill_tensor_real_distribution(element_type, + {n_batch, n_heads, context_size + 1, context_size + 1}, -0.5f, 0.5f, 555); + ireq2.set_tensor(input0, ireq2_input0); + ireq2.set_tensor(input1, ireq2_input1); + + std::stringstream oss1; + std::stringstream oss2; + for (auto&& state : ireq1.query_state()) { + state.reset(); + } + ireq1.infer(); + for (auto&& state : ireq1.query_state()) { + oss1.write(reinterpret_cast(state.get_state().data()), state.get_state().get_byte_size()); + } + + for (auto&& state : ireq2.query_state()) { + state.reset(); + } + ireq2.infer(); + for (auto&& state : ireq1.query_state()) { + oss2.write(reinterpret_cast(state.get_state().data()), state.get_state().get_byte_size()); + } + + ASSERT_TRUE(oss1.str() == oss2.str()); + } +}; + +TEST_F(KVCacheIssueTests, conflicted_memory_for_two_inf_req) { + this->test_smoke_conflicted_memory_for_two_inf_req(); +} + + } // namespace diff --git a/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/split_reshape_eltwise.cpp b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/split_reshape_eltwise.cpp new file mode 100644 index 00000000000000..9e98541283dcc2 --- /dev/null +++ b/src/plugins/intel_gpu/tests/functional/subgraph_tests/dynamic/split_reshape_eltwise.cpp @@ -0,0 +1,134 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "shared_test_classes/base/ov_subgraph.hpp" +#include "common_test_utils/ov_tensor_utils.hpp" +#include "common_test_utils/node_builders/eltwise.hpp" + +#include "openvino/op/parameter.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/result.hpp" +#include "openvino/op/variadic_split.hpp" + +namespace { +using ov::test::InputShape; + +typedef std::tuple< + std::vector, // input shapes + size_t, // split axis + ov::element::Type, // Model type + std::string // Device name +> SplitReshapeEltwiseTestParams; + +const std::vector model_precisions = { + ov::element::f16 +}; + +class SplitReshapeEltwiseTest : public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + SplitReshapeEltwiseTestParams test_params = obj.param; + std::ostringstream result; + std::vector input_shapes; + size_t axis; + ov::element::Type precision; + std::string target_device; + + std::tie(input_shapes, axis, precision, target_device) = test_params; + result << "IS="; + for (const auto& shape : input_shapes) { + result << ov::test::utils::partialShape2str({shape.first}) << "_"; + for (const auto& actual_shape : shape.second) { + result << ov::test::utils::partialShape2str({actual_shape}) << "_"; + } + } + result << "axis=" << axis << "_"; + result << "Precision=" << precision << "_"; + result << "target_device=" << target_device; + return result.str(); + } + +protected: + void generate_inputs(const std::vector& targetInputStaticShapes) override { + inputs.clear(); + const auto& funcInputs = function->inputs(); + for (size_t i = 0; i < funcInputs.size(); ++i) { + const auto& funcInput = funcInputs[i]; + + ov::Tensor tensor; + ov::test::utils::InputGenerateData in_data; + in_data.start_from = 0; + in_data.range = 80; + in_data.resolution = 8; + tensor = ov::test::utils::create_and_fill_tensor(funcInput.get_element_type(), targetInputStaticShapes[i], in_data); + inputs.insert({funcInput.get_node_shared_ptr(), tensor}); + } + } + + void SetUp() override { + SplitReshapeEltwiseTestParams test_params = this->GetParam(); + std::vector input_shapes; + size_t axis; + ov::element::Type model_type; + std::tie(input_shapes, axis, model_type, targetDevice) = test_params; + + init_input_shapes(input_shapes); + + ov::ParameterVector params = { + std::make_shared(model_type, inputDynamicShapes[0]), + std::make_shared(model_type, inputDynamicShapes[1]), + std::make_shared(model_type, inputDynamicShapes[2]), + }; + + auto axis_op = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{}, {1}); + axis_op->set_friendly_name("axis"); + + auto split_sizes = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{2}, {5, 5}); + split_sizes->set_friendly_name("split_sizes"); + + auto split = std::make_shared(params[0], axis_op, split_sizes); + split->set_friendly_name("split"); + + auto add_not_reshaped = std::make_shared(split->output(1), params[1]); + add_not_reshaped->set_friendly_name("add_not_reshaped"); + + std::vector target_shape; + for (auto& d : inputDynamicShapes[2]) { + target_shape.push_back(d.is_dynamic() ? -1 : d.get_length()); + } + auto target_shape_node = ov::op::v0::Constant::create(ov::element::i64, ov::Shape{target_shape.size()}, target_shape); + auto reshape = std::make_shared(split->output(0), target_shape_node, false); + + auto add_reshaped = std::make_shared(params[2], reshape); + add_reshaped->set_friendly_name("add_reshaped"); + + auto convert1 = std::make_shared(add_not_reshaped, ov::element::f32); + auto convert2 = std::make_shared(add_reshaped, ov::element::f32); + + ov::ResultVector results = {std::make_shared(convert1), std::make_shared(convert2)}; + function = std::make_shared(results, params, "eltwise_add_out"); + } +}; + +TEST_P(SplitReshapeEltwiseTest, Inference) { + run(); +} + +const std::vector> input_shapes = { + { + {{-1, 10}, {{2, 10}, {1, 10}}}, // split in shape + {{-1, 5}, {{2, 5}, {1, 5}}}, // not reshaped add input shape + {{-1, 1, 5}, {{2, 1, 5}, {1, 1, 5}}} // reshaped add input shape + }, +}; + + +const auto testParams_smoke = ::testing::Combine(::testing::ValuesIn(input_shapes), + ::testing::Values(1), // axis + ::testing::ValuesIn(model_precisions), + ::testing::Values(ov::test::utils::DEVICE_GPU)); + +INSTANTIATE_TEST_SUITE_P(smoke_dynamic_model, SplitReshapeEltwiseTest, + testParams_smoke, SplitReshapeEltwiseTest::getTestCaseName); +} // namespace diff --git a/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_gather_at_runtime.cpp b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_gather_at_runtime.cpp new file mode 100644 index 00000000000000..6ca8073c7658c3 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/dynamic_execution/skip_gather_at_runtime.cpp @@ -0,0 +1,101 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils.h" + +#include +#include +#include +#include +#include + +#include "gather_inst.h" +#include "program_wrapper.h" + +#include +#include + +using namespace cldnn; +using namespace ::tests; + +namespace skip_gather_tests { +enum execution_status { + optimized = 0, + skipped = 1, + executed = 2 +}; + +struct gather_iter_params { + ov::PartialShape input1_shape; + ov::PartialShape input2_shape; + execution_status expected_status; +}; + + +struct skip_gather_params { + std::vector input_data; + int axis; +}; + +class skip_gather_at_runtime_test : public testing::TestWithParam {}; + +TEST_P(skip_gather_at_runtime_test, runtime_skip) { + auto p = GetParam(); + auto& engine = get_test_engine(); + auto axis = p.axis; + auto input1_rank = p.input_data[0].input1_shape.size(); + auto input1_layout_dynamic = layout {ov::PartialShape::dynamic(input1_rank), data_types::f16, format::get_default_format(input1_rank)}; + auto input2_rank = p.input_data[0].input2_shape.size(); + auto input2_layout_dynamic = layout {ov::PartialShape::dynamic(input2_rank), data_types::f16, format::get_default_format(input2_rank)}; + topology topology(input_layout("input1", input1_layout_dynamic), + input_layout("input2", input1_layout_dynamic), + reshape("squeeze", input_info("input2"), false, {-1}, {-1}, reshape::reshape_mode::base), + gather("gather", + input_info("input1"), + input_info("squeeze"), + axis, + p.input_data[0].input1_shape.size(), + ov::Shape{}, + 0, + true), + reorder("reorder", input_info("gather"), format::get_default_format(input1_rank), data_types::f32)); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::allow_new_shape_infer(true)); + config.set_property(ov::intel_gpu::optimize_data(true)); + + network network(engine, topology, config); + auto gather_inst = network.get_primitive("gather"); + for (auto in_shape_data : p.input_data) { + auto input1_static_layout = layout {in_shape_data.input1_shape, data_types::f16, format::get_default_format(input1_rank)}; + auto input1_mem = engine.allocate_memory(input1_static_layout); + network.set_input_data("input1", input1_mem); + + auto input2_static_layout = layout {in_shape_data.input2_shape, data_types::f16, format::get_default_format(input2_rank)}; + auto intpu2_unit_static_layout = layout {ov::PartialShape{1}, data_types::f16, format::get_default_format(input2_rank)}; + auto input2_mem = (input2_static_layout.count() == 0)? engine.allocate_memory(intpu2_unit_static_layout) : engine.allocate_memory(input2_static_layout); + if (input2_static_layout.count() == 0) + input2_mem = engine.reinterpret_buffer(*input2_mem, input2_static_layout); + network.set_input_data("input2", input2_mem); + + auto outputs = network.execute(); + if (in_shape_data.expected_status == execution_status::executed) { + ASSERT_FALSE(engine.is_the_same_buffer(gather_inst->dep_memory(0), gather_inst->output_memory(0))); + ASSERT_FALSE(gather_inst->can_be_optimized()); + } else if (in_shape_data.expected_status == execution_status::optimized) { + ASSERT_TRUE(engine.is_the_same_buffer(gather_inst->dep_memory(0), gather_inst->output_memory(0))); + ASSERT_TRUE(gather_inst->can_be_optimized()); + } else { + ASSERT_TRUE(gather_inst->get_output_layout(0).count() == 0); + } + } +} + +INSTANTIATE_TEST_SUITE_P(smoke, skip_gather_at_runtime_test, + testing::ValuesIn(std::vector { + {{{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,1,8}, ov::PartialShape{1,0}, execution_status::skipped}, {ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1}, + {{{ov::PartialShape{1,2,8}, ov::PartialShape{1,1}, execution_status::executed},{ov::PartialShape{1,1,8}, ov::PartialShape{1,0}, execution_status::skipped}, {ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1}, + {{{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,1,8}, ov::PartialShape{1,1}, execution_status::optimized},{ov::PartialShape{1,11,8}, ov::PartialShape{1,6}, execution_status::executed}}, 1} + })); +} // skip gather tests diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index 5adc1e691b82a7..456fab4ae0286a 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -11,6 +11,8 @@ #include "intel_gpu/graph/program.hpp" #include "data_inst.h" +#include "concatenation_inst.h" +#include "gemm_inst.h" #include "crop_inst.h" #include "convolution_inst.h" #include "gather_inst.h" @@ -707,6 +709,54 @@ TEST(prepare_buffer_fusing, in_place_crop_static) { ASSERT_EQ(output_ptr_2[i], out2[i]); } +TEST(prepare_buffer_fusing, in_place_crop_static_padding_and_gemm) { + auto& engine = get_test_engine(); + + auto gemm_input_mem = engine.allocate_memory({ {1, 4, 4, 2}, data_types::f32, format::bfyx }); + auto concat_input_mem = engine.allocate_memory({ {1, 4, 2}, data_types::f32, format::bfyx }); + + set_values(gemm_input_mem, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, + 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f }); + set_values(concat_input_mem, { -0.5f, 2.0f, 0.5f, 1.0f, 0.5f, -2.0f, -0.5f, -1.0f }); + + std::vector expected = { 0.5, 4, 0.5, 10, 0.5, 16, 0.5, 22, + 0.5, 4, 0.5, 10, 0.5, 16, 0.5, 22, + 0.5, 4, 0.5, 10, 0.5, 16, 0.5, 22, + 0.5, 4, 0.5, 10, 0.5, 16, 0.5, 22}; + cldnn::tensor refSize = {1, 2, 1, 2}; + + topology topology( + input_layout("gemm_input", gemm_input_mem->get_layout()), + input_layout("concat_input", concat_input_mem->get_layout()), + concatenation("concat", { input_info("concat_input"), input_info("concat_input") }, 2), + crop("crop", input_info("concat"), refSize, tensor(0, 0, 0, 0)), + gemm("gemm", { input_info("gemm_input"), input_info("crop") }, data_types::f32, false, false, 1.0, 0.0, 4, 3), + reorder("output", input_info("gemm"), format::bfyx, data_types::f32) + ); + + { + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + network.set_input_data("gemm_input", gemm_input_mem); + network.set_input_data("concat_input", concat_input_mem); + + auto outputs = network.execute(); + + auto crop_prim = network.get_primitive("crop"); + ASSERT_EQ(crop_prim->can_be_optimized(), true); + + auto output = outputs.at("output").get_memory(); + cldnn::mem_lock output_ptr(output, get_test_stream()); + for (size_t i = 0; i < expected.size(); i++) { + ASSERT_EQ(output_ptr[i], expected[i]); + } + } +} + TEST(prepare_buffer_fusing, in_place_crop_dynamic) { auto& engine = get_test_engine(); diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp index 8d67bc3f7db2ad..c6f39b15ea532a 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/eltwise_gpu_test.cpp @@ -4711,6 +4711,11 @@ struct eltwise_layout_test_params { #define CASE_ELTWISE_TEST7 eltwise_mode::sum, {4, 5, 4, 1}, {4, 1, 4, 1}, format::bfyx, format::b_fs_yx_fsv16, "generic_eltwise_ref" #define CASE_ELTWISE_TEST8 eltwise_mode::sum, {4, 2, 4, 4}, {1, 1, 1, 1}, format::bfyx, format::b_fs_yx_fsv16, "generic_eltwise_ref" #define CASE_ELTWISE_TEST9 eltwise_mode::eq, {4, 2, 4, 4}, {1, 1, 1, 1}, format::b_fs_yx_fsv16, format::bfyx, "generic_eltwise_ref" +#define CASE_ELTWISE_TEST10 eltwise_mode::sum, {4, 8, 1, 1}, {1, 8, 1, 1}, format::b_fs_yx_fsv32, format::bfyx, "eltwise_blocked_opt" +#define CASE_ELTWISE_TEST11 eltwise_mode::sum, {4, 8, 1, 1}, {1, 8, 1, 1}, format::b_fs_yx_fsv16, format::bfyx, "eltwise_blocked_opt" +#define CASE_ELTWISE_TEST12 eltwise_mode::sum, {4, 16, 4, 4}, {1, 16, 1, 1}, format::b_fs_yx_fsv16, format::bfyx, "eltwise_blocked_opt" +#define CASE_ELTWISE_TEST13 eltwise_mode::sum, {4, 7, 4, 4}, {1, 7, 1, 1}, format::b_fs_yx_fsv16, format::bfyx, "generic_eltwise_ref" +#define CASE_ELTWISE_TEST14 eltwise_mode::sum, {1, 8, 1, 1}, {4, 8, 1, 1}, format::bfyx, format::b_fs_yx_fsv32, "generic_eltwise_ref" class eltwise_layout_test : public BaseEltwiseTest { public: @@ -4800,6 +4805,11 @@ INSTANTIATE_TEST_SUITE_P(eltwise, eltwise_test_mixed_layout, eltwise_layout_test_params{CASE_ELTWISE_TEST7}, eltwise_layout_test_params{CASE_ELTWISE_TEST8}, eltwise_layout_test_params{CASE_ELTWISE_TEST9}, + eltwise_layout_test_params{CASE_ELTWISE_TEST10}, + eltwise_layout_test_params{CASE_ELTWISE_TEST11}, + eltwise_layout_test_params{CASE_ELTWISE_TEST12}, + eltwise_layout_test_params{CASE_ELTWISE_TEST13}, + eltwise_layout_test_params{CASE_ELTWISE_TEST14}, })); // diff --git a/src/plugins/intel_npu/README.md b/src/plugins/intel_npu/README.md index 980faa71a15937..89aa29de9f1d15 100644 --- a/src/plugins/intel_npu/README.md +++ b/src/plugins/intel_npu/README.md @@ -176,6 +176,7 @@ The following properties are supported: | `ov::intel_npu::tiles`/
`NPU_TILES` | RW | Sets the number of npu tiles to compile the model for | `[0-]` | `-1` | | `ov::intel_npu::max_tiles`/
`NPU_MAX_TILES` | RW | Maximum number of tiles supported by the device we compile for. Can be set for offline compilation. If not set, it will be populated by driver.| `[0-]` | `[1-6] depends on npu platform` | | `ov::intel_npu::bypass_umd_caching`/
`NPU_BYPASS_UMD_CACHING` | RW | Bypass the caching of compiled models in UMD. | `YES`/ `NO`| `NO` | +| `ov::intel_npu::defer_weights_load`/
`NPU_DEFER_WEIGHTS_LOAD` | RW | Delay loading the weights until inference is created. | `YES`/ `NO`| `NO` |   ### Performance Hint: Default Number of DPU Groups / DMA Engines diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index 0fa1207bd9935a..a274c8d1c1cae6 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -74,6 +74,11 @@ struct OptionParser final { static int32_t parse(std::string_view val); }; +template <> +struct OptionParser final { + static uint32_t parse(std::string_view val); +}; + template <> struct OptionParser final { static int64_t parse(std::string_view val); @@ -167,6 +172,25 @@ struct OptionPrinter final { } }; +template +struct OptionPrinter> final { + static std::string toString(const std::map& val) { + std::stringstream ss; + std::size_t counter = 0; + std::size_t size = val.size(); + for (auto& [key, value] : val) { + std::string key_str = OptionPrinter::toString(key); + std::string value_str = OptionPrinter::toString(value); + ss << key_str << ":" << value_str; + if (counter < size - 1) { + ss << ","; + } + ++counter; + } + return ss.str(); + } +}; + // NB: boolean config option has values YES for true, NO for false template <> struct OptionPrinter final { diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 6d865ad5e4edf3..927b234df8ba15 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -17,6 +17,7 @@ namespace intel_npu { // void registerNPUWOptions(OptionsDesc& desc); +void registerNPUWLLMOptions(OptionsDesc& desc); #define DEFINE_OPT(Name, Type, DefaultValue, PropertyKey, Mode) \ struct Name final : OptionBase { \ @@ -66,4 +67,110 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime); DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime); DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime); DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime); +DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime); +DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime); +DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime); + +namespace npuw { +namespace llm { +struct ModelDesc { + std::string type; + std::string name_or_path; + int num_key_value_heads; +}; +enum class GenerateHint { FAST_COMPILE, BEST_PERF }; +} // namespace llm +} // namespace npuw + +struct NPUW_LLM_MODEL_DESC final : OptionBase { + static std::string_view key() { + return ov::intel_npu::npuw::llm::model_desc.name(); + } + + static constexpr std::string_view getTypeName() { + return "::intel_npu::npuw::llm::ModelDesc"; + } + + static ::intel_npu::npuw::llm::ModelDesc defaultValue() { + return {}; + } + + static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) { + ::intel_npu::npuw::llm::ModelDesc res; + std::map res_map = OptionParser>::parse(val); + res.type = res_map["type"]; + res.name_or_path = res_map["name_or_path"]; + res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]); + return res; + } + + static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) { + std::string res; + std::map res_map; + res_map["type"] = val.type; + res_map["name_or_path"] = val.name_or_path; + res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads); + return OptionPrinter>::toString(res_map); + } + + static OptionMode mode() { + return OptionMode::CompileTime; + } + + static bool isPublic() { + return true; + } +}; + +struct NPUW_LLM_GENERATE_HINT final : OptionBase { + static std::string_view key() { + return ov::intel_npu::npuw::llm::generate_hint.name(); + } + + static constexpr std::string_view getTypeName() { + return "::intel_npu::npuw::llm::GenerateHint"; + } + + static ::intel_npu::npuw::llm::GenerateHint defaultValue() { + return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE; + } + + static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) { + ::intel_npu::npuw::llm::GenerateHint res; + + if (val == "FAST_COMPILE") { + res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE; + } else if (val == "BEST_PERF") { + res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF; + } else { + OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ", + val, + ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\"."); + } + return res; + } + + static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) { + std::string res; + switch (val) { + case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE: + res = "FAST_COMPILE"; + break; + case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF: + res = "BEST_PERF"; + break; + default: + OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string."); + } + return res; + } + + static OptionMode mode() { + return OptionMode::CompileTime; + } + + static bool isPublic() { + return true; + } +}; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp index 8aabd132e9431a..15bbd69483bb57 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npu_private_properties.hpp @@ -305,13 +305,6 @@ static constexpr ov::Property batch_mode{"NPU_BATCH_MODE"}; */ static constexpr ov::Property create_executor{"NPU_CREATE_EXECUTOR"}; -/** - * @brief [Only for NPU Plugin] - * Type: boolean, default is false - * This option allows to omit loading the weights until inference is created - */ -static constexpr ov::Property defer_weights_load{"NPU_DEFER_WEIGHTS_LOAD"}; - /** * @brief Read-only property to get the name of used backend */ diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index af4a17988f451e..a416ca51233893 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -378,6 +378,51 @@ static constexpr ov::Property inputs_outputs{"NPUW_DUMP_IO"}; static constexpr ov::Property io_iters{"NPUW_DUMP_IO_ITERS"}; } // namespace dump +namespace llm { +/** + * @brief + * Type: bool. + * Tell NPUW that you want to pass dynamic stateful LLM model. + * Default value: false. + */ +static constexpr ov::Property enabled{"NPUW_LLM"}; + +/** + * @brief + * Type: std::map. + * Tell NPUW about your LLM model. Use following structure for that: + * "type:,name_or_path:,num_key_value_heads:". + * Default value: empty structure defined above. + */ +static constexpr ov::Property model_desc{"NPUW_LLM_MODEL_DESC"}; + +/** + * @brief + * Type: uint32_t. + * Tell NPUW your desirable max prompt length. + * Default value: 1024. + */ +static constexpr ov::Property max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"}; + +/** + * @brief + * Type: uint32_t. + * Tell NPUW your desirable min response length. + * Default value: 128. + */ +static constexpr ov::Property min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"}; + +/** + * @brief + * Type: std::string. + * Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it. + * Possible values: "FAST_COMPILE", "BEST_PERF". + * Default value: "FAST_COMPILE". + */ +static constexpr ov::Property generate_hint{"NPUW_LLM_GENERATE_HINT"}; + +} // namespace llm + } // namespace npuw } // namespace intel_npu } // namespace ov diff --git a/src/plugins/intel_npu/src/al/src/config/config.cpp b/src/plugins/intel_npu/src/al/src/config/config.cpp index 9d4c600351afa6..a4e2b515b8e3f6 100644 --- a/src/plugins/intel_npu/src/al/src/config/config.cpp +++ b/src/plugins/intel_npu/src/al/src/config/config.cpp @@ -50,6 +50,14 @@ int32_t OptionParser::parse(std::string_view val) { } } +uint32_t OptionParser::parse(std::string_view val) { + try { + return std::stoul(val.data()); + } catch (...) { + OPENVINO_THROW("Value '%s' is not a valid UINT32 option", val.data()); + } +} + int64_t OptionParser::parse(std::string_view val) { try { return std::stoll(val.data()); diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 0c7978845c690c..4ee9e392406452 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -54,3 +54,11 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add(); #endif } + +void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) { + desc.add(); + desc.add(); + desc.add(); + desc.add(); + desc.add(); +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index b9cdad9f4879db..aa02ca8681e80f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -28,6 +28,7 @@ #include "intel_npu/config/config.hpp" #include "intel_npu/config/npuw.hpp" #include "intel_npu/npuw_private_properties.hpp" +#include "llm_compiled_model.hpp" #include "openvino/runtime/device_id_parser.hpp" #include "openvino/runtime/internal_properties.hpp" #include "openvino/runtime/properties.hpp" @@ -85,10 +86,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr ov::npuw::ICompiledModel::create( + const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::AnyMap& properties) { + LOG_INFO("Choosing which NPUW CompiledModel to create"); + LOG_BLOCK(); + std::shared_ptr compiled_model; + auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name(); + if (properties.count(use_llm_key) && properties.at(use_llm_key).as() == true) { + LOG_INFO("ov::npuw::LLMCompiledModel will be created."); + compiled_model = std::make_shared(model, plugin, properties); + } else { + LOG_INFO("ov::npuw::CompiledModel will be created."); + compiled_model = std::make_shared(model, plugin, properties); + } + LOG_INFO("Done"); + return compiled_model; +} + +ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin) + : ov::ICompiledModel(model, plugin) {} + ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties) - : ov::ICompiledModel(model, plugin), + : ov::npuw::ICompiledModel(model, plugin), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), m_cfg(m_options_desc), m_name(model->get_friendly_name()), @@ -727,8 +751,9 @@ std::shared_ptr ov::npuw::CompiledModel::create_sync_infe const auto num_submodels = m_compiled_submodels.size(); for (std::size_t idx = 0u; idx < num_submodels; idx++) { const auto& comp_model_desc = m_compiled_submodels[idx]; - if (!comp_model_desc.replaced_by.has_value()) { - // not a funcall, do nothing + if (!comp_model_desc.replaced_by.has_value() || comp_model_desc.forced_to_fcall) { + // not a funcall, do nothing, or a subgraph that was forced to funcall + // (a 1-call function) - skip continue; } const auto real_idx = comp_model_desc.replaced_by.value(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 8ccb1f83349e47..0e728570eda8d5 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -22,10 +22,16 @@ class Plugin; namespace ov { namespace npuw { +class ICompiledModel : public ov::ICompiledModel { +public: + static std::shared_ptr create(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::AnyMap& properties); + ICompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin); +}; class InferRequest; - -class CompiledModel : public ov::ICompiledModel { +class CompiledModel : public ov::npuw::ICompiledModel { using DevList = std::vector; using GetPropertiesMap = std::map>>; diff --git a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp index 8d1c7c4a30acde..16e2a57e991e3a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/just_sync_infer_request.cpp @@ -783,9 +783,6 @@ void ov::npuw::JustInferRequest::unsafe_infer(std::size_t real_idx) { // Now copy the views from the output full-nway tensor to the output tensors for (std::size_t out_idx = 0u; out_idx < num_outputs; out_idx++) { - const auto& oport = comp_model_desc.compiled_model->outputs()[out_idx]; - auto spatial_tensor_shape = oport.get_shape(); - auto in_view = ov::npuw::util::view(m_spatial_io[real_idx].output_tails.at(out_idx), spatial.out_dim, 0, diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp new file mode 100644 index 00000000000000..e18b098969eb79 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -0,0 +1,346 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "llm_compiled_model.hpp" + +#include "llm_infer_request.hpp" +#include "logging.hpp" +#include "openvino/pass/stateful_to_stateless.hpp" +#include "openvino/runtime/iasync_infer_request.hpp" + +namespace { +uint32_t align_to(uint32_t value, uint32_t alignment) { + return (value + alignment - 1) & ~(alignment - 1); +} + +std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { + const auto kStartOutputKVCacheLayers = 1u; + for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { + auto kvout = model->output(i); + auto kvrslt = kvout.get_node(); + auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + auto kvval = kvcat->inputs()[1].get_source_output(); + kvval.set_names({kvout.get_any_name()}); + kvrslt->inputs()[0].replace_source_output(kvval); + } + model->validate_nodes_and_infer_types(); + return model; +} + +std::shared_ptr cvt_kvcache_to_fp16(const std::shared_ptr& model) { + ov::preprocess::PrePostProcessor ppp(model); + + for (auto tensor : model->inputs()) { + if (tensor.get_any_name().find("past_key") != std::string::npos) { + ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + for (auto tensor : model->outputs()) { + if (tensor.get_any_name().find("present") != std::string::npos) { + ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + return ppp.build(); +} + +struct KVAxesPosition { + uint32_t batch; + uint32_t seq_len; +}; + +void reshape_to_static(std::shared_ptr model, + const uint32_t input_size, + const uint32_t kvcache_size, + const KVAxesPosition& kv_axes_position) { + std::map new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("attention_mask") != std::string::npos) { + new_shape = ov::PartialShape({1, kvcache_size}); + } else if (input_name.find("position_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else { + const auto& partial_shape = input.get_partial_shape(); + new_shape = partial_shape; + new_shape[kv_axes_position.batch] = 1; + new_shape[kv_axes_position.seq_len] = kvcache_size - input_size; + } + new_shapes.emplace(input_name, new_shape); + } + model->reshape(new_shapes); +} + +KVAxesPosition get_kv_axes(const std::string& model_type) { + KVAxesPosition axes; + if (model_type == "chatglm") { + axes.batch = 1u; + axes.seq_len = 0u; + } else if (model_type == "qwen") { + // Note, qwen2 does not fall into this category and conforms to default layout + axes.batch = 0u; + axes.seq_len = 1u; + } else { + axes.batch = 0u; + axes.seq_len = 2u; + } + return axes; +} + +bool is_cw_compressed(const std::shared_ptr& model) { + std::vector rt_info_path = {"nncf", "weight_compression", "group_size"}; + if (!model->has_rt_info(rt_info_path)) { + // NB: Model isn't compressed by NNCF - skip + return false; + } + auto group_size = model->get_rt_info(rt_info_path); + if (group_size == -1) { + // NB: Enable DQ for CW quantized models + return true; + } + return false; +} + +struct NPUDesc { + std::string arch; + int64_t max_tiles; +}; + +std::optional extract_npu_descriptor(const std::shared_ptr& plugin) { + const ov::Any arch = plugin->get_property(ov::device::architecture.name(), ov::AnyMap{}); + const ov::Any max_tiles = plugin->get_property(ov::intel_npu::max_tiles.name(), ov::AnyMap{}); + return std::make_optional(NPUDesc{arch.as(), max_tiles.as()}); +} + +std::optional pop_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + std::optional found = std::make_optional(it->second); + config.erase(it); + return found; + } + return std::nullopt; +} + +template +std::optional get_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + return std::make_optional(it->second.as()); + } + return std::nullopt; +} + +template +T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { + auto anyopt = pop_option(config, key); + if (anyopt.has_value()) { + return anyopt.value().as(); + } + return default_value; +} + +ov::AnyMap get_baseline_common_config() { + ov::AnyMap config = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, + {"NPUW_DEVICES", "NPU"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_FOLD", "YES"}, + {"NPUW_DCOFF_TYPE", "f16"}, + {"NPUW_DCOFF_SCALE", "YES"}, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_SLICE_OUT", "YES"}, + {"NPUW_FUNCALL_ASYNC", "YES"}}; + return config; +} + +ov::AnyMap get_default_common_config(const std::shared_ptr& model) { + auto config = get_baseline_common_config(); + const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); + if (npu_l0 && std::atoi(npu_l0) == 1) { + config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); + } else { + config.emplace("NPUW_FUNCALL_FOR_ALL", "YES"); + } + return config; +} + +ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, const std::optional& npudesc) { + auto config = get_default_common_config(model); + if (is_cw_compressed(model)) { + config.emplace("NPUW_DQ", "YES"); + } else { + config.emplace("NPUW_PMM", "NO"); + } + if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { + config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); + } + return config; +} + +ov::AnyMap get_default_generate_config(const std::shared_ptr& model, + const std::optional& npudesc, + const ::intel_npu::npuw::llm::GenerateHint hint) { + auto config = get_default_common_config(model); + if (hint == ::intel_npu::npuw::llm::GenerateHint::BEST_PERF) { + config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); + } + // NB: Unconditionally set for generation model + config.emplace("NPUW_DQ", "YES"); + if (npudesc.has_value() && npudesc->arch == "4000") { + config.emplace("NPU_DPU_GROUPS", 4); + } + return config; +} + +void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { + for (const auto& [key, value] : rhs) { + // NB: Overwrite the value if key already exists + if (auto it = lhs.find(key); it != lhs.end()) { + it->second = value; + } else { + lhs.emplace(key, value); + } + } +} + +void drop_cache_dir(ov::AnyMap& config) { + if (config.count("NPU_USE_NPUW") != 0u) { + pop_option(config, "CACHE_DIR"); + } +} + +void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { + for (auto it = properties.begin(); it != properties.end(); ++it) { + if (it->first.find("NPUW_LLM") != it->first.npos) { + llm_properties.insert(*it); + } else { + other_properties.insert(*it); + } + } +} + +std::map any_copy(const ov::AnyMap& params) { + std::map result; + for (auto&& value : params) { + result.emplace(value.first, value.second.as()); + } + return result; +} +} // namespace + +ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::AnyMap& properties) + : ov::npuw::ICompiledModel(model, plugin), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc) { + LOG_DEBUG("Creating LLMCompiledModel"); + LOG_BLOCK(); + + ::intel_npu::registerNPUWLLMOptions(*m_options_desc); + + std::map npuw_llm_props; + std::map other_props; + split_llm_properties(properties, npuw_llm_props, other_props); + m_cfg.update(any_copy(npuw_llm_props)); + + LOG_DEBUG("1. Creating kvcache model as clone of passed one."); + auto kvcache_model = model->clone(); + LOG_DEBUG("2. Transform kvcache model from stateful to stateless."); + ov::pass::StatefulToStateless().run_on_model(kvcache_model); + + LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one."); + auto prefill_model = kvcache_model->clone(); + prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); + LOG_DEBUG("4. Converting KV-cache in prefill model to FP16."); + prefill_model = cvt_kvcache_to_fp16(prefill_model); + + LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token."); + kvcache_model = redirect_new_kv_to_output(kvcache_model); + LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16."); + kvcache_model = cvt_kvcache_to_fp16(kvcache_model); + + const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); + const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); + const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); + KVAxesPosition axes = get_kv_axes(model_desc.type); + m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len}; + LOG_DEBUG("7. Make prefill model with static shapes"); + reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); + LOG_DEBUG("8. Make kvcache model with static shapes"); + reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); + + auto npudesc = extract_npu_descriptor(plugin); + + ov::AnyMap properties_copy = other_props; + auto prefill_config = get_default_prefill_config(model, npudesc); + // NB: GENERATE_HINT is only applicable for default generate config! + const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); + LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); + auto generate_config = get_default_generate_config(model, npudesc, generate_hint); + merge_config_with(prefill_config, properties_copy); + merge_config_with(generate_config, properties_copy); + // FIXME: Drop CACHE_DIR option if NPUW is enabled + drop_cache_dir(prefill_config); + drop_cache_dir(generate_config); + + m_kvcache_compiled = std::make_shared(kvcache_model, plugin, generate_config); + m_prefill_compiled = std::make_shared(prefill_model, plugin, prefill_config); + + implement_properties(); + LOG_DEBUG("Done"); +} + +void ov::npuw::LLMCompiledModel::export_model(std::ostream& model) const { + OPENVINO_NOT_IMPLEMENTED; +} + +std::shared_ptr ov::npuw::LLMCompiledModel::get_runtime_model() const { + OPENVINO_NOT_IMPLEMENTED; +} + +void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) { + OPENVINO_NOT_IMPLEMENTED; +} + +ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const { + OPENVINO_SUPPRESS_DEPRECATED_START + auto&& configIterator = m_prop_to_opt.find(name); + if (configIterator != m_prop_to_opt.cend()) { + return std::get<1>(configIterator->second)(m_cfg); + } else { + return m_prefill_compiled->get_property(name); + } + OPENVINO_SUPPRESS_DEPRECATED_END +} + +std::shared_ptr ov::npuw::LLMCompiledModel::create_sync_infer_request() const { + auto* non_const_this = const_cast(this); // because of const in API + return non_const_this->create_llm_infer_request(); +} + +std::shared_ptr ov::npuw::LLMCompiledModel::create_llm_infer_request() { + auto this_sptr = std::static_pointer_cast(shared_from_this()); + return std::make_shared(this_sptr, m_kvcache_desc); +} + +void ov::npuw::LLMCompiledModel::implement_properties() { +#define BIND(N, T, GETTER) \ + { \ + ov::intel_npu::N.name(), { \ + ov::PropertyMutability::RW, [](const ::intel_npu::Config& config) -> ov::Any { \ + return config.GETTER<::intel_npu::T>(); \ + } \ + } \ + } + + m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get), + BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString), + BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get), + BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get), + BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)}); +#undef BIND +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp new file mode 100644 index 00000000000000..1a748997fd48fa --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "compiled_model.hpp" + +namespace ov { +namespace npuw { + +class LLMInferRequest; +class LLMCompiledModel : public ov::npuw::ICompiledModel { + using GetPropertiesMap = + std::map>>; + +public: + struct KVCacheDesc { + uint32_t max_prompt_size = 0u; + uint32_t total_size = 0u; + uint32_t num_stored_tokens = 0u; + uint32_t dim = 0u; + }; + + LLMCompiledModel(const std::shared_ptr& model, + const std::shared_ptr& plugin, + const ov::AnyMap& properties); + LLMCompiledModel() = delete; + void export_model(std::ostream& model) const override; + std::shared_ptr get_runtime_model() const override; + + void set_property(const ov::AnyMap& properties) override; + ov::Any get_property(const std::string& name) const override; + +private: + friend class LLMInferRequest; + + std::shared_ptr create_llm_infer_request(); + std::shared_ptr create_sync_infer_request() const override; + void implement_properties(); + + std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; + ::intel_npu::Config m_cfg; + GetPropertiesMap m_prop_to_opt; + + KVCacheDesc m_kvcache_desc; + std::shared_ptr m_kvcache_compiled; + std::shared_ptr m_prefill_compiled; +}; + +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp new file mode 100644 index 00000000000000..a8c90884d3d926 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -0,0 +1,193 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "llm_infer_request.hpp" + +#include + +#include "llm_compiled_model.hpp" +#include "logging.hpp" +#include "openvino/runtime/iasync_infer_request.hpp" + +namespace { +template +void fill_tensor(ov::SoPtr tensor, T fill_val, size_t offset = 0u) { + T* tensor_data = tensor->data(); + std::fill(tensor_data + offset, tensor_data + tensor->get_size(), fill_val); +} + +ov::SoPtr make_tensor_slice(ov::SoPtr tensor, + uint32_t dim, + uint32_t start_pos, + uint32_t end_pos) { + ov::Shape start_shape(std::vector(tensor->get_shape().size(), 0u)); + start_shape[dim] = start_pos; + ov::Shape end_shape = tensor->get_shape(); + end_shape[dim] = end_pos; + return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape)); +} +} // anonymous namespace + +ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr& compiled_model, + const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc) + : ov::ISyncInferRequest(compiled_model), + m_kvcache_desc(kvcache_desc) { + m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request(); + m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request(); + + for (auto input_port : m_prefill_request->get_compiled_model()->inputs()) { + m_prefill_in_ports.emplace(input_port.get_any_name(), input_port); + } + for (auto output_port : m_prefill_request->get_compiled_model()->outputs()) { + m_prefill_out_ports.emplace(output_port.get_any_name(), output_port); + } + + for (auto input_port : m_kvcache_request->get_compiled_model()->inputs()) { + m_kvcache_in_ports.emplace(input_port.get_any_name(), input_port); + } + for (auto output_port : m_kvcache_request->get_compiled_model()->outputs()) { + m_kvcache_out_ports.emplace(output_port.get_any_name(), output_port); + } +} + +void ov::npuw::LLMInferRequest::prepare_for_new_conversation() { + // FIXME: for input_ids it must be padding from tokenizer that not available from here + // Get it from NPUW options + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u); + fill_tensor(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u); + fill_tensor(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u); + m_kvcache_desc.num_stored_tokens = 0u; +} + +void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr input_ids, + ov::SoPtr attention_mask, + ov::SoPtr position_ids) { + LOG_DEBUG("Calling inference for prefill model..."); + LOG_BLOCK(); + + prepare_for_new_conversation(); + + auto padded_input_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")); + const size_t offset = padded_input_ids->get_size() - input_ids->get_size(); + std::copy_n(input_ids->data(), input_ids->get_size(), padded_input_ids->data() + offset); + + auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")); + std::copy_n(attention_mask->data(), + attention_mask->get_size(), + padded_attention_mask->data() + offset); + + auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")); + std::copy_n(position_ids->data(), position_ids->get_size(), padded_position_ids->data() + offset); + + m_prefill_request->infer(); + m_kvcache_desc.num_stored_tokens += static_cast(input_ids->get_size()); + m_need_copy_kvcache = true; + + m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits")); + + LOG_DEBUG("Done"); +} + +void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr input_ids, + ov::SoPtr attention_mask, + ov::SoPtr position_ids) { + LOG_DEBUG("Calling inference for generate model..."); + LOG_BLOCK(); + + // NB: KV-cache is full, further generation is impossible + if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { + OPENVINO_THROW("KV-Cache is full."); + } + + if (m_need_copy_kvcache) { + LOG_DEBUG("Copying kv-cache from prefill to generate model."); + const std::size_t kStartOutputKVCacheLayers = 1u; + const auto& kvcache_compiled = m_kvcache_request->get_compiled_model(); + for (std::size_t i = 0; i < kvcache_compiled->outputs().size() - 1; ++i) { + const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name)); + + const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + + // FIXME: We don't need to fill whole tensor with 0s, but only tensor.size() - num_stored_tokens + // taking into account kvcache dimension. + fill_tensor(kvcache_in_tensor, 0); + + auto prefill_out_slice = + make_tensor_slice(prefill_out_tensor, + m_kvcache_desc.dim, + m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, + m_kvcache_desc.max_prompt_size); + + auto kvcache_in_slice = + make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens); + + prefill_out_slice->copy_to(kvcache_in_slice._ptr); + } + LOG_DEBUG("Prepare attention mask pattern."); + auto* attention_mask_data = + m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data(); + attention_mask_data[m_kvcache_desc.total_size - 1] = 1; + + m_need_copy_kvcache = false; + } + + // FIXME: these tensors should be shared between the parent & child models + auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("input_ids")); + std::copy_n(input_ids->data(), input_ids->get_size(), kv_input_ids->data()); + + auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")); + std::copy_n(attention_mask->data(), attention_mask->get_size(), kv_attn_mask->data()); + + auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("position_ids")); + std::copy_n(position_ids->data(), position_ids->get_size(), kv_pos_ids->data()); + + m_kvcache_request->infer(); + m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits")); + m_kvcache_desc.num_stored_tokens += 1; + + LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration."); + const std::size_t kStartOutputKVCacheLayers = 1u; + const auto& kvcache_compiled = m_kvcache_request->get_compiled_model(); + for (std::size_t i = 0; i < kvcache_compiled->outputs().size() - 1; ++i) { + const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, + m_kvcache_desc.dim, + m_kvcache_desc.num_stored_tokens - 1, + m_kvcache_desc.num_stored_tokens); + auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); + kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); + } + LOG_DEBUG("Done"); +} + +void ov::npuw::LLMInferRequest::infer() { + const auto& inputs = get_inputs(); + + auto input_ids = get_tensor(inputs[0]); + auto attention_mask = get_tensor(inputs[1]); + auto position_ids = get_tensor(inputs[2]); + + OPENVINO_ASSERT(ov::element::i64 == input_ids->get_element_type()); + OPENVINO_ASSERT(ov::element::i64 == attention_mask->get_element_type()); + OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type()); + + if (input_ids->get_size() != 1) { + infer_prefill(input_ids, attention_mask, position_ids); + } else { + infer_generate(input_ids, attention_mask, position_ids); + } +} + +ov::SoPtr ov::npuw::LLMInferRequest::get_tensor(const ov::Output& port) const { + // NB: If asked for logits... + if (port == get_outputs()[0]) { + return m_logits; + } + return ov::ISyncInferRequest::get_tensor(port); +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp new file mode 100644 index 00000000000000..fbc6c702c4b62a --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include + +#include "llm_compiled_model.hpp" +#include "openvino/core/descriptor/output.hpp" +#include "openvino/runtime/isync_infer_request.hpp" + +namespace ov { +namespace npuw { + +class LLMInferRequest final : public ov::ISyncInferRequest { +public: + explicit LLMInferRequest(const std::shared_ptr& compiled_model, + const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc); + + void infer() override; + + ov::SoPtr get_tensor(const ov::Output& port) const override; + + void check_tensors() const override{}; + + std::vector get_profiling_info() const override { + return {}; + } + std::vector> query_state() const override { + return {}; + } + +private: + void prepare_for_new_conversation(); + + void infer_prefill(ov::SoPtr input_ids, + ov::SoPtr attention_mask, + ov::SoPtr position_ids); + + void infer_generate(ov::SoPtr input_ids, + ov::SoPtr attention_mask, + ov::SoPtr position_ids); + + std::shared_ptr m_kvcache_request; + std::shared_ptr m_prefill_request; + LLMCompiledModel::KVCacheDesc m_kvcache_desc; + ov::SoPtr m_logits; + bool m_need_copy_kvcache = false; + + std::unordered_map> m_prefill_in_ports; + std::unordered_map> m_prefill_out_ports; + std::unordered_map> m_kvcache_in_ports; + std::unordered_map> m_kvcache_out_ports; +}; + +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/logging.hpp b/src/plugins/intel_npu/src/plugin/npuw/logging.hpp index b258e3e6e6bfe9..95c9a742db7842 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/logging.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/logging.hpp @@ -62,3 +62,7 @@ void dump_failure(const std::shared_ptr& model, const std::string& de OPENVINO_THROW("NPUW: Assertion " #expr " failed"); \ } \ } while (0) + +#ifdef _MSC_VER +# define __PRETTY_FUNCTION__ __FUNCSIG__ +#endif diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp index d39c2363b1cd64..1cc47b568bcde9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/compute.cpp @@ -247,8 +247,8 @@ DQMatMulConv::DQMatMulConv(const std::shared_ptr& sn auto callback = [=](ov::pass::pattern::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_node_param = node_to_output.at(param); - auto matched_node_param2 = node_to_output.at(param2); + const auto& matched_node_param = node_to_output.at(param); + const auto& matched_node_param2 = node_to_output.at(param2); auto matched_node_transpose_in = node_to_output.at(transpose_in).get_node_shared_ptr(); auto matched_node_transpose_out = node_to_output.at(transpose_out).get_node_shared_ptr(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp index a428d956bbad87..93a43c9b82570a 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/dcoff.cpp @@ -97,12 +97,10 @@ ClosureRemap build_remap(const Function& fbody, const DCOFFParams& params_to) { LOG_DEBUG("This is an OK parameter, will be kept"); m.closure_remap.push_back(i - fbody._param_offset); - // Check if unpack is indeed required - const auto& type = param->get_element_type(); - if (type == ov::element::i4 || type == ov::element::u4 || type == ov::element::i8 || - type == ov::element::u8) { - m.weights_to_unpack.insert(i - fbody._param_offset); - } + // FIXME: type should be queried from a lazy tensor + // and compared against param->get_element_type() + // to decide 100% + m.weights_to_unpack.insert(i - fbody._param_offset); } // Process zero points for parameters @@ -709,7 +707,7 @@ DCOFFPassReshape4::DCOFFPassReshape4(DCOffMode dcoff_mode, ov::element::Type dco auto matched_paramA = std::static_pointer_cast(matched_nodeA); auto matched_paramC = std::static_pointer_cast(matched_nodeC); - auto matched_out_mulply = node_to_output.at(mulply); + const auto& matched_out_mulply = node_to_output.at(mulply); if (ov::element::i4 == matched_paramA->get_element_type() && (ov::element::f16 == matched_paramC->get_element_type() || diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp index 968039e88758a1..db9666b9485546 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/patterns/opt.cpp @@ -166,8 +166,8 @@ DQMatMulCWi::DQMatMulCWi(Context::Ref ctx) { auto matched_node_cvtw = node_to_output.at(qcvtw).get_node_shared_ptr(); auto matched_node_muls = node_to_output.at(qmuls).get_node_shared_ptr(); auto matched_node_mmi = node_to_output.at(qmmi).get_node_shared_ptr(); - auto matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff); - auto matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls); + auto& matched_node_qcoeff_out = uat::_(node_to_output).at_or_at_or_at(qcvtc, reshapec, qcoeff); + auto& matched_node_muls_out = uat::_(node_to_output).at_or_at(qcvtm, qmuls); if (!ctx.get().mm_dq_full) { const auto& matm_mul_out_shape = matched_matmul->get_output_shape(0); @@ -1432,7 +1432,7 @@ SliceLastMatmul::SliceLastMatmul() { auto callback = [=](ov::pass::pattern::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_out_matmul = node_to_output.at(matmul); + auto& matched_out_matmul = node_to_output.at(matmul); auto shape = matched_out_matmul.get_node()->input(0).get_shape(); @@ -1468,7 +1468,7 @@ SliceLastMatmulAdd::SliceLastMatmulAdd() { auto callback = [=](ov::pass::pattern::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_out_matmul = node_to_output.at(matmul); + auto& matched_out_matmul = node_to_output.at(matmul); auto shape = matched_out_matmul.get_node()->input(0).get_shape(); @@ -1504,7 +1504,7 @@ SliceLastMatmulTranspose::SliceLastMatmulTranspose() { auto callback = [=](ov::pass::pattern::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_out_matmul = node_to_output.at(matmul); + auto& matched_out_matmul = node_to_output.at(matmul); auto shape = matched_out_matmul.get_node()->input(0).get_shape(); @@ -1542,7 +1542,7 @@ SliceLastMatmulMultiply::SliceLastMatmulMultiply() { auto callback = [=](ov::pass::pattern::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_out_matmul = node_to_output.at(matmul); + auto& matched_out_matmul = node_to_output.at(matmul); auto shape = matched_out_matmul.get_node()->input(0).get_shape(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp b/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp index a7ea56dd3ff910..ca72023dfdc7de 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/spatial.cpp @@ -13,7 +13,7 @@ ov::npuw::runtime::spatial::AttentionMask::AttentionMask(std::size_t param_idx, ov::npuw::runtime::spatial::Selector::Ptr ov::npuw::runtime::spatial::AttentionMask::find( const ov::ISyncInferRequest& rq) { auto is_attn_mask = [](const ov::Output& p) { - const auto shape = p.get_shape(); + const auto& shape = p.get_shape(); return p.get_node()->get_friendly_name() == "attention_mask" && (shape.size() == 1 || (shape.size() == 2 && shape[0] == 1)); }; diff --git a/src/plugins/intel_npu/src/plugin/npuw/util.cpp b/src/plugins/intel_npu/src/plugin/npuw/util.cpp index e9cab91e60bdb0..ffefb747ffb18f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/util.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/util.cpp @@ -336,7 +336,7 @@ ov::SoPtr ov::npuw::util::view(const ov::SoPtr& src, view_shape.push_back(to[d] - from[d]); } - const auto strides = src->get_strides(); + const auto& strides = src->get_strides(); uint8_t* ptr = static_cast(src->data()); // Shift PTR according to the strides @@ -352,7 +352,7 @@ ov::SoPtr ov::npuw::util::view(const ov::SoPtr& src, std::size_t dim, std::size_t offset, std::size_t len) { - const auto shape = src->get_shape(); + const auto& shape = src->get_shape(); View view_start = View(shape.size(), 0u); View view_end = shape; view_start[dim] = offset; diff --git a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp index 2b4be1a759c17c..5ff064e7629759 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/weights_bank.cpp @@ -110,9 +110,6 @@ ov::Tensor Bank::eval_and_alloc(const LazyTensor& tensor, return transformed_tensor; } - // Non-CPU case: detach the evaluated LazyTensor from its memory - const_cast(tensor).detach(); - ov::SoPtr remote_tensor; ov::Tensor allocated_tensor; @@ -124,6 +121,12 @@ ov::Tensor Bank::eval_and_alloc(const LazyTensor& tensor, guard.unlock(); // Unlock the guard, map update is done - copy can continue in parallel transformed_tensor.copy_to(allocated_tensor); + + // Detach the evaluated LazyTensor from its memory here - when it is 100% + // not needed anymore (transformations, if any, and copies are done) + // Note: this is the non-CPU path! + const_cast(tensor).detach(); + return allocated_tensor; } diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index 18a96bff02fb80..da425d5d01a5c3 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -7,6 +7,7 @@ #include #include "compiled_model.hpp" +#include "npuw/compiled_model.hpp" #include "driver_compiler_adapter.hpp" #include "intel_npu/common/device_helpers.hpp" #include "intel_npu/common/igraph.hpp" @@ -16,7 +17,6 @@ #include "intel_npu/config/npuw.hpp" #include "intel_npu/config/runtime.hpp" #include "intel_npu/utils/zero/zero_init.hpp" -#include "npuw/compiled_model.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/parameter.hpp" #include "openvino/runtime/intel_npu/properties.hpp" @@ -489,6 +489,12 @@ Plugin::Plugin() [](const Config& config) { return config.get(); }}}, + {ov::intel_npu::defer_weights_load.name(), + {true, + ov::PropertyMutability::RW, + [](const Config& config) { + return config.get(); + }}}, // NPU Private // ========= {ov::intel_npu::dma_engines.name(), @@ -544,12 +550,6 @@ Plugin::Plugin() [](const Config& config) { return config.get(); }}}, - {ov::intel_npu::defer_weights_load.name(), - {false, - ov::PropertyMutability::RW, - [](const Config& config) { - return config.get(); - }}}, {ov::intel_npu::dynamic_shape_to_static.name(), {false, ov::PropertyMutability::RW, @@ -637,7 +637,7 @@ std::shared_ptr Plugin::compile_model(const std::shared_ptr< if (localProperties.count(ov::cache_dir.name()) || !_globalConfig.get().empty()) { OPENVINO_THROW("Option 'CACHE_DIR' is not supported with NPU_USE_NPUW!"); } - return std::make_shared(model->clone(), shared_from_this(), localProperties); + return ov::npuw::ICompiledModel::create(model->clone(), shared_from_this(), localProperties); } else { // NPUW is disabled, remove the key from the properties localProperties.erase(useNpuwKey); diff --git a/tests/constraints.txt b/tests/constraints.txt index 2272151565ca8a..004a2c65b5e474 100644 --- a/tests/constraints.txt +++ b/tests/constraints.txt @@ -1,5 +1,5 @@ numpy>=1.16.6,<2.1.0 -attrs==23.2.0 +attrs==24.2.0 distro==1.9.0 h5py>=3.1.0,<3.12.0 Jinja2>=2.11.2 diff --git a/tests/layer_tests/tensorflow_tests/test_tf_BinaryOps.py b/tests/layer_tests/tensorflow_tests/test_tf_BinaryOps.py index 4de252e40442c4..5814c1a5427b9b 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_BinaryOps.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_BinaryOps.py @@ -54,17 +54,13 @@ def create_add_placeholder_const_net(self, x_shape, y_shape, op_type): 'Maximum': tf.raw_ops.Maximum, 'Minimum': tf.raw_ops.Minimum, 'Mod': tf.raw_ops.Mod, - 'LogicalAnd': tf.raw_ops.LogicalAnd, - 'LogicalOr': tf.raw_ops.LogicalOr, 'FloorMod': tf.raw_ops.FloorMod, 'FloorDiv': tf.raw_ops.FloorDiv, 'Xdivy': tf.raw_ops.Xdivy, } input_type = np.float32 - if op_type in ["LogicalAnd", "LogicalOr", "LogicalXor"]: - input_type = bool - elif op_type in ['Pow']: + if op_type in ['Pow']: input_type = np.int32 self.input_type = input_type @@ -89,8 +85,7 @@ def create_add_placeholder_const_net(self, x_shape, y_shape, op_type): @pytest.mark.parametrize('y_shape', [[4], [2, 3, 4]]) @pytest.mark.parametrize("op_type", ['Add', 'AddV2', 'Sub', 'Mul', 'Div', 'RealDiv', 'SquaredDifference', 'Pow', - 'Maximum', 'Minimum', 'Mod', 'LogicalAnd', 'LogicalOr', 'FloorMod', - 'FloorDiv', 'Xdivy']) + 'Maximum', 'Minimum', 'Mod', 'FloorMod', 'FloorDiv', 'Xdivy']) @pytest.mark.nightly @pytest.mark.precommit @pytest.mark.xfail(condition=platform.system() == 'Darwin' and platform.machine() == 'arm64', diff --git a/tests/layer_tests/tensorflow_tests/test_tf_LogicalBinaryOps.py b/tests/layer_tests/tensorflow_tests/test_tf_LogicalBinaryOps.py new file mode 100644 index 00000000000000..e89dc96fedc7c6 --- /dev/null +++ b/tests/layer_tests/tensorflow_tests/test_tf_LogicalBinaryOps.py @@ -0,0 +1,54 @@ +# Copyright (C) 2018-2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 + +import numpy as np +import pytest +import tensorflow as tf +from common.tf_layer_test_class import CommonTFLayerTest + +rng = np.random.default_rng(23345) + + +class TestLogicalBinaryOps(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + assert 'x:0' in inputs_info, "Test error: inputs_info must contain `x`" + assert 'y:0' in inputs_info, "Test error: inputs_info must contain `y`" + x_shape = inputs_info['x:0'] + y_shape = inputs_info['y:0'] + + inputs_data = {} + inputs_data['x:0'] = rng.choice([True, False], x_shape).astype(bool) + inputs_data['y:0'] = rng.choice([True, False], y_shape).astype(bool) + return inputs_data + + def create_logical_binary_ops_net(self, x_shape, y_shape, op_type): + op_type_map = { + 'LogicalAnd': tf.raw_ops.LogicalAnd, + 'LogicalOr': tf.raw_ops.LogicalOr, + } + + tf.compat.v1.reset_default_graph() + # Create the graph and model + with tf.compat.v1.Session() as sess: + x = tf.compat.v1.placeholder(bool, x_shape, 'x') + y = tf.compat.v1.placeholder(bool, y_shape, 'y') + op_type_map[op_type](x=x, y=y, name=op_type) + + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + ref_net = None + + return tf_net, ref_net + + @pytest.mark.parametrize('x_shape', [[], [4], [3, 4], [2, 3, 4]]) + @pytest.mark.parametrize('y_shape', [[2, 3, 4]]) + @pytest.mark.parametrize("op_type", ['LogicalAnd', 'LogicalOr']) + @pytest.mark.nightly + @pytest.mark.precommit + def test_logical_binary_op(self, x_shape, y_shape, op_type, + ie_device, precision, ir_version, + temp_dir, use_legacy_frontend): + self._test(*self.create_logical_binary_ops_net(x_shape=x_shape, y_shape=y_shape, op_type=op_type), + ie_device, precision, ir_version, + temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) diff --git a/tests/model_hub_tests/pytorch/detectron2_precommit b/tests/model_hub_tests/pytorch/detectron2_precommit index 155e4d2a359779..f98e44ad21871f 100644 --- a/tests/model_hub_tests/pytorch/detectron2_precommit +++ b/tests/model_hub_tests/pytorch/detectron2_precommit @@ -1,13 +1,8 @@ -COCO-Detection/faster_rcnn_R_50_C4_1x,none -COCO-Detection/faster_rcnn_R_50_DC5_3x,none COCO-Detection/faster_rcnn_R_50_FPN_1x,none COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x,none COCO-Detection/retinanet_R_50_FPN_1x,none COCO-Detection/rpn_R_50_C4_1x,none -COCO-Detection/rpn_R_50_FPN_1x,none COCO-InstanceSegmentation/mask_rcnn_R_50_FPN_1x,none -COCO-InstanceSegmentation/mask_rcnn_R_50_C4_3x,none -COCO-InstanceSegmentation/mask_rcnn_R_50_DC5_3x,none COCO-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_3x,none COCO-Keypoints/keypoint_rcnn_R_50_FPN_3x,none COCO-Keypoints/keypoint_rcnn_X_101_32x8d_FPN_3x,none @@ -19,8 +14,6 @@ LVISv0.5-InstanceSegmentation/mask_rcnn_R_50_FPN_1x,none LVISv0.5-InstanceSegmentation/mask_rcnn_X_101_32x8d_FPN_1x,none Misc/cascade_mask_rcnn_R_50_FPN_3x,none Misc/cascade_mask_rcnn_X_152_32x8d_FPN_IN5k_gn_dconv,none -Misc/mask_rcnn_R_50_FPN_3x_dconv_c3-c5,none -Misc/mask_rcnn_R_50_FPN_3x_gn,none Misc/mask_rcnn_R_50_FPN_3x_syncbn,none Misc/scratch_mask_rcnn_R_50_FPN_9x_syncbn,none PascalVOC-Detection/faster_rcnn_R_50_C4,none diff --git a/tests/model_hub_tests/pytorch/test_llm.py b/tests/model_hub_tests/pytorch/test_llm.py index e444f93db9d7ec..ba48634a070e39 100644 --- a/tests/model_hub_tests/pytorch/test_llm.py +++ b/tests/model_hub_tests/pytorch/test_llm.py @@ -15,10 +15,10 @@ from torch_utils import TestTorchConvertModel -def is_gptq_model(config): +def is_quantized_model(config): config_dict = config.to_dict() if not isinstance(config, dict) else config quantization_config = config_dict.get("quantization_config", None) - return quantization_config and quantization_config["quant_method"] == "gptq" + return quantization_config and quantization_config["quant_method"] in ["gptq", "awq"] def patch_gptq(): @@ -26,35 +26,83 @@ def patch_gptq(): orig_cuda_is_bf16_supported = torch.cuda.is_bf16_supported orig_cuda_get_device_capability = torch.cuda.get_device_capability orig_post_init_model = None + orig_gemm_forward = None torch.set_default_dtype(torch.float32) torch.cuda.is_available = lambda: True torch.cuda.is_bf16_supported = lambda: False torch.cuda.get_device_capability = lambda n: (9, 1) - from optimum.gptq import GPTQQuantizer + try: + from optimum.gptq import GPTQQuantizer - orig_post_init_model = GPTQQuantizer.post_init_model + orig_post_init_model = GPTQQuantizer.post_init_model - def post_init_model(self, model): - from auto_gptq import exllama_set_max_input_length + def post_init_model(self, model): + from auto_gptq import exllama_set_max_input_length - class StoreAttr(object): - pass + class StoreAttr(object): + pass - model.quantize_config = StoreAttr() - model.quantize_config.desc_act = self.desc_act - if self.desc_act and not self.disable_exllama and self.max_input_length is not None: - model = exllama_set_max_input_length(model, self.max_input_length) - return model + model.quantize_config = StoreAttr() + model.quantize_config.desc_act = self.desc_act + if self.desc_act and not self.disable_exllama and self.max_input_length is not None: + model = exllama_set_max_input_length(model, self.max_input_length) + return model + + GPTQQuantizer.post_init_model = post_init_model + except ImportError: + pass + + try: + # patch GEMM module to work without CUDA GPU + from awq.modules.linear.gemm import WQLinearMMFunction + from awq.utils.packing_utils import dequantize_gemm + + def new_forward( + ctx, + x, + qweight, + qzeros, + scales, + w_bit=4, + group_size=128, + bias=None, + out_features=0, + ): + ctx.out_features = out_features + + out_shape = x.shape[:-1] + (out_features,) + x = x.to(torch.float16) - GPTQQuantizer.post_init_model = post_init_model - return (orig_cuda_is_available, orig_cuda_is_bf16_supported, orig_cuda_get_device_capability), orig_post_init_model + out = dequantize_gemm(qweight, qzeros, scales, w_bit, group_size) + out = torch.matmul(x, out) + out = out + bias if bias is not None else out + out = out.reshape(out_shape) -def unpatch_gptq(orig_cuda_check, orig_post_init_model): - from optimum.gptq import GPTQQuantizer + if len(out.shape) == 2: + out = out.unsqueeze(0) + return out + + orig_gemm_forward = WQLinearMMFunction.forward + WQLinearMMFunction.forward = new_forward + except ImportError: + pass + return (orig_cuda_is_available, orig_cuda_is_bf16_supported, orig_cuda_get_device_capability), orig_post_init_model, orig_gemm_forward + + +def unpatch_gptq(orig_cuda_check, orig_post_init_model, orig_gemm_forward): torch.cuda.is_available, torch.cuda.is_bf16_supported, torch.cuda.get_device_capability = orig_cuda_check - GPTQQuantizer.post_init_model = orig_post_init_model + try: + from optimum.gptq import GPTQQuantizer + GPTQQuantizer.post_init_model = orig_post_init_model + except ImportError: + pass + try: + from awq.modules.linear.gemm import WQLinearMMFunction + WQLinearMMFunction.forward = orig_gemm_forward + except ImportError: + pass def to_numpy(t): @@ -88,7 +136,7 @@ def flattenize_outputs(outputs): class TestLLMModel(TestTorchConvertModel): def setup_class(self): self.infer_timeout = 1800 - self.cuda_available, self.gptq_postinit = None, None + self.cuda_available, self.gptq_postinit, self.orig_gemm_forward = None, None, None @retry(3, exceptions=(OSError,), delay=1) def load_model(self, name, type): @@ -99,11 +147,12 @@ def load_model(self, name, type): except Exception: config = {} model_kwargs = {"torchscript": True, "trust_remote_code": True} - is_gptq = is_gptq_model(config) + is_quant = is_quantized_model(config) is_gpt2 = name == "openai-community/gpt2" - if is_gptq: - self.cuda_available, self.gptq_postinit = patch_gptq() + if is_quant: + self.cuda_available, self.gptq_postinit, self.orig_gemm_forward = patch_gptq() + model_kwargs["torch_dtype"] = "auto" model_kwargs["torch_dtype"] = torch.float32 self.ov_config = {"DYNAMIC_QUANTIZATION_GROUP_SIZE": "0"} elif is_gpt2: @@ -113,7 +162,7 @@ def load_model(self, name, type): t = AutoTokenizer.from_pretrained(name, trust_remote_code=True) self.model = AutoModelForCausalLM.from_pretrained(name, **model_kwargs) - if is_gptq: + if is_quant: model = self.model else: assert self.model.config.torch_dtype in [ @@ -175,8 +224,8 @@ def convert_model_impl(self, model_obj): def teardown_method(self): # restore after gptq patching if self.cuda_available is not None: - unpatch_gptq(self.cuda_available, self.gptq_postinit) - self.cuda_available, self.gptq_postinit = None, None + unpatch_gptq(self.cuda_available, self.gptq_postinit, self.orig_gemm_forward) + self.cuda_available, self.gptq_postinit, self.orig_gemm_forward = None, None, None super().teardown_method() @staticmethod @@ -191,7 +240,8 @@ def get_pkv(model, tokenizer): @pytest.mark.parametrize("type,name", [ ("opt_gptq", "katuni4ka/opt-125m-gptq"), ("llama", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"), - ("gpt2", "openai-community/gpt2") + ("gpt2", "openai-community/gpt2"), + ("llama_awq", "casperhansen/tinyllama-1b-awq") ]) @pytest.mark.precommit @pytest.mark.nightly @@ -210,6 +260,7 @@ def test_convert_model_precommit(self, name, type, ie_device): ("bloom_gptq", "sbolouki/bloom-1b7-gptq"), ("cohere_gptq", "shuyuej/aya-23-8B-GPTQ"), ("mbart_gptq", "Shivam098/opt-translation"), + ("llama_awq", "TheBloke/open-llama-3b-v2-wizard-evol-instuct-v2-196k-AWQ") ]) @pytest.mark.nightly def test_convert_model_nightly(self, name, type, ie_device): @@ -236,6 +287,8 @@ def test_convert_model_nightly(self, name, type, ie_device): marks=pytest.mark.xfail(reason="GPTQ QUANT_TYPE=cuda is not supported")), pytest.param("llama3_gptq", "TechxGenus/Meta-Llama-3-8B-GPTQ", marks=pytest.mark.xfail(reason="GPTQ QUANT_TYPE=cuda is not supported")), + ("qwen2_awq", "Qwen/Qwen2.5-Coder-32B-Instruct-AWQ"), + ("mixstral_awq", "TheBloke/SauerkrautLM-Mixtral-8x7B-AWQ"), ]) def test_convert_model_very_large(self, name, type, ie_device): self.run(model_name=name, model_link=type, ie_device=ie_device) diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index 56446beba12600..be304155e2afc0 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -19,6 +19,7 @@ pytest-html==4.1.1 pytest-xdist[psutil]==3.6.1 defusedxml==0.7.1 +autoawq==0.2.7; platform_system == "Linux" and platform_machine == "x86_64" auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" av==13.0.0 basicsr==1.4.2; python_version < "3.12" diff --git a/tools/ovc/openvino/tools/ovc/__init__.py b/tools/ovc/openvino/tools/ovc/__init__.py index 5b750b58969d24..3785d45324713f 100644 --- a/tools/ovc/openvino/tools/ovc/__init__.py +++ b/tools/ovc/openvino/tools/ovc/__init__.py @@ -1,8 +1,9 @@ # Copyright (C) 2018-2024 Intel Corporation # SPDX-License-Identifier: Apache-2.0 +import sys from openvino.tools.ovc.convert import convert_model -from openvino.tools.ovc.telemetry_utils import is_optimum, init_ovc_telemetry +from openvino.tools.ovc.telemetry_utils import is_optimum, init_ovc_telemetry, is_torch_compile import importlib.metadata as importlib_metadata @@ -11,10 +12,15 @@ except importlib_metadata.PackageNotFoundError: optimum_version = None -from openvino.runtime import get_version as get_rt_version # pylint: disable=no-name-in-module,import-error +from openvino import get_version as get_rt_version # pylint: disable=no-name-in-module,import-error telemetry = init_ovc_telemetry('OpenVINO') telemetry.send_event("ov", "import", "general_import") if is_optimum() and optimum_version is not None: telemetry = init_ovc_telemetry("Optimum Intel", optimum_version) telemetry.send_event("optimum", "import", "import_from_optimum,ov_version:{}".format(get_rt_version())) + +if is_torch_compile() and 'torch' in sys.modules: + torch_version = importlib_metadata.version("torch") + telemetry = init_ovc_telemetry("torch.compile", torch_version) + telemetry.send_event("torch.compile", "import", "Import from torch.compile(), ov_version: {}".format(get_rt_version())) diff --git a/tools/ovc/openvino/tools/ovc/convert.py b/tools/ovc/openvino/tools/ovc/convert.py index 77693ad4be2ca1..1bd61ff567e5d0 100644 --- a/tools/ovc/openvino/tools/ovc/convert.py +++ b/tools/ovc/openvino/tools/ovc/convert.py @@ -85,7 +85,11 @@ def convert_model( list of paths, objects derived from BaseExtension class or lists of objects. :param verbose: - Print detailed information about conversion. + Print detailed information about conversion. The detailed information is logged via standard logging library. + The log level can be changed by setting the log level using logging library. + Example: + import logging + logging.getLogger().setLevel(logging.DEBUG) :param share_weights: Reuse weights allocated in the original model. If input model is in file, then mmap is used to allocate weights directly from file. If input model is diff --git a/tools/ovc/openvino/tools/ovc/convert_impl.py b/tools/ovc/openvino/tools/ovc/convert_impl.py index aef054f8aafc24..152ff03c28e71a 100644 --- a/tools/ovc/openvino/tools/ovc/convert_impl.py +++ b/tools/ovc/openvino/tools/ovc/convert_impl.py @@ -243,8 +243,6 @@ def check_model_object(argv): def driver(argv: argparse.Namespace, non_default_params: dict): - init_logger('ERROR', argv.verbose) - # Log dictionary with non-default cli parameters where complex classes are excluded. log.debug(str(non_default_params)) @@ -433,7 +431,11 @@ def _convert(cli_parser: argparse.ArgumentParser, args, python_api_used): telemetry.send_event('ovc', 'version', simplified_ie_version) # Initialize logger with 'ERROR' as default level to be able to form nice messages # before arg parser deliver log_level requested by user - init_logger('ERROR', False) + verbose = False + if "verbose" in args and args["verbose"] or "--verbose" in sys.argv: + verbose = True + + init_logger('ERROR', verbose, python_api_used) argv = None # Minimize modifications among other places in case if multiple pieces are passed as input_model if python_api_used: diff --git a/tools/ovc/openvino/tools/ovc/help.py b/tools/ovc/openvino/tools/ovc/help.py index e09102be39419e..4f312ef20be99c 100644 --- a/tools/ovc/openvino/tools/ovc/help.py +++ b/tools/ovc/openvino/tools/ovc/help.py @@ -43,4 +43,6 @@ def get_convert_model_help_specifics(): {'action': 'version', # FIXME: Why the following is not accessible from arg parser? 'version': 'OpenVINO Model Converter (ovc) {}'.format(VersionChecker().get_ie_version())}, + 'verbose': + {'description': 'Print detailed information about conversion.'} } diff --git a/tools/ovc/openvino/tools/ovc/logger.py b/tools/ovc/openvino/tools/ovc/logger.py index f3c24a8582d0a2..46bd043cf207b9 100644 --- a/tools/ovc/openvino/tools/ovc/logger.py +++ b/tools/ovc/openvino/tools/ovc/logger.py @@ -62,7 +62,10 @@ def filter(self, record: log.LogRecord): return True # if regex wasn't set print all logs -def init_logger(lvl: str, verbose: bool): +def init_logger(lvl: str, verbose: bool, python_api_used: bool): + if verbose and python_api_used: + # We need to not override logger in case of verbose=True to allow user set a log level + return global handler_num log_exp = os.environ.get('MO_LOG_PATTERN') if not verbose: diff --git a/tools/ovc/openvino/tools/ovc/telemetry_utils.py b/tools/ovc/openvino/tools/ovc/telemetry_utils.py index 412d9b9607541e..f68a92be5d2de5 100644 --- a/tools/ovc/openvino/tools/ovc/telemetry_utils.py +++ b/tools/ovc/openvino/tools/ovc/telemetry_utils.py @@ -24,6 +24,12 @@ def is_optimum(): return True return False +def is_torch_compile(): + import traceback + for line in traceback.format_stack(): + if os.path.join("torch", "_dynamo", "backends", "registry.py") in line: + return True + return False def init_ovc_telemetry(app_name='OVC', app_version=None): app_version = app_version if app_version is not None else get_rt_version()