diff --git a/CMakeLists.txt b/CMakeLists.txt
index 7b940476e..deb46775e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,6 +25,14 @@ set(CMAKE_MODULE_PATH
   ${CMAKE_MODULE_PATH}
   "${CMAKE_CURRENT_LIST_DIR}/cmake"
 )
+
+if(WIN32)
+  if(NOT MSVC)
+    set(CMAKE_MSVC_RUNTIME_LIBRARY "")
+    set(MSVC_RUNTIME_LIBRARY "")
+  endif()
+endif()
+
 # build options
 option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF)
 option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF)
@@ -198,7 +206,7 @@ option(MNN_METAL "Enable Metal" OFF)
 option(MNN_OPENCL "Enable OpenCL" OFF)
 option(MNN_OPENGL "Enable OpenGL" OFF)
 option(MNN_VULKAN "Enable Vulkan" OFF)
-option(MNN_ARM82 "Enable ARM82" OFF)
+option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON)
 option(MNN_ONEDNN "Enable oneDNN" OFF)
 option(MNN_AVX512 "Enable AVX512" OFF)
 option(MNN_CUDA "Enable CUDA" OFF)
@@ -452,6 +460,12 @@ set(MNN_EXTRA_DEPENDS "")
 # Add Thread dependency
 find_package(Threads)
 list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT})
+if(WIN32)
+  if(NOT MSVC)
+    set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt")
+  endif()
+endif()
 
 if (NOT APPLE)
   if(MNN_OPENMP)
diff --git a/MNN.sln b/MNN.sln
index 7610b9e73..e69de29bb 100644
--- a/MNN.sln
+++ b/MNN.sln
@@ -1,36 +0,0 @@
-﻿
-Microsoft Visual Studio Solution File, Format Version 12.00
-# Visual Studio Version 17
-VisualStudioVersion = 17.5.002.0
-MinimumVisualStudioVersion = 10.0.40219.1
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rd_party", "3rd_party", "{5CD18987-C4CA-49D5-942F-14B15F46B1ED}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "flatbuffers", "flatbuffers", "{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}"
-EndProject
-Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{797AC14A-1653-469D-A240-76EF0F36E60A}"
-EndProject
-Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FlatBuffers.Test", "3rd_party\flatbuffers\tests\FlatBuffers.Test\FlatBuffers.Test.csproj", "{E5A80CC7-62B1-4887-B637-455F34CCC9B3}"
-EndProject
-Global
-	GlobalSection(SolutionConfigurationPlatforms) = preSolution
-		Debug|Any CPU = Debug|Any CPU
-		Release|Any CPU = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(ProjectConfigurationPlatforms) = postSolution
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.Build.0 = Release|Any CPU
-	EndGlobalSection
-	GlobalSection(SolutionProperties) = preSolution
-		HideSolutionNode = FALSE
-	EndGlobalSection
-	GlobalSection(NestedProjects) = preSolution
-		{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} = {5CD18987-C4CA-49D5-942F-14B15F46B1ED}
-		{797AC14A-1653-469D-A240-76EF0F36E60A} = {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}
-		{E5A80CC7-62B1-4887-B637-455F34CCC9B3} = {797AC14A-1653-469D-A240-76EF0F36E60A}
-	EndGlobalSection
-	GlobalSection(ExtensibilityGlobals) = postSolution
-		SolutionGuid = {11D826E1-518B-4BC2-8E45-03F5F48170D6}
-	EndGlobalSection
-EndGlobal
diff --git a/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S b/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S
deleted file mode 100644
index 2312a004a..000000000
--- a/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S
+++ /dev/null
@@ -1,77 +0,0 @@
-//
-//  NEON_MNNConvRunForUnitDepthWise_BF16.S
-//  MNN
-//
-//  Created by MNN on 2021/03/09.
-//  Copyright © 2018-2021 Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function NEON_MNNConvRunForUnitDepthWise_BF16
-//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: r0:dst, r1:src, r2:weight, r3:fw
-
-push {r4-r8, lr}
-
-//Load from sp:
-//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
-mov r4, r3
-ldr r5, [sp, #24]
-ldr r6, [sp, #28]
-ldr r7, [sp, #32]
-ldr r8, [sp, #36]
-
-cmp r4, #0
-vmov.i32 q0, #0
-beq UnitEnd
-cmp r5, #0
-beq UnitEnd
-
-mov lr, #2
-mul r6, lr, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
-mul r7, lr, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
-mul r8, lr, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul lr, r4, r7
-sub r8, r8, lr
-
-//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
-mov lr, #8
-mul lr, r4, lr
-sub r6, r6, lr
-
-
-UnitLoopH:
-mov lr, r4
-UnitLoopW:
-vld1.16 {d2}, [r1], r7
-vld1.16 {d4}, [r2]!
-vshll.s16 q1, d2, #16
-vshll.s16 q2, d4, #16
-
-vmla.f32 q0, q1, q2
-subs lr, lr, #1
-bne UnitLoopW
-subs r5, r5, #1
-add r1, r1, r8
-add r2, r2, r6
-bne UnitLoopH
-
-
-UnitEnd:
-vshrn.i32 d0, q0, #16
-vst1.16 {d0}, [r0]
-
-pop {r4-r8, pc}
-
-#endif
-#endif
diff --git a/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S b/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S
deleted file mode 100644
index 75254f555..000000000
--- a/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S
+++ /dev/null
@@ -1,66 +0,0 @@
-//
-//  NEON_MNNConvRunForUnitDepthWise_BF16.S
-//  MNN
-//
-//  Created by MNN on 2021/03/09.
-//  Copyright © 2018-2021 Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function NEON_MNNConvRunForUnitDepthWise_BF16
-//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: x0:dst, x1:src, x2:weight, x3:fw
-//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
-
-cmp x3, #0
-movi v0.4s, #0
-beq UnitEnd
-cmp x4, #0
-beq UnitEnd
-
-mov x9, #2
-mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step
-mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step
-mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul x9, x3, x6
-sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw
-
-//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw
-mov x9, #8
-mul x9, x3, x9
-sub x5, x5, x9
-
-
-UnitLoopH:
-mov x9, x3
-UnitLoopW:
-ld1 {v1.4h}, [x1], x6
-ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t)
-shll v1.4s, v1.4h, #16
-shll v2.4s, v2.4h, #16
-
-fmla v0.4s, v1.4s, v2.4s
-subs x9, x9, #1
-bne UnitLoopW
-subs x4, x4, #1
-add x1, x1, x7
-add x2, x2, x5
-bne UnitLoopH
-
-
-UnitEnd:
-shrn v0.4h, v0.4s, #16
-st1 {v0.4h}, [x0]
-
-ret
-
-#endif
diff --git a/backupcode/cpubackend/bf16/BF16Functions.cpp b/backupcode/cpubackend/bf16/BF16Functions.cpp
index 3f792a3ce..f9986c438 100644
--- a/backupcode/cpubackend/bf16/BF16Functions.cpp
+++ b/backupcode/cpubackend/bf16/BF16Functions.cpp
@@ -76,23 +76,6 @@ static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) {
         ::memcpy(dst, dstTemp, sizeRemain * sizeof(float));
     }
 }
-static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    BFVec4 dstValue(0.0f);
-    const int16_t* src_z    = (const int16_t*)src;
-    const int16_t* weight_z = (const int16_t*)weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const auto src_y    = src_z + fy * dilateY_step;
-        const auto weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const auto weight_x = weight_y + 4 * fx;
-            const auto src_x    = src_y + fx * dilateX_step;
-            dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x);
-        }
-    }
-    BFVec4::save((int16_t*)dst, dstValue);
-}
 
 static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
@@ -823,7 +806,6 @@ static CoreFunctions* gInstance = nullptr;
 bool BF16Functions::init() {
     gInstance = new CoreFunctions;
     gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16;
-    gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16;
     gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16;
     gInstance->MNNFp32ToLowp = _MNNFp32ToLowp;
     gInstance->MNNLowpToFp32 = _MNNLowpToFp32;
@@ -890,7 +872,6 @@ bool BF16Functions::init() {
     gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16;
     gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16;
     gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16;
-    gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16;
     gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16;
 #ifdef __aarch64__
     cpuinfo_arm_isa gCPUInfo;
diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md
index 95f9d5760..f9927b4f9 100644
--- a/docs/compile/cmake.md
+++ b/docs/compile/cmake.md
@@ -38,7 +38,7 @@ MNN使用CMake构建项目，CMake中的宏定义列表如下：
 | MNN_OPENCL           | 是否构建`OpenCL`后端，默认为`OFF` |
 | MNN_OPENGL           | 是否构建`OpenGL`后端，默认为`OFF` |
 | MNN_VULKAN           | 是否构建`Vulkan`后端，默认为`OFF` |
-| MNN_ARM82            | 是否构建`Armv8.2`后端，默认为`OFF` |
+| MNN_ARM82            | 编译ARM架构时，是否构建`Armv8.2`后端，以支持FP16计算，默认为`ON` |
 | MNN_ONEDNN           | 是否使用`oneDNN`，默认为`OFF` |
 | MNN_AVX512           | 是否构建`avx512`后端，默认为`OFF` |
 | MNN_CUDA             | 是否构建`Cuda`后端，默认为`OFF` |
diff --git a/docs/compile/engine.md b/docs/compile/engine.md
index eb8eb6503..200124725 100644
--- a/docs/compile/engine.md
+++ b/docs/compile/engine.md
@@ -22,37 +22,45 @@
         ```bash
         mkdir build && cd build && cmake .. && make -j8
         ```
-## Windows
+## Windows(非ARM架构)
 - 环境要求
   - Microsoft Visual Studio >= 2017
   - cmake >= 3.13
-  - powershell
   - Ninja
 - 相关编译选项
   - 同`Linux/MacOS`
 - 具体步骤
-  1. opencl/vulkan
-     - *(可选)*下载GPU Caps Viewer，你可以通过这个工具来查看本机设备的详细信息（opencl、opengl、vulkan等）
-     - sdk和驱动准备
-        - [opencl sdk](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases)，将opencl sdk目录的路径加到AMDAPPSDKROOT环境变量
-        - [vulkan sdk](https://vulkan.lunarg.com/)，将vulkan skd路径加入VULKAN_SDK环境变量，以备cmake查找
-        - [AMD opencl驱动](https://www.amd.com/zh-hans/support)
-        - [NVIDIA opencl驱动](https://developer.nvidia.com/opencl)
-        - [AMD vulkan驱动](https://community.amd.com/community/gaming/blog/2016/02/16/radeon-gpus-are-ready-for-the-vulkan-graphics-api)
-  2. 编译
-     - 64位编译：在设置中找到vcvars64.bat（适用于 VS 2017 的 x64 本机工具命令提示）并单击，打开VS编译x64架构程序的虚拟环境
-     - 32位编译：在设置中找到vcvarsamd64_x86.bat（VS 2017的 x64_x86 交叉工具命令提示符）并单击，打开VS交叉编译x86架构程序的虚拟环境 
-     - 在虚拟环境中执行如下编译命令：
-        ```bash
-        cd /path/to/MNN
-        ./schema/generate.ps1 # 非必须
-        mkdir build && cd build
-        cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
-        ninja
-        ```
-     - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
-     - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
-     - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
+  - 64位编译：在设置中找到vcvars64.bat（适用于 VS 2017 的 x64 本机工具命令提示）并单击，打开VS编译x64架构程序的虚拟环境
+  - 32位编译：在设置中找到vcvarsamd64_x86.bat（VS 2017的 x64_x86 交叉工具命令提示符）并单击，打开VS交叉编译x86架构程序的虚拟环境 
+  - 在虚拟环境中执行如下编译命令：
+     ```bash
+     cd /path/to/MNN
+     ./schema/generate.ps1 # 非必须
+     mkdir build && cd build
+     cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF
+     ninja
+     ```
+  - 若需要编译模型转换工具，cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON
+  - 若需要编译 MNN CUDA，MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ，另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON
+  - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构，不要直接使用 delete （直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题）
+
+## Windows(ARM架构)
+- 环境要求
+  - Microsoft Visual Studio >= 2017
+  - cmake >= 3.13
+  - Ninja
+  - Clang
+    - Clang 安装参考: https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1
+- 相关编译选项
+  - 同`Linux/MacOS`
+- 具体步骤
+  - 打开vs的ARM64命令行工具
+  - 进入 MNN 根目录
+  - mkdir build && cd build
+  - cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang++.exe"  -DCMAKE_LINKER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\lld.exe" -DCMAKE_BUILD_TYPE=Release
+    - Visual Studio 安装路径不一致的，可自行修改脚本
+  - ninja -j16
+
 ## Android
 - 环境要求
   - cmake >= 3.10
diff --git a/docs/tools/quant.md b/docs/tools/quant.md
index 1a66b6e1b..0e4e733c9 100644
--- a/docs/tools/quant.md
+++ b/docs/tools/quant.md
@@ -39,8 +39,43 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress，请查
 | ADMM | 使用ADMM方法进行权值量化 |
 
 ## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型)
-| input_type | `str` | 输入数据的类型，"sequence" |
-| path | `str` | 存放校正特征量化系数的输入数据目录 |，例如该目录下包含2个输入数据集input_0和input_1，子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹，可使用 testMNNFromOnnx.py 等脚本生成，参考模型转换的正确性校验部分。
+| 需要特别指定的参数 | 设置值 |
+|--------------------|------|
+| input_type | `str`：输入数据的类型，"sequence" |
+| path | `str`：存放校正特征量化系数的输入数据目录 |，
+例如在quant.json文件中 "path": "/home/data/inputs_dir/"，你所构造的矫正数据集有两个，分别存放在input_0和input_1子目录下，即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称，例如该模型的输入有三个：data0, data1, data2，输出有两个：out1, out2. 那么在input_0和input_1子目录下分别有六个文件：data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应，最后一个input.json文件则描述的是输入名和对应的shape内容：
+```json
+{
+    "inputs": [
+        {
+            "name": "data0",
+            "shape": [
+                2,
+                4,
+		        64,
+		        64
+            ]
+        },
+	        {
+            "name": "data1",
+            "shape": [
+                1
+            ]
+        },
+        {
+            "name": "data2",
+            "shape": [
+                2,
+                512,
+                768
+            ]
+        }
+    ],
+    "outputs": [
+        "out1", "out2"
+    ]
+}
+```
 
 ## 量化模型的使用
 和浮点模型同样使用方法，输入输出仍然为浮点类型
diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md
index 0d00de862..4677821cf 100644
--- a/docs/transformers/llm.md
+++ b/docs/transformers/llm.md
@@ -40,13 +40,16 @@ python llmexport.py \
      ├── llm.mnn
      ├── llm.mnn.json
      ├── llm.mnn.weight
-     ├── llm.onnx
+     ├── onnx/
+          ├──llm.onnx
+           ├──llm.onnx.data
      ├── llm_config.json
      └── tokenizer.txt
 ```
 
 ### 功能
-- 支持将模型为onnx或mnn模型，使用`--export onnx`或`--export mnn`
+- 将模型先转为onnx模型，使用`--export onnx`，然后使用./MNNConvert工具将onnx模型转为mnn模型: ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 -f ONNX --transformerFuse=1 --allowCustomOp
+- 更快的方式：直接转为mnn模型，使用`--export mnn`，注意，你需要先安装pymnn或者通过--mnnconvert选项指定MNNConvert工具的地址，两种条件必须满足其中一个。如果没有安装pymnn并且没有通过--mnnconvert指定MNNConvert工具的地址，那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具，需保证该目录下存在MNNConvert文件。
 - 支持对模型进行对话测试，使用`--test $query`会返回llm的回复内容
 - 默认会使用onnx-slim对onnx模型进行优化，跳过该步骤使用`--skip_slim`
 - 支持合并lora权重后导出，指定lora权重的目录使用`--lora_path`
diff --git a/express/Executor.cpp b/express/Executor.cpp
index 5f6a6dd48..bb54a393e 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -32,80 +32,64 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig&
         ScheduleConfig sConfig;
         sConfig.type = type;
         type = Schedule::getApprociateType(sConfig);
-        auto creator = MNNGetExtraRuntimeCreator(type);
-        MNN_ASSERT(nullptr != creator);
-        Backend::Info info;
-        info.type = type;
-        info.mode = Backend::Info::DIRECT;
-        info.numThread = numberThread;
-        if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
-            info.numThread = 4;
-        }
-        mAttr->firstType = type;
-        auto firstIter = mRuntimes.find(mAttr->firstType);
-        if (firstIter == mRuntimes.end()) {
-            info.user = (BackendConfig*)&config;
-            std::shared_ptr<Runtime> bn(creator->onCreate(info));
-            mRuntimes[mAttr->firstType] = bn;
-        } else {
-            firstIter->second->onReset(numberThread, &config, true);
-        }
-    } else {
-        auto creator = MNNGetExtraRuntimeCreator(type);
-        if (nullptr == creator) {
-            MNN_ERROR("Error to find creator of %d, set CPU default\n", type);
-            type = MNN_FORWARD_CPU;
-            creator = MNNGetExtraRuntimeCreator(type);
-        }
-        MNN_ASSERT(nullptr != creator);
-        Backend::Info info;
-        info.type = type;
-        mAttr->firstType = type;
-        auto firstIter = mRuntimes.find(mAttr->firstType);
-        if (firstIter == mRuntimes.end()) {
-            info.mode = Backend::Info::DIRECT;
-            info.numThread = numberThread;
-            info.user = (BackendConfig*)&config;
-            std::shared_ptr<Runtime> bn(creator->onCreate(info));
-            mRuntimes[mAttr->firstType] = bn;
-        } else {
-            firstIter->second->onReset(numberThread, &config, true);
-        }
     }
-    _refreshRuntime();
+    auto rt = _getOrCreateRuntime(type, &config, numberThread);
+    if (rt == nullptr) {
+        type = MNN_FORWARD_CPU;
+        numberThread = 1;
+        rt = _getOrCreateRuntime(type, &config, numberThread);
+    }
+    MNN_ASSERT(nullptr != rt);
+    mAttr->firstType = type;
 }
 
 int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) {
-    return mRuntimes[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
+    return mRuntimeInfo.first[mAttr->firstType]->onGetRuntimeStatus(statusEnum);
+}
+std::shared_ptr<Runtime> Executor::_getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset) {
+    auto iter = mRuntimeInfo.first.find(type);
+    if (iter != mRuntimeInfo.first.end()) {
+        iter->second->onReset(numberThread, config, reset);
+        return iter->second;
+    }
+    // Create Backend
+    auto cre = MNNGetExtraRuntimeCreator(type);
+    if (nullptr == cre) {
+        return nullptr;
+    }
+    Backend::Info info;
+    info.type = type;
+    info.mode = Backend::Info::DIRECT;
+    info.numThread = numberThread;
+    info.user = (BackendConfig*)config;
+    std::shared_ptr<Runtime> rt(cre->onCreate(info));
+    if (nullptr != rt) {
+        mRuntimeInfo.first.insert(std::make_pair(type, rt));
+    }
+    return rt;
 }
 
 void Executor::gc(GCFlag flag) {
     int level = flag == FULL ? 100 : 0;
-    for (auto& iter : mRuntimes) {
+    for (auto& iter : mRuntimeInfo.first) {
         iter.second->onGabageCollect(level);
     }
 }
 
-Executor::Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread) {
-    mRuntimes.insert(std::make_pair(type, backend));
+Executor::Executor(std::shared_ptr<Runtime> runtime, MNNForwardType type, int numberThread) {
+    mRuntimeInfo.first.insert(std::make_pair(type, runtime));
     mAttr.reset(new ExecutorAttr);
     mAttr->firstType = type;
-    if (MNN_FORWARD_CPU != type) {
-        // Create Backup Backend
-        Backend::Info info;
-        info.type = MNN_FORWARD_CPU;
-        auto cre = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU);
-        info.mode = Backend::Info::DIRECT;
-        info.numThread = 1;
-        std::shared_ptr<Runtime> backupRt(cre->onCreate(info));
-        mRuntimes.insert(std::make_pair(DEFAULT_BACKUP_RUNTIME_KEY, backupRt));
+    if (type == MNN_FORWARD_CPU) {
+        mRuntimeInfo.second = runtime;
+    } else {
+        mRuntimeInfo.second = _getOrCreateRuntime(MNN_FORWARD_CPU, nullptr, 1);
     }
     mDebug.reset(new DebugTools);
     BackendConfig defaultConfig;
     defaultConfig.flags = 4;
-    std::shared_ptr<Backend> defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig));
+    std::shared_ptr<Backend> defaultBackend(mRuntimeInfo.second->onCreate(&defaultConfig));
     mAttr->constantBackend = defaultBackend;
-    _refreshRuntime();
 }
 Executor::~Executor(){
     // Do nothing
@@ -176,21 +160,6 @@ std::shared_ptr<Executor> Executor::newExecutor(MNNForwardType type,
     auto executor = new Executor(runtime, type, numberThread);
     return std::shared_ptr<Executor>(executor);
 }
-void Executor::_refreshRuntime() {
-    mRuntimeInfo.first.clear();
-    mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY];
-    auto firstIter = mRuntimes.find(getAttr()->firstType);
-    if (firstIter != mRuntimes.end()) {
-        mRuntimeInfo.first.insert(std::make_pair(firstIter->first, firstIter->second));
-    } else {
-        MNN_ASSERT(false);
-    }
-    for (auto& iter : mRuntimes) {
-        if (iter.first != getAttr()->firstType) {
-            mRuntimeInfo.first.insert(std::make_pair(iter.first, iter.second));
-        }
-    }
-}
 
 RuntimeInfo Executor::getRuntime() {
     auto glo = ExecutorScope::Current();
@@ -297,43 +266,26 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S
     auto res = new RuntimeManager;
     auto glo = ExecutorScope::Current();
     std::lock_guard<std::mutex> _l(glo->mMutex);
-    auto& originRt = glo->mRuntimes;
-    Backend::Info compute;
-    compute.type      = Schedule::getApprociateType(config);
-    compute.numThread = config.numThread;
+    auto& originRt = glo->mRuntimeInfo;
+    auto type      = Schedule::getApprociateType(config);
+    int numThread = config.numThread;
     if(config.type == MNN_FORWARD_AUTO) {
-        if(compute.type == MNN_FORWARD_OPENCL || compute.type == MNN_FORWARD_METAL) {
+        if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) {
             // AUTO set default gpu-mode MNN_GPU_TUNING_FAST
-            compute.numThread = 16;
+            numThread = 16;
         }
     }
-    compute.user      = config.backendConfig;
-    auto iter = originRt.find(compute.type);
-    if (iter == originRt.end()) {
-        auto creator = MNNGetExtraRuntimeCreator(compute.type);
-        if (nullptr == creator) {
-            return nullptr;
-        }
-        auto newBn = creator->onCreate(compute);
-        if (nullptr == newBn) {
-            MNN_ERROR("Can't create Runtime: %s\n", EnumNameForwardType((ForwardType)compute.type));
-            return nullptr;
-        }
-        originRt.insert(std::make_pair(compute.type, std::shared_ptr<Runtime>(newBn)));
-    } else {
-        iter->second->onReset(compute.numThread, compute.user, false);
-    }
-    res->mInside->mRuntime.second =  originRt[DEFAULT_BACKUP_RUNTIME_KEY];
-    res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type]));
-    res->mInside->mInfo = originRt[compute.type];
-    res->mInside->mNumberThread = compute.numThread;
+    auto rt = glo->_getOrCreateRuntime(type, config.backendConfig, numThread, false);
+    res->mInside->mRuntime.second = originRt.second;
+    res->mInside->mRuntime.first.insert(std::make_pair(type, rt));
+    res->mInside->mInfo = rt;
+    res->mInside->mNumberThread = numThread;
     if (nullptr != config.backendConfig) {
         res->mInside->mConfig = *config.backendConfig;
         res->mInside->mUserConfig = true;
     } else {
         res->mInside->mUserConfig = false;
     }
-    glo->_refreshRuntime();
     return res;
 }
 ExecutorAttr* Executor::getAttr() const {
diff --git a/express/module/Module.cpp b/express/module/Module.cpp
index a0976bd67..d1dea03dc 100644
--- a/express/module/Module.cpp
+++ b/express/module/Module.cpp
@@ -379,6 +379,9 @@ static Module* loadInternal(const std::vector<std::string>& inputs, const std::v
     if (net->extraInfo() && net->extraInfo()->version()) {
         info->version = net->extraInfo()->version()->str();
     }
+    if (net->bizCode()) {
+        info->bizCode = net->bizCode()->str();
+    }
     auto rtMgr = _rtMgr;
     Module::Config defaultConfig;
     if (nullptr == config) {
diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp
index 31a07c632..33fa14afe 100644
--- a/express/module/StaticModule.cpp
+++ b/express/module/StaticModule.cpp
@@ -598,6 +598,7 @@ std::vector<Express::VARP> StaticModule::onForward(const std::vector<Express::VA
     mSession->getInfo(Interpreter::FLOPS, &flops);
     glo->getDebugTools()->flops += flops;
 #endif
+    
     return outputs;
 }
 
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index edeceb296..19bb95407 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -234,6 +234,8 @@ class MNN_PUBLIC Interpreter {
         // size limit of kvcache in memory (for a single layer)
         // if the size of kvcache exceeds the limit, it will be moved to disk
         KVCACHE_SIZE_LIMIT = 8,
+        // Op encoder number for commit
+        OP_ENCODER_NUMBER_FOR_COMMIT = 9,
     };
 
     enum ExternalPathType {
diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h
index 8a0af32de..33bc515fd 100644
--- a/include/MNN/MNNDefine.h
+++ b/include/MNN/MNNDefine.h
@@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \
 #define STR(x) STR_IMP(x)
 #define MNN_VERSION_MAJOR 2
 #define MNN_VERSION_MINOR 9
-#define MNN_VERSION_PATCH 5
+#define MNN_VERSION_PATCH 6
 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH)
 #endif /* MNNDefine_h */
diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp
index 367c15d03..3022f7b9d 100644
--- a/include/MNN/expr/Executor.hpp
+++ b/include/MNN/expr/Executor.hpp
@@ -138,12 +138,10 @@ class MNN_PUBLIC Executor {
     };
     static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr);
 private:
-    void _refreshRuntime();
+    std::shared_ptr<Runtime> _getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset = true);
     Executor(std::shared_ptr<Runtime> backend, MNNForwardType type, int numberThread);
     void _makeCache(const std::vector<EXPRP>& outputs, bool forceCPU);
 
-    // TODO: Remove mRuntimes, only use mRuntimeInfo
-    std::map<MNNForwardType, std::shared_ptr<Runtime>> mRuntimes;
     RuntimeInfo mRuntimeInfo;
     std::shared_ptr<DebugTools> mDebug;
     std::map<std::string, std::shared_ptr<SubGraph>> mSubGraph;
diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp
index 1e5562de8..a2e5dc41b 100644
--- a/include/MNN/expr/Module.hpp
+++ b/include/MNN/expr/Module.hpp
@@ -53,7 +53,7 @@ class MNN_PUBLIC Module {
         MNNForwardType type = MNN_FORWARD_CPU;
         BackendConfig* config = nullptr;
     };
-    
+
     struct Config {
         // Load module as dynamic, default static
         bool dynamic = false;
@@ -75,7 +75,7 @@ class MNN_PUBLIC Module {
     // Shared RuntimeManager
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const char* fileName, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
     static Module* load(const std::vector<std::string>& inputs, const std::vector<std::string>& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr<MNN::Express::Executor::RuntimeManager> rtMgr, const Config* config = nullptr);
-    
+
     static Module* extract(std::vector<Express::VARP> inputs, std::vector<Express::VARP> outputs, bool fortrain, const std::map<std::string, SubGraph>& subGraph = {});
 
     static Module* clone(const Module* module, const bool shareParams = false);
@@ -93,6 +93,8 @@ class MNN_PUBLIC Module {
         std::vector<std::string> outputNames;
         // The MNNConvert's Version build the module
         std::string version;
+        // The bizCode of MNN model
+        std::string bizCode;
     };
     const Info* getInfo() const;
     class CloneContext {
diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj
index 535f50d27..c8afc9f93 100644
--- a/project/ios/MNN.xcodeproj/project.pbxproj
+++ b/project/ios/MNN.xcodeproj/project.pbxproj
@@ -158,8 +158,6 @@
 		4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
-		4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
-		4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; };
@@ -497,7 +495,6 @@
 		92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
 		92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
-		92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
 		92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@@ -542,7 +539,6 @@
 		92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; };
 		92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; };
 		92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; };
-		92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; };
 		92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; };
 		92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; };
 		92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; };
@@ -603,12 +599,10 @@
 		92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; };
 		92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; };
 		92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; };
-		92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; };
 		92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; };
 		92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; };
 		92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; };
 		92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; };
-		92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; };
 		92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; };
 		92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; };
 		92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; };
@@ -790,6 +784,8 @@
 		CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; };
 		CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; };
 		CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; };
+		CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; };
+		CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; };
 		CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; };
 		CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; };
 		CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; };
@@ -1005,8 +1001,6 @@
 		4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = "<group>"; };
 		4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = "<group>"; };
 		4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = "<group>"; };
-		4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = "<group>"; };
-		4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = "<group>"; };
 		4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = "<group>"; };
 		4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = "<group>"; };
 		489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = "<group>"; };
@@ -1353,7 +1347,6 @@
 		92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
 		92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
-		92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@@ -1398,7 +1391,6 @@
 		92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = "<group>"; };
 		92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = "<group>"; };
 		92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = "<group>"; };
-		92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = "<group>"; };
 		92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = "<group>"; };
 		92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = "<group>"; };
 		92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = "<group>"; };
@@ -1459,12 +1451,10 @@
 		92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = "<group>"; };
 		92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = "<group>"; };
 		92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = "<group>"; };
-		92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = "<group>"; };
 		92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = "<group>"; };
 		92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = "<group>"; };
 		92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = "<group>"; };
 		92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = "<group>"; };
-		92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = "<group>"; };
 		92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = "<group>"; };
 		92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = "<group>"; };
 		92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = "<group>"; };
@@ -1647,6 +1637,8 @@
 		CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = "<group>"; };
 		CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = "<group>"; };
 		CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = "<group>"; };
+		CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = "<group>"; };
+		CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = "<group>"; };
 		CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = "<group>"; };
 		CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = "<group>"; };
 		CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = "<group>"; };
@@ -2648,7 +2640,6 @@
 				92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
 				92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
-				92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
 				92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@@ -2659,6 +2650,8 @@
 		92FF017C23AA0B4E00AC97F6 /* arm64 */ = {
 			isa = PBXGroup;
 			children = (
+				CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */,
+				CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */,
 				95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */,
 				95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */,
 				4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */,
@@ -2688,8 +2681,6 @@
 				4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */,
 				4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */,
 				4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */,
-				4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */,
-				4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */,
 				4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */,
 				4896D37025FE2A6A00717702 /* MNNExpFP16.S */,
 				4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */,
@@ -2743,7 +2734,6 @@
 				92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */,
 				92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */,
 				92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */,
-				92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */,
 				92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */,
 				92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */,
 				92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */,
@@ -2795,12 +2785,10 @@
 				92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */,
 				92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */,
 				92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */,
-				92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */,
 				92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */,
 				92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */,
 				92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */,
 				92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */,
-				92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */,
 				92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */,
 				92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */,
 				92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */,
@@ -3036,7 +3024,6 @@
 				4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */,
 				4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */,
 				48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */,
-				92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */,
 				CECF8C77299CAD9400D3875B /* log_builder.h in Headers */,
 				4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */,
 				92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */,
@@ -3394,14 +3381,12 @@
 				4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */,
 				48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */,
 				92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */,
-				92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */,
 				92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */,
 				92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */,
 				CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */,
 				EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */,
 				481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */,
-				92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */,
 				4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */,
 				92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */,
 				92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */,
@@ -3483,6 +3468,7 @@
 				92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */,
 				489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */,
 				92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */,
+				CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */,
 				4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */,
 				EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */,
 				4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */,
@@ -3592,7 +3578,6 @@
 				4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */,
 				92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */,
 				EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */,
-				4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */,
 				92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */,
 				48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */,
 				92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */,
@@ -3711,6 +3696,7 @@
 				4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */,
 				92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */,
 				92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */,
+				CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */,
 				EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */,
 				92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */,
 				CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */,
@@ -3771,7 +3757,6 @@
 				48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */,
 				950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */,
 				92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */,
-				4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */,
 				EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */,
 				CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */,
 				48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */,
@@ -3800,7 +3785,6 @@
 				92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */,
 				92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */,
 				92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */,
-				92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */,
 				92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */,
 				4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */,
 				CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */,
diff --git a/pymnn/pip_package/MNN/nn/__init__.py b/pymnn/pip_package/MNN/nn/__init__.py
index 3fc225092..ca9580300 100644
--- a/pymnn/pip_package/MNN/nn/__init__.py
+++ b/pymnn/pip_package/MNN/nn/__init__.py
@@ -13,7 +13,7 @@ def load_module_from_file(file_name, input_names, output_names, **kwargs):
     memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal)
     power_mode = kwargs.get('power_mode', _F.PowerMode.Normal)
     precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal)
-    thread_num = kwargs.get('thread_num', 4)
+    thread_num = kwargs.get('thread_num', 1)
 
     module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange,
                                        backend, memory_mode, power_mode, precision_mode, thread_num)
@@ -59,4 +59,4 @@ def __init(self):
         super(EmptyModule, self).__init__()
     def forward(self):
         return None
-dummy = EmptyModule()
\ No newline at end of file
+dummy = EmptyModule()
diff --git a/pymnn/pip_package/MNN/tools/mnnconvert.py b/pymnn/pip_package/MNN/tools/mnnconvert.py
index a3f773d01..7e347c254 100644
--- a/pymnn/pip_package/MNN/tools/mnnconvert.py
+++ b/pymnn/pip_package/MNN/tools/mnnconvert.py
@@ -13,6 +13,8 @@
 except:
     mnn_logger = None
 
+def convert(args):
+    Tools.mnnconvert(args)
 
 def parse_args():
     arg_dict = {}
@@ -28,13 +30,13 @@ def parse_args():
                 if arg_value.startswith("--") or arg_value.startswith("-"):
                     arg_value = True
             arg_dict[arg_name] = arg_value
-    
+
     return arg_dict
 
 
 def main():
     """ main funcion """
-    Tools.mnnconvert(sys.argv)
+    convert(sys.argv)
 
     arg_dict = parse_args()
 
@@ -52,7 +54,7 @@ def main():
         arg_dict.pop("MNNModel")
         log_dict["detail"] = {"args": arg_dict, "src_model_size": src_model_size, "dst_model_size": dst_model_size, "compress_rate": compress_rate}
         mnn_logger.put_log(log_dict, "convert")
-    
+
     return 0
 
 
diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py
index 320975bf5..6ee2398a5 100644
--- a/pymnn/pip_package/build_deps.py
+++ b/pymnn/pip_package/build_deps.py
@@ -17,6 +17,7 @@
 IS_WINDOWS = (platform.system() == 'Windows')
 IS_DARWIN = (platform.system() == 'Darwin')
 IS_LINUX = (platform.system() == 'Linux')
+IS_ARM = ('arm' in platform.processor())
 BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn
 
 USE_TRT      = False
@@ -55,8 +56,8 @@
         USE_OPENMP = True
     if "llm" in sys.argv[1]:
         USE_LLM = True
-    if "arm82" in sys.argv[1]:
-        USE_ARM82 = True
+
+if IS_ARM: USE_ARM82 = True
 
 print ("USE_INTERNAL:", USE_INTERNAL)
 print ("USE_TRT:", USE_TRT)
@@ -69,7 +70,6 @@
 print ("USE_SSE:", USE_SSE)
 print ("USE_OPENMP:", USE_OPENMP)
 print ("USE_LLM:", USE_LLM)
-print ("USE_ARM82:", USE_ARM82)
 
 def build_deps():
     """ build depency """
@@ -92,6 +92,9 @@ def build_deps():
     if USE_ARM82:
         extra_opts += ' -DMNN_ARM82=ON'
     extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF'
+    if IS_DARWIN:
+        # Mac / iOS System use GCD instead of MNN's thread pool
+        extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON '
 
     if IS_WINDOWS:
         os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\
diff --git a/pymnn/src/llm.h b/pymnn/src/llm.h
index 3ade7a17f..fc4e885e2 100644
--- a/pymnn/src/llm.h
+++ b/pymnn/src/llm.h
@@ -1,3 +1,4 @@
+#include <sstream>
 #include "llm/llm.hpp"
 
 typedef struct {
@@ -38,8 +39,7 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) {
     if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) {
         Py_RETURN_NONE;
     }
-    MNN::Transformer::LlmStreamBuffer buffer(nullptr);
-    std::ostream null_os(&buffer);
+    std::ostringstream null_os;
     auto res = self->llm->response(query, stream ? &std::cout : &null_os);
     return string2Object(res);
 }
diff --git a/pymnn/src/nn.h b/pymnn/src/nn.h
index c256754f6..a775cb0d6 100644
--- a/pymnn/src/nn.h
+++ b/pymnn/src/nn.h
@@ -154,6 +154,7 @@ static PyObject* PyMNN_Module_get_info(PyMNN_Module *self, PyObject *args) {
     }
     auto res = PyDict_New();
     PyDict_SetItemString(res, "version", char2Object(info->version.c_str()));
+    PyDict_SetItemString(res, "bizCode", char2Object(info->bizCode.c_str()));
     {
         auto names = PyList_New(info->inputNames.size());
         for (int i=0; i<info->inputNames.size(); ++i) {
@@ -379,6 +380,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
     }
     for (auto i = 0; i < PySequence_Size(dicts); ++i) {
         backendConfig[i].sharedContext = nullptr;
+        config[i].numThread = 1;
         config[i].backendConfig = &backendConfig[i];
         bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]);
         if (!ret) {
@@ -392,7 +394,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args)
     } else {
         m_ptr = Executor::RuntimeManager::createRuntimeManager(configs);
     }
-    
+
     if (m_ptr == nullptr) {
         printf("config size:%d\n", configs.size());
         std::string mnn_errno = "create_runtime_manager failed ";
diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp
index 92749c426..ea57b4d9e 100644
--- a/source/backend/arm82/Arm82Functions.cpp
+++ b/source/backend/arm82/Arm82Functions.cpp
@@ -50,10 +50,10 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size
 #endif
 #if defined(__aarch64__)
 void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad);
+void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 #endif
-void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow);
-
-void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
 
 void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep);
@@ -336,94 +336,6 @@ static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float*
     }
 }
 
-void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
-    constexpr int pack = 8;
-    int unit = ow / 2;
-    auto biasF = Vec::load((const float16_t*)bias);
-    auto minF = Vec(parameters[2]);
-    auto maxF = Vec(parameters[3]);
-    MNN_ASSERT(cacheLineSize >= 1);
-    for (int x = 0; x < unit; ++x) {
-        int offset = 4 * pack * x, i = 0;
-        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
-        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-        Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1);
-            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-            m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3);
-        }
-        auto o0 = m0 + m1 + m2 + biasF;
-        auto o1 = m1 - m2 + m3 + biasF;
-        o0 = Vec::min(maxF, o0);
-        o1 = Vec::min(maxF, o1);
-        o0 = Vec::max(minF, o0);
-        o1 = Vec::max(minF, o1);
-        Vec::save(dest + (2 * x + 0) * pack, o0);
-        Vec::save(dest + (2 * x + 1) * pack, o1);
-    }
-    if (unit * 2 < ow) {
-        int offset = 4 * pack * unit, i = 0;
-        Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-        Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
-        Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset);
-            m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack);
-            m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2);
-        }
-        auto o0 = m0 + m1 + m2 + biasF;
-        o0 = Vec::min(maxF, o0);
-        o0 = Vec::max(minF, o0);
-        Vec::save(dest + 2 * unit * pack, o0);
-    }
-}
-// unit: winograd unit (output is w/2)
-void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) {
-    constexpr int pack = 8; // float16x8
-    for (int x = 0; x < su; ++x) {
-        auto dstX = dest + 4 * pack * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec::load(source + pack * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-        Vec::save(dstX + pack * 0, m0);
-        Vec::save(dstX + pack * 1, m1);
-        Vec::save(dstX + pack * 2, m2);
-        Vec::save(dstX + pack * 3, m3);
-    }
-    MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su);
-    for (int x = eu; x < unit; ++x) {
-        auto dstX = dest + 4 * pack * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-        Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec::load(source + pack * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-        Vec::save(dstX + pack * 0, m0);
-        Vec::save(dstX + pack * 1, m1);
-        Vec::save(dstX + pack * 2, m2);
-        Vec::save(dstX + pack * 3, m3);
-    }
-}
-
 void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr,
                           size_t cStride, size_t eSub, size_t hSub) {
     const int pack = 8;
@@ -516,24 +428,6 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size
     }
 }
 
-static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                           size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    Vec dstValue(0.0f);
-    auto src_z    = (const FLOAT16*)src;
-    auto weight_z = (const FLOAT16*)weight;
-    for (fy = 0; fy < fh; ++fy) {
-        auto src_y    = src_z + fy * dilateY_step;
-        auto weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            auto weight_x = weight_y + 8 * fx;
-            auto src_x    = src_y + fx * dilateX_step;
-            dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x);
-        }
-    }
-    Vec::save((FLOAT16*)dst, dstValue);
-}
-
 static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh,
                                   size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
     int fx, fy;
@@ -706,12 +600,8 @@ bool Arm82Functions::init() {
     FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16);
     FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8);
     FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8);
-    FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16);
-    FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16);
-    FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon);
-    FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon);
     FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16);
     FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge);
@@ -754,6 +644,7 @@ bool Arm82Functions::init() {
     FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue);
 #endif
     FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A);
+    FUNC_PTR_ASSIGN(gInstance->MNNDepthwiseConvFastKernel, MNNDepthwiseConvFastKernelFP16);
 #endif
     FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A);
     FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode);
diff --git a/source/backend/arm82/CMakeLists.txt b/source/backend/arm82/CMakeLists.txt
index afbe55dbb..4f6e5ebd1 100644
--- a/source/backend/arm82/CMakeLists.txt
+++ b/source/backend/arm82/CMakeLists.txt
@@ -5,7 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
     file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*")
     add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM})
     target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
     file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*")
     if (MNN_LOW_MEMORY)
         file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*)
diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
deleted file mode 100644
index 7b2528991..000000000
--- a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S
+++ /dev/null
@@ -1,147 +0,0 @@
-//
-//  MNNConvDwF23MulTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23MulTransUnitFP16
-//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
-//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow
-push {r4-r11, lr}
-ldr r8, [sp, #36] // biasPtr
-ldr r9, [sp, #40] // postParameters
-ldr r10, [r9, #8] // minF
-ldr r11, [r9, #12] // maxF
-
-vpush {q4-q7}
-ldr r4, [r0, #0]
-ldr r5, [r0, #4]
-ldr r6, [r0, #8]
-
-vld1.16 {q4, q5}, [r1]!
-vld1.16 {q6, q7}, [r1]!
-vld1.16 {q8, q9}, [r1]!
-
-L2:
-cmp r3, #2
-blt L1
-
-LoopL2:
-mov r7, r1
-
-vld1.16 {q12, q13}, [r4]!
-vmul.f16 q0, q4, q12
-vld1.16 {q14, q15}, [r4]!
-vmul.f16 q1, q5, q13
-vld1.16 {q10, q11}, [r7]!
-vmul.f16 q2, q6, q14
-vld1.16 {q12, q13}, [r5]!
-vmul.f16 q3, q7, q15
-
-vmla.f16 q0, q8, q12
-vld1.16 {q14, q15}, [r5]!
-vmla.f16 q1, q9, q13
-vmla.f16 q2, q10, q14
-vmla.f16 q3, q11, q15
-
-vld1.16 {q10, q11}, [r7]!
-vld1.16 {q12, q13}, [r6]!
-vmla.f16 q0, q10, q12
-vmla.f16 q1, q11, q13
-vld1.16 {q10, q11}, [r7]!
-vadd.f16 q0, q1, q0
-vld1.16 {q14, q15}, [r6]!
-
-vmla.f16 q2, q10, q14
-vmla.f16 q3, q11, q15
-vadd.f16 q0, q0, q2
-
-vadd.f16 q3, q3, q1
-vsub.f16 q1, q3, q2
-
-vld1.32 {q10}, [r8]
-vdup.32 q11, r10
-vdup.32 q12, r11
-vcvt.f16.f32 d22, q11
-vcvt.f16.f32 d24, q12
-vmov.32 d23, d22
-vmov.32 d25, d24
-
-vadd.f16 q0, q10, q0
-vadd.f16 q1, q10, q1
-
-vmin.f16 q0, q12, q0
-vmin.f16 q1, q12, q1
-
-vmax.f16 q0, q11, q0
-vmax.f16 q1, q11, q1
-
-
-vst1.16 {q0, q1}, [r2]!
-
-sub r3, r3, #2
-cmp r3, #2
-bge LoopL2
-
-
-L1:
-cmp r3, #0
-beq End
-mov r7, r1
-mov r12, #32
-vld1.16 {q12, q13}, [r4]!
-vmul.f16 q0, q4, q12
-vld1.16 {q14}, [r4]!
-vmul.f16 q1, q5, q13
-vld1.16 {q10}, [r7], r12
-vmul.f16 q2, q6, q14
-vld1.16 {q12, q13}, [r5]!
-
-vmla.f16 q0, q8, q12
-vld1.16 {q14}, [r5]!
-vmla.f16 q1, q9, q13
-vmla.f16 q2, q10, q14
-
-vld1.16 {q10, q11}, [r7]!
-vld1.16 {q12, q13}, [r6]!
-vmla.f16 q0, q10, q12
-vmla.f16 q1, q11, q13
-vld1.16 {q10}, [r7]
-vld1.16 {q14}, [r6]!
-
-vmla.f16 q2, q10, q14
-
-vadd.f16 q0, q1, q0
-vadd.f16 q0, q0, q2
-
-vld1.32 {q10}, [r8]
-vdup.32 q11, r10
-vdup.32 q12, r11
-vcvt.f16.f32 d22, q11
-vcvt.f16.f32 d24, q12
-vmov.32 d23, d22
-vmov.32 d25, d24
-
-vadd.f16 q0, q10, q0
-
-vmin.f16 q0, q12, q0
-
-vmax.f16 q0, q11, q0
-
-vst1.16 {q0}, [r2]!
-End:
-
-vpop {q4-q7}
-pop {r4-r11, pc}
-
-#endif
-#endif
diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
deleted file mode 100644
index f2fb67713..000000000
--- a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S
+++ /dev/null
@@ -1,60 +0,0 @@
-//
-//  MNNConvDwF23SourceTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23SourceTransUnitFP16
-//    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
-
-//Auto:
-//r0: source, r1:dest, r2:unit
-
-push {lr}
-
-L1:
-cmp r2, #0
-beq End
-
-vld1.16 {q8, q9}, [r0]!
-vld1.16 {q10, q11}, [r0]!
-subs r2, r2, #1
-vsub.f16 q0, q8, q10
-vadd.f16 q1, q9, q10
-beq L1LoopEnd
-
-L1Loop:
-    vsub.f16 q2, q10, q9
-    vst1.16 {q0, q1}, [r1]!
-    vsub.f16 q3, q11, q9
-    vmov.i32 q8, q10
-    vst1.16 {q2, q3}, [r1]!
-    vmov.i32 q9, q11
-    vld1.16 {q10, q11}, [r0]!
-    vsub.f16 q0, q8, q10
-    vadd.f16 q1, q9, q10
-
-    subs r2, r2, #1
-    bne L1Loop
-L1LoopEnd:
-vsub.f16 q2, q10, q9
-vsub.f16 q3, q11, q9
-
-vst1.16 {q0, q1}, [r1]!
-vst1.16 {q2, q3}, [r1]!
-
-
-End:
-
-pop {pc}
-#endif
-#endif
diff --git a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
index 240c9b17a..c39406078 100644
--- a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
+++ b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S
@@ -16,26 +16,35 @@
 
 asm_function MNNConvRunForLineDepthwiseFP16
 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
-//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
+//                                 const float* bias, const float* parameters)
 
 
 //Auto Load:
 //r0:dst, r1:src, r2:weight, r3:width
 
-push {r4-r11, lr}
+push {r4-r8, r10, r11, lr}
 
 //Load From Sp
-//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep
-ldr r4, [sp, #36]
-ldr r5, [sp, #40]
-ldr r6, [sp, #44]
-ldr r7, [sp, #48]
-ldr r8, [sp, #52]
-ldr r9, [sp, #56]
-ldr r10, [sp, #60]
-ldr r11, [sp, #64]
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
+ldr r4, [sp, #32]
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
+ldr r8, [sp, #48]
+ldr lr, [sp, #52]
+ldr r10, [sp, #56]
+ldr r11, [sp, #60]
+ldr r12, [sp, #64] // bias
+vld1.32 {q0}, [r12] // bias
+ldr r12, [sp, #68]  // min,max
+vld1.32 {d2[0]}, [r12]!
+vld1.32 {d2[1]}, [r12]
 
 vpush {q4-q7}
+vmov.f32 q5, q0 // bias
+vdup.f32 q4, d2[0] // min
+vdup.f32 q6, d2[1] // max
 
 mov r12, #2 // sizeof(FLOAT16)
 mul r4, r12, r4
@@ -49,7 +58,7 @@ mul r12, r5, r7
 sub r8, r8, r12
 
 LoopDY:
-push {r0, r1, r3, r9, r10, r11}
+push {r0, r1, r3, r10, r11, lr}
 
 L8:
 cmp r3, #7
@@ -59,18 +68,18 @@ mov r12, #8
 mul r12, r4, r12
 
 L8Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
-    vmov.i32 q12, #0
-    vmov.i32 q13, #0
-    vmov.i32 q14, #0
-    vmov.i32 q15, #0
+    vmov.f32 q8,  q5 // use bias to init
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+    vmov.f32 q12, q5
+    vmov.f32 q13, q5
+    vmov.f32 q14, q5
+    vmov.f32 q15, q5
 
     vmov.i32 d14[0], r1
     vmov.i32 d14[1], r2
-    mov r9, r6
+    mov lr, r6
     L8LoopH:
         mov r10, r5
         L8LoopW:
@@ -98,11 +107,27 @@ L8Loop:
 
             bne L8LoopW
         L8LoopWEnd:
-        subs r9, r9, #1
+        subs lr, lr, #1
         add r1, r1, r8
         bne L8LoopH
 
     sub r3, r3, #8
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmax.f32 q12, q12, q4
+    vmax.f32 q13, q13, q4
+    vmax.f32 q14, q14, q4
+    vmax.f32 q15, q15, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    vmin.f32 q12, q12, q6
+    vmin.f32 q13, q13, q6
+    vmin.f32 q14, q14, q6
+    vmin.f32 q15, q15, q6
     vst1.16 {q8, q9}, [r0]!
     vmov.i32 r1, d14[0]
     vmov.i32 r2, d14[1]
@@ -121,14 +146,14 @@ mov r12, #4
 mul r12, r4, r12
 
 L4Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
-
-    vmov.i32 d8[0], r1
-    vmov.i32 d9[0], r2
-    mov r9, r6
+    vmov.f32 q8,  q5
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+
+    vmov.i32 d14[0], r1
+    vmov.i32 d14[1], r2
+    mov lr, r6
     L4LoopH:
         mov r10, r5
         L4LoopW:
@@ -147,14 +172,22 @@ L4Loop:
             add r1, r1, r7
 
             bne L4LoopW
-        subs r9, r9, #1
+        subs lr, lr, #1
         add r1, r1, r8
         bne L4LoopH
 
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
     sub r3, r3, #4
     vst1.16 {q8, q9}, [r0]!
-    vmov.i32 r1, d8[0]
-    vmov.i32 r2, d9[0]
+    vmov.i32 r1, d14[0]
+    vmov.i32 r2, d14[1]
     vst1.16 {q10, q11}, [r0]!
     add r1, r1, r12
     cmp r3, #4
@@ -168,8 +201,8 @@ cmp r3, #0
 beq End
 
 L1Loop:
-    vmov.i32 q0, #0
-    mov r9, r6
+    vmov.f32 q0, q5
+    mov lr, r6
     mov r11, r1
     mov r12, r2
     L1LoopH:
@@ -180,10 +213,12 @@ L1Loop:
             vmla.f16 q0, q1, q2
             subs r10, r10, #1
             bne L1LoopW
-        subs r9, r9, #1
+        subs lr, lr, #1
         add r1, r1, r8
         bne L1LoopH
 
+    vmax.f32 q0, q0, q4
+    vmin.f32 q0, q0, q6
     subs r3, r3, #1
     vst1.16 {q0}, [r0]!
     mov r2, r12
@@ -193,16 +228,15 @@ L1Loop:
 
 End:
 
-pop {r0, r1, r3, r9, r10, r11}
+pop {r0, r1, r3, r10, r11, lr}
 add r0, r0, r11
-subs r9, r9, #1
+subs lr, lr, #1
 add r1, r1, r10
 bne LoopDY
 
 
 vpop {q4-q7}
-pop {r4-r11, pc}
-
+pop {r4-r8, r10, r11, pc}
 
 #endif
 #endif
diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
deleted file mode 100644
index 5585b2cb0..000000000
--- a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S
+++ /dev/null
@@ -1,122 +0,0 @@
-//
-//  MNNConvDwF23MulTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23MulTransUnitFP16
-//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow);
-//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters
-
-stp d10, d11, [sp, #-32]!
-stp d8,  d9,  [sp, #16]
-
-ld1 {v8.8h}, [x4] // bias
-ldr w9, [x5, #8]
-ldr w10, [x5, #12]
-dup v9.4s, w9 // min
-dup v10.4s, w10 // max
-fcvtn v9.4h, v9.4s
-fcvtn v10.4h, v10.4s
-dup v9.8h, v9.h[0]
-dup v10.8h, v10.h[0]
-
-ldr x4, [x0, #0]
-ldr x5, [x0, #8]
-ldr x6, [x0, #16]
-
-ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
-ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64
-ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1]
-
-L2:
-cmp x3, #2
-blt L1
-
-LoopL2:
-
-ld1 {v20.8h, v21.8h}, [x4], #32
-fmul v0.8h, v4.8h, v20.8h
-ld1 {v22.8h, v23.8h}, [x4], #32
-fmul v1.8h, v5.8h, v21.8h
-fmul v2.8h, v6.8h, v22.8h
-ld1 {v20.8h, v21.8h}, [x5], #32
-fmul v3.8h, v7.8h, v23.8h
-
-fmla v0.8h, v16.8h, v20.8h
-ld1 {v22.8h, v23.8h}, [x5], #32
-fmla v1.8h, v17.8h, v21.8h
-fmla v2.8h, v18.8h, v22.8h
-fmla v3.8h, v19.8h, v23.8h
-
-ld1 {v20.8h, v21.8h}, [x6], #32
-fmla v0.8h, v28.8h, v20.8h
-fmla v1.8h, v29.8h, v21.8h
-fadd v0.8h, v1.8h, v0.8h
-ld1 {v22.8h, v23.8h}, [x6], #32
-
-fmla v2.8h, v30.8h, v22.8h
-fmla v3.8h, v31.8h, v23.8h
-fadd v0.8h, v0.8h, v2.8h
-
-fadd v3.8h, v3.8h, v1.8h
-fsub v1.8h, v3.8h, v2.8h
-
-fadd v0.8h, v0.8h, v8.8h
-fadd v1.8h, v1.8h, v8.8h
-
-fmin v0.8h, v0.8h, v10.8h
-fmin v1.8h, v1.8h, v10.8h
-
-fmax v0.8h, v0.8h, v9.8h
-fmax v1.8h, v1.8h, v9.8h
-
-st1 {v0.8h, v1.8h}, [x2], #32
-
-sub x3, x3, #2
-cmp x3, #2
-bge LoopL2
-
-
-L1:
-cmp x3, #0
-beq End
-ld1 {v20.8h, v21.8h, v22.8h}, [x4]
-fmul v0.8h, v4.8h, v20.8h
-fmul v1.8h, v5.8h, v21.8h
-fmul v2.8h, v6.8h, v22.8h
-ld1 {v20.8h, v21.8h, v22.8h}, [x5]
-
-fmla v0.8h, v16.8h, v20.8h
-fmla v1.8h, v17.8h, v21.8h
-fmla v2.8h, v18.8h, v22.8h
-
-ld1 {v20.8h, v21.8h, v22.8h}, [x6]
-fmla v0.8h, v28.8h, v20.8h
-fmla v1.8h, v29.8h, v21.8h
-fadd v0.8h, v1.8h, v0.8h
-
-fmla v2.8h, v30.8h, v22.8h
-fadd v0.8h, v0.8h, v2.8h
-
-fadd v0.8h, v0.8h, v8.8h
-
-fmin v0.8h, v0.8h, v10.8h
-
-fmax v0.8h, v0.8h, v9.8h
-st1 {v0.8h}, [x2]
-End:
-
-ldp d8,  d9,  [sp, #16]
-ldp d10, d11, [sp], #32
-
-ret
-#endif
diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
deleted file mode 100644
index cac31e53d..000000000
--- a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S
+++ /dev/null
@@ -1,56 +0,0 @@
-//
-//  MNNConvDwF23SourceTransUnitFP16.S
-//  MNN
-//
-//  Created by MNN on 2019/4/4.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvDwF23SourceTransUnitFP16
-//    void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit);
-
-//Auto:
-//x0: source, x1:dest, x2:unit
-
-L1:
-cmp x2, #0
-beq End
-
-ld1 {v16.8h, v17.8h}, [x0], #32
-ld1 {v18.8h, v19.8h}, [x0], #32
-subs x2, x2, #1
-fsub v0.8h, v16.8h, v18.8h
-fadd v1.8h, v17.8h, v18.8h
-beq L1LoopEnd
-
-L1Loop:
-    fsub v2.8h, v18.8h, v17.8h
-    st1 {v0.8h, v1.8h}, [x1], #32
-    fsub v3.8h, v19.8h, v17.8h
-    mov v16.16b, v18.16b
-    st1 {v2.8h, v3.8h}, [x1], #32
-    mov v17.16b, v19.16b
-    ld1 {v18.8h, v19.8h}, [x0], #32
-    fsub v0.8h, v16.8h, v18.8h
-    fadd v1.8h, v17.8h, v18.8h
-
-    subs x2, x2, #1
-    bne L1Loop
-L1LoopEnd:
-fsub v2.8h, v18.8h, v17.8h
-fsub v3.8h, v19.8h, v17.8h
-
-st1 {v0.8h, v1.8h}, [x1], #32
-st1 {v2.8h, v3.8h}, [x1], #32
-
-
-End:
-ret
-
-#endif
diff --git a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
index 1cb449d27..ada98a9b1 100644
--- a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
+++ b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S
@@ -15,17 +15,24 @@
 
 asm_function MNNConvRunForLineDepthwiseFP16
 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup,
-//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep,
+//                                const float* bias, float* parameters)
 
 //Auto Load:
 //x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step
 
 //Load From sp:
-//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13:parameters
 ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d8, d9, [sp, #(-16 * 3)]!
+stp d10, d11, [sp, #(16 * 2)]
+stp x19, x20, [sp, #(16 * 1)]
 
 mov x9, #2 // sizeof(FLOAT16)
 mul x4, x9, x4
@@ -34,15 +41,30 @@ mul x8, x9, x8
 mul x10, x9, x10
 mul x11, x9, x11
 
+ld1 {v8.8h}, [x12] // bias
+ld1r {v10.8h}, [x13], #2 // min
+ld1r {v11.8h}, [x13]
+
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9
 
-.macro zero_vec x0, x1, x2, x3
-    movi \x0\().8h, #0
-    movi \x1\().8h, #0
-    movi \x2\().8h, #0
-    movi \x3\().8h, #0
+.macro assign_bias x0, x1, x2, x3
+    mov \x0\().16b, v8.16b
+    mov \x1\().16b, v8.16b
+    mov \x2\().16b, v8.16b
+    mov \x3\().16b, v8.16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().8h, \x0\().8h, \xmin\().8h
+    fmax \x1\().8h, \x1\().8h, \xmin\().8h
+    fmax \x2\().8h, \x2\().8h, \xmin\().8h
+    fmax \x3\().8h, \x3\().8h, \xmin\().8h
+    fmin \x0\().8h, \x0\().8h, \xmax\().8h
+    fmin \x1\().8h, \x1\().8h, \xmax\().8h
+    fmin \x2\().8h, \x2\().8h, \xmax\().8h
+    fmin \x3\().8h, \x3\().8h, \xmax\().8h
 .endm
 
 LoopDY:
@@ -56,16 +78,16 @@ L16:
 cmp x3, #16
 blt L8
 
-mov x12, #16
-mul x12, x4, x12
+mov x19, #16
+mul x19, x4, x19
 
 L16Loop:
-    zero_vec v16, v17, v18, v19
-    zero_vec v20, v21, v22, v23
-    zero_vec v24, v25, v26, v27
-    zero_vec v28, v29, v30, v31
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23
+    assign_bias v24, v25, v26, v27
+    assign_bias v28, v29, v30, v31
 
-    mov x13, x1
+    mov x20, x1
     mov x14, x2
     mov x9, x6
     L16LoopH:
@@ -106,7 +128,7 @@ L16Loop:
             ld1 {v3.8h}, [x1], x4
             fmla v30.8h, v7.8h, v2.8h
             fmla v31.8h, v7.8h, v3.8h
-            sub x1, x1, x12
+            sub x1, x1, x19
             add x1, x1, x7
 
             bne L16LoopW
@@ -115,8 +137,12 @@ L16Loop:
         bne L16LoopH
 
     sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11 
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
     st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
     cmp x3, #16
     mov x2, x14
     st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
@@ -129,14 +155,14 @@ L8:
 cmp x3, #7
 ble L4
 
-mov x12, #8
-mul x12, x4, x12
+mov x19, #8
+mul x19, x4, x19
 
 L8Loop:
-    zero_vec v16, v17, v18, v19
-    zero_vec v20, v21, v22, v23
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23
 
-    mov x13, x1
+    mov x20, x1
     mov x14, x2
     mov x9, x6
     L8LoopH:
@@ -161,7 +187,7 @@ L8Loop:
             ld1 {v1.8h}, [x1], x4
             fmla v23.8h, v1.8h, v3.8h
 
-            sub x1, x1, x12
+            sub x1, x1, x19
             add x1, x1, x7
 
             bne L8LoopW
@@ -169,9 +195,12 @@ L8Loop:
         add x1, x1, x8
         bne L8LoopH
 
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+
     sub x3, x3, #8
     st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
     mov x2, x14
     st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
 
@@ -180,13 +209,13 @@ L4:
 cmp x3, #4
 ble L1
 
-mov x12, #4
-mul x12, x4, x12
+mov x19, #4
+mul x19, x4, x19
 
 L4Loop:
-    zero_vec v16, v17, v18, v19
+    assign_bias v16, v17, v18, v19
 
-    mov x13, x1
+    mov x20, x1
     mov x14, x2
     mov x9, x6
     L4LoopH:
@@ -203,7 +232,7 @@ L4Loop:
             ld1 {v1.8h}, [x1], x4
             fmla v19.8h, v1.8h, v3.8h
 
-            sub x1, x1, x12
+            sub x1, x1, x19
             add x1, x1, x7
 
             bne L4LoopW
@@ -211,9 +240,10 @@ L4Loop:
         add x1, x1, x8
         bne L4LoopH
 
+    compare_min_max v16, v17, v18, v19, v10, v11
     sub x3, x3, #4
     st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
-    add x1, x13, x12
+    add x1, x20, x19
     mov x2, x14
 
 L1:
@@ -221,10 +251,10 @@ cmp x3, #0
 beq End
 
 L1Loop:
-    movi v0.8h, #0
+    mov v0.16b, v8.16b
     mov x9, x6
     mov x11, x1
-    mov x12, x2
+    mov x19, x2
     L1LoopH:
         mov x10, x5
         L1LoopW:
@@ -238,8 +268,10 @@ L1Loop:
         bne L1LoopH
 
     subs x3, x3, #1
+    fmax v0.8h, v0.8h, v10.8h
+    fmin v0.8h, v0.8h, v11.8h
     st1 {v0.8h}, [x0], #16
-    mov x2, x12
+    mov x2, x19
     add x1, x11, x4
     bne L1Loop
 
@@ -257,7 +289,9 @@ add x0, x0, x11
 add x1, x1, x10
 bne LoopDY
 
-
+ldp x19, x20, [sp, #(16 * 1)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d8, d9, [sp], #(16 * 3)
 ret
 
 #endif
diff --git a/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S b/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S
new file mode 100644
index 000000000..80accbebc
--- /dev/null
+++ b/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S
@@ -0,0 +1,290 @@
+//
+//  MNNDepthwiseConvFastKernelFP16.S
+//  MNN
+//
+//  Created by MNN on 2024/09/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNDepthwiseConvFastKernelFP16
+
+// void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+//                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
+
+//Load From sp:
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
+ldr x8, [sp, #0]
+ldr x15, [sp, #8]
+ldr x10, [sp, #16]
+ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d14, d15, [sp, #(-16 * 9)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+stp x21, x22, [sp, #(16 * 4)]
+stp x19, x20, [sp, #(16 * 5)]
+stp x27, x28, [sp, #(16 * 6)]
+stp x25, x26, [sp, #(16 * 7)]
+stp x23, x24, [sp, #(16 * 8)]
+
+lsl x4, x4, #1   // src_w_step*sizeof(float)
+lsl x7, x7, #1   // dilate_x_step*sizeof(float)
+lsl x8, x8, #1   // dilate_y_step*sizeof(float)
+lsl x23, x10, #1 // srcHStep*sizeof(float)
+lsl x24, x11, #1 // dstHStep*sizeof(float)
+mov x20, x12     // bias
+mov x26, x13     // min
+add x27, x13, #2 // max
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul x9, x5, x7
+sub x8, x8, x9
+mov x25, x3 // width
+.macro assign_bias x0, x1, x2, x3, bv
+    mov \x0\().16b, \bv\().16b
+    mov \x1\().16b, \bv\().16b
+    mov \x2\().16b, \bv\().16b
+    mov \x3\().16b, \bv\().16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().8h, \x0\().8h, \xmin\().8h
+    fmax \x1\().8h, \x1\().8h, \xmin\().8h
+    fmax \x2\().8h, \x2\().8h, \xmin\().8h
+    fmax \x3\().8h, \x3\().8h, \xmin\().8h
+    fmin \x0\().8h, \x0\().8h, \xmax\().8h
+    fmin \x1\().8h, \x1\().8h, \xmax\().8h
+    fmin \x2\().8h, \x2\().8h, \xmax\().8h
+    fmin \x3\().8h, \x3\().8h, \xmax\().8h
+.endm
+
+LoopDY:
+//mov x23, x10
+//mov x24, x11
+mov x21, x0
+mov x22, x1
+
+L16:
+cmp x3, #16
+blt L8
+
+mov x12, #-176
+mov x19, #256
+
+L16Loop:
+    ld1 {v8.8h}, [x20] // load bias
+    assign_bias v16, v17, v18, v19, v8
+    assign_bias v20, v21, v22, v23, v8
+    assign_bias v24, v25, v26, v27, v8
+    assign_bias v28, v29, v30, v31, v8
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L16LoopH:
+        mov x10, x5
+        L16LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64
+            ld1 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #64
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+            fmla v17.8h, v8.8h, v1.8h
+            fmla v18.8h, v8.8h, v2.8h
+            fmla v19.8h, v8.8h, v3.8h
+
+            fmla v20.8h, v8.8h, v4.8h
+            fmla v21.8h, v8.8h, v5.8h
+            fmla v22.8h, v8.8h, v6.8h
+            fmla v23.8h, v8.8h, v7.8h
+
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
+
+            fmla v24.8h, v8.8h, v9.8h
+            fmla v25.8h, v8.8h, v10.8h
+            fmla v26.8h, v8.8h, v11.8h
+            fmla v27.8h, v8.8h, v12.8h
+
+            fmla v28.8h, v8.8h, v0.8h
+            fmla v29.8h, v8.8h, v1.8h
+            fmla v30.8h, v8.8h, v2.8h
+            fmla v31.8h, v8.8h, v3.8h
+
+            bne L16LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L16LoopH
+    ld1r {v10.8h}, [x26] // min
+    ld1r {v11.8h}, [x27] // max
+    sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11 
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x19 // 16 * pack * sizeof(float)
+    cmp x3, #16
+    mov x2, x14
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+    st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64
+    st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64
+    bge L16Loop
+
+
+L8:
+ld1r {v10.8h}, [x26] // min
+ld1r {v11.8h}, [x27] // max
+ld1 {v24.8h}, [x20] // load bias
+cmp x3, #7
+ble L4
+
+mov x12, #-48
+mov x19, #128
+
+L8Loop:
+    assign_bias v16, v17, v18, v19, v24
+    assign_bias v20, v21, v22, v23, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L8LoopH:
+        mov x10, x5
+        L8LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64
+            ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+            fmla v17.8h, v8.8h, v1.8h
+            fmla v18.8h, v8.8h, v2.8h
+            fmla v19.8h, v8.8h, v3.8h
+
+            fmla v20.8h, v8.8h, v4.8h
+            fmla v21.8h, v8.8h, v5.8h
+            fmla v22.8h, v8.8h, v6.8h
+            fmla v23.8h, v8.8h, v7.8h
+
+            bne L8LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L8LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+    sub x3, x3, #8
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x19 // 8 * pack * sizeof(float)
+    mov x2, x14
+    st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
+
+
+L4:
+cmp x3, #4
+ble L1
+
+mov x12, #16
+mov x19, #64
+
+L4Loop:
+    assign_bias v16, v17, v18, v19, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L4LoopH:
+        mov x10, x5
+        L4LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+            fmla v17.8h, v8.8h, v1.8h
+            fmla v18.8h, v8.8h, v2.8h
+            fmla v19.8h, v8.8h, v3.8h
+
+            bne L4LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L4LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    sub x3, x3, #4
+    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
+    add x1, x13, x19
+    mov x2, x14
+
+L1:
+cmp x3, #0
+beq End
+
+mov x19, #16
+
+L1Loop:
+    ld1 {v16.8h}, [x20] // assign bias
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L1LoopH:
+        mov x10, x5
+        L1LoopW:
+            ld1 {v8.8h}, [x2], #16
+            ld1 {v0.8h}, [x1], #16
+            subs x10, x10, #1
+            fmla v16.8h, v8.8h, v0.8h
+
+            bne L1LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L1LoopH
+
+    subs x3, x3, #1
+    fmax v16.8h, v16.8h, v10.8h
+    fmin v16.8h, v16.8h, v11.8h
+    st1 {v16.8h}, [x0], #16
+    add x1, x13, x4
+    mov x2, x14
+    bne L1Loop
+
+
+End:
+
+//mov x10, x23
+//mov x11, x24
+//mov x0, x21
+//mov x1, x22
+mov x3, x25
+
+subs x15, x15, #1
+add x0, x21, x24
+add x1, x22, x23
+bne LoopDY
+
+ldp x23, x24, [sp, #(16 * 8)]
+ldp x25, x26, [sp, #(16 * 7)]
+ldp x27, x28, [sp, #(16 * 6)]
+ldp x19, x20, [sp, #(16 * 5)]
+ldp x21, x22, [sp, #(16 * 4)]
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 9)
+ret
+
+#endif
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
index ad9313244..2a7cf474f 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S
@@ -108,14 +108,12 @@ stp x23, x24, [sp, #(16 * 8)]
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x23, [x6, #56]  // fp32minmax
-ldr x27, [x6, #64]  // blockNum
 
 //add x24, x23, #4
 
 mov x21, #16 // sizeof(float16_t) * PACK
-mul x27, x27, x3
 Start:
-lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT
 mov x22, #48 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
index dd893b292..decf68d84 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S
@@ -109,12 +109,10 @@ stp x23, x24, [sp, #(16 * 8)]
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
 ldr x23, [x6, #56]  // fp32minmax
-ldr x27, [x6, #64]  // blockNum
 
 mov x21, #16 // sizeof(float16_t) * PACK
-mul x27, x27, x3
 Start:
-lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 mov x22, #48 // src_steps
 ldr x27, [x6, #80] // extra scale
 TILE_12:
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
index 76c79b42e..6602d18b9 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S
@@ -150,15 +150,13 @@ stp x27, x28, [sp, #(16 * 8)]
 // ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
-ldr x23, [x6, #64] // blockNum
 ldr x14, [x6, #56]  // fp32minmax
 
-mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
 mov x21, #16 // sizeof(float16_t) * UNIT
 
 Start:
-lsl x15, x23, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
+lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6
 ldr x23, [x6, #80] // extra scale
 TILE_10:
     cmp x7, #10
diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
index f6f6625d7..ea01fef1a 100644
--- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
+++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S
@@ -130,15 +130,13 @@ stp x27, x28, [sp, #(16 * 8)]
 // ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
-ldr x23, [x6, #64] // blockNum
 ldr x14, [x6, #56]  // fp32minmax
 
-mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
 mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80
 mov x21, #16 // sizeof(float16_t) * UNIT
 
 Start:
-lsl x15, x23, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5
 ldr x23, [x6, #80] // extra scale
 TILE_10:
     cmp x7, #10
diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt
index e37ae3e55..82287d69f 100644
--- a/source/backend/cpu/CMakeLists.txt
+++ b/source/backend/cpu/CMakeLists.txt
@@ -42,9 +42,11 @@ ENDIF()
 
 # ARM82 Assemblies
 IF(MNN_ARM82)
-    target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
-    include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
-    list(APPEND MNN_TARGETS MNN_Arm82)
-    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
+    IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
+        target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82)
+        include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt)
+        list(APPEND MNN_TARGETS MNN_Arm82)
+        list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_Arm82>)
+    ENDIF()
 ENDIF()
 
diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp
index dd3401dcf..f28ba3e4a 100644
--- a/source/backend/cpu/CPUBackend.cpp
+++ b/source/backend/cpu/CPUBackend.cpp
@@ -48,7 +48,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector<Tensor*>& inputs, const
     CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType);
     return NO_ERROR;
 }
-void CPURuntime::computeDivideSizes(int size, int* dst) const {
+void CPUBackend::computeDivideSizes(int size, int* dst) const {
     if (mGroupWithComputeRate.size() <= 1) {
         // Avg divide
         int length = UP_DIV(size, mThreadNumber);
@@ -132,40 +132,6 @@ void CPURuntime::_bindCPUCore() const {
 #endif
 }
 
-void CPURuntime::_resetGroupCompute() const {
-    if (mPastDecreaseHint == hint().cpuDecreaseRate) {
-        return;
-    }
-    mGroupWithComputeRate.clear();
-    if (mThreadNumber <= 1 || mPower == BackendConfig::Power_Low) {
-        return;
-    }
-    mPastDecreaseHint = hint().cpuDecreaseRate;
-    auto cpuInfo = MNNGetCPUInfo();
-    if (cpuInfo->groups.size() < 2) {
-        return;
-    }
-    float decreaseRate = (float)(hint().cpuDecreaseRate) / 100.0f;
-    int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
-    int groupIndex = (int)cpuInfo->groups.size()-2;
-    float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
-    validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
-    float totalComputeRate = 1.0f * validCpuSize;
-    mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
-    float currentRate = 1.0f;
-    while (validCpuSize < mThreadNumber && groupIndex >= 0) {
-        auto& group = cpuInfo->groups[groupIndex];
-        int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
-        validCpuSize += group.ids.size();
-        currentRate *= decreaseRate;
-        totalComputeRate += currentRate * selectSize;
-        mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
-    }
-    for (auto& g : mGroupWithComputeRate) {
-        g.first = g.first / totalComputeRate;
-    }
-}
-
 void CPURuntime::_resetThreadPool() {
     mThreadNumber = std::max(1, mThreadNumber);
     mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER);
@@ -179,7 +145,6 @@ void CPURuntime::_resetThreadPool() {
         }
         mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber);
     }
-    mGroupWithComputeRate.clear();
     if (mThreadNumber > 1) {
         mTaskIndex = ThreadPool::acquireWorkIndex();
         if (-1 == mTaskIndex) {
@@ -204,8 +169,6 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful
     }
     mThreadNumber = numberThread;
     _resetThreadPool();
-    // Mask Group Compute reset
-    mPastDecreaseHint = -1;
 }
 
 CPURuntime::CPURuntime(const Backend::Info& info) {
@@ -280,7 +243,6 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons
         auto cpuBn = static_cast<CPUBackend*>(origin);
         mSharedDmaInfo = cpuBn->mDmaInfo;
     }
-    _resetGroupCompute();
     if (nullptr != config) {
         precision = config->precision;
         flags = config->flags;
@@ -403,6 +365,41 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p
 #endif
     mMemory = memory;
     mRuntime = const_cast<CPURuntime*>(runtime);
+    mThreadNumber = mRuntime->mThreadNumber;
+    // Compute Group Rate
+    do {
+        if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) {
+            break;
+        }
+        auto rate = mRuntime->hint().cpuDecreaseRate;
+        if (rate >= 100 || rate <= 0) {
+            break;
+        }
+        auto cpuInfo = MNNGetCPUInfo();
+        if (cpuInfo->groups.size() < 2) {
+            break;
+        }
+        mGroupWithComputeRate.clear();
+        float decreaseRate = (float)(rate) / 100.0f;
+        int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size());
+        int groupIndex = (int)cpuInfo->groups.size()-2;
+        float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq;
+        validCpuSize = ALIMIN(validCpuSize, mThreadNumber);
+        float totalComputeRate = 1.0f * validCpuSize;
+        mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize));
+        float currentRate = 1.0f;
+        while (validCpuSize < mThreadNumber && groupIndex >= 0) {
+            auto& group = cpuInfo->groups[groupIndex];
+            int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size());
+            validCpuSize += group.ids.size();
+            currentRate *= decreaseRate;
+            totalComputeRate += currentRate * selectSize;
+            mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize));
+        }
+        for (auto& g : mGroupWithComputeRate) {
+            g.first = g.first / totalComputeRate;
+        }
+    } while (false);
     auto dynamicAlloc = mRuntime->mSharedDmaInfo;
     if (nullptr == dynamicAlloc.get()) {
         mDmaInfo.reset(new CPURuntime::DynamicAllocator);
diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp
index b4c9843d0..00e39fc30 100644
--- a/source/backend/cpu/CPUBackend.hpp
+++ b/source/backend/cpu/CPUBackend.hpp
@@ -40,9 +40,6 @@ class CPURuntime : public Runtime {
     void onConcurrencyEnd() const;
     virtual bool onCheckInfo(Backend::Info& info) const override;
 
-    // dividedSize's length should be larger than threadNumber
-    void computeDivideSizes(int size, int* dst) const;
-
 #ifdef MNN_USE_THREAD_POOL
     inline bool multiThreadValid() const {
         return mThreadOpen;
@@ -60,9 +57,6 @@ class CPURuntime : public Runtime {
     mutable int mTaskIndex = -1;
     mutable bool mThreadOpen = false;
 #endif
-    void _resetGroupCompute() const;
-    mutable std::vector<std::pair<float, int>> mGroupWithComputeRate;
-    mutable int mPastDecreaseHint = -1;
     BackendConfig::MemoryMode mMemory;
     BackendConfig::PowerMode mPower;
     BackendConfig::PrecisionMode mPrecision;
@@ -108,6 +102,8 @@ class CPUBackend : public Backend {
     // Return sizeDivide, scheduleNumber aligned memory
     std::pair<int, int> multiThreadDivide(int size) const;
     virtual bool onSelectDynamicAllocator(int index, int maxIndex) override;
+    // dividedSize's length should be larger than threadNumber
+    void computeDivideSizes(int size, int* dst) const;
 
 public:
     virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override;
@@ -145,7 +141,7 @@ class CPUBackend : public Backend {
     static bool addCreator(OpType t, Creator* c);
 
     inline int threadNumber() const {
-        return mRuntime->mThreadNumber;
+        return mThreadNumber;
     }
 #ifdef MNN_USE_THREAD_POOL
     inline bool threadOpen() const {
@@ -182,6 +178,9 @@ class CPUBackend : public Backend {
     CoreFunctions* mCoreFunctions;
     CoreInt8Functions* mInt8CoreFunctions;
 private:
+    int mThreadNumber;
+    std::vector<std::pair<float, int>> mGroupWithComputeRate;
+
     std::shared_ptr<CPURuntime::DynamicAllocator> mDmaInfo;
     std::shared_ptr<EagerBufferAllocator> mStaticAllocator;
     CPURuntime* mRuntime;
diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp
index f3fdf2cb3..6d6e2df96 100644
--- a/source/backend/cpu/CPUConvolutionDepthwise.cpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp
@@ -14,7 +14,6 @@
 #include "core/TensorUtils.hpp"
 #include "backend/cpu/compute/CommonOptFunction.h"
 #include "backend/cpu/compute/ConvOpt.h"
-#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
 
 namespace MNN {
 CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b,
@@ -129,8 +128,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
     auto core          = static_cast<CPUBackend*>(backend())->functions();
     int bytes          = core->bytes;
     int unit           = core->pack;
-    auto unitFunc = core->MNNConvRunForUnitDepthWise;
-    auto lineFunc = core->MNNConvRunForLineDepthwise;
+    auto kernelFunc = core->MNNConvRunForLineDepthwise;
     auto postFunc = core->MNNAxByClampBroadcastUnit;
     auto inputTensor   = inputs[0];
     auto outputTensor  = outputs[0];
@@ -169,72 +167,60 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect
     int weight_z_step  = kernel_height * kernel_width * unit;
     int dilateY_step   = dilateY * src_width * unit;
     int dilateX_step   = dilateX * unit;
-    // Compute Mid Rect
-    int l = 0, t = 0, r = dst_width, b = dst_height;
-    for (; l * strideX - padX < 0 && l < dst_width; l++) {
-        // do nothing
-    }
-    for (; t * strideY - padY < 0 && t < dst_height; t++) {
-        // do nothing
-    }
-    for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) {
-        // do nothing
-    }
-    for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) {
-        // do nothing
-    }
 
-    auto postData = getPostParameters();
     auto batch = inputs[0]->batch();
     int total = batch * dst_depth_quad;
     int numberThread = ((CPUBackend*)backend())->threadNumber();
-    auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
-    auto runBasic     = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) {
-        for (int dy = T; dy < B; ++dy) {
-            auto dst_y        = dst_z + dy * dst_y_step * bytes;
-            int srcStartY       = dy * strideY - padY;
-            const auto src_dy = src_z + srcStartY * src_y_step * bytes;
-            int sfy             = ALIMAX(0, (UP_DIV(-srcStartY, dilateY)));
-            int efy             = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY));
-            for (int dx = L; dx < R; ++dx) {
-                auto dst_x        = dst_y + unit * dx * bytes;
-                int srcStartX       = dx * strideX - padX;
-                const auto src_dx = src_dy + srcStartX * unit * bytes;
-                int sfx             = ALIMAX(0, (UP_DIV(-srcStartX, dilateX)));
-                int efx             = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX));
-                unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes),
-                         (const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy,
-                         unit * kernel_width, dilateX_step, dilateY_step);
-            }
-        }
-    };
     std::vector<int> divides(numberThread+1);
     divides[0] = 0;
-    rt->computeDivideSizes(total, divides.data()+1);
-    mExecutor   = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) {
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(total, divides.data()+1);
+    mNumber = numberThread;
+    auto postData = getPostParameters();
+    if (static_cast<CPUBackend*>(backend())->functions()->bytes < 4) {
+        static_cast<CPUBackend*>(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2);
+    }
+    mFastKernelApply = (dilateX == 1 && dilateY == 1 && strideX == 1 && strideY == 1 && core->MNNDepthwiseConvFastKernel);
+    if (mFastKernelApply ) { // Only support ARM kernel
+        kernelFunc = core->MNNDepthwiseConvFastKernel;
+    }
+    auto pads = ConvolutionCommon::convolutionPadFull(inputs[0], outputs[0], mCommon);
+    int paddedWidth = std::get<0>(pads) + std::get<2>(pads) + src_width;
+    int paddedHeight = std::get<1>(pads) + std::get<3>(pads) + src_height;
+    mInputPad.reset(Tensor::createDevice<float>({mNumber, paddedWidth * paddedHeight * unit}));
+    bool succ = backend()->onAcquireBuffer(mInputPad.get(), Backend::DYNAMIC);
+    if (!succ) {
+        return OUT_OF_MEMORY;
+    }
+    if (paddedWidth != src_width) {
+        dilateY_step   = dilateY * paddedWidth * unit;
+        src_y_step     = paddedWidth * unit;
+    }
+    mExecutor   = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) {
+        const auto inputPadPtr = mInputPad->host<uint8_t>() + mInputPad->stride(0) * tId * bytes;
+        ::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes);
         auto biasP   = inputs[2]->host<uint8_t>();
         auto weightP = inputs[1]->host<uint8_t>();
         for (int index = divides[tId]; index < divides[tId+1]; ++index) {
+            
             int dz = index / batch;
-            auto dst_z           = dstOrigin + dst_z_step * index * bytes;
-            const auto src_z     = srcOrigin + src_z_step * index * bytes;
+            auto dstOrigin           = outputPtr + dst_z_step * index * bytes;
+            const auto srcOrigin     = inputPtr + src_z_step * index * bytes;
             auto bias_z          = biasP + unit * dz * bytes;
             const auto weight_dz = weightP + dz * weight_z_step * bytes;
-            runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t);
-            runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height);
-            runBasic(dst_z, src_z, weight_dz, 0, t, l, b);
-            runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b);
-            if (r > l && b > t) {
-                lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes),
-                                           (const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes),
-                                           (const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step,
-                                           dilateY_step, b - t, src_y_step * strideY, dst_y_step);
+            
+            auto srcPtr = srcOrigin;
+            // Pad inputs
+            for (int y = 0; y < src_height; ++y) {
+                auto src = srcOrigin + y * src_width * unit * bytes;
+                auto dst = inputPadPtr + ((y + padY) * paddedWidth + padX) * unit * bytes;
+                ::memcpy(dst, src, src_width * unit * bytes);
             }
-            postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data());
+
+            // Compute
+            kernelFunc((float*)dstOrigin, (const float*)(inputPadPtr), (const float*)weight_dz, dst_width, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, dst_height, src_y_step * strideY, dst_y_step, (const float*)bias_z, postData.data() + 2);
         }
     };
-    mNumber = numberThread;
-
+    backend()->onReleaseBuffer(mInputPad.get(), Backend::DYNAMIC);
     return NO_ERROR;
 }
 
@@ -281,11 +267,6 @@ class CPUConvolutionDepthwiseCreator : public CPUBackend::Creator {
         if (inputs.empty()) {
             return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
         }
-        auto core = static_cast<CPUBackend*>(backend)->functions();
-        if (conv->dilateX() == 1 && conv->dilateY() == 1 && conv->strideX() == 1 && conv->strideY() == 1 &&
-            conv->kernelX() == 3 && conv->kernelY() == 3 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2 && core->MNNMultiAndDestTransformCommon23 != nullptr) {
-            return new ConvolutionDepthwise3x3(conv, backend, originWeight, originWeightSize, originBias, originBiasSize);
-        }
         return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize);
     }
 };
diff --git a/source/backend/cpu/CPUConvolutionDepthwise.hpp b/source/backend/cpu/CPUConvolutionDepthwise.hpp
index 9b7cbecbc..91efb7b01 100644
--- a/source/backend/cpu/CPUConvolutionDepthwise.hpp
+++ b/source/backend/cpu/CPUConvolutionDepthwise.hpp
@@ -26,7 +26,12 @@ class CPUConvolutionDepthwise {
 
     private:
         std::function<void(const uint8_t *, uint8_t *, int)> mExecutor;
+        std::function<void(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                           size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                           size_t srcHStep, size_t dstHStep)> mFastKernel;
         int mNumber = 1;
+        std::shared_ptr<Tensor> mInputPad;
+        bool mFastKernelApply = false;
     };
     class MultiInputFloatExecution : public BasicFloatExecution {
     public:
diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
index 0df722bb4..8f94e84fc 100644
--- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp
+++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp
@@ -142,7 +142,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector<Tensor*>& inputs, con
 
     int size_ = mMutableResource.mBiasInt32->length(0);
     if (core->ConvDepthwise3x3LineInt8_ARM82) {
-        if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && dst_width >= 2 && dst_height >= 2) {
+        if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && dst_width >= 2 && dst_height >= 2) {
             mUse3x3Kernel   = true;
             mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82;
             UNIT = 4;
@@ -247,7 +247,7 @@ class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator {
         
         if (core->ConvDepthwise3x3LineInt8_ARM82) {
            if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1
-               && common->dilateY() == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
+               && common->dilateY() == 1 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) {
                use3x3kernel = true;
                UNIT = 4;
            }
diff --git a/source/backend/cpu/CPUGridSample.cpp b/source/backend/cpu/CPUGridSample.cpp
index 3cc633d3d..e39c22171 100644
--- a/source/backend/cpu/CPUGridSample.cpp
+++ b/source/backend/cpu/CPUGridSample.cpp
@@ -98,8 +98,8 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
         auto outW = outputTensor->buffer().dim[4].extent;
         auto threadCount = static_cast<CPUBackend*>(backend())->threadNumber();
         auto tileCount = outD;
-        auto inOffset  = batches * inH * inW * core->pack;
-        auto outOffset = batches * outH * outW * core->pack;
+        auto inOffset  = batches * inD * inH * inW * core->pack;
+        auto outOffset = batches * outD * outH * outW * core->pack;
         auto cordPtr = mTempCordBuffer->host<uint8_t>();
         for (auto b = 0; b < batches; ++b) {
             auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes;
@@ -109,10 +109,9 @@ ErrorCode CPUGridSample::onExecute(const std::vector<Tensor *> &inputs, const st
             // Compute cord
             MNN_CONCURRENCY_BEGIN(tId, threadCount) {
                 for (int index=tId; index < tileCount; index += threadCount) {
-                    auto c = index / outD;
-                    auto d = index % outD;
-                    auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes;
-                    auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes;
+                    auto d = index;
+                    auto inputC = _inputPtr;
+                    auto outputC = _outputPtr;
                     auto cordD = cordPtr + d * outH * outW * 3 * core->bytes;
                     auto outputD = outputC + d * outH * outW * core->pack * core->bytes;
                     for (int h = 0; h < outH; h++) {
diff --git a/source/backend/cpu/CPURuntime.cpp b/source/backend/cpu/CPURuntime.cpp
index 41f04fd8b..17a653f52 100644
--- a/source/backend/cpu/CPURuntime.cpp
+++ b/source/backend/cpu/CPURuntime.cpp
@@ -1373,6 +1373,9 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
                     }
                     group.ids = _readNumber((const char*)buffer.get(), buffer.size());
                 }
+                if (group.ids.empty()) {
+                    continue;
+                }
                 std::string minfreq = policyName + "/cpuinfo_min_freq";
                 {
                     MNN::AutoStorage<uint8_t> buffer;
@@ -1439,6 +1442,11 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) {
     _getInfoApple(cpuinfo_isa);
 #endif
 
+#if defined(__aarch64__) && defined(_WIN32)
+    cpuinfo_isa->fp16arith = true;
+    cpuinfo_isa->dot = true;
+#endif
+
     MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2);
     return;
 }
diff --git a/source/backend/cpu/GridSampler.hpp b/source/backend/cpu/GridSampler.hpp
index e2e738d26..895521349 100644
--- a/source/backend/cpu/GridSampler.hpp
+++ b/source/backend/cpu/GridSampler.hpp
@@ -138,7 +138,7 @@ static int MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int heig
         h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
         w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
     }
-    return ((d * height + h) * width + w) * 4;
+    return ((d * height + h) * width + w) * PACK;
 }
 
 static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
diff --git a/source/backend/cpu/arm/CMakeLists.txt b/source/backend/cpu/arm/CMakeLists.txt
index d8d06136c..37ae4c6d4 100644
--- a/source/backend/cpu/arm/CMakeLists.txt
+++ b/source/backend/cpu/arm/CMakeLists.txt
@@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?")
     if (MNN_SUPPORT_BF16)
         target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16)
     endif()
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64")
     message(STATUS "Enabling AArch64 Assemblies")
     add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC})
     target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/)
@@ -42,11 +42,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64")
         target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16)
     endif()
 
-    if(MNN_ARM82)
-        message(STATUS "Enable INT8 SDOT")
-        target_compile_options(MNNARM64 PRIVATE -DENABLE_ARMV82)
-    endif()
-
 else()
 # Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design
 endif()
diff --git a/source/backend/cpu/arm/FunctionSummary.hpp b/source/backend/cpu/arm/FunctionSummary.hpp
index 4c9a3ad19..be435004d 100644
--- a/source/backend/cpu/arm/FunctionSummary.hpp
+++ b/source/backend/cpu/arm/FunctionSummary.hpp
@@ -34,9 +34,6 @@ void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const s
                                const float* postParameters, const float* bias, const float* k, const float* b);
 void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
                                      const float* postParameters, const float* bias, const float* k, const float* b);
-
-void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width,
                                      size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step,
                                      size_t height, size_t srcHStep, size_t dstHStep);
diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
index 6fde7c37b..2cccf62ea 100644
--- a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
+++ b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S
@@ -34,8 +34,16 @@ ldr r8, [sp, #48]
 ldr lr, [sp, #52]
 ldr r10, [sp, #56]
 ldr r11, [sp, #60]
+ldr r12, [sp, #64] // bias
+vld1.32 {q0}, [r12] // bias
+ldr r12, [sp, #68]  // min,max
+vld1.32 {d2[0]}, [r12]!
+vld1.32 {d2[1]}, [r12]
 
 vpush {q4-q7}
+vmov.f32 q5, q0 // bias
+vdup.f32 q4, d2[0] // min
+vdup.f32 q6, d2[1] // max
 
 mov r12, #4
 mul r4, r12, r4
@@ -59,14 +67,14 @@ mov r12, #8
 mul r12, r4, r12
 
 L8Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
-    vmov.i32 q12, #0
-    vmov.i32 q13, #0
-    vmov.i32 q14, #0
-    vmov.i32 q15, #0
+    vmov.f32 q8,  q5 // use bias to init
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+    vmov.f32 q12, q5
+    vmov.f32 q13, q5
+    vmov.f32 q14, q5
+    vmov.f32 q15, q5
 
     vmov.i32 d14[0], r1
     vmov.i32 d14[1], r2
@@ -103,6 +111,22 @@ L8Loop:
         bne L8LoopH
 
     sub r3, r3, #8
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmax.f32 q12, q12, q4
+    vmax.f32 q13, q13, q4
+    vmax.f32 q14, q14, q4
+    vmax.f32 q15, q15, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    vmin.f32 q12, q12, q6
+    vmin.f32 q13, q13, q6
+    vmin.f32 q14, q14, q6
+    vmin.f32 q15, q15, q6
     vst1.32 {q8, q9}, [r0]!
     vmov.i32 r1, d14[0]
     vmov.i32 r2, d14[1]
@@ -121,13 +145,13 @@ mov r12, #4
 mul r12, r4, r12
 
 L4Loop:
-    vmov.i32 q8, #0
-    vmov.i32 q9, #0
-    vmov.i32 q10, #0
-    vmov.i32 q11, #0
+    vmov.f32 q8,  q5
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
 
-    vmov.i32 d8[0], r1
-    vmov.i32 d9[0], r2
+    vmov.i32 d14[0], r1
+    vmov.i32 d14[1], r2
     mov lr, r6
     L4LoopH:
         mov r10, r5
@@ -151,10 +175,18 @@ L4Loop:
         add r1, r1, r8
         bne L4LoopH
 
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
     sub r3, r3, #4
     vst1.32 {q8, q9}, [r0]!
-    vmov.i32 r1, d8[0]
-    vmov.i32 r2, d9[0]
+    vmov.i32 r1, d14[0]
+    vmov.i32 r2, d14[1]
     vst1.32 {q10, q11}, [r0]!
     add r1, r1, r12
     cmp r3, #4
@@ -168,7 +200,7 @@ cmp r3, #0
 beq End
 
 L1Loop:
-    vmov.i32 q0, #0
+    vmov.f32 q0, q5
     mov lr, r6
     mov r11, r1
     mov r12, r2
@@ -184,6 +216,8 @@ L1Loop:
         add r1, r1, r8
         bne L1LoopH
 
+    vmax.f32 q0, q0, q4
+    vmin.f32 q0, q0, q6
     subs r3, r3, #1
     vst1.32 {q0}, [r0]!
     mov r2, r12
@@ -203,6 +237,5 @@ bne LoopDY
 vpop {q4-q7}
 pop {r4-r8, r10, r11, pc}
 
-
 #endif
 #endif
diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S
deleted file mode 100644
index 06c98c03b..000000000
--- a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S
+++ /dev/null
@@ -1,74 +0,0 @@
-//
-//  MNNConvRunForUnitDepthWise.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __arm__
-#ifndef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvRunForUnitDepthWise
-//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: r0:dst, r1:src, r2:weight, r3:fw
-
-push {r4-r8, lr}
-
-//Load from sp:
-//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step
-mov r4, r3
-ldr r5, [sp, #24]
-ldr r6, [sp, #28]
-ldr r7, [sp, #32]
-ldr r8, [sp, #36]
-
-cmp r4, #0
-vmov.i32 q0, #0
-beq UnitEnd
-cmp r5, #0
-beq UnitEnd
-
-mov lr, #4
-mul r6, lr, r6
-mul r7, lr, r7
-mul r8, lr, r8
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul lr, r4, r7
-sub r8, r8, lr
-
-//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
-mov lr, #16
-mul lr, r4, lr
-sub r6, r6, lr
-
-
-UnitLoopH:
-mov lr, r4
-UnitLoopW:
-vld1.32 {q1}, [r1], r7
-vld1.32 {q2}, [r2]!
-vmla.f32 q0, q1, q2
-subs lr, lr, #1
-bne UnitLoopW
-subs r5, r5, #1
-add r1, r1, r8
-add r2, r2, r6
-bne UnitLoopH
-
-
-UnitEnd:
-
-vst1.32 {q0}, [r0]
-
-pop {r4-r8, pc}
-
-#endif
-#endif
diff --git a/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S b/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S
new file mode 100644
index 000000000..3c71c406d
--- /dev/null
+++ b/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S
@@ -0,0 +1,221 @@
+//
+//  MNNDepthwiseConvFastKernel.S
+//  MNN
+//
+//  Created by MNN on 2019/02/04.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifdef __arm__
+#ifndef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNDepthwiseConvFastKernel
+//void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep)
+
+
+//Auto Load:
+//r0:dst, r1:src, r2:weight, r3:width
+
+push {r4-r8, r10, r11, lr}
+
+//Load From Sp
+//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep
+ldr r4, [sp, #32]
+ldr r5, [sp, #36]
+ldr r6, [sp, #40]
+ldr r7, [sp, #44]
+ldr r8, [sp, #48]
+ldr lr, [sp, #52]
+ldr r10, [sp, #56]
+ldr r11, [sp, #60]
+ldr r12, [sp, #64] // bias
+vld1.32 {q0}, [r12] // bias
+ldr r12, [sp, #68]  // min,max
+vld1.32 {d2[0]}, [r12]!
+vld1.32 {d2[1]}, [r12]
+
+vpush {q4-q7}
+vmov.f32 q5, q0 // bias
+vdup.f32 q4, d2[0] // min
+vdup.f32 q6, d2[1] // max
+
+mov r12, #4
+mul r4, r12, r4
+mul r7, r12, r7
+mul r8, r12, r8
+mul r10, r12, r10
+mul r11, r12, r11
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul r12, r5, r7
+sub r8, r8, r12
+
+LoopDY:
+push {r0, r1, r3, r10, r11, lr}
+
+L8:
+cmp r3, #7
+ble L4
+
+L8Loop:
+    vmov.f32 q8,  q5 // use bias to init
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+    vmov.f32 q12, q5
+    vmov.f32 q13, q5
+    vmov.f32 q14, q5
+    vmov.f32 q15, q5
+
+    mov r12, r1
+    mov r4, r2
+    mov lr, r6
+    L8LoopH:
+        mov r10, r5
+        L8LoopW:
+            vld1.32 {q7}, [r2]!
+            vld1.32 {q0, q1}, [r1]!
+            vld1.32 {q2, q3}, [r1]!
+            subs r10, r10, #1
+            vmla.f32 q8, q0, q7
+            vmla.f32 q9, q1, q7
+            vmla.f32 q10, q2, q7
+            vmla.f32 q11, q3, q7
+            vld1.32 {q0, q1}, [r1]!
+            vld1.32 {q2, q3}, [r1]
+            vmla.f32 q12, q0, q7
+            vmla.f32 q13, q1, q7
+            vmla.f32 q14, q2, q7
+            vmla.f32 q15, q3, q7
+            sub r1, r1, #80
+
+            bne L8LoopW
+        L8LoopWEnd:
+        subs lr, lr, #1
+        add r1, r1, r8
+        bne L8LoopH
+
+    sub r3, r3, #8
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmax.f32 q12, q12, q4
+    vmax.f32 q13, q13, q4
+    vmax.f32 q14, q14, q4
+    vmax.f32 q15, q15, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    vmin.f32 q12, q12, q6
+    vmin.f32 q13, q13, q6
+    vmin.f32 q14, q14, q6
+    vmin.f32 q15, q15, q6
+    vst1.32 {q8, q9}, [r0]!
+    mov r1, r12
+    mov r2, r4
+    vst1.32 {q10, q11}, [r0]!
+    vst1.32 {q12, q13}, [r0]!
+    vst1.32 {q14, q15}, [r0]!
+    add r1, r1, #128
+    cmp r3, #8
+    bge L8Loop
+
+L4:
+cmp r3, #3
+ble L1
+
+L4Loop:
+    vmov.f32 q8,  q5
+    vmov.f32 q9,  q5
+    vmov.f32 q10, q5
+    vmov.f32 q11, q5
+
+    mov r12, r1
+    mov r4, r2
+    mov lr, r6
+    L4LoopH:
+        mov r10, r5
+        L4LoopW:
+            vld1.32 {q12}, [r2]!
+            vld1.32 {q0, q1}, [r1]!
+            vld1.32 {q2, q3}, [r1]
+            sub r1, r1, #16
+            subs r10, r10, #1
+            vmla.f32 q8, q12, q0
+            vmla.f32 q9, q12, q1
+            vmla.f32 q10, q12, q2
+            vmla.f32 q11, q12, q3
+
+            bne L4LoopW
+        subs lr, lr, #1
+        add r1, r1, r8
+        bne L4LoopH
+
+    vmax.f32 q8, q8, q4
+    vmax.f32 q9, q9, q4
+    vmax.f32 q10, q10, q4
+    vmax.f32 q11, q11, q4
+    vmin.f32 q8, q8, q6
+    vmin.f32 q9, q9, q6
+    vmin.f32 q10, q10, q6
+    vmin.f32 q11, q11, q6
+    sub r3, r3, #4
+    vst1.32 {q8, q9}, [r0]!
+    mov r1, r12
+    mov r2, r4
+    vst1.32 {q10, q11}, [r0]!
+    add r1, r1, #64
+    cmp r3, #4
+    bge L4Loop
+
+L1:
+cmp r3, #0
+beq End
+L1Loop:
+    vmov.f32 q0, q5
+    mov lr, r6
+    mov r11, r1
+    mov r12, r2
+    L1LoopH:
+        mov r10, r5
+        L1LoopW:
+            vld1.32 {q1}, [r1]!
+            vld1.32 {q2}, [r2]!
+            vmla.f32 q0, q1, q2
+            subs r10, r10, #1
+            bne L1LoopW
+        subs lr, lr, #1
+        add r1, r1, r8
+        bne L1LoopH
+
+    vmax.f32 q0, q0, q4
+    vmin.f32 q0, q0, q6
+    subs r3, r3, #1
+    vst1.32 {q0}, [r0]!
+    mov r2, r12
+    add r1, r11, #16
+    bne L1Loop
+
+
+End:
+
+pop {r0, r1, r3, r10, r11, lr}
+add r0, r0, r11
+subs lr, lr, #1
+add r1, r1, r10
+bne LoopDY
+
+vpop {q4-q7}
+pop {r4-r8, r10, r11, pc}
+
+
+#endif
+#endif
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
index 8b62af530..9c37ae75d 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S
@@ -65,9 +65,7 @@ ldr r12, [r6, #8] // int8 max
 str r12, [sp, #16]
 ldr r12, [r6, #12] // int8 min
 str r12, [sp, #20]
-ldr r12, [r6, #40] // blockNum
-mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
-lsl r12, r12, #6   // weight_stride = src_depth_quad*LP*HP
+lsl r12, r3, #6   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #24]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #28]
diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index 0e3966b9e..f3cdc98f9 100644
--- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -65,9 +65,7 @@ ldr r12, [r6, #32] // weightBias
 str r12, [sp, #8]
 ldr r12, [r6, #36] // f32minmax
 str r12, [sp, #12]
-ldr r12, [r6, #40] // blockNum
-mul r12, r12, r3   // src_depth_quad=src_depth_quad*blockNum
-lsl r12, r12, #5   // weight_stride = src_depth_quad*LP*HP
+lsl r12, r3, #5   // weight_stride = src_depth_quad*LP*HP
 str r12, [sp, #16]
 ldr r12, [r6, #48] // extraScale
 str r12, [sp, #20]
@@ -82,12 +80,14 @@ L2LoopDz:
     subs r12, r3, #1
     // first four output
     vld1.8 {q2}, [r1]!
-    vld1.8 {q4}, [r2]! // weight, d8,d9,d10,d11
+    vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
     // int4->int8
-    vmov.i8 q5, #15
-    vand.i8 q5, q5, q4
+    vmov.i8 q6, #15
+    vmov.i8 q7, #15
+    vand.i8 q6, q6, q4
+    vand.i8 q7, q7, q5
     vshr.u8 q4, q4, #4
-    vzip.8 q4, q5
+    vshr.u8 q5, q5, #4
 
     vmull.s8 q0, d4, d8
     vmull.s8 q1, d4, d10
@@ -95,12 +95,6 @@ L2LoopDz:
     vmlal.s8 q1, d5, d11
     vpaddl.s16 q8, q0
     vpaddl.s16 q9, q1
-    vld1.8 {q6}, [r2]! // weight,d12,d13,d14,d15
-    // int4->int8
-    vmov.i8 q7, #15
-    vand.i8 q7, q7, q6
-    vshr.u8 q6, q6, #4
-    vzip.8 q6, q7
 
     vmull.s8 q0, d4, d12
     vmull.s8 q1, d4, d14
@@ -129,22 +123,18 @@ L2LoopDz:
     L2LoopSz:
         // first four output
         vld1.8 {q2}, [r1]!
-        vld1.8 {q4}, [r2]!
+        vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
         // int4->int8
-        vmov.i8 q5, #15
-        vand.i8 q5, q5, q4
+        vmov.i8 q6, #15
+        vmov.i8 q7, #15
+        vand.i8 q6, q6, q4
+        vand.i8 q7, q7, q5
         vshr.u8 q4, q4, #4
-        vzip.8 q4, q5
+        vshr.u8 q5, q5, #4
         vmull.s8 q0, d4, d8
         vmull.s8 q1, d4, d10
         vmlal.s8 q0, d5, d9
         vmlal.s8 q1, d5, d11
-        vld1.8 {q6}, [r2]!
-        // int4->int8
-        vmov.i8 q7, #15
-        vand.i8 q7, q7, q6
-        vshr.u8 q6, q6, #4
-        vzip.8 q6, q7
         vpadal.s16 q8, q0
         vpadal.s16 q9, q1
 
@@ -269,12 +259,14 @@ L1LoopDz:
     subs r12, r3, #1
     // first four output
     vld1.8 {q2}, [r1]!
-    vld1.8 {q4}, [r2]!
+    vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
     // int4->int8
-    vmov.i8 q5, #15
-    vand.i8 q5, q5, q4
+    vmov.i8 q6, #15
+    vmov.i8 q7, #15
+    vand.i8 q6, q6, q4
+    vand.i8 q7, q7, q5
     vshr.u8 q4, q4, #4
-    vzip.8 q4, q5
+    vshr.u8 q5, q5, #4
     
     vmull.s8 q0, d4, d8
     vmull.s8 q1, d4, d10
@@ -282,12 +274,6 @@ L1LoopDz:
     vmlal.s8 q1, d5, d11
     vpaddl.s16 q8, q0
     vpaddl.s16 q9, q1
-    vld1.8 {q6}, [r2]!
-    // int4->int8
-    vmov.i8 q7, #15
-    vand.i8 q7, q7, q6
-    vshr.u8 q6, q6, #4
-    vzip.8 q6, q7
 
     vmull.s8 q0, d4, d12
     vmull.s8 q1, d4, d14
@@ -302,22 +288,18 @@ L1LoopDz:
     L1LoopSz:
         // first four output
         vld1.8 {q2}, [r1]!
-        vld1.8 {q4}, [r2]!
+        vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11
         // int4->int8
-        vmov.i8 q5, #15
-        vand.i8 q5, q5, q4
+        vmov.i8 q6, #15
+        vmov.i8 q7, #15
+        vand.i8 q6, q6, q4
+        vand.i8 q7, q7, q5
         vshr.u8 q4, q4, #4
-        vzip.8 q4, q5
+        vshr.u8 q5, q5, #4
         vmull.s8 q0, d4, d8
         vmull.s8 q1, d4, d10
         vmlal.s8 q0, d5, d9
         vmlal.s8 q1, d5, d11
-        vld1.8 {q6}, [r2]!
-        // int4->int8
-        vmov.i8 q7, #15
-        vand.i8 q7, q7, q6
-        vshr.u8 q6, q6, #4
-        vzip.8 q6, q7
         vpadal.s16 q8, q0
         vpadal.s16 q9, q1
 
diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
index 9a1bcbf46..08c174af3 100644
--- a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
+++ b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S
@@ -26,6 +26,12 @@ ldr x8, [sp, #0]
 ldr x15, [sp, #8]
 ldr x10, [sp, #16]
 ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d8, d9, [sp, #(-16 * 3)]!
+stp d10, d11, [sp, #(16 * 2)]
+stp x19, x20, [sp, #(16 * 1)]
 
 mov x9, #4
 mul x4, x9, x4
@@ -34,10 +40,32 @@ mul x8, x9, x8
 mul x10, x9, x10
 mul x11, x9, x11
 
+ld1 {v8.4s}, [x12] // bias
+ld1r {v10.4s}, [x13], #4 // min
+ld1r {v11.4s}, [x13]
+
 //dilate_y_step -> dilate_y_step - fw*dilate_x_step
 mul x9, x5, x7
 sub x8, x8, x9
 
+.macro assign_bias x0, x1, x2, x3
+    mov \x0\().16b, v8.16b
+    mov \x1\().16b, v8.16b
+    mov \x2\().16b, v8.16b
+    mov \x3\().16b, v8.16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().4s, \x0\().4s, \xmin\().4s
+    fmax \x1\().4s, \x1\().4s, \xmin\().4s
+    fmax \x2\().4s, \x2\().4s, \xmin\().4s
+    fmax \x3\().4s, \x3\().4s, \xmin\().4s
+    fmin \x0\().4s, \x0\().4s, \xmax\().4s
+    fmin \x1\().4s, \x1\().4s, \xmax\().4s
+    fmin \x2\().4s, \x2\().4s, \xmax\().4s
+    fmin \x3\().4s, \x3\().4s, \xmax\().4s
+.endm
+
 LoopDY:
 mov v4.d[0], x10
 mov v4.d[1], x11
@@ -53,22 +81,10 @@ mov x12, #16
 mul x12, x4, x12
 
 L16Loop:
-    movi v16.4s, #0
-    movi v17.4s, #0
-    movi v18.4s, #0
-    movi v19.4s, #0
-    movi v20.4s, #0
-    movi v21.4s, #0
-    movi v22.4s, #0
-    movi v23.4s, #0
-    movi v24.4s, #0
-    movi v25.4s, #0
-    movi v26.4s, #0
-    movi v27.4s, #0
-    movi v28.4s, #0
-    movi v29.4s, #0
-    movi v30.4s, #0
-    movi v31.4s, #0
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23
+    assign_bias v24, v25, v26, v27
+    assign_bias v28, v29, v30, v31
 
     mov x13, x1
     mov x14, x2
@@ -120,6 +136,10 @@ L16Loop:
         bne L16LoopH
 
     sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
     st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
     add x1, x13, x12
     cmp x3, #16
@@ -138,14 +158,8 @@ mov x12, #8
 mul x12, x4, x12
 
 L8Loop:
-    movi v16.4s, #0
-    movi v17.4s, #0
-    movi v18.4s, #0
-    movi v19.4s, #0
-    movi v20.4s, #0
-    movi v21.4s, #0
-    movi v22.4s, #0
-    movi v23.4s, #0
+    assign_bias v16, v17, v18, v19
+    assign_bias v20, v21, v22, v23
 
     mov x13, x1
     mov x14, x2
@@ -180,6 +194,8 @@ L8Loop:
         add x1, x1, x8
         bne L8LoopH
 
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
     sub x3, x3, #8
     st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
     add x1, x13, x12
@@ -195,10 +211,7 @@ mov x12, #4
 mul x12, x4, x12
 
 L4Loop:
-    movi v16.4s, #0
-    movi v17.4s, #0
-    movi v18.4s, #0
-    movi v19.4s, #0
+    assign_bias v16, v17, v18, v19
 
     mov x13, x1
     mov x14, x2
@@ -225,6 +238,7 @@ L4Loop:
         add x1, x1, x8
         bne L4LoopH
 
+    compare_min_max v16, v17, v18, v19, v10, v11
     sub x3, x3, #4
     st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
     add x1, x13, x12
@@ -235,7 +249,7 @@ cmp x3, #0
 beq End
 
 L1Loop:
-    movi v0.4s, #0
+    mov v0.16b, v8.16b
     mov x9, x6
     mov x11, x1
     mov x12, x2
@@ -252,6 +266,8 @@ L1Loop:
         bne L1LoopH
 
     subs x3, x3, #1
+    fmax v0.4s, v0.4s, v10.4s
+    fmin v0.4s, v0.4s, v11.4s
     st1 {v0.4s}, [x0], #16
     mov x2, x12
     add x1, x11, x4
@@ -271,7 +287,9 @@ add x0, x0, x11
 add x1, x1, x10
 bne LoopDY
 
-
+ldp x19, x20, [sp, #(16 * 1)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d8, d9, [sp], #(16 * 3)
 ret
 //MNNConvRunForLineDepthwise End
 
diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S
deleted file mode 100644
index 1036c90eb..000000000
--- a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S
+++ /dev/null
@@ -1,63 +0,0 @@
-//
-//  MNNConvRunForUnitDepthWise.S
-//  MNN
-//
-//  Created by MNN on 2019/02/04.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifdef __aarch64__
-
-#include "MNNAsmGlobal.h"
-
-.text
-.align 5
-
-asm_function MNNConvRunForUnitDepthWise
-//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step)
-
-//Auto: x0:dst, x1:src, x2:weight, x3:fw
-//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step
-
-cmp x3, #0
-movi v0.4s, #0
-beq UnitEnd
-cmp x4, #0
-beq UnitEnd
-
-mov x9, #4
-mul x5, x9, x5
-mul x6, x9, x6
-mul x7, x9, x7
-
-//dilate_y_step -> dilate_y_step - dilate_x_step*fw
-mul x9, x3, x6
-sub x7, x7, x9
-
-//weight_y_step -> weight_y_step - 4*sizeof(float)*fw
-mov x9, #16
-mul x9, x3, x9
-sub x5, x5, x9
-
-
-UnitLoopH:
-mov x9, x3
-UnitLoopW:
-ld1 {v1.4s}, [x1], x6
-ld1 {v2.4s}, [x2], #16
-fmla v0.4s, v1.4s, v2.4s
-subs x9, x9, #1
-bne UnitLoopW
-subs x4, x4, #1
-add x1, x1, x7
-add x2, x2, x5
-bne UnitLoopH
-
-
-UnitEnd:
-
-st1 {v0.4s}, [x0]
-
-ret
-
-#endif
diff --git a/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S b/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S
new file mode 100644
index 000000000..79770ba09
--- /dev/null
+++ b/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S
@@ -0,0 +1,292 @@
+//
+//  MNNDepthwiseConvFastKernel.S
+//  MNN
+//
+//  Created by MNN on 2024/09/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+
+#ifdef __aarch64__
+
+#include "MNNAsmGlobal.h"
+
+.text
+.align 5
+
+asm_function MNNDepthwiseConvFastKernel
+
+// void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+//                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+//                                    size_t srcHStep, size_t dstHStep);
+//Auto Load:
+//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step
+
+//Load From sp:
+//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax
+ldr x8, [sp, #0]
+ldr x15, [sp, #8]
+ldr x10, [sp, #16]
+ldr x11, [sp, #24]
+ldr x12, [sp, #32]
+ldr x13, [sp, #40]
+
+stp d14, d15, [sp, #(-16 * 9)]!
+stp d12, d13, [sp, #(16 * 1)]
+stp d10, d11, [sp, #(16 * 2)]
+stp d8,  d9,  [sp, #(16 * 3)]
+stp x21, x22, [sp, #(16 * 4)]
+stp x19, x20, [sp, #(16 * 5)]
+stp x27, x28, [sp, #(16 * 6)]
+stp x25, x26, [sp, #(16 * 7)]
+stp x23, x24, [sp, #(16 * 8)]
+
+lsl x4, x4, #2   // src_w_step*sizeof(float)
+lsl x7, x7, #2   // dilate_x_step*sizeof(float)
+lsl x8, x8, #2   // dilate_y_step*sizeof(float)
+lsl x23, x10, #2 // srcHStep*sizeof(float)
+lsl x24, x11, #2 // dstHStep*sizeof(float)
+mov x20, x12     // bias
+mov x26, x13     // min
+add x27, x13, #4 // max
+
+//dilate_y_step -> dilate_y_step - fw*dilate_x_step
+mul x9, x5, x7
+sub x8, x8, x9
+mov x25, x3 // width
+.macro assign_bias x0, x1, x2, x3, bv
+    mov \x0\().16b, \bv\().16b
+    mov \x1\().16b, \bv\().16b
+    mov \x2\().16b, \bv\().16b
+    mov \x3\().16b, \bv\().16b
+.endm
+
+.macro compare_min_max x0, x1, x2, x3, xmin, xmax
+    fmax \x0\().4s, \x0\().4s, \xmin\().4s
+    fmax \x1\().4s, \x1\().4s, \xmin\().4s
+    fmax \x2\().4s, \x2\().4s, \xmin\().4s
+    fmax \x3\().4s, \x3\().4s, \xmin\().4s
+    fmin \x0\().4s, \x0\().4s, \xmax\().4s
+    fmin \x1\().4s, \x1\().4s, \xmax\().4s
+    fmin \x2\().4s, \x2\().4s, \xmax\().4s
+    fmin \x3\().4s, \x3\().4s, \xmax\().4s
+.endm
+
+LoopDY:
+//mov x23, x10
+//mov x24, x11
+mov x21, x0
+mov x22, x1
+
+L16:
+cmp x3, #16
+blt L8
+
+mov x12, #-176
+mov x19, #256
+
+L16Loop:
+    ld1 {v8.4s}, [x20] // load bias
+    assign_bias v16, v17, v18, v19, v8
+    assign_bias v20, v21, v22, v23, v8
+    assign_bias v24, v25, v26, v27, v8
+    assign_bias v28, v29, v30, v31, v8
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L16LoopH:
+        mov x10, x5
+        L16LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
+            ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x1], #64
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+            fmla v17.4s, v8.4s, v1.4s
+            fmla v18.4s, v8.4s, v2.4s
+            fmla v19.4s, v8.4s, v3.4s
+
+            fmla v20.4s, v8.4s, v4.4s
+            fmla v21.4s, v8.4s, v5.4s
+            fmla v22.4s, v8.4s, v6.4s
+            fmla v23.4s, v8.4s, v7.4s
+
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
+
+            fmla v24.4s, v8.4s, v9.4s
+            fmla v25.4s, v8.4s, v10.4s
+            fmla v26.4s, v8.4s, v11.4s
+            fmla v27.4s, v8.4s, v12.4s
+
+            fmla v28.4s, v8.4s, v0.4s
+            fmla v29.4s, v8.4s, v1.4s
+            fmla v30.4s, v8.4s, v2.4s
+            fmla v31.4s, v8.4s, v3.4s
+
+            bne L16LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L16LoopH
+    ld1r {v10.4s}, [x26] // min
+    ld1r {v11.4s}, [x27] // max
+    sub x3, x3, #16
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11 
+    compare_min_max v24, v25, v26, v27, v10, v11
+    compare_min_max v28, v29, v30, v31, v10, v11
+    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+    add x1, x13, x19 // 16 * pack * sizeof(float)
+    cmp x3, #16
+    mov x2, x14
+    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+    st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
+    st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64
+    bge L16Loop
+
+
+L8:
+ld1r {v10.4s}, [x26] // min
+ld1r {v11.4s}, [x27] // max
+ld1 {v24.4s}, [x20] // load bias
+cmp x3, #7
+ble L4
+
+mov x12, #-48
+mov x19, #128
+
+
+L8Loop:
+    assign_bias v16, v17, v18, v19, v24
+    assign_bias v20, v21, v22, v23, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L8LoopH:
+        mov x10, x5
+        L8LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64
+            ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+            fmla v17.4s, v8.4s, v1.4s
+            fmla v18.4s, v8.4s, v2.4s
+            fmla v19.4s, v8.4s, v3.4s
+
+            fmla v20.4s, v8.4s, v4.4s
+            fmla v21.4s, v8.4s, v5.4s
+            fmla v22.4s, v8.4s, v6.4s
+            fmla v23.4s, v8.4s, v7.4s
+
+            bne L8LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L8LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    compare_min_max v20, v21, v22, v23, v10, v11
+    sub x3, x3, #8
+    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+    add x1, x13, x19 // 8 * pack * sizeof(float)
+    mov x2, x14
+    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
+
+
+L4:
+cmp x3, #4
+ble L1
+
+mov x12, #16
+mov x19, #64
+
+L4Loop:
+    assign_bias v16, v17, v18, v19, v24
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L4LoopH:
+        mov x10, x5
+        L4LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+            fmla v17.4s, v8.4s, v1.4s
+            fmla v18.4s, v8.4s, v2.4s
+            fmla v19.4s, v8.4s, v3.4s
+
+            bne L4LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L4LoopH
+
+    compare_min_max v16, v17, v18, v19, v10, v11
+    sub x3, x3, #4
+    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64
+    add x1, x13, x19
+    mov x2, x14
+
+L1:
+cmp x3, #0
+beq End
+
+mov x19, #16
+
+L1Loop:
+    ld1 {v16.4s}, [x20] // assign bias
+
+    mov x13, x1
+    mov x14, x2
+    mov x9, x6
+    L1LoopH:
+        mov x10, x5
+        L1LoopW:
+            ld1 {v8.4s}, [x2], #16
+            ld1 {v0.4s}, [x1], #16
+            subs x10, x10, #1
+            fmla v16.4s, v8.4s, v0.4s
+
+            bne L1LoopW
+        subs x9, x9, #1
+        add x1, x1, x8
+        bne L1LoopH
+
+    subs x3, x3, #1
+    fmax v16.4s, v16.4s, v10.4s
+    fmin v16.4s, v16.4s, v11.4s
+    st1 {v16.4s}, [x0], #16
+    add x1, x13, x4
+    mov x2, x14
+    bne L1Loop
+
+
+End:
+
+//mov x10, x23
+//mov x11, x24
+//mov x0, x21
+//mov x1, x22
+mov x3, x25
+
+subs x15, x15, #1
+add x0, x21, x24
+add x1, x22, x23
+bne LoopDY
+
+ldp x23, x24, [sp, #(16 * 8)]
+ldp x25, x26, [sp, #(16 * 7)]
+ldp x27, x28, [sp, #(16 * 6)]
+ldp x19, x20, [sp, #(16 * 5)]
+ldp x21, x22, [sp, #(16 * 4)]
+ldp d8,  d9,  [sp, #(16 * 3)]
+ldp d10, d11, [sp, #(16 * 2)]
+ldp d12, d13, [sp, #(16 * 1)]
+ldp d14, d15, [sp], #(16 * 9)
+ret
+//MNNConvRunForLineDepthwise End
+
+#endif
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
index d31d57ad7..b2cf3b215 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S
@@ -118,8 +118,7 @@ stp x23, x24, [sp, #(16 * 6)]
 ldr x19, [x15, #56] // fp32 min max
 ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
-mul x21, x21, x3    // blockNum * src_depth_quad_perblock
-lsl x21, x21, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
+lsl x21, x3, #6    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t)
 add x20, x19, #4
 
 Start:
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
index 339bbd37e..c5203dde4 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S
@@ -125,9 +125,7 @@ stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]
 
-ldr x27, [x6, #64]  // blockNum
-mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
+lsl x15, x3, #5     // x15 = src_depth_quad * UNIT * SRC_UNIT
 
 ldr w28, [x6, #24]  // useInt8
 ldr x25, [x6, #40]  // xKernelSum
diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
index 0225e0b4e..621f7a84b 100644
--- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
+++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S
@@ -138,9 +138,7 @@ ldr w23, [x6, #24]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
 
-ldr x22, [x6, #64] // blockNum
-mul x22, x22, x3   // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
-lsl x15, x22, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
+lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6
 
 ldr x10, [x6, #80]  // extra scale
 mov x21, #4 // sizeof(int8_t) * pack
diff --git a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
index 803166f17..dde601bfc 100644
--- a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
+++ b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S
@@ -55,8 +55,7 @@ mov x9, x6 // blockNum
 
 cbnz x12, TILE10_BLOCK_NUM
 ld1 {v5.4s, v6.4s}, [x2], #32
-ld1 {v7.d}[0], [x2]
-sub x2, x2, #32
+ld1 {v7.d}[0], [x2], #8
 
 TILE10_BLOCK_NUM:
 cbz x9, TILE10_END
@@ -315,4 +314,4 @@ ldp d10, d11, [sp, #(16 * 2)]
 ldp d12, d13, [sp, #(16 * 1)]
 ldp d14, d15, [sp], #(16 * 4)
 ret
-#endif
\ No newline at end of file
+#endif
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
index 90ad5673b..01d574fa8 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S
@@ -113,10 +113,8 @@ stp x21, x22, [sp, #(16 * 5)]
 stp x23, x24, [sp, #(16 * 6)]
 
 ldr x19, [x15, #56] // fp32 min max
-ldr x21, [x15, #64] // blockNum
 ldr x23, [x15, #80] // extraScale
-mul x21, x21, x3    // blockNum * src_depth_quad_perblock
-lsl x21, x21, #5    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
+lsl x21, x3, #5    // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t)
 add x20, x19, #4
 
 Start:
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
index 49b9567cc..4e94c454d 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S
@@ -124,9 +124,7 @@ stp x27, x28, [sp, #(16 * 6)]
 stp x25, x26, [sp, #(16 * 7)]
 stp x23, x24, [sp, #(16 * 8)]
 
-ldr x27, [x6, #64]  // blockNum
-mul x27, x27, x3    // blockNum * src_depth_quad_perblock
-lsl x15, x27, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
+lsl x15, x3, #4     // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t)
 
 ldr x25, [x6, #40]  // xKernelSum
 ldr x26, [x6, #48]  // weightQuantBias
diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
index 891196103..d6b2c53e2 100644
--- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
+++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S
@@ -116,9 +116,7 @@ stp x27, x28, [sp, #(16 * 8)]
 ldr x27, [x6, #40] // srcKernelSum
 ldr x28, [x6, #48] // weightQuanBias
 
-ldr x22, [x6, #64] // blockNum
-mul x22, x22, x3   // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block
-lsl x15, x22, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
+lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4
 
 mov x21, #16 // sizeof(float) * pack
 ldr x14, [x6, #56]  // float32 maxmin ptr
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
index df1b70970..f5f1af06a 100644
--- a/source/backend/cpu/compute/CommonOptFunction.cpp
+++ b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -3028,203 +3028,6 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) {
 #endif
 }
 
-void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) {
-    int unit = ow / 2;
-    MNN_ASSERT(cacheLineSize >= 1);
-    auto biasF = Vec4::load(bias);
-    auto minF = Vec4(parameters[2]);
-    auto maxF = Vec4(parameters[3]);
-    for (int x = 0; x < unit; ++x) {
-        auto offset = 4 * 4 * x;
-        int i = 0;
-        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-        Vec4 m3     = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
-
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-            m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3);
-        }
-
-        auto o0 = m0 + m1 + m2 + biasF;
-        auto o1 = m1 - m2 + m3 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o1 = Vec4::min(maxF, o1);
-        o0 = Vec4::max(minF, o0);
-        o1 = Vec4::max(minF, o1);
-        Vec4::save(dest + 8 * x + 0 * 4, o0);
-        Vec4::save(dest + 8 * x + 1 * 4, o1);
-    }
-    if (unit * 2 < ow) {
-        auto offset = 4 * 4 * unit;
-        int i = 0;
-        Vec4 m0     = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-        Vec4 m1     = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-        Vec4 m2     = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-
-        for (i = 1; i < cacheLineSize; ++i) {
-            m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0);
-            m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1);
-            m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2);
-        }
-        auto o0 = m0 + m1 + m2 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o0 = Vec4::max(minF, o0);
-        Vec4::save(dest + 8 * unit + 0 * 4, o0);
-    }
-}
-extern "C" {
-void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit);
-}
-
-void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) {
-    for (int x = 0; x < su; ++x) {
-        auto dstX = dest + 4 * 4 * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-
-        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec4::load(source + 4 * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-
-        Vec4::save(dstX + 4 * 0, m0);
-        Vec4::save(dstX + 4 * 1, m1);
-        Vec4::save(dstX + 4 * 2, m2);
-        Vec4::save(dstX + 4 * 3, m3);
-    }
-    MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su);
-
-    for (int x = eu; x < unit; ++x) {
-        auto dstX = dest + 4 * 4 * x;
-        auto sx   = x * 2 - (int)pad;
-        auto ex   = sx + 4;
-
-        auto clampSx = std::max(sx, 0);
-        auto clampEx = std::min(ex, (int)iw);
-
-        Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f};
-        for (int i = clampSx; i < clampEx; ++i) {
-            v[i - sx] = Vec4::load(source + 4 * i);
-        }
-        auto m0 = v[0] - v[2];
-        auto m1 = v[1] + v[2];
-        auto m2 = v[2] - v[1];
-        auto m3 = v[3] - v[1];
-
-        Vec4::save(dstX + 4 * 0, m0);
-        Vec4::save(dstX + 4 * 1, m1);
-        Vec4::save(dstX + 4 * 2, m2);
-        Vec4::save(dstX + 4 * 3, m3);
-    }
-}
-
-#ifndef MNN_USE_NEON
-void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) {
-    int unit = ow / 2;
-    auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0);
-    auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1);
-    auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2);
-    auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3);
-    auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0);
-    auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1);
-    auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2);
-    auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3);
-    auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0);
-    auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1);
-    auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2);
-    auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3);
-    auto biasF = Vec4::load(bias);
-    auto minF = Vec4(parameters[2]);
-    auto maxF = Vec4(parameters[3]);
-    for (int x = 0; x < unit; ++x) {
-        auto offset = 4 * 4 * x;
-        int i = 0;
-        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
-        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
-        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
-        Vec4 m3     = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3);
-
-        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
-        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
-        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
-        m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3);
-
-        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
-        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
-        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
-        m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3);
-
-        auto o0 = m0 + m1 + m2 + biasF;
-        auto o1 = m1 - m2 + m3 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o1 = Vec4::min(maxF, o1);
-        o0 = Vec4::max(minF, o0);
-        o1 = Vec4::max(minF, o1);
-        Vec4::save(dest + 8 * x + 0 * 4, o0);
-        Vec4::save(dest + 8 * x + 1 * 4, o1);
-    }
-    if (unit * 2 < ow) {
-        auto offset = 4 * 4 * unit;
-        Vec4 m0     = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0);
-        Vec4 m1     = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1);
-        Vec4 m2     = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2);
-
-        m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0);
-        m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1);
-        m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2);
-
-        m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0);
-        m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1);
-        m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2);
-        auto o0 = m0 + m1 + m2 + biasF;
-        o0 = Vec4::min(maxF, o0);
-        o0 = Vec4::max(minF, o0);
-        Vec4::save(dest + 8 * unit + 0 * 4, o0);
-    }
-}
-void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) {
-    if (unit <= 0) {
-        return;
-    }
-    Vec4 v0 = Vec4::load(source + 4 * 0);
-    Vec4 v1 = Vec4::load(source + 4 * 1);
-    Vec4 v2;
-    Vec4 v3;
-    source += 8;
-
-    for (int x = 0; x < unit; ++x) {
-        v2 = Vec4::load(source + 0 * 4);
-        v3 = Vec4::load(source + 1 * 4);
-        auto m0 = v0 - v2;
-        auto m1 = v1 + v2;
-        auto m2 = v2 - v1;
-        auto m3 = v3 - v1;
-
-        Vec4::save(dest + 4 * 0, m0);
-        Vec4::save(dest + 4 * 1, m1);
-        Vec4::save(dest + 4 * 2, m2);
-        Vec4::save(dest + 4 * 3, m3);
-
-        source += 8;
-        dest += 16;
-
-        v0 = v2;
-        v1 = v3;
-    }
-}
-#endif
-
 static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) {
     if(sparseBlockOC == 4) {
         packedSparseMatMul = MNNPackedSparseMatMulEpx4;
@@ -3365,10 +3168,6 @@ void MNNCoreFunctionInit() {
 
     gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit;
     gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise;
-    gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise;
-    gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23;
-    gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit;
-    gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23;
     gCoreFunction->MNNMatrixAdd = MNNMatrixAdd;
     gCoreFunction->MNNMatrixSub = MNNMatrixSub;
     gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction;
@@ -3390,6 +3189,9 @@ void MNNCoreFunctionInit() {
     gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform;
     gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise;
     gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise;
+#ifdef MNN_USE_NEON
+    gCoreFunction->MNNDepthwiseConvFastKernel = MNNDepthwiseConvFastKernel;
+#endif
     gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat;
     gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat;
     gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8;
@@ -3514,4 +3316,4 @@ void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth
         areaOffset,
     };
     MNNPackInt8C2(dst, src, area, depth, offset);
-}
\ No newline at end of file
+}
diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h
index 4af1a81a8..32ebd0c54 100644
--- a/source/backend/cpu/compute/CommonOptFunction.h
+++ b/source/backend/cpu/compute/CommonOptFunction.h
@@ -170,9 +170,6 @@ struct MatMulParam {
 void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId);
 
 void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count);
-void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
-void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* postParameter);
-void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow);
 void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count);
 
 struct SumByAxisParams {
@@ -267,15 +264,10 @@ struct CoreFunctions {
     void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset);
 
     // NC4HW4's compute function
-    void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                        size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
     void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                     size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                    size_t srcHStep, size_t dstHStep);
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
     void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
-    void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* post);
-    void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
-    void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* post);
     void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                       size_t bStride, size_t height);
     void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
@@ -309,6 +301,9 @@ struct CoreFunctions {
                                       size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
     void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                       size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
+    void(*MNNDepthwiseConvFastKernel)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) = nullptr;
     void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad);
     void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput,
                            int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth,
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
index ae2c1a8ff..bcba4eedb 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp
@@ -44,10 +44,14 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& inputs, co
     return NO_ERROR;
 }
 
-void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) {
+void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum) {
     auto weightDst = weight->host<uint8_t>();
     memset(weightDst, 0, weight->size());
-    if (SRC_UNIT > pack) {
+    int kernelCountUnit = weight->shape()[1];
+    int blockL = kernelCountUnit / blockNum;
+    int strideOutside = ROUND_UP(oc, UNIT) * SRC_UNIT * blockL;
+    int strideInside   = weight->stride(0) / blockNum;
+    if (SRC_UNIT > pack) { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack) / blockNum, UNIT, SRC_UNIT};
         auto icDivU = UP_DIV(ic, pack);
         for (int k = 0; k < kernelCount; ++k) {
             const auto srcK = weightSrc + k;
@@ -58,31 +62,37 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
                 const int ySubOutSide = yIndex / (SRC_UNIT / pack);
                 const int ySubInSide  = yIndex % (SRC_UNIT / pack);
 
-                auto dstY       = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide;
+                int blockId = ySubOutSide / blockL;
+                int blockInsideId = ySubOutSide % blockL;
+
+                auto dstY       = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + ySubInSide * pack + yInSide;
                 const auto srcY = srcK + y * kernelCount;
                 for (int x = 0; x < oc; ++x) {
                     const int xOutSide = x / UNIT;
                     const int xInSide  = x % UNIT;
-                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
                     const int srcIndex = x * kernelCount * ic;
                     dstY[dstIndex]     = srcY[srcIndex];
                 }
             }
         }
-    } else {
+    } else { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount / blockNum, UNIT, SRC_UNIT};
         for (int k = 0; k < kernelCount; ++k) {
             auto icDivU = UP_DIV(ic, SRC_UNIT);
             const auto srcK = weightSrc + k;
             for (int y = 0; y < ic; ++y) {
                 const int yOutSide    = y / SRC_UNIT;
                 const int yInSide     = y % SRC_UNIT;
+                
+                int blockId = (yOutSide + k * icDivU) / blockL;
+                int blockInsideId = (yOutSide + k * icDivU) % blockL;
 
-                auto dstY       = weightDst + (yOutSide + k * icDivU)  * weight->stride(1) + yInSide;
+                auto dstY       = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + yInSide;
                 const auto srcY = srcK + y * kernelCount;
                 for (int x = 0; x < oc; ++x) {
                     const int xOutSide = x / UNIT;
                     const int xInSide  = x % UNIT;
-                    const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT;
+                    const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT;
                     const int srcIndex = x * kernelCount * ic;
                     dstY[dstIndex]     = srcY[srcIndex];
                 }
@@ -93,7 +103,8 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS
 
 static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
                                  const std::shared_ptr<Tensor>& weightOrigin,
-                                 std::shared_ptr<Tensor>& weight) {
+                                 std::shared_ptr<Tensor>& weight, int blockNum) {
+    MNN_ASSERT(blockNum > 0);
     auto core = static_cast<CPUBackend*>(bn)->int8Functions();
     auto gcore = static_cast<CPUBackend*>(bn)->functions();
     int UNIT, SRC_UNIT, DST_XUNIT;
@@ -119,11 +130,11 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common,
         MNN_ERROR("Memory not enough");
         return false;
     }
-    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack);
+    ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host<uint8_t>(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack, blockNum);
     return true;
 }
 
-static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend) {
+static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resource, std::shared_ptr<ConvolutionCommon::Int8Common> quantCommon, const Convolution2D* conv2d, Backend* backend, int32_t* blocknumPtr) {
     // common parameters
     int outputCount = conv2d->common()->outputCount();
     auto core = static_cast<CPUBackend*>(backend)->functions();
@@ -135,6 +146,7 @@ static void GetResourceInt8(std::shared_ptr<CPUConvolution::ResourceInt8> resour
         dequantCnt /= 2;
     }
     int blockNum = dequantCnt / outputCount;
+    blocknumPtr[0] = blockNum;
     int scaleSize = blockNum * ocUp4; // pack size.
     int blockSize = LSize / blockNum;
     int originOffset = 0;
@@ -244,7 +256,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
     auto gcore = static_cast<CPUBackend*>(backend)->functions();
     mResourceInt8.reset(new CPUConvolution::ResourceInt8);
     mResourceInt8->mDynamicQuant = true;
-    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend);
+    int blockNum = 1;
+    GetResourceInt8(mResourceInt8, quanCommon, convOp, backend, &blockNum);
+    mBlockNum = blockNum;
     // dynamic quant
     int UNIT, SRC_UNIT, DST_XUNIT;
     core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT);
@@ -285,10 +299,15 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
         // Pack two int4-weight to one int8-weight.
         int cnt = lP * hP / 4;
         int L = lU * lP;
+        int blockL = lU / blockNum;
+        int stride0 = (lP * hP) * hU * blockL;
+        int stride1 = (lP * hP) * blockL;
         for (int i = 0; i < hU; ++i) {
             for (int j = 0; j < lU; ++j) {
+                int blockId = j / blockL;
+                int blockkInsideId = j % blockL;
                 for (int k = 0; k < cnt; ++k) {
-                    int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k);
+                    int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k);
                     
                     int hpId0     = (2 * k + 1) / lP;
                     int lpId0     = (2 * k) % lP;
@@ -322,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
                 tmpWeight[2 * i + 1] = s1;
             }
             std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength * 2}, (void*)tmpWeight.data()));
-            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
             if(!mValid) {
                 return;
             }
@@ -349,7 +368,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O
             mResourceInt8->mWeightInt8 = weightLow;
         } else {
             std::shared_ptr<Tensor> srcWeight(Tensor::create<uint8_t>({weightLength}, (void*)quanCommon->weight.get()));
-            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8);
+            mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum);
             if(!mValid) {
                 return;
             }
@@ -429,7 +448,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st
 DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr<ResourceInt8> res) : ConvInt8TiledExecutor(backend, op, res) {
     std::shared_ptr<Tensor> weightOrigin = mResourceInt8->mWeightInt8;
     auto convOp = op->main_as_Convolution2D();
-    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8);
+    mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8, mBlockNum);
     if(!mValid) {
         return;
     }
@@ -559,7 +578,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
 
         mDivides.resize(threads+1);
         mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(totalWork, mDivides.data() + 1);
         for (int i = 0; i < mDivides.size(); ++i) {
             mDivides[i] *= part;
         }
@@ -572,7 +591,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector<Tensor*>& input
         mThreadNums = ALIMIN(threads, mTileCount);
         mDivides.resize(threads+1);
         mDivides[0] = 0;
-        static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1);
     }
     int ocUp4 = ROUND_UP(outC, gcore->pack);
     // int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2);
@@ -663,6 +682,9 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
 
     auto inputDataPtr        = input->host<int8_t>();
     auto im2colPtr           = mTempIm2ColBuffer->host<int8_t>();
+    if (SRC_UNIT > PackUnit) {
+        memset(im2colPtr, 0, mTempIm2ColBuffer->size());
+    }
     const auto weightDataPtr = mResourceInt8->mWeightInt8->host<int8_t>();
     auto srcKernelSumPtr     = mTempSrcSum.data();
     auto weightDequantBias = mResourceInt8->mOriginScale->host<uint8_t>() + alphaSize * 4;
@@ -736,7 +758,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
             dequantscale = range / 255.0f;
             zeropoint = roundf(-minVal * 255.f / range) - 128.0f;
         }
-        std::vector<float>qsVec(PackUnit, quantscale);
         auto sizeDiv = UP_DIV(inputsize, PackUnit);
         int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih;
         if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4
@@ -867,7 +888,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
         const auto biasFloatTid = reinterpret_cast<float*>(biasPtr + ocIndex * 4);
         const auto scaleFloatTid = reinterpret_cast<float*>(scalePtr + ocIndex * 4);
         const auto weightDequanBiasTid  = reinterpret_cast<float*>(weightDequantBias + ocIndex * 4);
-        const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * kernelCountUnitDouble * SRC_UNIT * weightBytes);
+        const auto weightPtrTid = weightDataPtr + static_cast<int32_t>(ocIndex * blockL * SRC_UNIT * weightBytes);
         if (mBlockNum == 1) {
             quanParam.biasFloat = biasFloatTid;
             quanParam.scale = scaleFloatTid;
@@ -941,7 +962,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector<Tensor*>& inpu
                         quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4;
                         quanParam.scale = (float*)(scaleFloatTid + k * ocUp4);
 
-                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y, blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
+                        mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step);
                     }
                     ptrX += (step * mBlockNum);
                     realDstCount-=step;
diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
index bebeaa5c4..6c46b9161 100644
--- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
+++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp
@@ -24,7 +24,7 @@ class ConvInt8TiledExecutor : public CPUConvolution {
     virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
     virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
     virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0;
-    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack);
+    static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum = 1);
 
 protected:
     ConvolutionCommon::Im2ColParameter mIm2ColParamter;
@@ -74,7 +74,7 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor {
     std::vector<int32_t> mDivides;
 
     int mThreadNums;
-    int mBlockNum;
+    int mBlockNum = 1;
     int mOcPerThread;
     bool mSplitByOc;
     bool mUseBatchQuan;
diff --git a/source/backend/cpu/compute/ConvOpt.cpp b/source/backend/cpu/compute/ConvOpt.cpp
index 5f7545c2c..3f209e059 100644
--- a/source/backend/cpu/compute/ConvOpt.cpp
+++ b/source/backend/cpu/compute/ConvOpt.cpp
@@ -39,14 +39,17 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size
 
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep) {
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
     int dx, fx, fy;
+    auto biasValue = Vec4::load(bias);
+    auto minF = Vec4(parameters[0]);
+    auto maxF = Vec4(parameters[1]);
     for (int y = 0; y < height; ++y) {
         auto srcY = src + y * srcHStep;
         auto dstY = dst + y * dstHStep;
         for (dx = 0; dx < width; ++dx) {
             float* dst_x          = dstY + dx * 4;
-            Vec4 dstValue(0.0f);
+            auto dstValue = biasValue;
             const float* src_z    = srcY + src_w_setup * dx;
             const float* weight_z = weight;
             for (fy = 0; fy < fh; ++fy) {
@@ -58,29 +61,13 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh
                     dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
                 }
             }
+            dstValue = Vec4::min(dstValue, maxF);
+            dstValue = Vec4::max(dstValue, minF);
             Vec4::save(dst_x, dstValue);
         }
     }
 }
 
-void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    Vec4 dstValue(0.0f);
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + 4 * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x);
-        }
-    }
-    Vec4::save(dst, dstValue);
-}
-
 void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad,
                              size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step,
                              size_t dilateX_step, size_t dilateY_step, float* alpha) {
diff --git a/source/backend/cpu/compute/ConvOpt.h b/source/backend/cpu/compute/ConvOpt.h
index 3d727c98e..bdb96666f 100644
--- a/source/backend/cpu/compute/ConvOpt.h
+++ b/source/backend/cpu/compute/ConvOpt.h
@@ -16,17 +16,19 @@
 extern "C" {
 #endif
 
-void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep);
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 
 void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh,
                                   size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup,
                                   size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step);
 
+void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
+                                    size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
+                                    size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
+
 void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                   size_t bStride, size_t height);
 void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
diff --git a/source/backend/cpu/compute/Convolution1x1Strassen.cpp b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
index 3ed5c0c6e..455e9cb6c 100644
--- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp
+++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp
@@ -133,11 +133,10 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
     }
 #endif
     mWeightBytes = static_cast<float>(dequantBits) / 8.0f;
-    auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
     if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) {
         std::vector<int> divides(numberThread+1);
         divides[0] = 0;
-        rt->computeDivideSizes(matrixSizeE, divides.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(matrixSizeE, divides.data()+1);
         mUnits.resize(numberThread);
         for (int i = 0; i < numberThread; ++i) {
             int planeStart = divides[i];
@@ -177,7 +176,7 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector<Tensor *> &inputs,
         auto ocDiv = UP_DIV(ocC4, hDiv);
         std::vector<int> divides(numberThread+1);
         divides[0] = 0;
-        rt->computeDivideSizes(ocDiv, divides.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(ocDiv, divides.data()+1);
         mUnits.resize(numberThread);
         for (int i = 0; i < numberThread; ++i) {
             int ocStart = divides[i] * hDiv;
diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
deleted file mode 100644
index 46fc68048..000000000
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-//
-//  ConvolutionDepthwise3x3.cpp
-//  MNN
-//
-//  Created by MNN on 2019/4/3.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp"
-#include "backend/cpu/CPUBackend.hpp"
-#include "CommonOptFunction.h"
-#include "core/Concurrency.h"
-#include "core/Macro.h"
-
-namespace MNN {
-ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr<CPUConvolution::Resource> resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) {
-    mResource = resource;
-}
-
-ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b,
-                                                 const float *originWeight, size_t originWeightSize, const float *bias,
-                                                 size_t biasSize)
-    : CPUConvolution(common, b) {
-    MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY());
-    MNN_ASSERT(1 == common->strideX() && 1 == common->strideY());
-    MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY());
-    mResource.reset(new Resource);
-    mResource->backend = b;
-    auto core = static_cast<CPUBackend*>(b)->functions();
-    auto pack = core->pack;
-    auto bytes = core->bytes;
-    auto success = mResource->copyBiasAlign(bias, biasSize);
-    if (!success) {
-        mValid = false;
-        return;
-    }
-    auto channel   = common->outputCount();
-    auto channelC4 = UP_DIV(channel, pack);
-    auto unitSize = channelC4 * pack * 3 * 4;
-    mResource->mWeight.reset(Tensor::createDevice<uint8_t>({unitSize * bytes}));
-    mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC);
-    if (!mValid) {
-        return;
-    }
-    AutoStorage<float> tempWeightStorge;
-    auto weightHost = mResource->mWeight->host<float>();
-    if (bytes < 4) {
-        // Lowp need extra float storage for transform
-        tempWeightStorge.reset(unitSize);
-        if (nullptr == tempWeightStorge.get()) {
-            mValid = false;
-            return;
-        }
-        weightHost = tempWeightStorge.get();
-    }
-    ::memset(weightHost, 0,  unitSize * sizeof(float));
-    /* 1D-Winograd F(2,3) and tiling */
-    for (int c = 0; c < channel; ++c) {
-        auto cIndex     = c / pack;
-        auto cRemain    = c % pack;
-        auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain;
-        auto weightSrcZ = originWeight + c * 9;
-        for (int y = 0; y < 3; ++y) {
-            auto k0 = weightSrcZ[3 * y + 0];
-            auto k1 = weightSrcZ[3 * y + 1];
-            auto k2 = weightSrcZ[3 * y + 2];
-
-            auto m0 = k0;
-            auto m1 = 0.5f * (k0 + k1 + k2);
-            auto m2 = 0.5f * (k0 - k1 + k2);
-            auto m3 = k2;
-
-            weightDstZ[(y * 4 + 0) * pack] = m0;
-            weightDstZ[(y * 4 + 1) * pack] = m1;
-            weightDstZ[(y * 4 + 2) * pack] = m2;
-            weightDstZ[(y * 4 + 3) * pack] = m3;
-        }
-    }
-    if (bytes < 4) {
-        core->MNNFp32ToLowp(weightHost, mResource->mWeight->host<int16_t>(), unitSize);
-    }
-}
-
-ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() {
-    // Do nothing
-}
-
-bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) {
-    if (nullptr == dst) {
-        return true;
-    }
-    auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn);
-    *dst = dstExe;
-    return true;
-}
-
-ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    CPUConvolution::onResize(inputs, outputs);
-    const int numberThread = ((CPUBackend *)backend())->threadNumber();
-    auto output      = outputs[0];
-    auto owUnit      = UP_DIV(output->width(), 2);
-    auto core        = static_cast<CPUBackend*>(backend())->functions();
-    // 3 cacheline
-    mCacheLine.reset(Tensor::createDevice<uint8_t>({numberThread, 3 * 4 * owUnit * core->pack * core->bytes}));
-    auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC);
-    if (!valid) {
-        return OUT_OF_MEMORY;
-    }
-    backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC);
-    auto iw       = inputs[0]->width();
-    mSourceStartX = UP_DIV(mPadX, 2);
-    mSourceEndX   = std::max((iw + mPadX - 4) / 2, mSourceStartX);
-    mPostParameters = getPostParameters();
-    // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit;
-    // FUNC_PRINT_ALL(rate, f);
-
-    int channelC4 = UP_DIV(inputs[0]->channel(), core->pack);
-    int batch     = inputs[0]->batch();
-    auto total = channelC4 * batch;
-
-    mDivides.resize(numberThread+1);
-    mDivides[0] = 0;
-    static_cast<const CPURuntime*>(backend()->getRuntime())->computeDivideSizes(total, mDivides.data() + 1);
-    
-    return NO_ERROR;
-}
-
-ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector<Tensor *> &inputs,
-                                             const std::vector<Tensor *> &outputs) {
-    auto input    = inputs[0];
-    auto output   = outputs[0];
-    auto core        = static_cast<CPUBackend*>(backend())->functions();
-
-    int channelC4 = UP_DIV(input->channel(), core->pack);
-    int initSize  = std::min(input->height(), 2);
-    int batch     = input->batch();
-    int ow        = output->width();
-    int oh        = output->height();
-    int owUnit    = UP_DIV(ow, 2);
-
-    auto iw           = input->width();
-    auto ih           = input->height();
-    auto kernelOrigin = mResource->mWeight->host<uint8_t>();
-
-    /*oy-mPadY>=0*/
-    int middelYStart = mPadY;
-
-    /*oy-mPadY+3-1 < ih*/
-    int middelYEnd = std::max(ih - 2 + mPadY, middelYStart);
-
-    int threadNumber = ((CPUBackend *)backend())->threadNumber();
-    auto maxKernelH  = std::min(mPadY + ih, 3);
-    auto inputOrigin  = input->host<uint8_t>();
-    auto outputOrigin = output->host<uint8_t>();
-    MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
-        auto cacheLineStart = mCacheLine->host<uint8_t>() + tId * mCacheLine->stride(0);
-        for (int index = mDivides[tId]; index < mDivides[tId+1]; ++index) {
-            int z = index / batch;
-            auto biasPtr = (const float*)(mResource->mBias->host<uint8_t>() + core->bytes * core->pack * z);
-            auto inputZ     = inputOrigin + core->pack * index * iw * ih * core->bytes;
-            auto outputZ    = outputOrigin + core->pack * index * ow * oh * core->bytes;
-            auto kernelZ    = kernelOrigin + z * core->pack * core->bytes * 4 * 3;
-            auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0;
-            auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1;
-            auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2;
-
-            float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2};
-
-            // Init
-            for (int i = 0; i < initSize; ++i) {
-                core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX,
-                                       mSourceEndX);
-            }
-
-            // Compute Top
-            for (int y = 0; y < middelYStart; ++y) {
-                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
-                int cacheLineSize = y - mPadY + maxKernelH;
-                if (cacheLineSize <= 0) {
-                    ::memset(outputY, 0, core->bytes * ow * core->pack);
-                    core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
-                    continue;
-                }
-                auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes;
-                cacheLineSize = std::min(cacheLineSize, ih);
-                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
-            }
-
-            // Compute Mid
-            for (int y = middelYStart; y < middelYEnd; ++y) {
-                auto outputY = outputZ + y * core->bytes * core->pack * ow;
-                auto iy      = y - mPadY + 2;
-                core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX,
-                                       mSourceEndX);
-                // FUNC_PRINT(ow);
-                core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data());
-
-                auto temp    = cacheLine[0];
-                cacheLine[0] = cacheLine[1];
-                cacheLine[1] = cacheLine[2];
-                cacheLine[2] = temp;
-            }
-
-            // Compute Bottom
-            for (int y = middelYEnd; y < oh; ++y) {
-                auto outputY      = outputZ + y * core->bytes * core->pack * ow;
-                int cacheLineSize = (ih - y + mPadY);
-                if (cacheLineSize <= 0) {
-                    ::memset(outputY, 0, ow * core->bytes * core->pack);
-                    core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1,  mPostParameters.data());
-                    continue;
-                }
-                core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data());
-                cacheLine[0] = cacheLine[1];
-                cacheLine[1] = cacheLine[2];
-            }
-        }
-    } MNN_CONCURRENCY_END();
-    return NO_ERROR;
-}
-} // namespace MNN
diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
deleted file mode 100644
index 4ff4d4ef0..000000000
--- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-//
-//  ConvolutionDepthwise3x3.hpp
-//  MNN
-//
-//  Created by MNN on 2019/4/3.
-//  Copyright © 2018, Alibaba Group Holding Limited
-//
-
-#ifndef ConvolutionDepthwise3x3_hpp
-#define ConvolutionDepthwise3x3_hpp
-
-#include "backend/cpu/CPUConvolution.hpp"
-
-namespace MNN {
-class ConvolutionDepthwise3x3 : public CPUConvolution {
-public:
-    ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight,
-                            size_t originWeightSize, const float *bias, size_t biasSize);
-    virtual ~ConvolutionDepthwise3x3();
-
-    virtual ErrorCode onResize(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual ErrorCode onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) override;
-    virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override;
-private:
-    ConvolutionDepthwise3x3(std::shared_ptr<Resource> resource, const Convolution2DCommon* common, Backend* b);
-
-    std::shared_ptr<Resource> mResource;
-
-    std::unique_ptr<Tensor> mCacheLine;
-    int mSourceStartX = 0;
-    int mSourceEndX   = 0;
-    std::vector<float> mPostParameters;
-    std::vector<int> mDivides;
-};
-} // namespace MNN
-
-#endif /* ConvolutionDepthwise3x3_hpp */
diff --git a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
index 74b23af3e..79ffa1451 100644
--- a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
+++ b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp
@@ -262,7 +262,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
     // MNN_PRINT("ow=%d, oh=%d\n", ow, oh);
     
     std::vector<int> divides(threadNumber+1);
-    static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(totalCount, divides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(totalCount, divides.data()+1);
     divides[0] = 0;
     auto midBuffer0Bytes = srcUnit2 * pack * bytes;
     bool allow_x86_bf16_winograd = true;
@@ -542,7 +542,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector<Tensor *> &inputs,
         }
     };
     std::vector<int> postDivides(threadNumber+1);
-    static_cast<const CPURuntime*>( static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(dc_4, postDivides.data()+1);
+    static_cast<CPUBackend *>(backend())->computeDivideSizes(dc_4, postDivides.data()+1);
     postDivides[0] = 0;
 
     mPostFunction.first = threadNumber;
diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
index 918f47fa1..fea897d71 100644
--- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
+++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp
@@ -541,7 +541,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
         auto rt = static_cast<const CPURuntime*>(backend()->getRuntime());
         std::vector<int> ocC4ParralSize(threadNumber + 1);
         ocC4ParralSize[0] = 0;
-        rt->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1);
         mFunction.second = [=](int placeholder) {
         const float* biasPtr = bias ? bias->host<float>() : nullptr;
         auto gemmBuffer = mTempBufferTranspose.host<uint8_t>() + mTempBufferTranspose.stride(0) * 0;
@@ -583,7 +583,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
             }
             info[0] = 1;
             int hw4Stride = info[1] * unit * bytes;
-            rt->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
+            static_cast<CPUBackend *>(backend())->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1);
             im2colParallelSize[0] = 0;
             MNN_CONCURRENCY_BEGIN(tId, threadNumber) {
                 int threadEL[4];
@@ -672,7 +672,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector<Tensor*>& inputs
         std::vector<int> divides(threadNumber + 1);
         divides[0] = 0;
 
-        static_cast<const CPURuntime*>(static_cast<CPUBackend*>(backend())->getRuntime())->computeDivideSizes(tileCount, divides.data() + 1);
+        static_cast<CPUBackend *>(backend())->computeDivideSizes(tileCount, divides.data() + 1);
 
         mFunction.second       = [=](int tId) {
             const float* biasPtr = bias ? bias->host<float>() : nullptr;
diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
index 497ef3bf9..5bed95103 100644
--- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp
+++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp
@@ -1416,12 +1416,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co
                                               size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) {
     const int bytes = ((post->useInt8 == 1) ? 1 : 4);
     float fp32min = 0, fp32max = 0;
-//    if (0 == post->useInt8) {
-//        fp32min = (post->fp32minmax)[0];
-//        fp32max = (post->fp32minmax)[1];
-//    }
-    auto blockNum = post->blockNum;
-    int weight_step_Z = (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
     int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
     const auto srcSumPtr = post->srcKernelSum;
     if (0 == post->useInt8 && post->fp32minmax) {
@@ -1486,7 +1481,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
     uint32_t c = 0xf;
     const int bytes = 4;
     float fp32min = 0, fp32max = 0;
-    int weight_step_Z = 0.5 * (post->blockNum * src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = 0.5 * (src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
     int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
     MNN_ASSERT(post->useInt8==0);
     if (post->fp32minmax) {
@@ -1495,7 +1490,6 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src,
     }
 
     float* biasPtr = (float*)post->biasFloat;
-    int blockNum = post->blockNum;
 
     const auto srcSumPtr = post->srcKernelSum;
     for (int dz = 0; dz < dst_depth_quad; ++dz) {
diff --git a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
index 450714416..ed4226b89 100644
--- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp
@@ -68,13 +68,12 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
         fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
     }
-    int blockNum = post->blockNum;
     const float* biasPtr = nullptr;
     if (post->biasFloat) {
         biasPtr = post->biasFloat;
     }
 
-    int weight_step_Z = 0.5 * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+    int weight_step_Z = 0.5 * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
     int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
     const __m128i mask = _mm_set1_epi8(0xf);
     
@@ -506,7 +505,6 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
         fp32min = _mm256_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm256_set1_ps((post->fp32minmax)[1]);
     }
-    int blockNum = post->blockNum;
     const float* biasPtr = nullptr;
     if (post->biasFloat) {
         biasPtr = post->biasFloat;
@@ -554,7 +552,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
     //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad);
     if (GEMMINT8_AVX2_E == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
             const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
             const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
@@ -683,7 +681,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
     }
     if (3 == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
             const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
             const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
@@ -791,7 +789,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
     }    
     if (2 == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
             const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
             const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
@@ -879,7 +877,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
     }    
     if (1 == realDst) {
         for (int dz = 0; dz < dst_depth_quad; ++dz) {
-            const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
+            const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H);
             const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8;
             const float* scale_dz = post->scale + dz * AVX2_PACKINT8;
             auto dst_z           = dst + dz * dst_step_tmp;
diff --git a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
index eb006312c..3f2ae1291 100644
--- a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp
@@ -35,8 +35,6 @@ void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int
 void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector<std::vector<int>> &vecPos, const std::vector<std::vector<float>> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth);
 void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub);
-void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                     size_t weight_y_step, size_t dilateX_step, size_t dilateY_step);
 void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter);
 void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu);
 void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter);
@@ -48,7 +46,7 @@ void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c
                                     size_t length, size_t hSub);
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep);
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters);
 }
 
@@ -108,40 +106,25 @@ void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, si
     }
 }
 
-void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    __m256 dstValue = _mm256_setzero_ps();
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + PACK_UNIT * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
-        }
-    }
-    _mm256_storeu_ps(dst, dstValue);
-}
-
 void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
     int dx, fx, fy;
     const int unit = 4;
     int widthUnit = width / unit;
     int widthRemain = width - widthUnit * unit;
     const float* weight_z = weight;
+    auto minF = _mm256_broadcast_ss(parameters + 0);
+    auto maxF = _mm256_broadcast_ss(parameters + 1);
+    auto bv = _mm256_loadu_ps(bias);
     for (int y = 0; y < height; ++y) {
         auto srcY = src + y * srcHStep;
         auto dstY = dst + y * dstHStep;
         for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm256_setzero_ps();
-            auto dstValue1 = _mm256_setzero_ps();
-            auto dstValue2 = _mm256_setzero_ps();
-            auto dstValue3 = _mm256_setzero_ps();
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
             for (fy = 0; fy < fh; ++fy) {
                 const float* src_y    = srcY + fy * dilateY_step;
                 const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@@ -155,6 +138,14 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                     dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue));
                 }
             }
+            dstValue0 = _mm256_min_ps(dstValue0, maxF);
+            dstValue1 = _mm256_min_ps(dstValue1, maxF);
+            dstValue2 = _mm256_min_ps(dstValue2, maxF);
+            dstValue3 = _mm256_min_ps(dstValue3, maxF);
+            dstValue0 = _mm256_max_ps(dstValue0, minF);
+            dstValue1 = _mm256_max_ps(dstValue1, minF);
+            dstValue2 = _mm256_max_ps(dstValue2, minF);
+            dstValue3 = _mm256_max_ps(dstValue3, minF);
             _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
             _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
             _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@@ -164,7 +155,7 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
         }
         for (dx = 0; dx < widthRemain; ++dx) {
             float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm256_setzero_ps();
+            auto dstValue = bv;
             const float* src_z    = srcY + src_w_setup * dx;
             const float* weight_z = weight;
             for (fy = 0; fy < fh; ++fy) {
@@ -176,6 +167,8 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                     dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x)));
                 }
             }
+            dstValue = _mm256_min_ps(dstValue, maxF);
+            dstValue = _mm256_max_ps(dstValue, minF);
             _mm256_storeu_ps(dst_x, dstValue);
         }
     }
@@ -316,68 +309,6 @@ void _AVX_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, siz
     }
 }
 
-static size_t _AVX_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
-    if (padMode == true) { //padMode == BorderMode_ZEROS
-        if (h < 0 || h >= height || w < 0 || w >= width) {
-            return -1;
-        }
-    } else {
-        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
-        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
-        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
-        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
-        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
-    }
-    return h * width * PACK_UNIT + w * PACK_UNIT;
-}
-
-void _AVX_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
-    for (auto ow = 0; ow < outW; ++ow) {
-        auto w = cordPtr[2 * ow + 0];
-        auto h = cordPtr[2 * ow + 1];
-        __m256 interp;
-
-        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
-            int nh = ::floor(h + 0.5f);
-            int nw = ::floor(w + 0.5f);
-            size_t ns = _AVX_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
-            for (int k = 0; k < channelCUnit; ++k) {
-                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        } else { //sampleMode == GridSampleMode_BILINEAR
-            int w0_h = ::floor(h);
-            int w0_w = ::floor(w);
-            int w1_h = ::ceil(h);
-            int w1_w = ::ceil(w);
-            auto oneV = _mm256_set1_ps(1.0f);
-
-            auto f0 = _mm256_set1_ps((float)w1_w - w);
-            auto f1 = _mm256_sub_ps(oneV, f0);
-            auto h0 = _mm256_set1_ps((float)w1_h - h);
-            auto h1 = _mm256_sub_ps(oneV, h0);
-
-            size_t s00 = _AVX_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
-            size_t s01 = _AVX_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
-            size_t s10 = _AVX_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
-            size_t s11 = _AVX_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
-
-            for (int k = 0; k < channelCUnit; ++k) {
-                __m256 i00 = s00 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s00);
-                __m256 i01 = s01 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s01);
-                __m256 i10 = s10 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s10);
-                __m256 i11 = s11 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s11);
-
-                __m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, f0), _mm256_mul_ps(i01, f1));
-                __m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, f0), _mm256_mul_ps(i11, f1));
-
-                interp = _mm256_add_ps(_mm256_mul_ps(i0, h0), _mm256_mul_ps(i1, h1));
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        }
-    }
-}
-
 void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
     Vec8 max = Vec8(-FLT_MAX);
     for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@@ -524,70 +455,6 @@ static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth,
     return ((d * height + h) * width + w) *  PACK_UNIT;
 }
 
-void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
-    for (auto ow = 0; ow < outW; ++ow) {
-        auto w = cordPtr[3 * ow + 0];
-        auto h = cordPtr[3 * ow + 1];
-        auto d = cordPtr[3 * ow + 2];
-        __m256 interp;
-
-        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
-            int nd = ::floor(d + 0.5f);
-            int nh = ::floor(h + 0.5f);
-            int nw = ::floor(w + 0.5f);
-            size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode);
-            for (int k = 0; k < channelCUnit; ++k) {
-                interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns);
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        } else { //sampleMode == GridSampleMode_BILINEAR
-            int w0_d = ::floor(d);
-            int w0_h = ::floor(h);
-            int w0_w = ::floor(w);
-            int w1_d = ::ceil(d);
-            int w1_h = ::ceil(h);
-            int w1_w = ::ceil(w);
-            auto oneV = _mm256_set1_ps(1.0f);
-
-            auto f0 = _mm256_set1_ps((float)w1_w - w);
-            auto f1 = _mm256_sub_ps(oneV, f0);
-            auto h0 = _mm256_set1_ps((float)w1_h - h);
-            auto h1 = _mm256_sub_ps(oneV, h0);
-            auto d0 = _mm256_set1_ps((float)w1_d - d);
-            auto d1 = _mm256_sub_ps(oneV, d0);
-
-            size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode);
-            size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode);
-            size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode);
-            size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode);
-            size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode);
-            size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode);
-            size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode);
-            size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode);
-
-            for (int k = 0; k < channelCUnit; ++k) {
-                __m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000);
-                __m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001);
-                __m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010);
-                __m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011);
-                __m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100);
-                __m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101);
-                __m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110);
-                __m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111);
-
-                __m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1));
-                __m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1));
-                __m256 i0  = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1));
-                __m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1));
-                __m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1));
-                __m256 i1  = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1));
-
-                interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1));
-                _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        }
-    }
-}
 
 void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride,
                        size_t bStride, size_t height) {
@@ -867,13 +734,9 @@ void _AVX_ExtraInit(void* functions) {
     coreFunction->MNNMatrixAdd          = _AVX_MNNMatrixAdd;
     coreFunction->MNNMatrixSub          = _AVX_MNNMatrixSub;
 
-    coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWise;
     coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise;
     coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit;
     coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction;
-    coreFunction->MNNMultiAndDestTransformCommon23 = _AVX_MNNMultiAndDestTransformCommon23;
-    coreFunction->MNNSourceTransformCommonF23 = _AVX_MNNSourceTransformCommonF23;
-    coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnit;
     coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel;
     coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise;
     coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise;
@@ -881,7 +744,7 @@ void _AVX_ExtraInit(void* functions) {
     coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
     coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
     coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D;
-    coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D;
+    coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
     coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax;
     coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax;
     coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg;
diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
index 31335e2cf..5d73ffc50 100644
--- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
+++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp
@@ -115,7 +115,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
         fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
     }
-    auto blockNum = post->blockNum;
     const float* biasPtr = nullptr;
     const float* bias_dz = nullptr;
     const float* extraB_dz = nullptr;
@@ -162,7 +161,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s
             }
         }
     }
-    int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+    int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
     if (realDst == GEMMINT8_AVX512_E) {
         for (int dz = 0; dz < dzU; ++dz) {
             auto weight_dz = weight + dz * weightZStride;
@@ -1452,7 +1451,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
         fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
     }
-    auto blockNum = post->blockNum;
     const float* biasPtr = nullptr;
     const float* bias_dz = nullptr;
     const float* extraB_dz = nullptr;
@@ -1500,7 +1498,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t
             }
         }
     }
-    int weight_step_Z = static_cast<int32_t>(blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
+    int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t)
     int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t)
 
     if (realDst == GEMMINT8_AVX512_E) {
diff --git a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
index 5addec946..44e9bc36f 100644
--- a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
+++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl
@@ -105,7 +105,6 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
         fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
     }
-    auto blockNum = post->blockNum;
     const float* biasPtr = nullptr;
     const float* bias_dz = nullptr;
     const float* extraB_dz = nullptr;
@@ -113,7 +112,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s
         biasPtr = post->biasFloat;
     }
 
-    int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
+    int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H);
     
     auto srcKernelSumPtr = post->srcKernelSum;
     __m512 kernelSum0 = _mm512_setzero_ps();
@@ -1444,7 +1443,6 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
         fp32min = _mm512_set1_ps((post->fp32minmax)[0]);
         fp32max = _mm512_set1_ps((post->fp32minmax)[1]);
     }
-    auto blockNum = post->blockNum;
     const float* biasPtr = nullptr;
     const float* bias_dz = nullptr;
     const float* extraB_dz = nullptr;
@@ -1458,7 +1456,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight
     __m512 kernelSum2 = _mm512_setzero_ps();
     __m512 kernelSum3 = _mm512_setzero_ps();
 
-    int weight_step_Z = static_cast<int32_t>(src_depth_quad * blockNum * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
+    int weight_step_Z = static_cast<int32_t>(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2);
     int weight_step_Y = static_cast<int32_t>(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2);
     const __m512i mask = _mm512_set1_epi8(0xf);
     if (GEMMINT8_AVX512_E == realDst) {
diff --git a/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp b/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
index 047c3dc7a..3542e717c 100644
--- a/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp
@@ -124,40 +124,25 @@ void _AVX512_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B,
     }
 }
 
-void _AVX512_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    __m512 dstValue = _mm512_setzero_ps();
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + PACK_UNIT * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
-        }
-    }
-    _mm512_storeu_ps(dst, dstValue);
-}
-
 void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
     int dx, fx, fy;
     const int unit = 4;
     int widthUnit = width / unit;
     int widthRemain = width - widthUnit * unit;
     const float* weight_z = weight;
+    auto minF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 0));
+    auto maxF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 1));
+    auto bv = _mm512_loadu_ps(bias);
     for (int y = 0; y < height; ++y) {
         auto srcY = src + y * srcHStep;
         auto dstY = dst + y * dstHStep;
         for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm512_setzero_ps();
-            auto dstValue1 = _mm512_setzero_ps();
-            auto dstValue2 = _mm512_setzero_ps();
-            auto dstValue3 = _mm512_setzero_ps();
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
             for (fy = 0; fy < fh; ++fy) {
                 const float* src_y    = srcY + fy * dilateY_step;
                 const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@@ -171,6 +156,14 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
                     dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
                 }
             }
+            dstValue0 = _mm512_min_ps(dstValue0, maxF);
+            dstValue1 = _mm512_min_ps(dstValue1, maxF);
+            dstValue2 = _mm512_min_ps(dstValue2, maxF);
+            dstValue3 = _mm512_min_ps(dstValue3, maxF);
+            dstValue0 = _mm512_max_ps(dstValue0, minF);
+            dstValue1 = _mm512_max_ps(dstValue1, minF);
+            dstValue2 = _mm512_max_ps(dstValue2, minF);
+            dstValue3 = _mm512_max_ps(dstValue3, minF);
             _mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
             _mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
             _mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@@ -180,7 +173,7 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
         }
         for (dx = 0; dx < widthRemain; ++dx) {
             float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm512_setzero_ps();
+            auto dstValue = bv;
             const float* src_z    = srcY + src_w_setup * dx;
             const float* weight_z = weight;
             for (fy = 0; fy < fh; ++fy) {
@@ -192,6 +185,8 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa
                     dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue);
                 }
             }
+            dstValue = _mm512_min_ps(dstValue, maxF);
+            dstValue = _mm512_max_ps(dstValue, minF);
             _mm512_storeu_ps(dst_x, dstValue);
         }
     }
@@ -307,68 +302,6 @@ void _AVX512_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH,
     }
 }
 
-static size_t _AVX512_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) {
-    if (padMode == true) { //padMode == BorderMode_ZEROS
-        if (h < 0 || h >= height || w < 0 || w >= width) {
-            return -1;
-        }
-    } else {
-        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
-        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
-        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
-        h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h);
-        w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w);
-    }
-    return h * width * PACK_UNIT + w * PACK_UNIT;
-}
-
-void _AVX512_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) {
-    for (auto ow = 0; ow < outW; ++ow) {
-        auto w = cordPtr[2 * ow + 0];
-        auto h = cordPtr[2 * ow + 1];
-        __m512 interp;
-
-        if (sampleMode == true) { //sampleMode == SampleMode_NEAREST
-            int nh = ::floor(h + 0.5f);
-            int nw = ::floor(w + 0.5f);
-            size_t ns = _AVX512_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode);
-            for (int k = 0; k < channelCUnit; ++k) {
-                interp = ns == -1 ? _mm512_set1_ps(0.f) : _mm512_loadu_ps(inputPtr + k * inOffset + ns);
-                _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        } else { //sampleMode == GridSampleMode_BILINEAR
-            int w0_h = ::floor(h);
-            int w0_w = ::floor(w);
-            int w1_h = ::ceil(h);
-            int w1_w = ::ceil(w);
-            auto oneV = _mm512_set1_ps(1.0f);
-
-            auto f0 = _mm512_set1_ps((float)w1_w - w);
-            auto f1 = _mm512_sub_ps(oneV, f0);
-            auto h0 = _mm512_set1_ps((float)w1_h - h);
-            auto h1 = _mm512_sub_ps(oneV, h0);
-
-            size_t s00 = _AVX512_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode);
-            size_t s01 = _AVX512_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode);
-            size_t s10 = _AVX512_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode);
-            size_t s11 = _AVX512_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode);
-
-            for (int k = 0; k < channelCUnit; ++k) {
-                __m512 i00 = s00 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s00);
-                __m512 i01 = s01 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s01);
-                __m512 i10 = s10 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s10);
-                __m512 i11 = s11 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s11);
-
-                __m512 i0 = _mm512_add_ps(_mm512_mul_ps(i00, f0), _mm512_mul_ps(i01, f1));
-                __m512 i1 = _mm512_add_ps(_mm512_mul_ps(i10, f0), _mm512_mul_ps(i11, f1));
-
-                interp = _mm512_add_ps(_mm512_mul_ps(i0, h0), _mm512_mul_ps(i1, h1));
-                _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp);
-            }
-        }
-    }
-}
-
 void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) {
     Vec16 max = Vec16(-FLT_MAX);
     for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) {
@@ -752,13 +685,9 @@ void _AVX512_ExtraInit(void* functions) {
     coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar;
     coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32;
 
-    coreFunction->MNNConvRunForUnitDepthWise = _AVX512_MNNConvRunForUnitDepthWise;
     coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise;
     coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit;
     coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction;
-    coreFunction->MNNMultiAndDestTransformCommon23 = _AVX512_MNNMultiAndDestTransformCommon23;
-    coreFunction->MNNSourceTransformCommonF23 = _AVX512_MNNSourceTransformCommonF23;
-    coreFunction->MNNConvDwF23MulTransUnit = _AVX512_MNNConvDwF23MulTransUnit;
     coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel;
     coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise;
     coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise;
@@ -767,6 +696,7 @@ void _AVX512_ExtraInit(void* functions) {
     coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax;
     coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg;
     coreFunction->MNNGridSampleInterp = MNNGridSampleInterp;
+    coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D;
     coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad;
 
     coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode;
diff --git a/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp b/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
index 8b3dc590a..6102508a6 100644
--- a/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp
@@ -11,40 +11,25 @@
 
 #define PACK_UNIT 8
 
-void _AVX_MNNConvRunForUnitDepthWiseFMA(float* dst, const float* src, const float* weight, size_t fw, size_t fh,
-                                  size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) {
-    int fx, fy;
-    __m256 dstValue = _mm256_setzero_ps();
-    const float* src_z    = src;
-    const float* weight_z = weight;
-    for (fy = 0; fy < fh; ++fy) {
-        const float* src_y    = src_z + fy * dilateY_step;
-        const float* weight_y = weight_z + fy * weight_y_step;
-        for (fx = 0; fx < fw; ++fx) {
-            const float* weight_x = weight_y + PACK_UNIT * fx;
-            const float* src_x    = src_y + fx * dilateX_step;
-            dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
-        }
-    }
-    _mm256_storeu_ps(dst, dstValue);
-}
-
 void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
     int dx, fx, fy;
     const int unit = 4;
     int widthUnit = width / unit;
     int widthRemain = width - widthUnit * unit;
     const float* weight_z = weight;
+    auto minF = _mm256_broadcast_ss(parameters + 0);
+    auto maxF = _mm256_broadcast_ss(parameters + 1);
+    auto bv = _mm256_loadu_ps(bias);
     for (int y = 0; y < height; ++y) {
         auto srcY = src + y * srcHStep;
         auto dstY = dst + y * dstHStep;
         for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm256_setzero_ps();
-            auto dstValue1 = _mm256_setzero_ps();
-            auto dstValue2 = _mm256_setzero_ps();
-            auto dstValue3 = _mm256_setzero_ps();
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
             for (fy = 0; fy < fh; ++fy) {
                 const float* src_y    = srcY + fy * dilateY_step;
                 const float* weight_y = weight_z + fy * fw * PACK_UNIT;
@@ -58,6 +43,14 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
                     dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3);
                 }
             }
+            dstValue0 = _mm256_min_ps(dstValue0, maxF);
+            dstValue1 = _mm256_min_ps(dstValue1, maxF);
+            dstValue2 = _mm256_min_ps(dstValue2, maxF);
+            dstValue3 = _mm256_min_ps(dstValue3, maxF);
+            dstValue0 = _mm256_max_ps(dstValue0, minF);
+            dstValue1 = _mm256_max_ps(dstValue1, minF);
+            dstValue2 = _mm256_max_ps(dstValue2, minF);
+            dstValue3 = _mm256_max_ps(dstValue3, minF);
             _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0);
             _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1);
             _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2);
@@ -67,7 +60,7 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
         }
         for (dx = 0; dx < widthRemain; ++dx) {
             float* dst_x          = dstY + dx * PACK_UNIT;
-            auto dstValue = _mm256_setzero_ps();
+            auto dstValue = bv;
             const float* src_z    = srcY + src_w_setup * dx;
             const float* weight_z = weight;
             for (fy = 0; fy < fh; ++fy) {
@@ -79,6 +72,8 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa
                     dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue);
                 }
             }
+            dstValue = _mm256_min_ps(dstValue, maxF);
+            dstValue = _mm256_max_ps(dstValue, minF);
             _mm256_storeu_ps(dst_x, dstValue);
         }
     }
@@ -173,8 +168,6 @@ static void _AVXFMA_MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFu
 void _AVX_ExtraInitFMA(void* functions) {
     auto coreFunction = static_cast<MNN::CoreFunctions*>(functions);
     coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA;
-    coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWiseFMA;
-    coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnitFMA;
     // sparse conv init
     coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel;
 
diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
index 7e8fff748..a132b48b9 100644
--- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
+++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp
@@ -68,7 +68,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
 void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el);
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                size_t srcHStep, size_t dstHStep);
+                                size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters);
 void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
                                             size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst);
 void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8);
diff --git a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
index f1fb9b338..d20f3dc23 100644
--- a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
+++ b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp
@@ -73,9 +73,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons
     if (post->biasFloat) {
         biasPtr = post->biasFloat;
     }
-    auto blockNum = post->blockNum;
     for (int dz = 0; dz < dst_depth_quad; ++dz) {
-        const auto weight_dz = weight + dz * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+        const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
         const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT;
         const float* scale_dz = nullptr;
         scale_dz  = post->scale + dz * GEMM_INT8_UNIT;
@@ -324,8 +323,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const
     if (post->biasFloat) {
         biasPtr = post->biasFloat;
     }
-    int blockNum = post->blockNum;
-    int weight_step_Z = 0.5 * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
+    int weight_step_Z = 0.5 * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
     int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT);
 
     auto oneValue = _mm_set1_epi16(1);
diff --git a/source/backend/cpu/x86_x64/sse/PackedFunction.cpp b/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
index 0006aeb21..ab6d5a705 100644
--- a/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
+++ b/source/backend/cpu/x86_x64/sse/PackedFunction.cpp
@@ -65,7 +65,7 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo
 
 void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup,
                                 size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height,
-                                     size_t srcHStep, size_t dstHStep) {
+                                     size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) {
     int dx, fx, fy;
     const int unit = 8;
     int widthUnit = width / unit;
@@ -75,18 +75,21 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
     if (need4) {
         widthRemain-=4;
     }
+    auto minF = _mm_set1_ps(parameters[0]);
+    auto maxF = _mm_set1_ps(parameters[1]);
+    auto bv = _mm_loadu_ps(bias);
     for (int y = 0; y < height; ++y) {
         auto srcY = src + y * srcHStep;
         auto dstY = dst + y * dstHStep;
         for (dx = 0; dx < widthUnit; ++dx) {
-            auto dstValue0 = _mm_set1_ps(0.0f);
-            auto dstValue1 = _mm_set1_ps(0.0f);
-            auto dstValue2 = _mm_set1_ps(0.0f);
-            auto dstValue3 = _mm_set1_ps(0.0f);
-            auto dstValue4 = _mm_set1_ps(0.0f);
-            auto dstValue5 = _mm_set1_ps(0.0f);
-            auto dstValue6 = _mm_set1_ps(0.0f);
-            auto dstValue7 = _mm_set1_ps(0.0f);
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
+            auto dstValue4 = bv;
+            auto dstValue5 = bv;
+            auto dstValue6 = bv;
+            auto dstValue7 = bv;
             for (fy = 0; fy < fh; ++fy) {
                 const float* src_y    = srcY + fy * dilateY_step;
                 const float* weight_y = weight_z + fy * fw * 4;
@@ -104,6 +107,24 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                     dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue));
                 }
             }
+            dstValue0 = _mm_min_ps(dstValue0, maxF);
+            dstValue1 = _mm_min_ps(dstValue1, maxF);
+            dstValue2 = _mm_min_ps(dstValue2, maxF);
+            dstValue3 = _mm_min_ps(dstValue3, maxF);
+            dstValue4 = _mm_min_ps(dstValue4, maxF);
+            dstValue5 = _mm_min_ps(dstValue5, maxF);
+            dstValue6 = _mm_min_ps(dstValue6, maxF);
+            dstValue7 = _mm_min_ps(dstValue7, maxF);
+
+            dstValue0 = _mm_max_ps(dstValue0, minF);
+            dstValue1 = _mm_max_ps(dstValue1, minF);
+            dstValue2 = _mm_max_ps(dstValue2, minF);
+            dstValue3 = _mm_max_ps(dstValue3, minF);
+            dstValue4 = _mm_max_ps(dstValue4, minF);
+            dstValue5 = _mm_max_ps(dstValue5, minF);
+            dstValue6 = _mm_max_ps(dstValue6, minF);
+            dstValue7 = _mm_max_ps(dstValue7, minF);
+
             _mm_storeu_ps(dstY + 4 * 0, dstValue0);
             _mm_storeu_ps(dstY + 4 * 1, dstValue1);
             _mm_storeu_ps(dstY + 4 * 2, dstValue2);
@@ -116,10 +137,10 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
             srcY += unit * src_w_setup;
         }
         if (need4) {
-            auto dstValue0 = _mm_set1_ps(0.0f);
-            auto dstValue1 = _mm_set1_ps(0.0f);
-            auto dstValue2 = _mm_set1_ps(0.0f);
-            auto dstValue3 = _mm_set1_ps(0.0f);
+            auto dstValue0 = bv;
+            auto dstValue1 = bv;
+            auto dstValue2 = bv;
+            auto dstValue3 = bv;
             for (fy = 0; fy < fh; ++fy) {
                 const float* src_y    = srcY + fy * dilateY_step;
                 const float* weight_y = weight_z + fy * fw * 4;
@@ -133,6 +154,15 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                     dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue));
                 }
             }
+            dstValue0 = _mm_min_ps(dstValue0, maxF);
+            dstValue1 = _mm_min_ps(dstValue1, maxF);
+            dstValue2 = _mm_min_ps(dstValue2, maxF);
+            dstValue3 = _mm_min_ps(dstValue3, maxF);
+
+            dstValue0 = _mm_max_ps(dstValue0, minF);
+            dstValue1 = _mm_max_ps(dstValue1, minF);
+            dstValue2 = _mm_max_ps(dstValue2, minF);
+            dstValue3 = _mm_max_ps(dstValue3, minF);
             _mm_storeu_ps(dstY + 4 * 0, dstValue0);
             _mm_storeu_ps(dstY + 4 * 1, dstValue1);
             _mm_storeu_ps(dstY + 4 * 2, dstValue2);
@@ -142,7 +172,7 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
         }
         for (dx = 0; dx < widthRemain; ++dx) {
             float* dst_x          = dstY + dx * 4;
-            auto dstValue = _mm_set1_ps(0.0f);
+            auto dstValue = bv;
             const float* src_z    = srcY + src_w_setup * dx;
             const float* weight_z = weight;
             for (fy = 0; fy < fh; ++fy) {
@@ -154,6 +184,8 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float*
                     dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x)));
                 }
             }
+            dstValue = _mm_min_ps(dstValue, maxF);
+            dstValue = _mm_max_ps(dstValue, minF);
             _mm_storeu_ps(dst_x, dstValue);
         }
     }
diff --git a/source/backend/metal/AllShader.cpp b/source/backend/metal/AllShader.cpp
index 3a695797e..16c50153f 100644
--- a/source/backend/metal/AllShader.cpp
+++ b/source/backend/metal/AllShader.cpp
@@ -792,6 +792,44 @@ const char* shader_MetalLayerNorm_metal =
 " out_data[gid.x]=(M4)(norm);\n"
 " }\n"
 "}\n"
+"kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n"
+" device M4 *out [[buffer(1)]],\n"
+" constant layernorm_constants& cst [[buffer(2)]],\n"
+" const device float4 *gamma [[buffer(3)]],\n"
+" const device float4 *beta [[buffer(4)]],\n"
+" uint gid [[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" int total_idx=(gid*4+sgitg);\n"
+" int in_idx=total_idx % (cst.inside/4);\n"
+" int out_idx=total_idx/(cst.inside/4);\n"
+" auto in_data=in+out_idx*cst.inside/4;\n"
+" auto out_data=out+out_idx*cst.inside/4;\n"
+" float square_sum=0.0f;\n"
+" for(int i=tiisg; i<cst.inside/4; i+=SIMD_GROUP_WIDTH) {\n"
+" M4 data=in_data[i];\n"
+" float dis=data.x;\n"
+" square_sum += dis*dis;\n"
+" dis=data.y;\n"
+" square_sum += dis*dis;\n"
+" dis=data.z;\n"
+" square_sum += dis*dis;\n"
+" dis=data.w;\n"
+" square_sum += dis*dis;\n"
+" }\n"
+" square_sum=simd_sum(square_sum);\n"
+" \n"
+" if(tiisg == 0) {\n"
+" float var=1.0/sqrt(square_sum/cst.inside+cst.eps);\n"
+" \n"
+" float4 norm=var*((float4)in_data[in_idx]);\n"
+" if(cst.has_gamma_beta) {\n"
+" out_data[in_idx]=(M4)(norm*gamma[in_idx]+beta[in_idx]);\n"
+" } else {\n"
+" out_data[in_idx]=(M4)(norm);\n"
+" }\n"
+" }\n"
+"}\n"
 ;
 const char* shader_MetalConvolutionWinograd_metal = 
 "struct winograd_constants {\n"
@@ -1578,6 +1616,60 @@ const char* shader_MetalConvolution1x1_metal =
 " //if (computeSize>2) {xy_out[2]=activate(M4(result2),cst.activation); }\n"
 " //if (computeSize>3) {xy_out[3]=activate(M4(result3),cst.activation); }\n"
 "}\n"
+"kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n"
+" device M4 *out [[buffer(1)]],\n"
+" constant conv1x1_constants& cst [[buffer(2)]],\n"
+" const device MNN::uchar4x2 *wt [[buffer(3)]],\n"
+" const device M4 *biasTerms [[buffer(4)]],\n"
+" const device float4 *dequantScale [[buffer(5)]],\n"
+" uint3 gid[[threadgroup_position_in_grid]],\n"
+" uint tiisg[[thread_index_in_simdgroup]],\n"
+" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n"
+" int uz=gid.x*2+sgitg;\n"
+" int rx=gid.y;\n"
+" auto xy_wt=wt+uz*cst.input_slice;\n"
+" auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n"
+" auto xy_out=out+(int)gid.z*cst.output_size+uz*cst.output_size*cst.batch+rx;\n"
+" auto biasValue=FLOAT4(biasTerms[uz]);\n"
+" FLOAT4 result0=FLOAT4(0);\n"
+" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n"
+" for (int bi=0; bi<cst.block_size; bi++) {\n"
+" FLOAT4 scale=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+0]);\n"
+" FLOAT4 dequant_bias=FLOAT4(dequantScale[2*(uz*cst.block_size+bi)+1]);\n"
+" int zmin=bi*block;\n"
+" int zmax=min(zmin+block,cst.input_slice);\n"
+" for (int z=zmin+tiisg; z<zmax; z+=SIMD_GROUP_WIDTH) {\n"
+" auto in40=(FLOAT4)*(xy_in0+z*cst.input_size*cst.batch);\n"
+" MNN::uchar4x2 w_int4=xy_wt[z];\n"
+" FLOAT4x4 w_dequant;\n"
+" for (int i=0; i<4; ++i) {\n"
+" FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
+" FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
+" w_dequant[i]=res;\n"
+" }\n"
+" result0 += FLOAT4(in40*w_dequant);\n"
+" \n"
+"// FLOAT4x4 w_dequant;\n"
+"// for (int i=0; i<4; ++i) {\n"
+"// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n"
+"// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n"
+"// w_dequant[i]=w4;\n"
+"// }\n"
+"//\n"
+"// FLOAT4 temp=FLOAT4(in40*w_dequant);\n"
+"// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n"
+" }\n"
+" }\n"
+" FLOAT4 res;\n"
+" res.x=simd_sum(result0.x);\n"
+" res.y=simd_sum(result0.y);\n"
+" res.z=simd_sum(result0.z);\n"
+" res.w=simd_sum(result0.w);\n"
+" /* true */\n"
+" if (tiisg == 0) {\n"
+" xy_out[0]=activate(M4(res+biasValue),cst.activation);\n"
+" }\n"
+"}\n"
 "kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n"
 " device M4 *out [[buffer(1)]],\n"
 " constant conv1x1_constants& cst [[buffer(2)]],\n"
@@ -1960,6 +2052,7 @@ const char* shader_MetalDefine_metal =
 "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
 "// Macro\n"
 "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n"
+"#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32\n"
 "#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n"
 "#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n"
 "// whether computer with float32 when store with float16\n"
diff --git a/source/backend/metal/MNNMetalContext.h b/source/backend/metal/MNNMetalContext.h
index 2159ccf1f..ca5a589d1 100644
--- a/source/backend/metal/MNNMetalContext.h
+++ b/source/backend/metal/MNNMetalContext.h
@@ -33,8 +33,8 @@ typedef enum {
 /** metal device */
 @property (strong, nonatomic, readonly) id<MTLDevice> device;
 /** max memory length cound be used in threadgroup */
-@property (assign, nonatomic, readonly) BOOL isCommitEachShader;
 @property (assign, nonatomic, readonly) BOOL isIphone;
+@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable;
 
 /**
  * @brief alloc temp buffer on device
diff --git a/source/backend/metal/MNNMetalContext.mm b/source/backend/metal/MNNMetalContext.mm
index e23fda331..b271c1243 100644
--- a/source/backend/metal/MNNMetalContext.mm
+++ b/source/backend/metal/MNNMetalContext.mm
@@ -79,30 +79,17 @@ static void createLibrary(id<MTLDevice> device, NSMutableDictionary<NSString *,
     }
 }
 
-+ (BOOL)commit_frequent{
-    struct utsname systemInfo;
-    uname(&systemInfo);
-
-    NSString *deviceString = [NSString stringWithCString:systemInfo.machine encoding:NSASCIIStringEncoding];
-
-    if ([deviceString isEqualToString:@"iPhone10,1"]) return YES; //@"iPhone 8 Global";
-    if ([deviceString isEqualToString:@"iPhone10,2"]) return YES; //@"iPhone 8 Plus Global";
-    if ([deviceString isEqualToString:@"iPhone10,4"]) return YES; //@"iPhone 8 GSM";
-    if ([deviceString isEqualToString:@"iPhone10,5"]) return YES; //@"iPhone 8 Plus GSM";
-    if ([deviceString isEqualToString:@"iPhone10,3"]) return YES; //@"A1865/A1902 iPhone X";
-    if ([deviceString isEqualToString:@"iPhone10,6"]) return YES; //@"Global/A1901 iPhone X";
-    if ([deviceString isEqualToString:@"iPhone11,2"]) return YES; //@"iPhone XS";
-    if ([deviceString isEqualToString:@"iPhone11,4"]) return YES; //@"iPhone XS Max";
-    if ([deviceString isEqualToString:@"iPhone11,6"]) return YES; //@"iPhone XS Max";
-    if ([deviceString isEqualToString:@"iPhone11,8"]) return YES; //@"iPhone XR";
-    if ([deviceString isEqualToString:@"iPhone12,1"]) return YES; //@"iPhone 11";
-    if ([deviceString isEqualToString:@"iPhone12,3"]) return YES; //@"iPhone 11 Pro";
-    if ([deviceString isEqualToString:@"iPhone12,5"]) return YES; //@"iPhone 11 Pro Max";
-    if ([deviceString isEqualToString:@"iPhone12,8"]) return YES; //@"iPhone SE 2";
-    if ([deviceString isEqualToString:@"iPhone13,1"]) return YES; //@"iPhone 12 mini";
-    if ([deviceString isEqualToString:@"iPhone13,2"]) return YES; //@"iPhone 12";
-    if ([deviceString isEqualToString:@"iPhone13,3"]) return YES; //@"iPhone 12 Pro";
-    if ([deviceString isEqualToString:@"iPhone13,4"]) return YES; //@"iPhone 12 Pro Max";
++ (BOOL)isSimdGroupAvailable{
+#if TARGET_OS_IPHONE
+    if(@available(iOS 14, *)) {
+        return YES;
+    }
+#endif
+#if TARGET_OS_MAC
+    if(@available(macOS 10.14, *)) {
+        return YES;
+    }
+#endif
     return NO;
 }
 
@@ -124,8 +111,8 @@ - (BOOL) initWithSharedContext:(const MNNMetalSharedContext*)context dev:(id<MTL
     _device = context->device;
     _cachesFp16   = [NSMutableDictionary dictionary];
     _cachesFp32   = [NSMutableDictionary dictionary];
-    _isCommitEachShader = self.class.commit_frequent;
     _isIphone = self.class.isIphone;
+    _isSimdGroupAvailable = self.class.isSimdGroupAvailable;
     createLibrary(_device, _cachesFp16, true);
     createLibrary(_device, _cachesFp32, false);
     return nil != _device;
diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm
index e1d1ef28f..9679fe1ab 100644
--- a/source/backend/metal/MetalAttention.mm
+++ b/source/backend/metal/MetalAttention.mm
@@ -39,7 +39,9 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     const device int* mask [[buffer(4)]],
 #endif
     constant Param& param [[buffer(5)]],
-    uint3 gid[[thread_position_in_grid]]) {
+    uint3 gid[[thread_position_in_grid]],
+    uint  tiisg[[thread_index_in_simdgroup]],
+    uint  sgitg[[simdgroup_index_in_threadgroup]]) {
     const int x = gid.x; // query_seq_len
     const int y = gid.y; // head_num
     const int z = gid.z; // key_seq_len
@@ -102,7 +104,7 @@ kernel void main0(const device T* input0 [[buffer(0)]],
         }
     }
     out *= Vscale;
-    output[y + z * head_num] = (T)out;
+    output[y * key_seq_len + z] = (T)out;
 #endif
 }
 
@@ -158,18 +160,18 @@ kernel void main0(const device T* input0 [[buffer(0)]],
     }
     output[ x * stride * group + (y * head_dim + z)] = out;
 #else
-    device const T *A_offset = input0 + y;
+    device const T *A_offset = input0 + y * value_seq_len;
     device const T *B_offset = input1 + offset_head;
     device T *Pastvalue_offset = past_value + offset_head;
     float out = 0;
     
     for(int i = 0; i < value_seq_len - 1; ++i){
-        float A = (float)A_offset[i * head_num];
+        float A = (float)A_offset[i];
         float B = (float)Pastvalue_offset[i * stride];
         
         out += A * B;
     }
-    out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0];
+    out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0];
     if (yr == 0) {
         Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0];
     }
@@ -282,6 +284,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
 
 
 void AttentionBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
     auto query = inputs[0];
     auto key = inputs[1];
     auto value = inputs[2];
@@ -407,8 +410,8 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override {
     // For softmax parameter
     int inside, outside;
     if (mIsDecode) {
-        inside = mNumHead;
-        outside = 1;
+        inside = 1;
+        outside = mNumHead;
     } else {
         inside = 1;
         outside = mCache->mKv_seq_len * mNumHead;
diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp
index 22eee335f..dfcc571dc 100644
--- a/source/backend/metal/MetalBackend.hpp
+++ b/source/backend/metal/MetalBackend.hpp
@@ -189,10 +189,7 @@ class MetalBackend : public Backend {
                               id<MTLComputeCommandEncoder> encoder, id<MTLBuffer> shape) const;
 
     void flushEncoder() const;
-    id<MTLComputeCommandEncoder> encoder_for_net() const;
-    void addOpEncoder(std::function<void(void)> opEncoder);
-    
-    bool isCommandEncoderSet();
+    id<MTLComputeCommandEncoder> encoder_for_net() const;    
     
     BufferAllocator* getBufferPool() const;
     EagerBufferAllocator *getStaticBufferPool() const {
@@ -233,11 +230,8 @@ class MetalBackend : public Backend {
 
     const MetalRuntime* mRuntime;
     mutable NSUInteger mEncoderCount = 0;
-    mutable bool mOpEncoderSet = false;//whether has set encoder
     mutable bool mSupportDeferEncode = true;
-    mutable bool mFrameEncodeCache = false;
 
-    std::vector<std::function<void(void)>> mOpEncoders;
     mutable id<MTLComputeCommandEncoder> mComputeEncoder = nil;
     std::shared_ptr<BufferAllocator> mBufferPool;
     std::shared_ptr<BufferAllocator> mBufferPoolShapeImmutable;
diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm
index 268db6fde..3d680b65f 100644
--- a/source/backend/metal/MetalBackend.mm
+++ b/source/backend/metal/MetalBackend.mm
@@ -229,6 +229,7 @@ MemChunk chunk() override {
         }
         return NULL;
     }
+    //MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type()));
 
     auto exe = iter->second->onCreate(inputs, op, this, outputs);
     if (NULL == exe) {
@@ -258,15 +259,8 @@ MemChunk chunk() override {
 void MetalBackend::onExecuteEnd() const {
     flushEncoder();
     commit_net();
-
-    if(mFrameEncodeCache) {
-        // Prepare for next execute
-        for(auto opEncoder : mOpEncoders) {
-            opEncoder();
-        }
-        mOpEncoderSet = true;
-    }
 }
+    
 BufferAllocator* MetalBackend::getBufferPool() const {
     return mCurrentAllocator;
 }
@@ -302,18 +296,11 @@ MemChunk chunk() override {
     return true;
 }
 
-bool MetalBackend::isCommandEncoderSet() {
-    return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport
-}
-
 bool MetalBackend::isCmdBufferCommit() {
     auto ctx = (__bridge MNNMetalContext *)context();
-    if(!ctx.isCommitEachShader) {
-        return false;
-    }
     
     //TODO: set magic number
-    const int magicNum = 2;
+    const int magicNum = mRuntime->hint().encorderNumForCommit;
     mEncoderCount++;
     if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) {
         return true;
@@ -321,12 +308,6 @@ MemChunk chunk() override {
     return false;
 }
 
-void MetalBackend::addOpEncoder(std::function<void(void)> opEncoder) {
-    if(mFrameEncodeCache) {
-        mOpEncoders.push_back(opEncoder);
-    }
-}
-
 id<MTLBuffer> MetalBackend::getHostBuffer(size_t size) const {
     size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT;
     // reuse
@@ -534,11 +515,7 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff
     }
 })metal";
 
-void MetalBackend::onResizeBegin() {
-    mFrameEncodeCache = false;
-    mOpEncoderSet = false;
-    mOpEncoders.clear();
-    
+void MetalBackend::onResizeBegin() {    
     // Abort last inference task if needed
     flushEncoder();
     _commandBuffer_net = nil;
@@ -549,7 +526,6 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff
 
 ErrorCode MetalBackend::onResizeEnd() {
     auto ctx = (__bridge MNNMetalContext *)context();
-    mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode);
     return mCurrentAllocator->compute();
 }
 
@@ -711,9 +687,8 @@ static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::C
 void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const {
     flushEncoder();
     auto ctx = (__bridge MNNMetalContext *)context();
-    if(!mFrameEncodeCache) {
-        commit_net();
-    }
+    commit_net();
+    
     _resetDynamicMemory();
     onCopyBuffer(src, dst, nil, nil);
 }
@@ -789,9 +764,8 @@ static void _execute(id<MTLComputeCommandEncoder> encoder, const MetalBackend::C
 int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) {
     flushEncoder();
     auto ctx = (__bridge MNNMetalContext *)context();
-    if(!mOpEncoderSet) {
-        commit_net();
-    }
+    commit_net();
+    
     if (toCpu) {
         wait();
     }
diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm
index 33a3eb19d..35e65118d 100644
--- a/source/backend/metal/MetalConvolution1x1.mm
+++ b/source/backend/metal/MetalConvolution1x1.mm
@@ -87,8 +87,16 @@
         std::string name = "conv1x1_g1z4_w8";
         mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()];
         if (mDequantBits == 4) {
-            mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
-            name = "conv1x1_g1z4_w4";
+            if(context.isSimdGroupAvailable && ob * ow * oh == 1) {
+                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()];
+                name = "conv1x1_g1z4_m1w4";
+                mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1));
+
+                return NO_ERROR;
+            } else {
+                mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()];
+                name = "conv1x1_g1z4_w4";
+            }
         }
         NSArray *arr = [NSArray arrayWithObjects:(id<MTLBuffer>)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(),
                         (id<MTLBuffer>)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(),
diff --git a/source/backend/metal/MetalExecution.mm b/source/backend/metal/MetalExecution.mm
index 35de88d24..75a20cbfc 100644
--- a/source/backend/metal/MetalExecution.mm
+++ b/source/backend/metal/MetalExecution.mm
@@ -18,10 +18,6 @@
 ErrorCode MetalExecution::onExecute(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
     auto backend = static_cast<MetalBackend *>(this->backend());
 
-    if(backend->isCommandEncoderSet()) {
-        return NO_ERROR;
-    }
-    
     auto func = [=](){
         auto encoder           = backend->encoder_for_net();
         this->onEncode(inputs, outputs, encoder);
@@ -31,7 +27,6 @@
         }
     };
     func();
-    backend->addOpEncoder(func);
 
     return NO_ERROR;
 }
diff --git a/source/backend/metal/MetalGridSample.mm b/source/backend/metal/MetalGridSample.mm
index ed66b6748..22213c0a3 100644
--- a/source/backend/metal/MetalGridSample.mm
+++ b/source/backend/metal/MetalGridSample.mm
@@ -26,7 +26,7 @@
 #endif
 
 struct grid_sample_params {
-    int batches;
+    int batch;
     int channels;
     int inH;
     int inW;
@@ -179,7 +179,7 @@ kernel void main0(const device T *input [[buffer(0)]],
                    device T *output [[buffer(2)]],
                    constant grid_sample_params &p [[buffer(3)]],
                    uint3 gid                        [[thread_position_in_grid]]) {
-    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batches)
+    if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batch)
         return;
 
     int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON;
@@ -191,8 +191,8 @@ kernel void main0(const device T *input [[buffer(0)]],
     
     const int channelC4 = (p.channels + 3) / 4;
     for (int c = 0; c < channelC4; ++ c) {
-        auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x;
-        auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW;
+        auto outputPos = gid.z*p.outD*p.outH*p.outW + c*p.outD*p.outH*p.outW*p.batch + gid.y*p.outW + gid.x;
+        auto inputPtr = input + gid.z*p.inD*p.inH*p.inW + c*p.inH*p.inW*p.inD*p.batch;
 #if GRID3D
         output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode);
 #else
diff --git a/source/backend/metal/MetalLayerNorm.mm b/source/backend/metal/MetalLayerNorm.mm
index 917d5fe6a..7eaf586f0 100755
--- a/source/backend/metal/MetalLayerNorm.mm
+++ b/source/backend/metal/MetalLayerNorm.mm
@@ -76,6 +76,7 @@
     ((int *)mShapeBuffer.contents)[3]   = (int)has_gamma_beta_;
 
     
+    
     bool parallel = (mInside > 32) && ((mInside & 3) == 0);
     if(RMSNorm){
         mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()];
@@ -85,10 +86,17 @@
     
     auto inside = parallel ? mInside/4 : mInside;
     mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)];
+    if(context.isSimdGroupAvailable) {
+        if(mOutside == 1 && RMSNorm && parallel) {
+            mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()];
+            mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1));
+        }
+    }
     return NO_ERROR;
 }
 
 void MetalLayerNorm::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
     auto backend = static_cast<MetalBackend *>(this->backend());
     auto context = (__bridge MNNMetalContext *)backend->context();
     auto input = inputs[0], output = outputs[0];
diff --git a/source/backend/metal/MetalLoop.mm b/source/backend/metal/MetalLoop.mm
index 8f51e8622..85010045d 100644
--- a/source/backend/metal/MetalLoop.mm
+++ b/source/backend/metal/MetalLoop.mm
@@ -550,6 +550,7 @@ virtual ErrorCode onResize(const std::vector<Tensor *>& inputs, const std::vecto
     }
     virtual void onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
                                id<MTLComputeCommandEncoder> encoder) override {
+
         auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
         auto dstTensor = mTensors[cmd->indexes()->data()[0]];
         auto srcTensor = mTensors[cmd->indexes()->data()[1]];
diff --git a/source/backend/metal/MetalRaster.hpp b/source/backend/metal/MetalRaster.hpp
index 0d64e0840..23e7e47f3 100644
--- a/source/backend/metal/MetalRaster.hpp
+++ b/source/backend/metal/MetalRaster.hpp
@@ -28,13 +28,10 @@ class MetalRaster : public MetalExecution {
         MTLSize global;
     };
 private:
-    std::map<Tensor*, std::shared_ptr<Tensor>> mTempInput;
     std::map<Tensor*, BlitInfo> mTempInputCopy;
-    std::shared_ptr<Tensor> mTempOutput;
     bool mNeedZero = false;
     Tensor* mOutputPtr = nullptr;
-    id<MTLComputePipelineState> mBlitPipeline;
-    std::vector<id<MTLBuffer>> mShapeTemp;
+    std::vector<id<MTLComputePipelineState>> mBlitPipeline;
     id<MTLBuffer> mZeroCopy = nil;
     id<MTLComputePipelineState> mZeroPipeline;
 };
diff --git a/source/backend/metal/MetalRaster.mm b/source/backend/metal/MetalRaster.mm
index 8383b10e6..788f13087 100644
--- a/source/backend/metal/MetalRaster.mm
+++ b/source/backend/metal/MetalRaster.mm
@@ -34,6 +34,31 @@ static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Re
     info.stride[3] = sampler.src.offset;
     info.extent[3] = sampler.dst.offset;
 }
+    
+static std::string getUnitName(int bytes) {
+    std::string unitName;
+    switch (bytes) {
+        case 1:
+            unitName = "uchar";
+            break;
+        case 2:
+            unitName = "short";
+            break;
+        case 4:
+            unitName = "int";
+            break;
+        case 8:
+            unitName = "short4";
+            break;
+        case 16:
+            unitName = "int4";
+            break;
+        default:
+            FUNC_PRINT(bytes);
+            break;
+    }
+    return unitName;
+}
 
 static const char* gMultiBlitMetal = R"metal(
 #include <metal_stdlib>
@@ -85,6 +110,125 @@ kernel void main0(const device T *in [[buffer(0)]],
 }
 )metal";
 
+static const char* gMultiRasterTemplate = R"metal(
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct SamplerInfo {
+    uint4 stride;//stride[3] + offset
+    uint4 size;//size[3] + totalSize
+    uint4 extent;//dstStride[3]+dstOffset
+};
+kernel void main0(const device T *in [[buffer(0)]],
+                       device T *out [[buffer(1)]],
+                       const device uint4* buf [[buffer(2)]],
+                       uint3 tgid [[thread_position_in_grid]]) {
+    
+    uint4 limit = buf[2];
+    const device SamplerInfo* infoP = (const device SamplerInfo*)(buf + 3);
+    uint3 gid = tgid;
+    gid.x = tgid.x % limit.x;
+    uint n = tgid.x / limit.x;
+    if (n < limit.y) {
+        SamplerInfo info = infoP[n];
+
+        if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
+            uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
+            uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
+        #ifdef INPUT_FORMAT_NCHW
+            int srcOffsetReal = srcOffset;
+        #elif INPUT_FORMAT_NHWC
+            int srcOffsetReal = srcOffset;
+        #elif INPUT_FORMAT_C4NHW4
+            uint4 src_shape = buf[0];//src nchw
+            int src_batch   = src_shape.x;
+            int src_channel = src_shape.y;
+            int src_height  = src_shape.z;
+            int src_width   = src_shape.w;
+            int in_w = srcOffset % src_width; srcOffset /= src_width;
+            int in_h = srcOffset % src_height; srcOffset /= src_height;
+            int in_c = srcOffset % src_channel;
+            int in_b = srcOffset / src_channel;
+            int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
+        #endif
+
+        #ifdef OUTPUT_FORMAT_NCHW
+            int dstOffsetReal = dstOffset;
+        #elif OUTPUT_FORMAT_NHWC
+            int dstOffsetReal = dstOffset;
+        #elif OUTPUT_FORMAT_C4NHW4
+            uint4 dst_shape = buf[1];//dst nchw
+            int dst_batch   = dst_shape.x;
+            int dst_channel = dst_shape.y;
+            int dst_height  = dst_shape.z;
+            int dst_width   = dst_shape.w;
+            int out_w = dstOffset % dst_width; dstOffset /= dst_width;
+            int out_h = dstOffset % dst_height; dstOffset /= dst_height;
+            int out_c = dstOffset % dst_channel;
+            int out_b = dstOffset / dst_channel;
+            int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
+        #endif
+            out[dstOffsetReal] = in[srcOffsetReal];
+        }
+    }
+}
+)metal";
+
+static const char* gSingleRasterTemplate = R"metal(
+#include <metal_stdlib>
+#include <simd/simd.h>
+using namespace metal;
+struct SamplerInfo {
+    uint4 stride;//stride[3] + offset
+    uint4 size;//size[3] + totalSize
+    uint4 extent;//dstStride[3]+dstOffset
+};
+kernel void main0(const device T *in [[buffer(0)]],
+                       device T *out [[buffer(1)]],
+                       const device uint4* buf [[buffer(2)]],
+                       uint3 gid [[thread_position_in_grid]]) {
+    SamplerInfo info = *((const device SamplerInfo*)(buf + 3));
+    if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) {
+        uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w;
+        uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w;
+    #ifdef INPUT_FORMAT_NCHW
+        int srcOffsetReal = srcOffset;
+    #elif INPUT_FORMAT_NHWC
+        int srcOffsetReal = srcOffset;
+    #elif INPUT_FORMAT_C4NHW4
+        uint4 src_shape = buf[0];//src nchw
+        int src_batch   = src_shape.x;
+        int src_channel = src_shape.y;
+        int src_height  = src_shape.z;
+        int src_width   = src_shape.w;
+        int in_w = srcOffset % src_width; srcOffset /= src_width;
+        int in_h = srcOffset % src_height; srcOffset /= src_height;
+        int in_c = srcOffset % src_channel;
+        int in_b = srcOffset / src_channel;
+        int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4);
+    #endif
+
+    #ifdef OUTPUT_FORMAT_NCHW
+        int dstOffsetReal = dstOffset;
+    #elif OUTPUT_FORMAT_NHWC
+        int dstOffsetReal = dstOffset;
+    #elif OUTPUT_FORMAT_C4NHW4
+        uint4 dst_shape = buf[1];//dst nchw
+        int dst_batch   = dst_shape.x;
+        int dst_channel = dst_shape.y;
+        int dst_height  = dst_shape.z;
+        int dst_width   = dst_shape.w;
+        int out_w = dstOffset % dst_width; dstOffset /= dst_width;
+        int out_h = dstOffset % dst_height; dstOffset /= dst_height;
+        int out_c = dstOffset % dst_channel;
+        int out_b = dstOffset / dst_channel;
+        int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4);
+    #endif
+        out[dstOffsetReal] = in[srcOffsetReal];
+    }
+}
+)metal";
+    
 static const char* gFillInt4 = R"metal(
 #include <metal_stdlib>
 #include <simd/simd.h>
@@ -105,32 +249,13 @@ kernel void main0(device int4 *out   [[buffer(0)]],
 id<MTLComputePipelineState> MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) {
     auto mtbn = static_cast<MetalBackend*>(backend);
     std::string pipelineName;
-    std::string unitName;
+    std::string unitName = getUnitName(bytes);
     if (multiRegion) {
         pipelineName = "blit_multi";
     } else {
         pipelineName = "blit";
     }
-    switch (bytes) {
-        case 1:
-            unitName = "uchar";
-            break;
-        case 2:
-            unitName = "short";
-            break;
-        case 4:
-            unitName = "int";
-            break;
-        case 8:
-            unitName = "short4";
-            break;
-        case 16:
-            unitName = "int4";
-            break;
-        default:
-            FUNC_PRINT(bytes);
-            break;
-    }
+    
     std::vector<std::string> keys = {
         unitName,
         pipelineName
@@ -159,9 +284,6 @@ kernel void main0(device int4 *out   [[buffer(0)]],
     if (nil != mZeroCopy) {
         mtbn->returnConstBuffer(mZeroCopy);
     }
-    for (auto b : mShapeTemp) {
-        mtbn->returnConstBuffer(b);
-    }
 }
 struct MemsetInfo {
     int value[4];
@@ -197,9 +319,8 @@ kernel void main0(device int4 *out   [[buffer(0)]],
             mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo));
         }
     }
-    mTempInput.clear();
+
     mTempInputCopy.clear();
-    mTempOutput = nullptr;
     mOutputPtr = output;
 #ifndef MNN_METAL_FORBID_RASTER_C4
     if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
@@ -216,7 +337,8 @@ kernel void main0(device int4 *out   [[buffer(0)]],
             }
         }
         if (fast) {
-            mBlitPipeline = getBlitPipeline(bytes * 4, backend(), true);
+            mBlitPipeline.resize(1);
+            mBlitPipeline[0] = getBlitPipeline(bytes * 4, backend(), true);
             std::map<Tensor*, std::vector<int>> collectForTensor;
             for (int i=0; i< des->regions.size(); ++i) {
                 auto& slice = des->regions[i];
@@ -249,7 +371,7 @@ kernel void main0(device int4 *out   [[buffer(0)]],
                 }
                 ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
                  ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
-                auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
+                auto local = [context computeBestGroupAndLocal:mBlitPipeline[0] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
                 blit.global = local.first;
                 blit.local = local.second;
                 mTempInputCopy.insert(std::make_pair(iter.first, blit));
@@ -258,57 +380,14 @@ kernel void main0(device int4 *out   [[buffer(0)]],
         }
     }
 #endif
-    for (int i=0; i< des->regions.size(); ++i) {
-        auto& slice = des->regions[i];
-        auto origin = slice.origin;
-        if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
-            continue;
-        }
-        if (mTempInput.find(origin)!=mTempInput.end()) {
-            continue;
-        }
-        std::shared_ptr<Tensor> newTensor(new Tensor);
-        TensorUtils::copyShape(origin, newTensor.get());
-        TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW;
-        newTensor->buffer().type = origin->getType();
-        TensorUtils::setLinearLayout(newTensor.get());
-        mTempInput.insert(std::make_pair(origin, newTensor));
-    }
-    if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) {
-        mTempOutput.reset(new Tensor);
-        TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW);
-    }
-    if (nullptr != mTempOutput) {
-        auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-        mOutputPtr = mTempOutput.get();
-    }
-    for (auto& iter : mTempInput) {
-        auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC);
-        if (!res) {
-            return OUT_OF_MEMORY;
-        }
-    }
-    for (auto& iter : mTempInput) {
-        backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC);
-    }
-    if (nullptr != mTempOutput) {
-        backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC);
-    }
-    mBlitPipeline = getBlitPipeline(bytes, backend(), true);
+    
     std::map<Tensor*, std::vector<int>> collectForTensor;
     for (int i=0; i< des->regions.size(); ++i) {
         auto& slice = des->regions[i];
         if (nullptr == slice.origin) {
             continue;
         }
-        auto iter = mTempInput.find(slice.origin);
         Tensor* t = slice.origin;
-        if (iter != mTempInput.end()) {
-            t = iter->second.get();
-        }
         auto coliter = collectForTensor.find(t);
         if (coliter == collectForTensor.end()) {
             collectForTensor.insert(std::make_pair(t, std::vector<int>{i}));
@@ -316,15 +395,64 @@ kernel void main0(device int4 *out   [[buffer(0)]],
             coliter->second.emplace_back(i);
         }
     }
+    
+    NSString* input_format;
+    NSString* output_format;
+    if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
+        output_format = @"OUTPUT_FORMAT_NCHW";
+    } else if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
+        output_format = @"OUTPUT_FORMAT_NHWC";
+    } else {
+        output_format = @"OUTPUT_FORMAT_C4NHW4";
+    }
+    std::string unitName = getUnitName(bytes);
+    mBlitPipeline.resize(collectForTensor.size());
+    int index = 0;
     for (auto& iter : collectForTensor) {
+        auto origin = iter.first;
+
+        if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NCHW) {
+            input_format = @"INPUT_FORMAT_NCHW";
+        } else if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
+            input_format = @"INPUT_FORMAT_NHWC";
+        } else {
+            input_format = @"INPUT_FORMAT_C4NHW4";
+        }
+        std::vector<std::string> keys = {
+            std::string([input_format UTF8String]),
+            std::string([output_format UTF8String]),
+            unitName,
+        };
+        if(iter.second.size() == 1) {
+            keys.emplace_back("direct_raster_single");
+        } else {
+            keys.emplace_back("direct_raster_multi");
+        }
+        auto pipeline = mtbn->runtime()->findPipeline(keys);
+        
+        if(nullptr == pipeline) {
+            MTLCompileOptions *options = [[MTLCompileOptions alloc] init];
+            options.preprocessorMacros = @{
+                input_format : @"1",
+                output_format : @"1",
+                @"T" : @(unitName.c_str()),
+            };
+            if(iter.second.size() == 1) {
+                pipeline = mtbn->makeComputePipelineWithSourceOption(gSingleRasterTemplate, "main0", options);
+            } else {
+                pipeline = mtbn->makeComputePipelineWithSourceOption(gMultiRasterTemplate, "main0", options);
+            }
+            mtbn->runtime()->insertPipeline(keys, pipeline);
+        }
+        mBlitPipeline[index] = pipeline;
+        
         BlitInfo blit;
-        auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 4 * sizeof(uint32_t));
+        auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 12 * sizeof(uint32_t));
         blit.blit = std::make_pair(memory.first, memory.second);
         auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer();
 
-        auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 4 * sizeof(uint32_t) + memory.second);
+        auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 12 * sizeof(uint32_t) + memory.second);
 
-        blit.blit = std::make_pair(memory.first, memory.second);
         uint32_t maxSize[3] = {1, 1, 1};
         for (int v=0; v<iter.second.size(); ++v) {
             auto& slice = des->regions[iter.second[v]];
@@ -333,41 +461,42 @@ kernel void main0(device int4 *out   [[buffer(0)]],
             maxSize[1] = ALIMAX(maxSize[1], slice.size[1]);
             maxSize[2] = ALIMAX(maxSize[2], slice.size[2]);
         }
-        ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0];
-         ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size();
-        auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
+        
+        uint32_t* shape = (uint32_t*)((uint8_t*)[buffer contents] + memory.second);
+        int origin_area = 1;
+        for(int i = 2; i < origin->shape().size(); i++) {
+            origin_area *= origin->shape()[i];
+        }
+        int output_area = 1;
+        for(int i = 2; i < output->shape().size(); i++) {
+            output_area *= output->shape()[i];
+        }
+        shape[0] = ALIMAX(1, origin->shape()[0]);
+        shape[1] = ALIMAX(1, origin->shape()[1]);
+        shape[2] = ALIMAX(1, origin_area);
+        shape[3] = 1;
+        shape[4] = ALIMAX(1, output->shape()[0]);
+        shape[5] = ALIMAX(1, output->shape()[1]);
+        shape[6] = ALIMAX(1, output_area);
+        shape[7] = 1;
+        shape[8] = maxSize[0];
+        shape[9] = iter.second.size();
+
+        auto local = [context computeBestGroupAndLocal:mBlitPipeline[index++] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])];
         blit.global = local.first;
         blit.local = local.second;
         mTempInputCopy.insert(std::make_pair(iter.first, blit));
     }
-    for (auto b : mShapeTemp) {
-        mtbn->returnConstBuffer(b);
-    }
-    mShapeTemp.clear();
-    for (int i = 0; i < mTempInput.size(); ++i) {
-        id<MTLBuffer> shape = mtbn->getConstBuffer(0);
-        mShapeTemp.emplace_back(std::move(shape));
-    }
-    if (nullptr != mTempOutput) {
-        mShapeTemp.emplace_back(mtbn->getConstBuffer(0));
-    }
     return NO_ERROR;
 }
 
 void MetalRaster::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs, id<MTLComputeCommandEncoder> encoder) {
+
     auto backend = static_cast<MetalBackend *>(this->backend());
     auto context = (__bridge MNNMetalContext *)backend->context();
-    int out_offset = TensorUtils::getDescribe(outputs[0])->extra.offset;
-    if (nullptr != mTempOutput) {
-        out_offset = TensorUtils::getDescribe(mTempOutput.get())->extra.offset;
-    }
+
     if (mNeedZero) {
-        size_t sizeInBytes;
-        if (mTempOutput != nullptr) {
-            sizeInBytes = backend->getTensorSizeInBytes(mTempOutput.get());
-        } else {
-            sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
-        }
+        size_t sizeInBytes = backend->getTensorSizeInBytes(outputs[0]);
         size_t size = sizeInBytes / (4 * sizeof(int32_t));
         auto ptr = (MemsetInfo*)[mZeroCopy contents];
         ptr->size[0] = (uint32_t)size;
@@ -376,28 +505,33 @@ kernel void main0(device int4 *out   [[buffer(0)]],
         [encoder setBuffer: mZeroCopy offset:0 atIndex: 1];
         [encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)];
     }
+    
+    bool singlePipeline = false;
     int index = 0;
-    for (auto& iter : mTempInput) {
-        backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]);
+    if(mBlitPipeline.size() == 1) {
+        singlePipeline = true;
+        [encoder setComputePipelineState:mBlitPipeline[0]];
+    } else {
+        MNN_ASSERT(mTempInputCopy.size() == mBlitPipeline.size());
     }
-
-    [encoder setComputePipelineState:mBlitPipeline];
     for (auto& iter : mTempInputCopy) {
+        if(!singlePipeline) {
+            [encoder setComputePipelineState:mBlitPipeline[index++]];
+        }
         MetalBackend::setTensor(iter.first, encoder, 0);
         MetalBackend::setTensor(mOutputPtr, encoder, 1);
         auto& blit = iter.second;
         auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer();
         [encoder setBuffer: buffer offset:blit.blit.second atIndex: 2];
+
         [encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local];
     }
-    if (nullptr != mTempOutput) {
-        backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]);
-    }
 }
 
 class MetalRasterCreator : public MetalBackend::Creator {
 public:
     virtual Execution *onCreate(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend, const std::vector<Tensor *>& outputs) const {
+
         return new MetalRaster(backend);
     }
 };
diff --git a/source/backend/metal/shader/MetalConvolution1x1.metal b/source/backend/metal/shader/MetalConvolution1x1.metal
index 21bd0d8d0..80e4d7fb6 100644
--- a/source/backend/metal/shader/MetalConvolution1x1.metal
+++ b/source/backend/metal/shader/MetalConvolution1x1.metal
@@ -167,6 +167,65 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in            [[buffer(0)]],
     //if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); }
 }
 
+kernel void conv1x1_g1z4_m1w4(const device ftype4 *in            [[buffer(0)]],
+                            device ftype4 *out                 [[buffer(1)]],
+                            constant conv1x1_constants& cst    [[buffer(2)]],
+                            const device MNN::uchar4x2 *wt      [[buffer(3)]],
+                            const device ftype4 *biasTerms     [[buffer(4)]],
+                            const device float4 *dequantScale  [[buffer(5)]],
+                            uint3 gid[[threadgroup_position_in_grid]],
+                            uint  tiisg[[thread_index_in_simdgroup]],
+                            uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+    int uz = gid.x * 2 + sgitg;
+
+    int rx = gid.y;
+    auto xy_wt = wt + uz * cst.input_slice;
+    auto xy_in0  = in  + (int)gid.z  * cst.input_size + rx + 0;
+    auto xy_out = out + (int)gid.z * cst.output_size + uz * cst.output_size * cst.batch + rx;
+    auto biasValue = FLOAT4(biasTerms[uz]);
+    FLOAT4 result0 = FLOAT4(0);
+
+    int block = (cst.input_slice + cst.block_size - 1) / cst.block_size;
+    for (int bi=0; bi<cst.block_size; bi++) {
+        FLOAT4 scale = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 0]);
+        FLOAT4 dequant_bias = FLOAT4(dequantScale[2 * (uz * cst.block_size + bi) + 1]);
+        int zmin = bi * block;
+        int zmax = min(zmin + block, cst.input_slice);
+        for (int z = zmin + tiisg; z < zmax; z+=SIMD_GROUP_WIDTH) {
+            auto in40 = (FLOAT4)*(xy_in0 + z * cst.input_size * cst.batch);
+            MNN::uchar4x2 w_int4 = xy_wt[z];
+
+            FLOAT4x4 w_dequant;
+            for (int i = 0; i < 4; ++i) {
+                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
+                w_dequant[i] = res;
+            }
+
+            result0 += FLOAT4(in40 * w_dequant);
+            
+//            FLOAT4x4 w_dequant;
+//            for (int i = 0; i < 4; ++i) {
+//                FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8);
+//                FLOAT4 res = w4 * scale[i] + dequant_bias[i];
+//                w_dequant[i] = w4;
+//            }
+//
+//            FLOAT4 temp = FLOAT4(in40 * w_dequant);
+//            result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias;
+        }
+    }
+    FLOAT4 res;
+    res.x = simd_sum(result0.x);
+    res.y = simd_sum(result0.y);
+    res.z = simd_sum(result0.z);
+    res.w = simd_sum(result0.w);
+    /* true */
+    if (tiisg == 0) {
+        xy_out[0] = activate(ftype4(res + biasValue), cst.activation);
+    }
+}
+
 kernel void conv1x1_g1z8(const device ftype4 *in            [[buffer(0)]],
                          device ftype4 *out                 [[buffer(1)]],
                          constant conv1x1_constants& cst    [[buffer(2)]],
diff --git a/source/backend/metal/shader/MetalDefine.metal b/source/backend/metal/shader/MetalDefine.metal
index bcf7aa462..bf3f85daf 100644
--- a/source/backend/metal/shader/MetalDefine.metal
+++ b/source/backend/metal/shader/MetalDefine.metal
@@ -5,6 +5,7 @@ using namespace metal;
 // Macro
 // –––––––––––––––––––––––––––––––––––––––––––––––––––
 
+#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32
 #define UP_DIV(x, y)    ( ((x) + (y) - 1) / (y) )
 #define ROUND_UP(x, y)  ( ((x) + (y) - 1) / (y) * (y) )
 
diff --git a/source/backend/metal/shader/MetalLayerNorm.metal b/source/backend/metal/shader/MetalLayerNorm.metal
index 626fd9d06..bad927112 100644
--- a/source/backend/metal/shader/MetalLayerNorm.metal
+++ b/source/backend/metal/shader/MetalLayerNorm.metal
@@ -147,3 +147,46 @@ kernel void layernorm_x4_rms(const device ftype4 *in       [[buffer(0)]],
         out_data[gid.x] = (ftype4)(norm);
     }
 }
+
+kernel void layernorm_m1x4_rms(const device ftype4 *in       [[buffer(0)]],
+                             device ftype4 *out            [[buffer(1)]],
+                             constant layernorm_constants& cst  [[buffer(2)]],
+                             const device float4 *gamma    [[buffer(3)]],
+                             const device float4 *beta     [[buffer(4)]],
+                             uint  gid  [[threadgroup_position_in_grid]],
+                             uint  tiisg[[thread_index_in_simdgroup]],
+                             uint  sgitg[[simdgroup_index_in_threadgroup]]) {
+
+    int total_idx = (gid * 4 + sgitg);
+    int in_idx = total_idx  % (cst.inside/4);
+    int out_idx = total_idx  / (cst.inside/4);
+
+    auto in_data = in + out_idx * cst.inside/4;
+    auto out_data = out + out_idx * cst.inside/4;
+
+    float square_sum = 0.0f;
+
+    for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) {
+        ftype4 data = in_data[i];
+        float dis = data.x;
+        square_sum += dis * dis;
+        dis = data.y;
+        square_sum += dis * dis;
+        dis = data.z;
+        square_sum += dis * dis;
+        dis = data.w;
+        square_sum += dis * dis;
+    }
+    square_sum = simd_sum(square_sum);
+    
+    if(tiisg == 0) {
+        float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps);
+        
+        float4 norm = var * ((float4)in_data[in_idx]);
+        if(cst.has_gamma_beta) {
+            out_data[in_idx] = (ftype4)(norm * gamma[in_idx] + beta[in_idx]);
+        } else {
+            out_data[in_idx] = (ftype4)(norm);
+        }
+    }
+}
diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
index d42961c1e..2b45559c4 100644
--- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp
@@ -111,7 +111,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const
             }
         #endif
             
-            if (deviceName.find("QUALCOMM Adreno") != std::string::npos) {
+            if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) {
                 mGpuType = ADRENO;
                 
                 // if device is QUALCOMM's and version is 2.0 , set spacial optimized param
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
index 8dc9957cf..3e288c1f5 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp
@@ -7,7 +7,8 @@
 //
 
 #include "backend/opencl/core/runtime/OpenCLWrapper.hpp"
-#ifdef WIN32
+#ifdef _WIN32
+#include <windows.h>
 #include <libloaderapi.h>
 #else
 #include <dlfcn.h>
@@ -94,7 +95,7 @@ bool OpenCLSymbols::LoadOpenCLLibrary() {
 
 bool OpenCLSymbols::UnLoadOpenCLLibrary() {
     if (handle_ != nullptr) {
-#if defined(WIN32)
+#if defined(_WIN32)
         if (FreeLibrary(handle_) == 0) {
 #else
         if (dlclose(handle_) != 0) {
@@ -129,7 +130,7 @@ bool OpenCLSymbols::isGlError() {
 
 
 bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) {
-#if defined(WIN32)
+#if defined(_WIN32)
     handle_ = LoadLibraryA(library_path.c_str());
     if (handle_ == nullptr) {
         return false;
diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
index 561ccde8c..ba39a8c30 100644
--- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
+++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp
@@ -9,7 +9,7 @@
 #ifndef OpenCLWrapper_hpp
 #define OpenCLWrapper_hpp
 
-#if defined(WIN32)
+#if defined(_WIN32)
 #include <Windows.h>
 #undef min
 #undef max
@@ -248,7 +248,7 @@ class OpenCLSymbols {
 
 private:
     bool LoadLibraryFromPath(const std::string &path);
-#if defined(WIN32)
+#if defined(_WIN32)
     HMODULE handle_ = nullptr;
 #else
     void *handle_ = nullptr;
diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
index 8ba800b26..185a25294 100644
--- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp
@@ -324,8 +324,6 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
         mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
         mConvGemmOutTensor.reset(Tensor::createDevice<float>({alignN * alignM}));
         mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-        mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
         
         {
             std::set<std::string> buildOptions;
@@ -399,6 +397,8 @@ ErrorCode ConvBufExecution::onResize(const std::vector<Tensor *> &inputs, const
 
             mOpenCLBackend->endRecord(mRecording);
         }
+        mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
+        mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
         
         return NO_ERROR;
     } else if (mResource->mConv1x1Opt) {
diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
index d31462301..3c2a02b9d 100644
--- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
+++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp
@@ -398,9 +398,6 @@ void ConvBufLowMemoryExecution::useFPWeightGemmLowMemory(Tensor * input, Tensor
     mOpenCLBackend->onAcquireBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC);
     mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
     mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-    mOpenCLBackend->onReleaseBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC);
-    mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
-    mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
     
     //weight inverse quantization and rearrange
     {
@@ -508,6 +505,9 @@ void ConvBufLowMemoryExecution::useFPWeightGemmLowMemory(Tensor * input, Tensor
         unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]};
         unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]};
     }
+    mOpenCLBackend->onReleaseBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC);
+    mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC);
     
     return;
 }
diff --git a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
index 50b7fd25e..1c11e60e9 100644
--- a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
+++ b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp
@@ -14,78 +14,125 @@ namespace MNN {
 namespace OpenCL {
 GridSampleBufExecution::GridSampleBufExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
     : CommonExecution(backend, op) {
-    mUnits.resize(1);
-    auto &unit = mUnits[0];
+    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
+    mMode = op->main_as_GridSample()->mode();
     mPaddingMode = op->main_as_GridSample()->paddingMode();
     if (op->main_as_GridSample()->alignCorners()) {
         mAlignCorners = 1;
-    }
-    else {
+    }else {
         mAlignCorners = 0;
     }
-
-    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    auto runtime = mOpenCLBackend->getOpenCLRuntime();
-    auto gridSampleParam = op->main_as_GridSample();
-
-    std::set<std::string> buildOptions;
-    if (op->main_as_GridSample()->mode() == 0) {
-        mKernelName = "bilinear_buf";
-        unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions);
-    }
-    else {
-        mKernelName = "nearest_buf";
-        unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions);
-    }
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 }
 
 ErrorCode GridSampleBufExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto &unit = mUnits[0];
+    auto runtime = mOpenCLBackend->getOpenCLRuntime();
     auto inputTensor = inputs[0];
     auto gridTensor = inputs[1];
     auto outputTensor = outputs[0];
-    auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
-
-    const int batches = inputTensor->buffer().dim[0].extent;
-    const int channels = inputTensor->buffer().dim[1].extent;
-    const int inH = inputTensor->buffer().dim[2].extent;
-    const int inW = inputTensor->buffer().dim[3].extent;
-    const int channelC4 = UP_DIV(channels, 4);
-
-    const int outH = outputTensor->buffer().dim[2].extent;
-    const int outW = outputTensor->buffer().dim[3].extent;
-
-    mGlobalWorkSize = {
-        static_cast<uint32_t>(channelC4),
-        static_cast<uint32_t>(outW),
-        static_cast<uint32_t>(outH * batches)
-    };
-
-    MNN_ASSERT(outW > 0 && outH > 0);
-
-    uint32_t idx = 0;
-    cl_int ret = CL_SUCCESS;
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
-    ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor));
-    ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor));
-    ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inH));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batches));
-    ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
-    ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
-    MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution");
-
-    mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first;
-    
-    mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    if(outputs[0]->dimensions() > 4){
+        mUnits.resize(1);
+        const int batches = inputTensor->buffer().dim[0].extent;
+        const int channels = inputTensor->buffer().dim[1].extent;
+        const int inD = inputTensor->buffer().dim[2].extent;
+        const int inH = inputTensor->buffer().dim[3].extent;
+        const int inW = inputTensor->buffer().dim[4].extent;
+        const int channelC4 = UP_DIV(channels, 4);
+        const int outD = outputTensor->buffer().dim[2].extent;
+        const int outH = outputTensor->buffer().dim[3].extent;
+        const int outW = outputTensor->buffer().dim[4].extent;
+        
+        auto &unit = mUnits[0];
+        std::set<std::string> buildOptions;
+        if (mMode == SampleMode_BILINEAR) {
+            mKernelName = "bilinear5d_buf";
+            unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions);
+        } else {
+            mKernelName = "nearest5d_buf";
+            unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions);
+        }
+        mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mGlobalWorkSize = {
+            static_cast<uint32_t>(channelC4 * outD),
+            static_cast<uint32_t>(outW),
+            static_cast<uint32_t>(outH * batches)
+        };
+        MNN_ASSERT(outW > 0 && outH > 0);
+        
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inD));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outD));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batches));
+        ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
+        ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution");
+        
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first;
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    }else{
+        mUnits.resize(1);
+        auto &unit = mUnits[0];
+        const int batches = inputTensor->buffer().dim[0].extent;
+        const int channels = inputTensor->buffer().dim[1].extent;
+        const int inH = inputTensor->buffer().dim[2].extent;
+        const int inW = inputTensor->buffer().dim[3].extent;
+        const int channelC4 = UP_DIV(channels, 4);
+        const int outH = outputTensor->buffer().dim[2].extent;
+        const int outW = outputTensor->buffer().dim[3].extent;
+        
+        std::set<std::string> buildOptions;
+        if (mMode == 0) {
+            mKernelName = "bilinear_buf";
+            unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions);
+        }
+        else {
+            mKernelName = "nearest_buf";
+            unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions);
+        }
+        mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mGlobalWorkSize = {
+            static_cast<uint32_t>(channelC4),
+            static_cast<uint32_t>(outW),
+            static_cast<uint32_t>(outH * batches)
+        };
+        MNN_ASSERT(outW > 0 && outH > 0);
+
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batches));
+        ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
+        ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution");
+
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first;
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    }
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/cl/grid_sample.cl b/source/backend/opencl/execution/cl/grid_sample.cl
index 6a41f0050..67f1a7f94 100644
--- a/source/backend/opencl/execution/cl/grid_sample.cl
+++ b/source/backend/opencl/execution/cl/grid_sample.cl
@@ -56,6 +56,27 @@ FLOAT4 sample(int h, int w,
     return RI_F(tmp, SAMPLER, (int2)(w_offset_base + w, h_offset_base + h));
 }
 
+
+FLOAT4 sample3d(int d, int h, int w,
+              const int x_offset_base,
+              const int y_offset_base,
+              __read_only image2d_t tmp,
+              int depth, int height, int width,
+              enum BorderMode paddingMode){
+
+    if (d < 0 || d >= depth || h < 0 || h >= height || w < 0 || w >= width) {
+        if(paddingMode == BorderMode_ZEROS)
+        {
+            return 0.0f;
+        }
+        d = CLAMP(d, 0, depth - 1);
+        h = CLAMP(h, 0, height - 1);
+        w = CLAMP(w, 0, width - 1);
+    }
+    return RI_F(tmp, SAMPLER, (int2)(x_offset_base + h * width + w, y_offset_base + d));
+}
+
+
 __kernel void nearest(GLOBAL_SIZE_3_DIMS  __read_only image2d_t input,
                         __read_only image2d_t grid,
                         __write_only image2d_t output,
@@ -176,3 +197,148 @@ __kernel void bilinear(GLOBAL_SIZE_3_DIMS  __read_only image2d_t input,
     const int output_h_offset = mad24(output_batch_idx, output_height, output_height_idx);
     WI_F(output, (int2)(output_w_offset, output_h_offset), value);
 }
+
+__kernel void nearest5d(GLOBAL_SIZE_3_DIMS
+                        __read_only image2d_t input,
+                        __read_only image2d_t grid,
+                        __write_only image2d_t output,
+                        __private const int input_height,
+                        __private const int input_width,
+                        __private const int input_depth,
+                        __private const int output_height,
+                        __private const int output_width,
+                        __private const int output_depth,
+                        __private const int batch,
+                        __private const enum BorderMode paddingMode,
+                        __private const int alignCorners){
+    
+    const int output_channel_depth_idx      = get_global_id(0);
+    const int output_width_block_idx        = get_global_id(1);
+    const int output_batch_height_block_idx = get_global_id(2);
+    
+    DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx);
+    
+    const int output_channel_idx = output_channel_depth_idx / output_depth;
+    const int output_depth_idx = output_channel_depth_idx % output_depth;
+    const int output_batch_idx  = output_batch_height_block_idx / output_height;
+    const int output_height_idx = output_batch_height_block_idx % output_height;
+    
+    // get grid idx
+    const int grid_w_offset = (output_depth_idx / 4) * output_width * 3 + output_width_block_idx * 3;
+    const int grid_h_offset = mad24(output_batch_idx, output_height, output_height_idx);
+    
+    FLOAT4 grid_x = RI_F(grid, SAMPLER, (int2)(grid_w_offset, grid_h_offset));
+    FLOAT4 grid_y = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 1, grid_h_offset));
+    FLOAT4 grid_z = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 2, grid_h_offset));
+
+    const float arr[12] = {grid_x.x, grid_y.x, grid_z.x, grid_x.y, grid_y.y, grid_z.y, grid_x.z, grid_y.z, grid_z.z, grid_x.w, grid_y.w, grid_z.w};
+    
+    // get grid x,y
+    const int arr_offset = output_depth_idx % 4;
+    const float x = arr[3 * arr_offset];
+    const float y = arr[3 * arr_offset + 1];
+    const float z = arr[3 * arr_offset + 2];
+
+    float in_grid_x = getPosition(x, input_width, alignCorners);
+    float in_grid_y = getPosition(y, input_height, alignCorners);
+    float in_grid_z = getPosition(z, input_depth, alignCorners);
+
+    // get nearest point
+    int nw = floor(in_grid_x + 0.5f);
+    int nh = floor(in_grid_y + 0.5f);
+    int nd = floor(in_grid_z + 0.5f);
+    
+    const int inp_w_offset = mul24(output_channel_idx, input_width * input_height);
+    const int inp_h_offset = mul24(output_batch_idx, input_depth);
+    FLOAT4 value = sample3d(nd, nh, nw, inp_w_offset, inp_h_offset, input, input_depth, input_height, input_width, paddingMode);
+    
+    const int output_w_offset = output_channel_idx * output_width * output_height + output_height_idx * output_width + output_width_block_idx;
+    const int output_h_offset = mad24(output_batch_idx, output_depth, output_depth_idx);
+    WI_F(output, (int2)(output_w_offset, output_h_offset), value);
+}
+
+__kernel void bilinear5d(GLOBAL_SIZE_3_DIMS
+                        __read_only image2d_t input,
+                        __read_only image2d_t grid,
+                        __write_only image2d_t output,
+                        __private const int input_height,
+                        __private const int input_width,
+                        __private const int input_depth,
+                        __private const int output_height,
+                        __private const int output_width,
+                        __private const int output_depth,
+                        __private const int batch,
+                        __private const enum BorderMode paddingMode,
+                        __private const int alignCorners){
+
+    const int output_channel_depth_idx      = get_global_id(0);
+    const int output_width_block_idx        = get_global_id(1);
+    const int output_batch_height_block_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx);
+    
+    const int output_channel_idx = output_channel_depth_idx / output_depth;
+    const int output_depth_idx = output_channel_depth_idx % output_depth;
+    const int output_batch_idx  = output_batch_height_block_idx / output_height;
+    const int output_height_idx = output_batch_height_block_idx % output_height;
+    
+    // get grid idx
+    const int grid_w_offset = (output_depth_idx / 4) * output_width * 3 + output_width_block_idx * 3;
+    const int grid_h_offset = mad24(output_batch_idx, output_height, output_height_idx);
+    
+    FLOAT4 grid_x = RI_F(grid, SAMPLER, (int2)(grid_w_offset, grid_h_offset));
+    FLOAT4 grid_y = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 1, grid_h_offset));
+    FLOAT4 grid_z = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 2, grid_h_offset));
+
+    const float arr[12] = {grid_x.x, grid_y.x, grid_z.x, grid_x.y, grid_y.y, grid_z.y, grid_x.z, grid_y.z, grid_z.z, grid_x.w, grid_y.w, grid_z.w};
+    
+    // get grid x,y
+    const int arr_offset = output_depth_idx % 4;
+    const float x = arr[3 * arr_offset];
+    const float y = arr[3 * arr_offset + 1];
+    const float z = arr[3 * arr_offset + 2];
+
+    float in_grid_x = getPosition(x, input_width, alignCorners);
+    float in_grid_y = getPosition(y, input_height, alignCorners);
+    float in_grid_z = getPosition(z, input_depth, alignCorners);
+
+    int in_d0 = floor(in_grid_z);
+    int in_h0 = floor(in_grid_y);
+    int in_w0 = floor(in_grid_x);
+    int in_d1 = ceil(in_grid_z);
+    int in_h1 = ceil(in_grid_y);
+    int in_w1 = ceil(in_grid_x);
+    
+    float x_weight0 = in_grid_x - in_w0;
+    float x_weight1 = 1 - x_weight0;
+    float y_weight0 = in_grid_y - in_h0;
+    float y_weight1 = 1 - y_weight0;
+    float z_weight0 = in_grid_z - in_d0;
+    float z_weight1 = 1 - z_weight0;
+
+    // bilinear interpolation
+    const int inp_x_offset = mul24(output_channel_idx, input_width * input_height);
+    const int inp_y_offset = mul24(output_batch_idx, input_depth);
+    FLOAT4 i000 = sample3d(in_d0, in_h0, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i001 = sample3d(in_d0, in_h0, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i010 = sample3d(in_d0, in_h1, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i011 = sample3d(in_d0, in_h1, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i100 = sample3d(in_d1, in_h0, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i101 = sample3d(in_d1, in_h0, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i110 = sample3d(in_d1, in_h1, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    FLOAT4 i111 = sample3d(in_d1, in_h1, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode);
+    
+    
+    FLOAT4 i00 = (FLOAT4)(x_weight1) * i000  + (FLOAT4)(x_weight0) * i001;
+    FLOAT4 i01 = (FLOAT4)(x_weight1) * i010  + (FLOAT4)(x_weight0) * i011;
+    FLOAT4 i10 = (FLOAT4)(x_weight1) * i100  + (FLOAT4)(x_weight0) * i101;
+    FLOAT4 i11 = (FLOAT4)(x_weight1) * i110  + (FLOAT4)(x_weight0) * i111;
+    
+    FLOAT4 i0 = (FLOAT4)(y_weight1) * i00  + (FLOAT4)(y_weight0) * i01;
+    FLOAT4 i1 = (FLOAT4)(y_weight1) * i10  + (FLOAT4)(y_weight0) * i11;
+    FLOAT4 interp = (FLOAT4)(z_weight1) * i0 + (FLOAT4)(z_weight0) * i1;
+    const int output_w_offset = output_channel_idx * output_width * output_height + output_height_idx * output_width + output_width_block_idx;
+    const int output_h_offset = mad24(output_batch_idx, output_depth, output_depth_idx);
+
+    WI_F(output, (int2)(output_w_offset, output_h_offset), interp);
+}
diff --git a/source/backend/opencl/execution/cl/grid_sample_buf.cl b/source/backend/opencl/execution/cl/grid_sample_buf.cl
index 758cb2295..42ada041f 100644
--- a/source/backend/opencl/execution/cl/grid_sample_buf.cl
+++ b/source/backend/opencl/execution/cl/grid_sample_buf.cl
@@ -54,6 +54,25 @@ COMPUTE_FLOAT4 sample(int h, int w,
     return CONVERT_COMPUTE_FLOAT4(vload4(offset, buffer));
 }
 
+COMPUTE_FLOAT4 sample3d(int d, int h, int w,
+              const int offset_base,
+              __global const FLOAT *buffer,
+              int depth, int height, int width,
+              enum BorderMode paddingMode){
+
+    if (d < 0 || d >= depth || h < 0 || h >= height || w < 0 || w >= width) {
+        if(paddingMode == BorderMode_ZEROS)
+        {
+            return 0.0f;
+        }
+        d = CLAMP(d, 0, depth - 1);
+        h = CLAMP(h, 0, height - 1);
+        w = CLAMP(w, 0, width - 1);
+    }
+    int offset = ((offset_base + d) * height + h) * width + w;
+    return CONVERT_COMPUTE_FLOAT4(vload4(offset, buffer));
+}
+
 __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
                         __global const FLOAT* grid,
                         __global FLOAT* output,
@@ -165,3 +184,126 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
     const int output_offset = ((output_batch_idx + output_channel_block_idx * batch) * output_height + output_height_idx) * output_width + output_width_block_idx;
     vstore4(CONVERT_FLOAT4(value), output_offset, output);
 }
+__kernel void nearest5d_buf(GLOBAL_SIZE_3_DIMS  __global const FLOAT* input,
+                        __global const FLOAT* grid,
+                        __global FLOAT* output,
+                        __private const int input_height,
+                        __private const int input_width,
+                        __private const int input_depth,
+                        __private const int output_height,
+                        __private const int output_width,
+                        __private const int output_depth,
+                        __private const int batch,
+                        __private const enum BorderMode paddingMode,
+                        __private const int alignCorners){
+    
+    const int output_channel_depth_idx      = get_global_id(0);
+    const int output_width_block_idx        = get_global_id(1);
+    const int output_batch_height_block_idx = get_global_id(2);
+    
+    DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx);
+    
+    const int output_channel_idx = output_channel_depth_idx / output_depth;
+    const int output_depth_idx = output_channel_depth_idx % output_depth;
+    const int output_batch_idx  = output_batch_height_block_idx / output_height;
+    const int output_height_idx = output_batch_height_block_idx % output_height;
+    
+    const int grid_offset = ((output_batch_idx * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx;
+    float3 grid_xyz = convert_float3(vload3(grid_offset, grid));
+
+    const float x = grid_xyz.x;
+    const float y = grid_xyz.y;
+    const float z = grid_xyz.z;
+
+    float in_grid_x = getPosition(x, input_width, alignCorners);
+    float in_grid_y = getPosition(y, input_height, alignCorners);
+    float in_grid_z = getPosition(z, input_depth, alignCorners);
+
+    // get nearest point
+    int nw = floor(in_grid_x + 0.5f);
+    int nh = floor(in_grid_y + 0.5f);
+    int nd = floor(in_grid_z + 0.5f);
+
+    const int inp_offset_base = (output_batch_idx + output_channel_idx * batch) * input_depth;
+    COMPUTE_FLOAT4 value = sample3d(nd, nh, nw, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+
+    const int output_offset = (((output_batch_idx + output_channel_idx * batch) * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx;
+    vstore4(CONVERT_FLOAT4(value), output_offset, output);
+}
+
+__kernel void bilinear5d_buf(GLOBAL_SIZE_3_DIMS
+                        __global const FLOAT* input,
+                        __global const FLOAT* grid,
+                        __global FLOAT* output,
+                        __private const int input_height,
+                        __private const int input_width,
+                        __private const int input_depth,
+                        __private const int output_height,
+                        __private const int output_width,
+                        __private const int output_depth,
+                        __private const int batch,
+                        __private const enum BorderMode paddingMode,
+                        __private const int alignCorners){
+
+    const int output_channel_depth_idx      = get_global_id(0);
+    const int output_width_block_idx        = get_global_id(1);
+    const int output_batch_height_block_idx = get_global_id(2);
+
+    DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx);
+    
+    const int output_channel_idx = output_channel_depth_idx / output_depth;
+    const int output_depth_idx = output_channel_depth_idx % output_depth;
+    const int output_batch_idx  = output_batch_height_block_idx / output_height;
+    const int output_height_idx = output_batch_height_block_idx % output_height;
+    
+    const int grid_offset = ((output_batch_idx * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx;
+    float3 grid_xyz = convert_float3(vload3(grid_offset, grid));
+
+    
+    // get grid x,y
+    const float x = grid_xyz.x;
+    const float y = grid_xyz.y;
+    const float z = grid_xyz.z;
+
+    float in_grid_x = getPosition(x, input_width, alignCorners);
+    float in_grid_y = getPosition(y, input_height, alignCorners);
+    float in_grid_z = getPosition(z, input_depth, alignCorners);
+
+    int in_d0 = floor(in_grid_z);
+    int in_h0 = floor(in_grid_y);
+    int in_w0 = floor(in_grid_x);
+    int in_d1 = ceil(in_grid_z);
+    int in_h1 = ceil(in_grid_y);
+    int in_w1 = ceil(in_grid_x);
+    
+    float x_weight0 = in_grid_x - in_w0;
+    float x_weight1 = 1 - x_weight0;
+    float y_weight0 = in_grid_y - in_h0;
+    float y_weight1 = 1 - y_weight0;
+    float z_weight0 = in_grid_z - in_d0;
+    float z_weight1 = 1 - z_weight0;
+
+    // bilinear interpolation
+    const int inp_offset_base = (output_batch_idx + output_channel_idx * batch) * input_depth;
+    COMPUTE_FLOAT4 i000 = sample3d(in_d0, in_h0, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i001 = sample3d(in_d0, in_h0, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i010 = sample3d(in_d0, in_h1, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i011 = sample3d(in_d0, in_h1, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i100 = sample3d(in_d1, in_h0, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i101 = sample3d(in_d1, in_h0, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i110 = sample3d(in_d1, in_h1, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    COMPUTE_FLOAT4 i111 = sample3d(in_d1, in_h1, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode);
+    
+    
+    COMPUTE_FLOAT4 i00 = (COMPUTE_FLOAT4)(x_weight1) * i000  + (COMPUTE_FLOAT4)(x_weight0) * i001;
+    COMPUTE_FLOAT4 i01 = (COMPUTE_FLOAT4)(x_weight1) * i010  + (COMPUTE_FLOAT4)(x_weight0) * i011;
+    COMPUTE_FLOAT4 i10 = (COMPUTE_FLOAT4)(x_weight1) * i100  + (COMPUTE_FLOAT4)(x_weight0) * i101;
+    COMPUTE_FLOAT4 i11 = (COMPUTE_FLOAT4)(x_weight1) * i110  + (COMPUTE_FLOAT4)(x_weight0) * i111;
+    
+    COMPUTE_FLOAT4 i0 = (COMPUTE_FLOAT4)(y_weight1) * i00  + (COMPUTE_FLOAT4)(y_weight0) * i01;
+    COMPUTE_FLOAT4 i1 = (COMPUTE_FLOAT4)(y_weight1) * i10  + (COMPUTE_FLOAT4)(y_weight0) * i11;
+    COMPUTE_FLOAT4 interp = (COMPUTE_FLOAT4)(z_weight1) * i0 + (COMPUTE_FLOAT4)(z_weight0) * i1;
+    
+    const int output_offset = (((output_batch_idx + output_channel_idx * batch) * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx;
+    vstore4(CONVERT_FLOAT4(interp), output_offset, output);
+}
diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc
index a4d2cb4f4..ed7ee0cc6 100644
--- a/source/backend/opencl/execution/cl/opencl_program.cc
+++ b/source/backend/opencl/execution/cl/opencl_program.cc
@@ -1535,6 +1535,23 @@ const char* grid_sample_buf =
 " int offset=(offset_base+h)*width+w;\n"
 " return CONVERT_COMPUTE_FLOAT4(vload4(offset,buffer));\n"
 "}\n"
+"COMPUTE_FLOAT4 sample3d(int d,int h,int w,\n"
+" const int offset_base,\n"
+" __global const FLOAT *buffer,\n"
+" int depth,int height,int width,\n"
+" enum BorderMode paddingMode){\n"
+" if (d<0 || d >= depth || h<0 || h >= height || w<0 || w >= width) {\n"
+" if(paddingMode == BorderMode_ZEROS)\n"
+" {\n"
+" return 0.0f;\n"
+" }\n"
+" d=CLAMP(d,0,depth-1);\n"
+" h=CLAMP(h,0,height-1);\n"
+" w=CLAMP(w,0,width-1);\n"
+" }\n"
+" int offset=((offset_base+d)*height+h)*width+w;\n"
+" return CONVERT_COMPUTE_FLOAT4(vload4(offset,buffer));\n"
+"}\n"
 "__kernel void nearest_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,\n"
 " __global const FLOAT* grid,\n"
 " __global FLOAT* output,\n"
@@ -1628,6 +1645,117 @@ const char* grid_sample_buf =
 " const int output_offset=((output_batch_idx+output_channel_block_idx*batch)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
 " vstore4(CONVERT_FLOAT4(value),output_offset,output);\n"
 "}\n"
+"__kernel void nearest5d_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,\n"
+" __global const FLOAT* grid,\n"
+" __global FLOAT* output,\n"
+" __private const int input_height,\n"
+" __private const int input_width,\n"
+" __private const int input_depth,\n"
+" __private const int output_height,\n"
+" __private const int output_width,\n"
+" __private const int output_depth,\n"
+" __private const int batch,\n"
+" __private const enum BorderMode paddingMode,\n"
+" __private const int alignCorners){\n"
+" \n"
+" const int output_channel_depth_idx=get_global_id(0);\n"
+" const int output_width_block_idx=get_global_id(1);\n"
+" const int output_batch_height_block_idx=get_global_id(2);\n"
+" \n"
+" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n"
+" \n"
+" const int output_channel_idx=output_channel_depth_idx/output_depth;\n"
+" const int output_depth_idx=output_channel_depth_idx % output_depth;\n"
+" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
+" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
+" \n"
+" const int grid_offset=((output_batch_idx*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" float3 grid_xyz=convert_float3(vload3(grid_offset,grid));\n"
+" const float x=grid_xyz.x;\n"
+" const float y=grid_xyz.y;\n"
+" const float z=grid_xyz.z;\n"
+" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
+" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
+" float in_grid_z=getPosition(z,input_depth,alignCorners);\n"
+" // get nearest point\n"
+" int nw=floor(in_grid_x+0.5f);\n"
+" int nh=floor(in_grid_y+0.5f);\n"
+" int nd=floor(in_grid_z+0.5f);\n"
+" const int inp_offset_base=(output_batch_idx+output_channel_idx*batch)*input_depth;\n"
+" COMPUTE_FLOAT4 value=sample3d(nd,nh,nw,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" const int output_offset=(((output_batch_idx+output_channel_idx*batch)*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" vstore4(CONVERT_FLOAT4(value),output_offset,output);\n"
+"}\n"
+"__kernel void bilinear5d_buf(GLOBAL_SIZE_3_DIMS\n"
+" __global const FLOAT* input,\n"
+" __global const FLOAT* grid,\n"
+" __global FLOAT* output,\n"
+" __private const int input_height,\n"
+" __private const int input_width,\n"
+" __private const int input_depth,\n"
+" __private const int output_height,\n"
+" __private const int output_width,\n"
+" __private const int output_depth,\n"
+" __private const int batch,\n"
+" __private const enum BorderMode paddingMode,\n"
+" __private const int alignCorners){\n"
+" const int output_channel_depth_idx=get_global_id(0);\n"
+" const int output_width_block_idx=get_global_id(1);\n"
+" const int output_batch_height_block_idx=get_global_id(2);\n"
+" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n"
+" \n"
+" const int output_channel_idx=output_channel_depth_idx/output_depth;\n"
+" const int output_depth_idx=output_channel_depth_idx % output_depth;\n"
+" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
+" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
+" \n"
+" const int grid_offset=((output_batch_idx*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" float3 grid_xyz=convert_float3(vload3(grid_offset,grid));\n"
+" \n"
+" // get grid x,y\n"
+" const float x=grid_xyz.x;\n"
+" const float y=grid_xyz.y;\n"
+" const float z=grid_xyz.z;\n"
+" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
+" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
+" float in_grid_z=getPosition(z,input_depth,alignCorners);\n"
+" int in_d0=floor(in_grid_z);\n"
+" int in_h0=floor(in_grid_y);\n"
+" int in_w0=floor(in_grid_x);\n"
+" int in_d1=ceil(in_grid_z);\n"
+" int in_h1=ceil(in_grid_y);\n"
+" int in_w1=ceil(in_grid_x);\n"
+" \n"
+" float x_weight0=in_grid_x-in_w0;\n"
+" float x_weight1=1-x_weight0;\n"
+" float y_weight0=in_grid_y-in_h0;\n"
+" float y_weight1=1-y_weight0;\n"
+" float z_weight0=in_grid_z-in_d0;\n"
+" float z_weight1=1-z_weight0;\n"
+" // bilinear interpolation\n"
+" const int inp_offset_base=(output_batch_idx+output_channel_idx*batch)*input_depth;\n"
+" COMPUTE_FLOAT4 i000=sample3d(in_d0,in_h0,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i001=sample3d(in_d0,in_h0,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i010=sample3d(in_d0,in_h1,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i011=sample3d(in_d0,in_h1,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i100=sample3d(in_d1,in_h0,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i101=sample3d(in_d1,in_h0,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i110=sample3d(in_d1,in_h1,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" COMPUTE_FLOAT4 i111=sample3d(in_d1,in_h1,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n"
+" \n"
+" \n"
+" COMPUTE_FLOAT4 i00=(COMPUTE_FLOAT4)(x_weight1)*i000+(COMPUTE_FLOAT4)(x_weight0)*i001;\n"
+" COMPUTE_FLOAT4 i01=(COMPUTE_FLOAT4)(x_weight1)*i010+(COMPUTE_FLOAT4)(x_weight0)*i011;\n"
+" COMPUTE_FLOAT4 i10=(COMPUTE_FLOAT4)(x_weight1)*i100+(COMPUTE_FLOAT4)(x_weight0)*i101;\n"
+" COMPUTE_FLOAT4 i11=(COMPUTE_FLOAT4)(x_weight1)*i110+(COMPUTE_FLOAT4)(x_weight0)*i111;\n"
+" \n"
+" COMPUTE_FLOAT4 i0=(COMPUTE_FLOAT4)(y_weight1)*i00+(COMPUTE_FLOAT4)(y_weight0)*i01;\n"
+" COMPUTE_FLOAT4 i1=(COMPUTE_FLOAT4)(y_weight1)*i10+(COMPUTE_FLOAT4)(y_weight0)*i11;\n"
+" COMPUTE_FLOAT4 interp=(COMPUTE_FLOAT4)(z_weight1)*i0+(COMPUTE_FLOAT4)(z_weight0)*i1;\n"
+" \n"
+" const int output_offset=(((output_batch_idx+output_channel_idx*batch)*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n"
+" vstore4(CONVERT_FLOAT4(interp),output_offset,output);\n"
+"}\n"
 ;
 #endif
 const char* interp = 
@@ -2840,17 +2968,37 @@ const char* raster =
 " DEAL_NON_UNIFORM_DIM2(x,y);\n"
 " WI_DATA(output,(int2)(x,y),(OUTPUT_TYPE_I4)(0));\n"
 "}\n"
-"__kernel void raster_buffer_direct(\n"
+"__kernel void raster_buffer(\n"
 " GLOBAL_SIZE_3_DIMS\n"
-" __read_only image2d_t input,\n"
+" __global INPUT_TYPE *input,\n"
+" __private const int inputOffset,\n"
+" __private const int inputStride0,\n"
+" __private const int inputStride1,\n"
+" __private const int inputStride2,\n"
+" __global OUTPUT_TYPE *output,\n"
+" __private const int outputOffset,\n"
+" __private const int outputStride0,\n"
+" __private const int outputStride1,\n"
+" __private const int outputStride2\n"
+" ) {\n"
+" const int x=get_global_id(0);\n"
+" const int y=get_global_id(1);\n"
+" const int z=get_global_id(2);\n"
+" \n"
+" DEAL_NON_UNIFORM_DIM3(x,y,z);\n"
+" \n"
+" int inputIndex=inputOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n"
+" int outputIndex=outputOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n"
+" output[outputIndex]=(OUTPUT_TYPE)input[inputIndex];\n"
+"}\n"
+"__kernel void raster_buffer_combine(\n"
+" GLOBAL_SIZE_3_DIMS\n"
+" __global INPUT_TYPE *input,\n"
 " __private const int inputOffset,\n"
 " __private const int combineSrcOffset,\n"
 " __private const int inputStride0,\n"
 " __private const int inputStride1,\n"
 " __private const int inputStride2,\n"
-" __private const int src_width,\n"
-" __private const int src_height,\n"
-" __private const int src_channel,\n"
 " __global OUTPUT_TYPE *output,\n"
 " __private const int outputOffset,\n"
 " __private const int combineDstOffset,\n"
@@ -2869,21 +3017,7 @@ const char* raster =
 " \n"
 " int inputIndex=inputOffset+id*combineSrcOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n"
 " int outputIndex=outputOffset+id*combineDstOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n"
-"#ifdef INPUT_DATA_FORMAT_NHWC\n"
-" int in_c=inputIndex % src_channel; inputIndex /= src_channel;\n"
-" int in_w=inputIndex % src_width; inputIndex /= src_width;\n"
-" int in_h=inputIndex % src_height;\n"
-" int in_b=inputIndex/src_height;\n"
-"#else\n"
-" int in_w=inputIndex % src_width; inputIndex /= src_width;\n"
-" int in_h=inputIndex % src_height; inputIndex /= src_height;\n"
-" int in_c=inputIndex % src_channel;\n"
-" int in_b=inputIndex/src_channel;\n"
-"#endif\n"
-" int2 coord=(int2)((in_c/4)*src_width+in_w,in_b*src_height+in_h);\n"
-" INPUT_TYPE_I4 value=RI_DATA(input,SAMPLER,coord);\n"
-" INPUT_TYPE_I* value_ptr=(INPUT_TYPE_I*)&value;\n"
-" output[outputIndex]=(OUTPUT_TYPE)value_ptr[in_c % 4];\n"
+" output[outputIndex]=(OUTPUT_TYPE)input[inputIndex];\n"
 "}\n"
 "__kernel void raster_image(\n"
 " GLOBAL_SIZE_3_DIMS\n"
@@ -9825,6 +9959,23 @@ const char* grid_sample =
 " }\n"
 " return RI_F(tmp,SAMPLER,(int2)(w_offset_base+w,h_offset_base+h));\n"
 "}\n"
+"FLOAT4 sample3d(int d,int h,int w,\n"
+" const int x_offset_base,\n"
+" const int y_offset_base,\n"
+" __read_only image2d_t tmp,\n"
+" int depth,int height,int width,\n"
+" enum BorderMode paddingMode){\n"
+" if (d<0 || d >= depth || h<0 || h >= height || w<0 || w >= width) {\n"
+" if(paddingMode == BorderMode_ZEROS)\n"
+" {\n"
+" return 0.0f;\n"
+" }\n"
+" d=CLAMP(d,0,depth-1);\n"
+" h=CLAMP(h,0,height-1);\n"
+" w=CLAMP(w,0,width-1);\n"
+" }\n"
+" return RI_F(tmp,SAMPLER,(int2)(x_offset_base+h*width+w,y_offset_base+d));\n"
+"}\n"
 "__kernel void nearest(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n"
 " __read_only image2d_t grid,\n"
 " __write_only image2d_t output,\n"
@@ -9925,6 +10076,139 @@ const char* grid_sample =
 " const int output_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
 " WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n"
 "}\n"
+"__kernel void nearest5d(GLOBAL_SIZE_3_DIMS\n"
+" __read_only image2d_t input,\n"
+" __read_only image2d_t grid,\n"
+" __write_only image2d_t output,\n"
+" __private const int input_height,\n"
+" __private const int input_width,\n"
+" __private const int input_depth,\n"
+" __private const int output_height,\n"
+" __private const int output_width,\n"
+" __private const int output_depth,\n"
+" __private const int batch,\n"
+" __private const enum BorderMode paddingMode,\n"
+" __private const int alignCorners){\n"
+" \n"
+" const int output_channel_depth_idx=get_global_id(0);\n"
+" const int output_width_block_idx=get_global_id(1);\n"
+" const int output_batch_height_block_idx=get_global_id(2);\n"
+" \n"
+" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n"
+" \n"
+" const int output_channel_idx=output_channel_depth_idx/output_depth;\n"
+" const int output_depth_idx=output_channel_depth_idx % output_depth;\n"
+" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
+" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
+" \n"
+" // get grid idx\n"
+" const int grid_w_offset=(output_depth_idx/4)*output_width*3+output_width_block_idx*3;\n"
+" const int grid_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
+" \n"
+" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset,grid_h_offset));\n"
+" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1,grid_h_offset));\n"
+" FLOAT4 grid_z=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2,grid_h_offset));\n"
+" const float arr[12]={grid_x.x,grid_y.x,grid_z.x,grid_x.y,grid_y.y,grid_z.y,grid_x.z,grid_y.z,grid_z.z,grid_x.w,grid_y.w,grid_z.w};\n"
+" \n"
+" // get grid x,y\n"
+" const int arr_offset=output_depth_idx % 4;\n"
+" const float x=arr[3*arr_offset];\n"
+" const float y=arr[3*arr_offset+1];\n"
+" const float z=arr[3*arr_offset+2];\n"
+" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
+" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
+" float in_grid_z=getPosition(z,input_depth,alignCorners);\n"
+" // get nearest point\n"
+" int nw=floor(in_grid_x+0.5f);\n"
+" int nh=floor(in_grid_y+0.5f);\n"
+" int nd=floor(in_grid_z+0.5f);\n"
+" \n"
+" const int inp_w_offset=mul24(output_channel_idx,input_width*input_height);\n"
+" const int inp_h_offset=mul24(output_batch_idx,input_depth);\n"
+" FLOAT4 value=sample3d(nd,nh,nw,inp_w_offset,inp_h_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" \n"
+" const int output_w_offset=output_channel_idx*output_width*output_height+output_height_idx*output_width+output_width_block_idx;\n"
+" const int output_h_offset=mad24(output_batch_idx,output_depth,output_depth_idx);\n"
+" WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n"
+"}\n"
+"__kernel void bilinear5d(GLOBAL_SIZE_3_DIMS\n"
+" __read_only image2d_t input,\n"
+" __read_only image2d_t grid,\n"
+" __write_only image2d_t output,\n"
+" __private const int input_height,\n"
+" __private const int input_width,\n"
+" __private const int input_depth,\n"
+" __private const int output_height,\n"
+" __private const int output_width,\n"
+" __private const int output_depth,\n"
+" __private const int batch,\n"
+" __private const enum BorderMode paddingMode,\n"
+" __private const int alignCorners){\n"
+" const int output_channel_depth_idx=get_global_id(0);\n"
+" const int output_width_block_idx=get_global_id(1);\n"
+" const int output_batch_height_block_idx=get_global_id(2);\n"
+" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n"
+" \n"
+" const int output_channel_idx=output_channel_depth_idx/output_depth;\n"
+" const int output_depth_idx=output_channel_depth_idx % output_depth;\n"
+" const int output_batch_idx=output_batch_height_block_idx/output_height;\n"
+" const int output_height_idx=output_batch_height_block_idx % output_height;\n"
+" \n"
+" // get grid idx\n"
+" const int grid_w_offset=(output_depth_idx/4)*output_width*3+output_width_block_idx*3;\n"
+" const int grid_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n"
+" \n"
+" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset,grid_h_offset));\n"
+" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1,grid_h_offset));\n"
+" FLOAT4 grid_z=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2,grid_h_offset));\n"
+" const float arr[12]={grid_x.x,grid_y.x,grid_z.x,grid_x.y,grid_y.y,grid_z.y,grid_x.z,grid_y.z,grid_z.z,grid_x.w,grid_y.w,grid_z.w};\n"
+" \n"
+" // get grid x,y\n"
+" const int arr_offset=output_depth_idx % 4;\n"
+" const float x=arr[3*arr_offset];\n"
+" const float y=arr[3*arr_offset+1];\n"
+" const float z=arr[3*arr_offset+2];\n"
+" float in_grid_x=getPosition(x,input_width,alignCorners);\n"
+" float in_grid_y=getPosition(y,input_height,alignCorners);\n"
+" float in_grid_z=getPosition(z,input_depth,alignCorners);\n"
+" int in_d0=floor(in_grid_z);\n"
+" int in_h0=floor(in_grid_y);\n"
+" int in_w0=floor(in_grid_x);\n"
+" int in_d1=ceil(in_grid_z);\n"
+" int in_h1=ceil(in_grid_y);\n"
+" int in_w1=ceil(in_grid_x);\n"
+" \n"
+" float x_weight0=in_grid_x-in_w0;\n"
+" float x_weight1=1-x_weight0;\n"
+" float y_weight0=in_grid_y-in_h0;\n"
+" float y_weight1=1-y_weight0;\n"
+" float z_weight0=in_grid_z-in_d0;\n"
+" float z_weight1=1-z_weight0;\n"
+" // bilinear interpolation\n"
+" const int inp_x_offset=mul24(output_channel_idx,input_width*input_height);\n"
+" const int inp_y_offset=mul24(output_batch_idx,input_depth);\n"
+" FLOAT4 i000=sample3d(in_d0,in_h0,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i001=sample3d(in_d0,in_h0,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i010=sample3d(in_d0,in_h1,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i011=sample3d(in_d0,in_h1,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i100=sample3d(in_d1,in_h0,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i101=sample3d(in_d1,in_h0,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i110=sample3d(in_d1,in_h1,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" FLOAT4 i111=sample3d(in_d1,in_h1,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n"
+" \n"
+" \n"
+" FLOAT4 i00=(FLOAT4)(x_weight1)*i000+(FLOAT4)(x_weight0)*i001;\n"
+" FLOAT4 i01=(FLOAT4)(x_weight1)*i010+(FLOAT4)(x_weight0)*i011;\n"
+" FLOAT4 i10=(FLOAT4)(x_weight1)*i100+(FLOAT4)(x_weight0)*i101;\n"
+" FLOAT4 i11=(FLOAT4)(x_weight1)*i110+(FLOAT4)(x_weight0)*i111;\n"
+" \n"
+" FLOAT4 i0=(FLOAT4)(y_weight1)*i00+(FLOAT4)(y_weight0)*i01;\n"
+" FLOAT4 i1=(FLOAT4)(y_weight1)*i10+(FLOAT4)(y_weight0)*i11;\n"
+" FLOAT4 interp=(FLOAT4)(z_weight1)*i0+(FLOAT4)(z_weight0)*i1;\n"
+" const int output_w_offset=output_channel_idx*output_width*output_height+output_height_idx*output_width+output_width_block_idx;\n"
+" const int output_h_offset=mad24(output_batch_idx,output_depth,output_depth_idx);\n"
+" WI_F(output,(int2)(output_w_offset,output_h_offset),interp);\n"
+"}\n"
 ;
 const char* buffer_convert_quant = 
 "#ifdef MNN_SUPPORT_FP16\n"
diff --git a/source/backend/opencl/execution/cl/raster.cl b/source/backend/opencl/execution/cl/raster.cl
index 6033514d5..a7fe7bd5c 100644
--- a/source/backend/opencl/execution/cl/raster.cl
+++ b/source/backend/opencl/execution/cl/raster.cl
@@ -44,17 +44,38 @@ __kernel void image_set_zero(
     WI_DATA(output, (int2)(x, y), (OUTPUT_TYPE_I4)(0));
 }
 
-__kernel void raster_buffer_direct(
+__kernel void raster_buffer(
                     GLOBAL_SIZE_3_DIMS
-                    __read_only image2d_t input,
+                    __global INPUT_TYPE *input,
+                    __private const int inputOffset,
+                    __private const int inputStride0,
+                    __private const int inputStride1,
+                    __private const int inputStride2,
+                    __global OUTPUT_TYPE *output,
+                    __private const int outputOffset,
+                    __private const int outputStride0,
+                    __private const int outputStride1,
+                    __private const int outputStride2
+                    ) {
+    const int x = get_global_id(0);
+    const int y = get_global_id(1);
+    const int z = get_global_id(2);
+    
+    DEAL_NON_UNIFORM_DIM3(x, y, z);
+    
+    int inputIndex = inputOffset + z * inputStride0 + y * inputStride1 + x * inputStride2;
+    int outputIndex = outputOffset + z * outputStride0 + y * outputStride1 + x * outputStride2;
+    output[outputIndex] = (OUTPUT_TYPE)input[inputIndex];
+}
+
+__kernel void raster_buffer_combine(
+                    GLOBAL_SIZE_3_DIMS
+                    __global INPUT_TYPE *input,
                     __private const int inputOffset,
                     __private const int combineSrcOffset,
                     __private const int inputStride0,
                     __private const int inputStride1,
                     __private const int inputStride2,
-                    __private const int src_width,
-                    __private const int src_height,
-                    __private const int src_channel,
                     __global OUTPUT_TYPE *output,
                     __private const int outputOffset,
                     __private const int combineDstOffset,
@@ -73,23 +94,10 @@ __kernel void raster_buffer_direct(
     
     int inputIndex = inputOffset + id * combineSrcOffset + z * inputStride0 + y * inputStride1 + x * inputStride2;
     int outputIndex = outputOffset + id * combineDstOffset + z * outputStride0 + y * outputStride1 + x * outputStride2;
-#ifdef INPUT_DATA_FORMAT_NHWC
-    int in_c = inputIndex % src_channel; inputIndex /= src_channel;
-    int in_w = inputIndex % src_width; inputIndex /= src_width;
-    int in_h = inputIndex % src_height;
-    int in_b = inputIndex / src_height;
-#else
-    int in_w = inputIndex % src_width; inputIndex /= src_width;
-    int in_h = inputIndex % src_height; inputIndex /= src_height;
-    int in_c = inputIndex % src_channel;
-    int in_b = inputIndex / src_channel;
-#endif
-    int2 coord = (int2)((in_c / 4) * src_width + in_w, in_b * src_height + in_h);
-    INPUT_TYPE_I4 value = RI_DATA(input, SAMPLER, coord);
-    INPUT_TYPE_I* value_ptr = (INPUT_TYPE_I*)&value;
-    output[outputIndex] = (OUTPUT_TYPE)value_ptr[in_c % 4];
+    output[outputIndex] = (OUTPUT_TYPE)input[inputIndex];
 }
 
+
 __kernel void raster_image(
                     GLOBAL_SIZE_3_DIMS
                     __read_only image2d_t input,
diff --git a/source/backend/opencl/execution/image/GridSampleExecution.cpp b/source/backend/opencl/execution/image/GridSampleExecution.cpp
index 7cc2a0ff1..39369e99e 100644
--- a/source/backend/opencl/execution/image/GridSampleExecution.cpp
+++ b/source/backend/opencl/execution/image/GridSampleExecution.cpp
@@ -14,77 +14,128 @@ namespace MNN {
 namespace OpenCL {
 GridSampleExecution::GridSampleExecution(const std::vector<Tensor *> &inputs, const MNN::Op *op, Backend *backend)
     : CommonExecution(backend, op) {
-    mUnits.resize(1);
-    auto &unit = mUnits[0];
+    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
+    mMode = op->main_as_GridSample()->mode();
     mPaddingMode = op->main_as_GridSample()->paddingMode();
     if (op->main_as_GridSample()->alignCorners()) {
         mAlignCorners = 1;
-    }
-    else {
+    }else {
         mAlignCorners = 0;
     }
-
-    mOpenCLBackend = static_cast<OpenCLBackend *>(backend);
-    auto runtime = mOpenCLBackend->getOpenCLRuntime();
-    auto gridSampleParam = op->main_as_GridSample();
-
-    std::set<std::string> buildOptions;
-    if (op->main_as_GridSample()->mode() == 0) {
-        mKernelName = "bilinear";
-        unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions);
-    }
-    else {
-        mKernelName = "nearest";
-        unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions);
-
-    }
-
-    mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
 }
 
 ErrorCode GridSampleExecution::onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs) {
-    auto &unit = mUnits[0];
+    auto runtime = mOpenCLBackend->getOpenCLRuntime();
     auto inputTensor = inputs[0];
     auto gridTensor = inputs[1];
     auto outputTensor = outputs[0];
-
-    const int batches = inputTensor->buffer().dim[0].extent;
-    const int channels = inputTensor->buffer().dim[1].extent;
-    const int inH = inputTensor->buffer().dim[2].extent;
-    const int inW = inputTensor->buffer().dim[3].extent;
-    const int channelC4 = UP_DIV(channels, 4);
-
-    const int outH = outputTensor->buffer().dim[2].extent;
-    const int outW = outputTensor->buffer().dim[3].extent;
-
-    mGlobalWorkSize = {
-        static_cast<uint32_t>(channelC4),
-        static_cast<uint32_t>(outW),
-        static_cast<uint32_t>(outH * batches)
-    };
-
-    MNN_ASSERT(outW > 0 && outH > 0);
-
-    uint32_t idx = 0;
-    cl_int ret = CL_SUCCESS;
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
-    ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
-    ret |= unit.kernel->get().setArg(idx++, openCLImage(inputTensor));
-    ret |= unit.kernel->get().setArg(idx++, openCLImage(gridTensor));
-    ret |= unit.kernel->get().setArg(idx++, openCLImage(outputTensor));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inH));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
-    ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
-    ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
-    ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
-    MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleExecution");
-
-    mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first;
-    mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
-    unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
-    unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    if(outputs[0]->dimensions() > 4){
+        mUnits.resize(1);
+        const int batches = inputTensor->buffer().dim[0].extent;
+        const int channels = inputTensor->buffer().dim[1].extent;
+        const int inD = inputTensor->buffer().dim[2].extent;
+        const int inH = inputTensor->buffer().dim[3].extent;
+        const int inW = inputTensor->buffer().dim[4].extent;
+        const int channelC4 = UP_DIV(channels, 4);
+        const int outD = outputTensor->buffer().dim[2].extent;
+        const int outH = outputTensor->buffer().dim[3].extent;
+        const int outW = outputTensor->buffer().dim[4].extent;
+        std::vector<int> outputShape = tensorShapeFormat(gridTensor);
+        auto &unit = mUnits[0];
+        std::set<std::string> buildOptions;
+        if (mMode == 0) {
+            mKernelName = "bilinear5d";
+            unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions);
+        }
+        else {
+            mKernelName = "nearest5d";
+            unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions);
+            
+        }
+        mMaxWorkGroupSize = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        mGlobalWorkSize = {
+            static_cast<uint32_t>(channelC4 * outD),
+            static_cast<uint32_t>(outW),
+            static_cast<uint32_t>(outH * batches)
+        };
+        MNN_ASSERT(outW > 0 && outH > 0);
+        
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inD));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outD));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(batches));
+        ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
+        ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleExecution");
+        
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first;
+        
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    }else{
+        mUnits.resize(1);
+        auto &unit = mUnits[0];
+        const int batches = inputTensor->buffer().dim[0].extent;
+        const int channels = inputTensor->buffer().dim[1].extent;
+        const int inH = inputTensor->buffer().dim[2].extent;
+        const int inW = inputTensor->buffer().dim[3].extent;
+        const int channelC4 = UP_DIV(channels, 4);
+        
+        const int outH = outputTensor->buffer().dim[2].extent;
+        const int outW = outputTensor->buffer().dim[3].extent;
+        
+        std::set<std::string> buildOptions;
+        if (mMode == 0) {
+            mKernelName = "bilinear";
+            unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions);
+        }
+        else {
+            mKernelName = "nearest";
+            unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions);
+            
+        }
+        
+        mGlobalWorkSize = {
+            static_cast<uint32_t>(channelC4),
+            static_cast<uint32_t>(outW),
+            static_cast<uint32_t>(outH * batches)
+        };
+        
+        MNN_ASSERT(outW > 0 && outH > 0);
+        
+        uint32_t idx = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]);
+        ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]);
+        ret |= unit.kernel->get().setArg(idx++, openCLImage(inputTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLImage(gridTensor));
+        ret |= unit.kernel->get().setArg(idx++, openCLImage(outputTensor));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(inW));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outH));
+        ret |= unit.kernel->get().setArg(idx++, static_cast<uint32_t>(outW));
+        ret |= unit.kernel->get().setArg(idx++, mPaddingMode);
+        ret |= unit.kernel->get().setArg(idx++, mAlignCorners);
+        MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleExecution");
+        
+        mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first;
+        mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize);
+        unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]};
+        unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]};
+    }
     return NO_ERROR;
 }
 
diff --git a/source/backend/opencl/execution/image/RasterExecution.cpp b/source/backend/opencl/execution/image/RasterExecution.cpp
index 15cdb6235..47a1af17f 100644
--- a/source/backend/opencl/execution/image/RasterExecution.cpp
+++ b/source/backend/opencl/execution/image/RasterExecution.cpp
@@ -37,15 +37,131 @@ ErrorCode RasterExecution::onEncode(const std::vector<Tensor *> &____inputs, con
     mNeedZero = !TensorUtils::regionIsFull(output);
     auto regionNum = des->regions.size();
     auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime();
-    
+    mFast = false;
+    if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) {
+        mFast = true;
+        for (int i=0; i< des->regions.size(); ++i) {
+            auto& slice = des->regions[i];
+            if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) {
+                mFast = false;
+                break;
+            }
+            if (!OpCommonUtils::canBlitFast(slice, output)) {
+                mFast = false;
+                break;
+            }
+        }
+    }
+
+    if(mFast)
+    {
+        mUnits.resize(regionNum);
+        int kernel_idx = 0;
+        
+        if(mNeedZero)
+        {
+            mUnits.resize(regionNum + 1);
+            auto outputShape    = tensorShapeFormat(output);
+            int region[] = {outputShape[0], UP_DIV(outputShape[3], 4), outputShape[1], outputShape[2]};//nhwc
+            Unit &unit          = mUnits[kernel_idx++];
+            unit.kernel         = runtime->buildKernel("raster", "image_set_zero", {}, output, output);
+            unit.localWorkSize  = {8, 8};
+            unit.globalWorkSize = {(uint32_t)UP_DIV((region[1] * region[3]), 16)*16,
+                                   (uint32_t)UP_DIV((region[0] * region[2]), 16)*16};
+
+            int global_dim0 = region[1] * region[3];
+            int global_dim1 = region[0] * region[2];
+
+            uint32_t idx   = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= unit.kernel->get().setArg(idx++, global_dim0);
+            ret |= unit.kernel->get().setArg(idx++, global_dim1);
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
+            if(ret != CL_SUCCESS)
+            {
+                MNN_PRINT("setArg err %d\n", (int)ret);
+            }
+            mOpenCLBackend->recordKernel2d(unit.kernel,
+                {(uint32_t)UP_DIV((region[1] * region[3]), 16)*16,
+                (uint32_t)UP_DIV((region[0] * region[2]), 16)*16},
+                {8, 8});
+        }
+        
+        // image raster
+        for (auto& slice : des->regions)
+        {
+            Tensor::InsideDescribe::Region C4Region;
+            OpCommonUtils::turnToPackRegion(slice, C4Region, output, 4);
+
+            Unit &unit          = mUnits[kernel_idx++];
+            unit.kernel         = runtime->buildKernel("raster", "raster_image", {}, output, output);
+
+            const std::vector<uint32_t> gws =  {(uint32_t)C4Region.size[2],
+                                                    (uint32_t)C4Region.size[1],
+                                                    (uint32_t)C4Region.size[0]};
+            uint32_t mMaxWorkGroupSize      = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+
+            auto outputShape    = tensorShapeFormat(output);
+            auto sliceShape    = tensorShapeFormat(slice.origin);
+
+            uint32_t idx   = 0;
+            cl_int ret = CL_SUCCESS;
+            ret |= unit.kernel->get().setArg(idx++, gws[0]);
+            ret |= unit.kernel->get().setArg(idx++, gws[1]);
+            ret |= unit.kernel->get().setArg(idx++, gws[2]);
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(slice.origin));
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.offset);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[0]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[1]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[2]);
+            ret |= unit.kernel->get().setArg(idx++, sliceShape[1]);
+            ret |= unit.kernel->get().setArg(idx++, sliceShape[2]);
+            ret |= unit.kernel->get().setArg(idx++, sliceShape[3]);
+            ret |= unit.kernel->get().setArg(idx++, openCLImage(output));
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.offset);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[0]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[1]);
+            ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[2]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[1]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[2]);
+            ret |= unit.kernel->get().setArg(idx++, outputShape[3]);
+            if(ret != CL_SUCCESS)
+            {
+                MNN_PRINT("setArg err %d\n", (int)ret);
+            }
+            std::string name = "rasterImage";
+            const std::vector<uint32_t> lws = localWS3DDefault(gws, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first;
+            
+            unit.localWorkSize = {lws[0], lws[1], lws[2]};
+            
+            unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
+                                   ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])),
+                                   ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))};
+            mOpenCLBackend->recordKernel3d(unit.kernel, gws, lws);
+        }
+        return NO_ERROR;
+    }
     
     bool cancombine = CanCombine(outputs);
     // Alloc Temp buffer
     auto bufferPool     = ((OpenCLBackend *)backend())->getBufferPool();
-    if(output->getType().code == halide_type_float && runtime->isSupportedFP16()) {
-        mTempOutput         = bufferPool->alloc(output->usize()/2);
-    }else{
-        mTempOutput         = bufferPool->alloc(output->usize());
+    auto bufferUnitSize = runtime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float);
+    for(int i=0; i< regionNum; ++i)
+    {
+        auto origin = des->regions[i].origin;
+        if(mTempInput.find(origin) != mTempInput.end())
+        {
+            continue;
+        }
+
+        auto buffer = bufferPool->alloc(origin->elementSize()*bufferUnitSize);
+        mTempInput.insert(std::make_pair(origin, buffer));
+    }
+    mTempOutput         = bufferPool->alloc(output->elementSize() * bufferUnitSize);
+
+    for(auto& iter : mTempInput)
+    {
+        bufferPool->recycle(iter.second);
     }
     bufferPool->recycle(mTempOutput);
     
@@ -53,12 +169,12 @@ ErrorCode RasterExecution::onEncode(const std::vector<Tensor *> &____inputs, con
     if(cancombine){
         regionNum = 1;
     }
-    mUnits.resize(regionNum + 1);
+    mUnits.resize(regionNum + originNum + 1);
     
     int kernel_idx = 0;
     if(mNeedZero)
     {
-        mUnits.resize(regionNum + 2);
+        mUnits.resize(regionNum + originNum + 2);
         auto outputShape    = tensorShapeFormat(output);
         int region[] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//nhwc
         Unit &unit          = mUnits[kernel_idx++];
@@ -88,23 +204,63 @@ ErrorCode RasterExecution::onEncode(const std::vector<Tensor *> &____inputs, con
         
         mOpenCLBackend->recordKernel2d(unit.kernel, gws, lws);
     }
+
+    //image to buffer
+    for(auto& iter : mTempInput)
+    {
+        Tensor* origin = iter.first;
+        std::vector<int> regionShape = tensorShapeFormat(origin);
+        int inputWH[]      = {regionShape[2], regionShape[1]};
+        int region[]       = {regionShape[0], UP_DIV(regionShape[3], 4), regionShape[1], regionShape[2]};
+                
+        Unit &unit          = mUnits[kernel_idx++];
+        if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC)// Image to nhwc buffer
+        {
+            unit.kernel         = runtime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", {}, origin, origin);
+        }
+        else //Image to nchw buffer
+        {
+            unit.kernel         = runtime->buildKernel("buffer_to_image", "image_to_nchw_buffer", {}, origin, origin);
+        }
+
+        std::vector<uint32_t> gws = {(uint32_t)(region[3] * region[1]),
+                                     (uint32_t)(region[2] * region[0])};
+        //MNN_CHECK_CL_SUCCESS
+        uint32_t idx   = 0;
+        cl_int ret = CL_SUCCESS;
+        ret |= unit.kernel->get().setArg(idx++, gws[0]);
+        ret |= unit.kernel->get().setArg(idx++, gws[1]);
+        ret |= unit.kernel->get().setArg(idx++, *(iter.second));
+        ret |= unit.kernel->get().setArg(idx++, inputWH[1]);
+        ret |= unit.kernel->get().setArg(idx++, inputWH[0]);
+        ret |= unit.kernel->get().setArg(idx++, regionShape[3]);
+        ret |= unit.kernel->get().setArg(idx++, openCLImage(origin));
+        if(ret != CL_SUCCESS)
+        {
+            MNN_PRINT("setArg err %d\n", (int)ret);
+        }
+        
+        uint32_t mMaxWorkGroupSize      = static_cast<uint32_t>(runtime->getMaxWorkGroupSize(unit.kernel));
+        
+        std::string kernelName = "raster_image_to_buffer";
+        std::vector<uint32_t> lws = localWS2DDefault(gws, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first;
+
+        unit.localWorkSize = {lws[0], lws[1]};
+        unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])),
+            ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))};
+        mOpenCLBackend->recordKernel2d(unit.kernel, gws, lws);
+    }
     
     // buffer raster
     if(cancombine){
-        std::set<std::string> buildOptions;
         auto regions = des->regions;
         auto slice = regions[0];
-        auto origin = slice.origin;
-        auto inputShape = tensorShapeFormat(origin);
         int nums = regions.size();
         int src_offset = regions[1].src.offset - slice.src.offset;
         int dst_offset = regions[1].dst.offset - slice.dst.offset;
-        if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
-            buildOptions.emplace(" -DINPUT_DATA_FORMAT_NHWC");
-        }
         
         Unit &unit          = mUnits[kernel_idx++];
-        unit.kernel         = runtime->buildKernel("raster", "raster_buffer_direct", buildOptions, output, output);
+        unit.kernel         = runtime->buildKernel("raster", "raster_buffer_combine", {}, output, output);
         
         unit.globalWorkSize = {(uint32_t)slice.size[2] * nums,
             (uint32_t)slice.size[1],
@@ -120,15 +276,12 @@ ErrorCode RasterExecution::onEncode(const std::vector<Tensor *> &____inputs, con
         ret |= unit.kernel->get().setArg(idx++, gws[0]);
         ret |= unit.kernel->get().setArg(idx++, gws[1]);
         ret |= unit.kernel->get().setArg(idx++, gws[2]);
-        ret |= unit.kernel->get().setArg(idx++, openCLBuffer(origin));
+        ret |= unit.kernel->get().setArg(idx++, *(mTempInput[slice.origin]));
         ret |= unit.kernel->get().setArg(idx++, slice.src.offset);
         ret |= unit.kernel->get().setArg(idx++, src_offset);
         ret |= unit.kernel->get().setArg(idx++, slice.src.stride[0]);
         ret |= unit.kernel->get().setArg(idx++, slice.src.stride[1]);
         ret |= unit.kernel->get().setArg(idx++, slice.src.stride[2]);
-        ret |= unit.kernel->get().setArg(idx++, inputShape[2]);
-        ret |= unit.kernel->get().setArg(idx++, inputShape[1]);
-        ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
         ret |= unit.kernel->get().setArg(idx++, *mTempOutput);
         ret |= unit.kernel->get().setArg(idx++, slice.dst.offset);
         ret |= unit.kernel->get().setArg(idx++, dst_offset);
@@ -153,16 +306,8 @@ ErrorCode RasterExecution::onEncode(const std::vector<Tensor *> &____inputs, con
     }else{
         for (auto& slice : des->regions)
         {
-            std::set<std::string> buildOptions;
-            auto origin = slice.origin;
-            auto inputShape = tensorShapeFormat(origin);
-            int src_offset = 0;
-            int dst_offset = 0;
-            if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) {
-                buildOptions.emplace(" -DINPUT_DATA_FORMAT_NHWC");
-            }
             Unit &unit          = mUnits[kernel_idx++];
-            unit.kernel         = runtime->buildKernel("raster", "raster_buffer_direct", buildOptions, output, output);
+            unit.kernel         = runtime->buildKernel("raster", "raster_buffer", {}, output, output);
             
             unit.globalWorkSize = {(uint32_t)slice.size[2],
                 (uint32_t)slice.size[1],
@@ -178,22 +323,16 @@ ErrorCode RasterExecution::onEncode(const std::vector<Tensor *> &____inputs, con
             ret |= unit.kernel->get().setArg(idx++, gws[0]);
             ret |= unit.kernel->get().setArg(idx++, gws[1]);
             ret |= unit.kernel->get().setArg(idx++, gws[2]);
-            ret |= unit.kernel->get().setArg(idx++, openCLBuffer(origin));
+            ret |= unit.kernel->get().setArg(idx++, *(mTempInput[slice.origin]));
             ret |= unit.kernel->get().setArg(idx++, slice.src.offset);
-            ret |= unit.kernel->get().setArg(idx++, src_offset);
             ret |= unit.kernel->get().setArg(idx++, slice.src.stride[0]);
             ret |= unit.kernel->get().setArg(idx++, slice.src.stride[1]);
             ret |= unit.kernel->get().setArg(idx++, slice.src.stride[2]);
-            ret |= unit.kernel->get().setArg(idx++, inputShape[2]);
-            ret |= unit.kernel->get().setArg(idx++, inputShape[1]);
-            ret |= unit.kernel->get().setArg(idx++, inputShape[3]);
             ret |= unit.kernel->get().setArg(idx++, *mTempOutput);
             ret |= unit.kernel->get().setArg(idx++, slice.dst.offset);
-            ret |= unit.kernel->get().setArg(idx++, dst_offset);
             ret |= unit.kernel->get().setArg(idx++, slice.dst.stride[0]);
             ret |= unit.kernel->get().setArg(idx++, slice.dst.stride[1]);
             ret |= unit.kernel->get().setArg(idx++, slice.dst.stride[2]);
-            ret |= unit.kernel->get().setArg(idx++, slice.size[2]);
             if(ret != CL_SUCCESS)
             {
                 MNN_PRINT("setArg err %d\n", (int)ret);
diff --git a/source/backend/opencl/execution/image/SoftmaxExecution.cpp b/source/backend/opencl/execution/image/SoftmaxExecution.cpp
index 125c3c9d2..ad01839cb 100644
--- a/source/backend/opencl/execution/image/SoftmaxExecution.cpp
+++ b/source/backend/opencl/execution/image/SoftmaxExecution.cpp
@@ -87,8 +87,8 @@ ErrorCode SoftmaxExecution::onEncode(const std::vector<Tensor *> &inputs, const
     std::vector<uint32_t> mGlobalWorkSize{1, 1, 1};
     if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){
         mAxis = 1;
-        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch};
         localSize = getLocalSize(channelBlocks, MaxLocalSize);
+        mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch};
     }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputWidth){
         mAxis = 2;
         mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch};
diff --git a/source/backend/vulkan/image/backend/VulkanBackend.cpp b/source/backend/vulkan/image/backend/VulkanBackend.cpp
index 0663ceba6..4be0fddb4 100644
--- a/source/backend/vulkan/image/backend/VulkanBackend.cpp
+++ b/source/backend/vulkan/image/backend/VulkanBackend.cpp
@@ -321,6 +321,9 @@ void VulkanBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso
             iter = mConverters.find(key);
         }
         mCmdBuffers.push_back(iter->second.second->get());
+        if (TensorUtils::getDescribe(srcTensor)->isMutable == false) {
+            _finish();
+        }
     } else if (dstTensor->host<void>() != nullptr) {
         // gpu->host
         auto size = VulkanTensor::getAlignSize(srcTensor) * sizeof(float);
diff --git a/source/backend/vulkan/image/compiler/AllShader.cpp b/source/backend/vulkan/image/compiler/AllShader.cpp
index e9b7860f1..843d6fe37 100644
--- a/source/backend/vulkan/image/compiler/AllShader.cpp
+++ b/source/backend/vulkan/image/compiler/AllShader.cpp
@@ -19172,6 +19172,2066 @@ const unsigned char glsl_imageTonc4hw4_comp[] = {
 };
 unsigned int glsl_imageTonc4hw4_comp_len = 2220;
 
+const unsigned char glsl_binary_blit_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0x16, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x86, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x85, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x86, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00,
+  0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x17, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x3d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x44, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x85, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x87, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00,
+  0x87, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa0, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3c, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x03, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x15, 0x01, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00,
+  0x40, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x4d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0x52, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x60, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+  0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd7, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00,
+  0xd7, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00,
+  0xda, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe5, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00,
+  0xe0, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xec, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00,
+  0x15, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x6e, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0x9f, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_comp_len = 1760;
+
+const unsigned char glsl_binary_blit_ADD_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_ADD_comp_len = 2808;
+
+const unsigned char glsl_binary_blit_SUB_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_SUB_comp_len = 2808;
+
+const unsigned char glsl_binary_blit_MUL_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_MUL_comp_len = 2808;
+
+const unsigned char glsl_binary_blit_DIV_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x88, 0x00, 0x05, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00,
+  0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_DIV_comp_len = 2808;
+
+const unsigned char glsl_binary_blit_POW_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x1a, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_POW_comp_len = 2816;
+
+const unsigned char glsl_binary_blit_VMAX_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x28, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_VMAX_comp_len = 2816;
+
+const unsigned char glsl_binary_blit_VMIN_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00,
+  0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00,
+  0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00,
+  0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00,
+  0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00,
+  0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00,
+  0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00,
+  0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00,
+  0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00,
+  0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00,
+  0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00,
+  0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x25, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00,
+  0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00,
+  0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_VMIN_comp_len = 2816;
+
+const unsigned char glsl_binary_blit_SQUDIFF_comp[] = {
+  0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
+  0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30,
+  0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61,
+  0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49,
+  0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x06, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72,
+  0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x05, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00,
+  0x05, 0x00, 0x05, 0x00, 0x92, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74,
+  0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00,
+  0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61,
+  0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66,
+  0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00,
+  0x05, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70,
+  0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00,
+  0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00,
+  0x48, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x94, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x04, 0x00, 0xa8, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00,
+  0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00,
+  0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00,
+  0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00,
+  0x4a, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x14, 0x00, 0x02, 0x00, 0x51, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x1d, 0x00, 0x03, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00,
+  0x20, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x8d, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00,
+  0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x91, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x92, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x93, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x93, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00,
+  0x9a, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00,
+  0x9b, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00,
+  0x9c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00,
+  0x3b, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00,
+  0xa7, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00,
+  0x43, 0x00, 0x00, 0x00, 0xa8, 0x00, 0x00, 0x00, 0xa7, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00,
+  0x45, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00,
+  0x47, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x4f, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00,
+  0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00,
+  0x51, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x50, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00,
+  0x53, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4e, 0x00, 0x00, 0x00,
+  0x58, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00,
+  0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x4e, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x4d, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x59, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00,
+  0x5c, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00,
+  0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x67, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00,
+  0x5b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x77, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x78, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00,
+  0xac, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xb4, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00,
+  0xaf, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xba, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00,
+  0xb5, 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00,
+  0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xc1, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x05, 0x00, 0x77, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x4c, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00,
+  0x07, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00,
+  0x5d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xca, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcd, 0x00, 0x00, 0x00,
+  0xca, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00,
+  0xcd, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00,
+  0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd3, 0x00, 0x00, 0x00,
+  0xd0, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00,
+  0xd3, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xd6, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00,
+  0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00,
+  0xd4, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00,
+  0x77, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00,
+  0x85, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00,
+  0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe1, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00,
+  0x70, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xe7, 0x00, 0x00, 0x00, 0xe1, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00,
+  0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00,
+  0xed, 0x00, 0x00, 0x00, 0xe7, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00,
+  0x88, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00,
+  0x06, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0xed, 0x00, 0x00, 0x00,
+  0xef, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x96, 0x00, 0x00, 0x00,
+  0x97, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00,
+  0xc1, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00,
+  0x98, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00,
+  0x41, 0x00, 0x06, 0x00, 0x96, 0x00, 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00,
+  0x9d, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00,
+  0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0x9f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00,
+  0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00,
+  0x0f, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00,
+  0xa1, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00, 0x0f, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00,
+  0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00,
+  0x96, 0x00, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00,
+  0x75, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00,
+  0xa6, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00,
+  0x54, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x54, 0x00, 0x00, 0x00,
+  0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00
+};
+unsigned int glsl_binary_blit_SQUDIFF_comp_len = 2828;
+
 const unsigned char glsl_matmul_output_comp[] = {
   0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00,
   0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00,
diff --git a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
index 5575d39eb..915ca987b 100644
--- a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
+++ b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp
@@ -94,6 +94,15 @@ mMaps.insert(std::make_pair("glsl_col2Im_RELU_comp", std::make_pair(glsl_col2Im_
 mMaps.insert(std::make_pair("glsl_col2Im_RELU6_comp", std::make_pair(glsl_col2Im_RELU6_comp,glsl_col2Im_RELU6_comp_len)));
 mMaps.insert(std::make_pair("glsl_nc4hw4toimage_comp", std::make_pair(glsl_nc4hw4toimage_comp,glsl_nc4hw4toimage_comp_len)));
 mMaps.insert(std::make_pair("glsl_imageTonc4hw4_comp", std::make_pair(glsl_imageTonc4hw4_comp,glsl_imageTonc4hw4_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_comp", std::make_pair(glsl_binary_blit_comp,glsl_binary_blit_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_ADD_comp", std::make_pair(glsl_binary_blit_ADD_comp,glsl_binary_blit_ADD_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_SUB_comp", std::make_pair(glsl_binary_blit_SUB_comp,glsl_binary_blit_SUB_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_MUL_comp", std::make_pair(glsl_binary_blit_MUL_comp,glsl_binary_blit_MUL_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_DIV_comp", std::make_pair(glsl_binary_blit_DIV_comp,glsl_binary_blit_DIV_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_POW_comp", std::make_pair(glsl_binary_blit_POW_comp,glsl_binary_blit_POW_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_VMAX_comp", std::make_pair(glsl_binary_blit_VMAX_comp,glsl_binary_blit_VMAX_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_VMIN_comp", std::make_pair(glsl_binary_blit_VMIN_comp,glsl_binary_blit_VMIN_comp_len)));
+mMaps.insert(std::make_pair("glsl_binary_blit_SQUDIFF_comp", std::make_pair(glsl_binary_blit_SQUDIFF_comp,glsl_binary_blit_SQUDIFF_comp_len)));
 mMaps.insert(std::make_pair("glsl_matmul_output_comp", std::make_pair(glsl_matmul_output_comp,glsl_matmul_output_comp_len)));
 mMaps.insert(std::make_pair("glsl_matmul_output_BIAS_comp", std::make_pair(glsl_matmul_output_BIAS_comp,glsl_matmul_output_BIAS_comp_len)));
 mMaps.insert(std::make_pair("glsl_matmul_output_TRANSPOSE_comp", std::make_pair(glsl_matmul_output_TRANSPOSE_comp,glsl_matmul_output_TRANSPOSE_comp_len)));
diff --git a/source/backend/vulkan/image/execution/VulkanLoop.cpp b/source/backend/vulkan/image/execution/VulkanLoop.cpp
new file mode 100644
index 000000000..fc86cc58a
--- /dev/null
+++ b/source/backend/vulkan/image/execution/VulkanLoop.cpp
@@ -0,0 +1,232 @@
+#include "VulkanLoop.hpp"
+#include "VulkanBinary.hpp"
+
+namespace MNN {
+
+std::string getMidName(const Op* op) {
+    std::string mid = "";
+    if (op->type() == OpType_Eltwise) {
+        if (op->main_as_Eltwise()->coeff() != nullptr) {
+            // Don't support
+            return "";
+        }
+        switch (op->main_as_Eltwise()->type()) {
+            case EltwiseType_SUB:
+                mid = "SUB";
+                break;
+            case EltwiseType_MAXIMUM:
+                mid = "VMAX";
+                break;
+            case EltwiseType_PROD:
+                mid = "MUL";
+                break;
+            case EltwiseType_SUM:
+                mid = "ADD";
+                break;
+            default:
+                break;
+        }
+    } else if (op->type() == OpType_BinaryOp) {
+        switch (op->main_as_BinaryOp()->opType()) {
+            case BinaryOpOperation_ADD:
+                mid = "ADD";
+                break;
+            case BinaryOpOperation_SUB:
+                mid = "SUB";
+                break;
+            case BinaryOpOperation_MAXIMUM:
+                mid = "VMAX";
+                break;
+            case BinaryOpOperation_MINIMUM:
+                mid = "VMIN";
+                break;
+            case BinaryOpOperation_MUL:
+                mid = "MUL";
+                break;
+            case BinaryOpOperation_POW:
+                mid = "POW";
+                break;
+            case BinaryOpOperation_SquaredDifference:
+                mid = "SQUDIFF";
+                break;
+            case BinaryOpOperation_DIV:
+            case BinaryOpOperation_REALDIV:
+                mid = "DIV";
+                break;
+            default:
+                break;
+        }
+    }
+    return mid;
+}
+
+static void _setTensorStack(std::vector<Tensor*>& result, const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const LoopParam* loop) {
+    if (loop->inputIndexes() != nullptr) {
+        for (int i=0; i<loop->inputIndexes()->size(); ++i) {
+            result[loop->inputIndexes()->data()[i]] = inputs[i];
+        }
+    }
+    for (int i=0; i<loop->outputIndexes()->size(); ++i) {
+        result[loop->outputIndexes()->data()[i]] = outputs[i];
+    }
+}
+
+struct BinaryBroadCastInfo {
+    ivec4 srcview0;
+    ivec4 srcview1;
+    ivec4 dstview;
+    ivec4 size;
+};
+
+class VulkanBinaryBroadCast : public VulkanBasicExecution {
+public:
+    VulkanBinaryBroadCast(const LoopParam* loop, Backend *bn, bool isInt) : VulkanBasicExecution(bn) {
+        mLoop = loop;
+        auto vkbackend = static_cast<VulkanBackend*>(bn);
+
+        std::vector<VkDescriptorType> types{
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
+            VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER,
+        };
+
+        std::string shaderName = "glsl_binary_blit_" + getMidName(mLoop->commands()->GetAs<RegionCommand>(0)->op()) + "_comp";
+
+        mLoopPipeline = vkbackend->getPipeline(shaderName, types);
+        mDescriptorSet.reset(mLoopPipeline->createSet());
+
+        mGpuLoopParam.reset(new VulkanBuffer(vkbackend->getMemoryPool(), false, sizeof(BinaryBroadCastInfo), nullptr, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT));
+        mTensors.resize(mLoop->tensorNumber());
+    }
+
+    virtual ~VulkanBinaryBroadCast() = default;
+
+    virtual ErrorCode onEncode(const std::vector<Tensor *>& inputs, const std::vector<Tensor *>& outputs,
+                               const VulkanCommandPool::Buffer* cmdBuffer) override {
+        _setTensorStack(mTensors, inputs, outputs, mLoop);
+        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
+        auto size = cmd->size()->data();
+        auto vkBn = static_cast<VulkanBackend*>(backend());
+        auto srcStride0 = cmd->view()->GetAs<View>(1)->stride()->data();
+        auto srcStride1 = cmd->view()->GetAs<View>(2)->stride()->data();
+        auto dstStride = cmd->view()->GetAs<View>(0)->stride()->data();
+        int totalSize = size[0] * size[1] * size[2];
+        auto param = reinterpret_cast<BinaryBroadCastInfo*>(mGpuLoopParam->map());
+        for (int i=0; i<3; ++i) {
+            param->size[i] = size[i];
+            param->srcview0[i] = srcStride0[i];
+            param->srcview1[i] = srcStride1[i];
+            param->dstview[i] = dstStride[i];
+        }
+        param->srcview0[3] = cmd->view()->GetAs<View>(1)->offset();
+        param->srcview1[3] = cmd->view()->GetAs<View>(2)->offset();
+        param->dstview[3] = cmd->view()->GetAs<View>(0)->offset();
+        param->size[3] = size[0] * size[1] * size[2];
+        mGpuLoopParam->unmap();
+        auto output = mTensors[cmd->indexes()->data()[0]];
+        auto input0 = mTensors[cmd->indexes()->data()[1]];
+        auto input1 = mTensors[cmd->indexes()->data()[2]];
+
+        {
+            int bufferSizeSource0 = sizeof(float);
+            for (int i=0; i<input0->dimensions(); ++i) {
+                bufferSizeSource0 *= input0->length(i);
+            }
+            mInput0.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeSource0, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
+            mInput0.convert.reset(new VulkanImageConverter(vkBn));
+        }
+        {
+            int bufferSizeSource1 = sizeof(float);
+            for (int i=0; i<input1->dimensions(); ++i) {
+                bufferSizeSource1 *= input1->length(i);
+            }
+            mInput1.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeSource1, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
+            mInput1.convert.reset(new VulkanImageConverter(vkBn));
+        }
+        {
+            int bufferSizeOutput = sizeof(float);
+            for (int i=0; i<output->dimensions(); ++i) {
+                bufferSizeOutput *= output->length(i);
+            }
+            mOutput.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeOutput, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT));
+            mOutput.convert.reset(new VulkanImageConverter(vkBn));
+        }
+        mInput0.convert->encodeTensorToBuffer(input0, mInput0.buffer->buffer(), mInput0.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(inputs[0]), cmdBuffer);
+        mInput1.convert->encodeTensorToBuffer(input1, mInput1.buffer->buffer(), mInput1.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(inputs[1]), cmdBuffer);
+
+        mDescriptorSet->writeBuffer(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
+        mDescriptorSet->writeBuffer(mInput0.buffer->buffer(), 1, mInput0.buffer->size());
+        mDescriptorSet->writeBuffer(mInput1.buffer->buffer(), 2, mInput1.buffer->size());
+        mDescriptorSet->writeBuffer(mGpuLoopParam->buffer(), 3, mGpuLoopParam->size());
+
+        cmdBuffer->barrierSource(mInput0.buffer->buffer(), 0, mInput0.buffer->size());
+        cmdBuffer->barrierSource(mInput1.buffer->buffer(), 0, mInput1.buffer->size());
+
+        mLoopPipeline->bind(cmdBuffer->get(), mDescriptorSet->get());
+        vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalSize,256), 1, 1);
+
+        cmdBuffer->barrierSource(mOutput.buffer->buffer(), 0, mOutput.buffer->size());
+        mOutput.convert->encodeBufferToTensor(mOutput.buffer->buffer(), output, mOutput.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(outputs[0]), cmdBuffer);
+        mInput0.buffer->release();
+        mInput1.buffer->release();
+        mOutput.buffer->release();
+
+        return NO_ERROR;
+    }
+
+private:
+    const LoopParam* mLoop;
+    const VulkanPipeline* mLoopPipeline;
+    std::shared_ptr<VulkanBuffer> mGpuLoopParam;
+    std::shared_ptr<VulkanLayout::DescriptorSet> mDescriptorSet;
+    std::vector<Tensor*> mTensors;
+    struct ConvertInfo {
+        std::shared_ptr<VulkanImageConverter> convert;
+        std::shared_ptr<VulkanBuffer> buffer;
+    };
+    ConvertInfo mInput0;
+    ConvertInfo mInput1;
+    ConvertInfo mOutput;
+};
+
+VulkanBasicExecution* VulkanLoop::create(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const Op* op, Backend* bn) {
+    auto loop = op->main_as_LoopParam();
+    if (nullptr == loop || loop->commands() == nullptr) {
+        return nullptr;
+    }
+    if (nullptr != loop->initCommand()) {
+        return nullptr;
+    }
+
+    if (1 == loop->commands()->size()) {
+        auto cmd = loop->commands()->GetAs<RegionCommand>(0);
+        auto subop = cmd->op();
+        if (OpType_BinaryOp == subop->type() && cmd->fuse() < 0 && 1 == loop->loopNumber()) {
+            std::string shaderMidName = getMidName(loop->commands()->GetAs<RegionCommand>(0)->op());
+            if (shaderMidName.empty()) {
+                return nullptr;
+            }
+            bool isInt = inputs[1]->getType().code == halide_type_int;
+            if (isInt) {
+                return nullptr;
+            }
+            return new VulkanBinaryBroadCast(loop, bn, isInt);
+        }
+    }
+    return nullptr;
+}
+
+class VulkanLoopCreator : public VulkanBackend::Creator {
+public:
+    virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
+        return VulkanLoop::create(inputs, outputs, op, bn);
+    }
+};
+
+static bool gResistor = []() {
+    VulkanBackend::addCreator(OpType_While, new VulkanLoopCreator);
+    return true;
+}();
+
+}
\ No newline at end of file
diff --git a/source/backend/vulkan/image/execution/VulkanLoop.hpp b/source/backend/vulkan/image/execution/VulkanLoop.hpp
new file mode 100644
index 000000000..2ee524aad
--- /dev/null
+++ b/source/backend/vulkan/image/execution/VulkanLoop.hpp
@@ -0,0 +1,24 @@
+//
+//  VulkanLoop.cpp
+//  MNN
+//
+//  Created by MNN on 2024/10/18.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+
+#ifndef VulkanLoop_hpp
+#define VulkanLoop_hpp
+
+#include "VulkanBasicExecution.hpp"
+#include "VulkanImageConverter.hpp"
+
+namespace MNN {
+
+class VulkanLoop {
+public:
+    static VulkanBasicExecution* create(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const Op* op, Backend* bn);
+};
+
+} // namespace MNN
+
+#endif /* VulkanLoop_hpp */
diff --git a/source/backend/vulkan/image/execution/VulkanRaster.cpp b/source/backend/vulkan/image/execution/VulkanRaster.cpp
index d5cc81b07..a5f93fd30 100644
--- a/source/backend/vulkan/image/execution/VulkanRaster.cpp
+++ b/source/backend/vulkan/image/execution/VulkanRaster.cpp
@@ -244,83 +244,8 @@ class VulkanRasterCreator : public VulkanBackend::Creator {
 };
 
 
-class VulkanLoop : public VulkanBasicExecution {
-public:
-    VulkanLoop(Backend *bn, const LoopParam* loop) : VulkanBasicExecution(bn) {
-        mLoop = loop;
-    }
-    virtual ~VulkanLoop() = default;
-
-    virtual ErrorCode onEncode(const std::vector<Tensor *> &inputs, const std::vector<Tensor *> &outputs,
-                               const VulkanCommandPool::Buffer *cmdBuffer) override {
-        mExecutions.clear();
-        auto cmd = mLoop->commands()->GetAs<RegionCommand>(0);
-        std::vector<Tensor*> tensors(mLoop->tensorNumber());
-        for (int i=0; i<mLoop->inputIndexes()->size(); ++i) {
-            tensors[mLoop->inputIndexes()->data()[i]] = inputs[i];
-        }
-        for (int i=0; i<mLoop->outputIndexes()->size(); ++i) {
-            tensors[mLoop->outputIndexes()->data()[i]] = outputs[i];
-        }
-        auto C = tensors[cmd->indexes()->data()[0]];
-        auto A = tensors[cmd->indexes()->data()[1]];
-        auto B = tensors[cmd->indexes()->data()[2]];
-        for (int i=0; i<mLoop->loopNumber(); ++i) {
-            VulkanMatMul::MatMulInfo matInfo;
-            matInfo.e = cmd->size()->data()[0];
-            matInfo.l = cmd->size()->data()[1];
-            matInfo.h = cmd->size()->data()[2];
-            matInfo.offsetC = cmd->view()->GetAs<View>(0)->offset() + i * cmd->steps()->data()[0];
-            matInfo.offsetA = cmd->view()->GetAs<View>(1)->offset() + i * cmd->steps()->data()[1];
-            matInfo.offsetB = cmd->view()->GetAs<View>(2)->offset() + i * cmd->steps()->data()[2];
-            ::memcpy(matInfo.aStride, cmd->view()->GetAs<View>(1)->stride()->data(), 3 * sizeof(int));
-            ::memcpy(matInfo.bStride, cmd->view()->GetAs<View>(2)->stride()->data(), 3 * sizeof(int));
-            ::memcpy(matInfo.cStride, cmd->view()->GetAs<View>(0)->stride()->data(), 3 * sizeof(int));
-            Tensor* bias = nullptr;
-            if (cmd->indexes()->size() > 3) {
-                bias = tensors[cmd->indexes()->data()[3]];
-                matInfo.offsetBias = cmd->view()->GetAs<View>(3)->offset() + i * cmd->steps()->data()[3];
-            }
-            auto matmulOp = cmd->op();
-            std::shared_ptr<VulkanBasicExecution> exe(new VulkanMatMul(matmulOp->main_as_MatMul()->transposeA(), matmulOp->main_as_MatMul()->transposeB(), backend()));
-            auto matmulExe = static_cast<VulkanMatMul*>(exe.get());
-            bool res = true;
-            if (bias == nullptr) {
-                res = matmulExe->encode({{A, B}}, {C}, cmdBuffer, matInfo);
-            } else {
-                res = matmulExe->encode({{A, B, bias}}, {C}, cmdBuffer, matInfo);
-            }
-            if (!res) {
-                return NOT_SUPPORT;
-            }
-            mExecutions.emplace_back(exe);
-        }
-        return NO_ERROR;
-    }
-private:
-    std::vector<std::shared_ptr<VulkanBasicExecution>> mExecutions;
-    const LoopParam* mLoop;
-};
-
-class VulkanLoopCreator : public VulkanBackend::Creator {
-public:
-    virtual VulkanBasicExecution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op, Backend* bn) const override {
-        auto loop = op->main_as_LoopParam();
-        if (1 != loop->commands()->size()) {
-            return nullptr;
-        }
-        auto cmd = loop->commands()->GetAs<RegionCommand>(0);
-        if (OpType_MatMul != cmd->op()->type()) {
-            return nullptr;
-        }
-        return new VulkanLoop(bn, loop);
-    }
-};
-
-
 static bool gResistor = []() {
     VulkanBackend::addCreator(OpType_Raster, new VulkanRasterCreator);
-//    VulkanBackend::addCreator(OpType_While, new VulkanLoopCreator);
     return true;
 }();
 
diff --git a/source/backend/vulkan/image/execution/glsl/binary_blit.comp b/source/backend/vulkan/image/execution/glsl/binary_blit.comp
new file mode 100644
index 000000000..348318bca
--- /dev/null
+++ b/source/backend/vulkan/image/execution/glsl/binary_blit.comp
@@ -0,0 +1,84 @@
+#version 440 core
+#ifdef C4
+#define FLOAT vec4
+#else
+#define FLOAT float
+#endif
+
+#define OUTPUT_TYPE float
+
+#define FLOAT4 vec4
+layout(std430) buffer;
+layout(set=0, binding=0) writeonly buffer sourceBuffer{
+    OUTPUT_TYPE data[];
+} uOutput;
+
+
+layout(set=0, binding=1) readonly buffer destBuffer{
+    FLOAT data[];
+} uInput0;
+
+layout(set=0, binding=2) readonly buffer destBuffer0{
+    FLOAT data[];
+} uInput1;
+
+layout(set=0, binding=3) uniform constBuffer{
+    ivec4 srcview0;
+    ivec4 srcview1;
+    ivec4 dstview;
+    ivec4 size;
+} uConstant;
+
+layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
+
+int computeVec4dot(ivec4 a, ivec4 b) {
+    return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+}
+
+FLOAT4 binary(FLOAT4 x0, FLOAT4 x1) {
+    FLOAT4 value;
+#ifdef ADD
+        value = x0 + x1;
+#endif
+#ifdef SUB
+        value = x0 - x1;
+#endif
+#ifdef MUL
+        value = x0 * x1;
+#endif
+#ifdef DIV
+        value = x0 / x1;
+#endif
+#ifdef POW
+        value = pow(x0, x1);
+#endif
+#ifdef VMAX
+        value = max(x0, x1);
+#endif
+#ifdef VMIN
+        value = min(x0, x1);
+#endif
+#ifdef SQUDIFF
+        value = (x0 - x1) * (x0 - x1);
+#endif
+    return value;
+}
+
+void main()
+{
+    ivec3 posTmp = ivec3(gl_GlobalInvocationID);
+    if (posTmp.x < uConstant.size.w)
+    {
+        ivec4 pos;
+        pos.x = posTmp.x / (uConstant.size.y * uConstant.size.z);
+        int subIndex = posTmp.x % (uConstant.size.y * uConstant.size.z);
+        pos.z = subIndex % uConstant.size.z;
+        pos.y = subIndex / uConstant.size.z;
+        pos.w = 1;
+        int s0 = computeVec4dot(uConstant.srcview0, pos);
+        int s1 = computeVec4dot(uConstant.srcview1, pos);
+        int d = computeVec4dot(uConstant.dstview, pos);
+
+        uOutput.data[d] = OUTPUT_TYPE(binary(FLOAT4(uInput0.data[s0]), FLOAT4(uInput1.data[s1])).x);
+    }
+}
diff --git a/source/backend/vulkan/image/execution/glsl/macro.json b/source/backend/vulkan/image/execution/glsl/macro.json
index 3c964eff1..bfc289616 100644
--- a/source/backend/vulkan/image/execution/glsl/macro.json
+++ b/source/backend/vulkan/image/execution/glsl/macro.json
@@ -107,5 +107,15 @@
     ],
     "resizeNearest.comp":[
         "NEAREST_ROUND"
+    ],
+    "binary_blit.comp":[
+        "ADD",
+        "SUB",
+        "MUL",
+        "DIV",
+        "POW",
+        "VMAX",
+        "VMIN",
+        "SQUDIFF"
     ]
 }
diff --git a/source/backend/vulkan/image/shaders/AllShader.h b/source/backend/vulkan/image/shaders/AllShader.h
index 4297b2ced..5f52602a3 100644
--- a/source/backend/vulkan/image/shaders/AllShader.h
+++ b/source/backend/vulkan/image/shaders/AllShader.h
@@ -182,6 +182,24 @@ extern const unsigned char glsl_nc4hw4toimage_comp[];
 extern unsigned int glsl_nc4hw4toimage_comp_len;
 extern const unsigned char glsl_imageTonc4hw4_comp[];
 extern unsigned int glsl_imageTonc4hw4_comp_len;
+extern const unsigned char glsl_binary_blit_comp[];
+extern unsigned int glsl_binary_blit_comp_len;
+extern const unsigned char glsl_binary_blit_ADD_comp[];
+extern unsigned int glsl_binary_blit_ADD_comp_len;
+extern const unsigned char glsl_binary_blit_SUB_comp[];
+extern unsigned int glsl_binary_blit_SUB_comp_len;
+extern const unsigned char glsl_binary_blit_MUL_comp[];
+extern unsigned int glsl_binary_blit_MUL_comp_len;
+extern const unsigned char glsl_binary_blit_DIV_comp[];
+extern unsigned int glsl_binary_blit_DIV_comp_len;
+extern const unsigned char glsl_binary_blit_POW_comp[];
+extern unsigned int glsl_binary_blit_POW_comp_len;
+extern const unsigned char glsl_binary_blit_VMAX_comp[];
+extern unsigned int glsl_binary_blit_VMAX_comp_len;
+extern const unsigned char glsl_binary_blit_VMIN_comp[];
+extern unsigned int glsl_binary_blit_VMIN_comp_len;
+extern const unsigned char glsl_binary_blit_SQUDIFF_comp[];
+extern unsigned int glsl_binary_blit_SQUDIFF_comp_len;
 extern const unsigned char glsl_matmul_output_comp[];
 extern unsigned int glsl_matmul_output_comp_len;
 extern const unsigned char glsl_matmul_output_BIAS_comp[];
diff --git a/source/backend/vulkan/runtime/vulkan_wrapper.cpp b/source/backend/vulkan/runtime/vulkan_wrapper.cpp
index 88e54f373..1da2353a3 100644
--- a/source/backend/vulkan/runtime/vulkan_wrapper.cpp
+++ b/source/backend/vulkan/runtime/vulkan_wrapper.cpp
@@ -21,7 +21,7 @@ int InitVulkan(void) {
 #include <string>
 #include <vector>
 #include <mutex>
-#ifdef WIN32
+#ifdef _WIN32
 #include <windows.h>
 #include <libloaderapi.h>
 #define MNN_DLSYM(lib, func_name) GetProcAddress(reinterpret_cast<HMODULE>(lib), func_name)
@@ -32,7 +32,7 @@ int InitVulkan(void) {
 
 int InitVulkanOnce(void) {
     const std::vector<std::string> gVulkan_library_paths = {
-#ifdef WIN32
+#ifdef _WIN32
     "vulkan-1.dll",
 #endif
     "libvulkan.so",
@@ -42,7 +42,7 @@ int InitVulkanOnce(void) {
     };
     void* libvulkan = nullptr;
     for (const auto& s : gVulkan_library_paths) {
-#ifdef WIN32
+#ifdef _WIN32
         libvulkan = LoadLibrary(s.c_str());
 #else
         libvulkan = dlopen(s.c_str(), RTLD_NOW | RTLD_LOCAL);
@@ -52,7 +52,7 @@ int InitVulkanOnce(void) {
         }
     }
     if (nullptr == libvulkan) {
-#ifdef WIN32
+#ifdef _WIN32
         MNN_ERROR("Load vulkan library error\n");
 #else
         auto message = dlerror();
diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp
index 2e0b2548b..d83b2ba10 100644
--- a/source/core/Backend.hpp
+++ b/source/core/Backend.hpp
@@ -52,6 +52,9 @@ struct RuntimeHint {
     
     std::string midMemoryPath;
     std::string weightMemoryPath;
+    
+    // op encoder number for once commit
+    int encorderNumForCommit = 10;
 };
 /** abstract backend */
 class Backend : public NonCopyable {
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index 48148ab28..b8354d53a 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -97,6 +97,8 @@ void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) {
         case Interpreter::KVCACHE_SIZE_LIMIT:
             runtimeHint.kvcacheSizeLimit = hint;
             break;
+        case Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT:
+            runtimeHint.encorderNumForCommit = hint;
         default:
             break;
     }
diff --git a/source/geometry/GeometryReshape.cpp b/source/geometry/GeometryReshape.cpp
index a8d466437..88d98a24c 100644
--- a/source/geometry/GeometryReshape.cpp
+++ b/source/geometry/GeometryReshape.cpp
@@ -107,7 +107,7 @@ static void _create() {
     std::shared_ptr<GeometryComputer> _comp(new SingleGeometryComputer);
     GeometryComputer::registerGeometryComputer(_comp, {OpType_Squeeze, OpType_Unsqueeze, OpType_ExpandDims, OpType_Flatten, OpType_QuantizedReshape});
     std::shared_ptr<GeometryComputer> copycomp(new CopyGeometryComputer);
-    GeometryComputer::registerGeometryComputer(comp, {OpType_Identity});
+    GeometryComputer::registerGeometryComputer(copycomp, {OpType_Identity});
 }
 
 REGISTER_GEOMETRY(GeometryReshape, _create);
diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp
index 5e28154d5..d7636e074 100644
--- a/source/math/Vec.hpp
+++ b/source/math/Vec.hpp
@@ -234,11 +234,7 @@ struct Vec<int32_t, 4> {
         value = std::move(lr.value);
     }
     float operator[](size_t i) {
-#if defined(_MSC_VER)
-        return value.n128_i32[i];
-#else
         return value[i];
-#endif
     }
     static VecType load(const float* addr) {
         VecType v = { (int32x4_t)(vld1q_f32(addr)) };
@@ -400,11 +396,7 @@ struct Vec<float, 4> {
         value = std::move(lr.value);
     }
     float operator[](size_t i) {
-#if defined(_MSC_VER)
-        return value.n128_f32[i];
-#else
         return value[i];
-#endif
     }
     static VecType load(const float* addr) {
         VecType v = { vld1q_f32(addr) };
diff --git a/test.sh b/test.sh
index 52bd6c6d3..1ad2ab3ab 100755
--- a/test.sh
+++ b/test.sh
@@ -206,6 +206,7 @@ android_static_build() {
     -DMNN_OPENCL=true \
     -DMNN_BUILD_MINI=true \
     -DMNN_SUPPORT_BF16=true \
+    -DMNN_ARM82=false \
     -DMNN_OPENCL=true \
     -DMNN_SUPPORT_TRANSFORMER_FUSE=ON \
     -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=.
@@ -657,7 +658,7 @@ android_test() {
     # 1. build Android32
     mkdir build_32
     pushd build_32
-    ../build_32.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_OPENCL=true -DMNN_LOW_MEMORY=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON
+    ../build_32.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_OPENCL=true -DMNN_LOW_MEMORY=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON -DMNN_ARM82=OFF
     android32_build_wrong=$[$? > 0]
     mnn32_size=$(ls -lh libMNN.so | awk '{print $5}')
     expr32_size=$(ls -lh libMNN_Express.so | awk '{print $5}')
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 896089e45..f128825a6 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -18,7 +18,7 @@ endif()
 
 add_executable(run_test.out ${Files})
 target_link_libraries(run_test.out ${MNN_DEPS})
-if (WIN32)
+if (MSVC)
   target_compile_options(run_test.out PRIVATE /bigobj)
 endif()
 if (MNN_SUPPORT_BF16)
diff --git a/test/MNNTestSuite.h b/test/MNNTestSuite.h
index a7ea46e4a..f961ec7cb 100644
--- a/test/MNNTestSuite.h
+++ b/test/MNNTestSuite.h
@@ -80,6 +80,12 @@ class MNNTestSuite {
      * @return shared instance
      */
     static MNNTestSuite* get();
+    struct Status {
+        int precision = 0;
+        int memory = 0;
+        int power = 0;
+    };
+    Status pStaus;
 
 public:
     /**
diff --git a/test/TestUtils.cpp b/test/TestUtils.cpp
index 6719871c3..cfd10eba9 100644
--- a/test/TestUtils.cpp
+++ b/test/TestUtils.cpp
@@ -15,6 +15,7 @@
 #include <vector>
 #include <MNN/expr/Expr.hpp>
 #include "core/TensorUtils.hpp"
+#include "RuntimeAttr.hpp"
 
 using namespace MNN;
 
@@ -86,4 +87,8 @@ float convertFP32ToFP16(float fp32Value) {
 }
 
 
+MNNForwardType getCurrentType() {
+    auto attr = MNN::Express::ExecutorScope::Current()->getAttr();
+    return attr->firstType;
+}
 
diff --git a/test/TestUtils.h b/test/TestUtils.h
index 6a5dd2c20..ac14eb732 100644
--- a/test/TestUtils.h
+++ b/test/TestUtils.h
@@ -102,6 +102,7 @@ float convertFP32ToFP16(float fp32Value);
 inline float keepFP32Precision(float fp32Value) {
     return fp32Value;
 }
+MNNForwardType getCurrentType();
 
 using ConvertFP32 = float(*)(float fp32Value);
 
diff --git a/test/expr/ExecutorResetTest.cpp b/test/expr/ExecutorResetTest.cpp
index 60050c454..c6d5c4249 100644
--- a/test/expr/ExecutorResetTest.cpp
+++ b/test/expr/ExecutorResetTest.cpp
@@ -11,6 +11,7 @@
 #include <MNN/expr/Executor.hpp>
 #include <MNN/expr/ExprCreator.hpp>
 #include <MNN/expr/ExecutorScope.hpp>
+#include <MNN/expr/Module.hpp>
 #include "MNNTestSuite.h"
 
 using namespace MNN::Express;
@@ -67,6 +68,23 @@ class ExecutorResetTest : public MNNTestCase {
         x->setName("Prob");
         return x;
     }
+    bool _runmbv1() {
+        auto x = _Input({1, 3, 224, 224}, NC4HW4);
+        auto y = _mobileNetV1Expr(x);
+        auto buffer = Variable::save({y});
+        y = nullptr;x=nullptr;
+        MNN::BackendConfig bnConfig;
+        auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1);
+        ExecutorScope scope(exe);
+        std::shared_ptr<Module> m(Module::load({"Input"}, {"Prob"}, (const uint8_t*)buffer.data(), buffer.size()));
+        x = _Input({1, 3, 224, 224}, NC4HW4);
+        x->writeMap<float>();
+        m->onForward({x});
+        exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 4);
+        m->onForward({x});
+
+        return true;
+    }
 
     virtual bool run(int precision) {
         int numberThread = 0;
@@ -104,6 +122,9 @@ class ExecutorResetTest : public MNNTestCase {
             FUNC_PRINT(1);
             return false;
         }
+        if (!_runmbv1()) {
+            return false;
+        }
         return true;
     }
 };
diff --git a/test/main.cpp b/test/main.cpp
index 862ecdc0c..a23ef8eac 100644
--- a/test/main.cpp
+++ b/test/main.cpp
@@ -67,6 +67,8 @@ int main(int argc, char* argv[]) {
     }
     MNN::Express::ExecutorScope scope(exe);
     exe->setGlobalExecutorConfig(type, config, thread);
+    MNNTestSuite::get()->pStaus.memory = memory;
+    MNNTestSuite::get()->pStaus.precision = precision;
     if (argc > 1) {
         auto name = argv[1];
         if (strcmp(name, "all") == 0) {
diff --git a/test/op/BroadcastToTest.cpp b/test/op/BroadcastToTest.cpp
index 0be0b54bd..1920468d7 100644
--- a/test/op/BroadcastToTest.cpp
+++ b/test/op/BroadcastToTest.cpp
@@ -20,6 +20,14 @@ class BroadcastToTest : public MNNTestCase {
     virtual ~BroadcastToTest() = default;
 
     virtual bool run(int precision) {
+        bool resultNCHW = testDimensionformat(NCHW, precision);
+        bool resultNHWC = testDimensionformat(NHWC, precision);
+
+        return (resultNCHW && resultNHWC);
+    }
+
+private:
+    bool testDimensionformat(Dimensionformat dimensionFormat, int precision) {
         {
             const float tensorData[]   = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
             const int shapeData[]      = {2, 3, 2, 2};
@@ -28,8 +36,8 @@ class BroadcastToTest : public MNNTestCase {
                 1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 6.0, 5.0, 6.0,
             };
 
-            auto tensor = _Const(tensorData, {1, 3, 1, 2}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {4}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {1, 3, 1, 2}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {4}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -51,8 +59,8 @@ class BroadcastToTest : public MNNTestCase {
             const int shapeData[]      = {3, 3};
             const float expectedData[] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0};
 
-            auto tensor = _Const(tensorData, {1, 3}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {2}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {1, 3}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {2}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -74,8 +82,8 @@ class BroadcastToTest : public MNNTestCase {
             const int shapeData[]      = {3, 3};
             const float expectedData[] = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0};
 
-            auto tensor = _Const(tensorData, {3, 1}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {2}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {3, 1}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {2}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -98,8 +106,8 @@ class BroadcastToTest : public MNNTestCase {
             const float expectedData[] = {1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0,
                                           1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0};
 
-            auto tensor = _Const(tensorData, {1, 1, 1, 2}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {4}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {1, 1, 1, 2}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {4}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -122,8 +130,8 @@ class BroadcastToTest : public MNNTestCase {
             const float expectedData[] = {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0,
                                           1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0};
 
-            auto tensor = _Const(tensorData, {1, 3, 1, 1}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {4}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {1, 3, 1, 1}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {4}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -145,8 +153,8 @@ class BroadcastToTest : public MNNTestCase {
             const int shapeData[]      = {1, 1, 1, 1};
             const float expectedData[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0};
 
-            auto tensor = _Const(tensorData, {1, 3, 1, 2}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {4}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {1, 3, 1, 2}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {4}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -169,8 +177,8 @@ class BroadcastToTest : public MNNTestCase {
             const float expectedData[] = {1.0, 1.0, 2.0, 2.0, 3.0, 3.0,
                                           1.0, 1.0, 2.0, 2.0, 3.0, 3.0};
 
-            auto tensor = _Const(tensorData, {3, 1}, NHWC, halide_type_of<float>());
-            auto shape  = _Const(shapeData, {3}, NHWC, halide_type_of<int>());
+            auto tensor = _Const(tensorData, {3, 1}, dimensionFormat, halide_type_of<float>());
+            auto shape  = _Const(shapeData, {3}, dimensionFormat, halide_type_of<int>());
             auto result = _BroadcastTo(tensor, shape);
 
             const int size  = result->getInfo()->size;
@@ -194,15 +202,23 @@ class BinaryBroadcastTest : public MNNTestCase {
     virtual ~BinaryBroadcastTest() = default;
 
     virtual bool run(int precision) {
-        auto X = _Input({2, 5, 2}, NHWC, halide_type_of<float>());
+        bool resultNCHW = testDimensionFormat(NCHW, precision);
+        bool resultNHWC = testDimensionFormat(NHWC, precision);
+
+        return (resultNCHW && resultNHWC);
+    }
+
+private:
+    bool testDimensionFormat(Dimensionformat dimensionFormat, int precision) {
+        auto X = _Input({2, 5, 2}, dimensionFormat, halide_type_of<float>());
         X->setName("X");
-        auto y0 = _Input({}, NHWC, halide_type_of<float>());
+        auto y0 = _Input({}, dimensionFormat, halide_type_of<float>());
         y0->writeMap<float>()[0] = 1.0f;
-        auto y1 = _Input({1, 1, 2}, NHWC, halide_type_of<float>());
+        auto y1 = _Input({1, 1, 2}, dimensionFormat, halide_type_of<float>());
         y1->writeMap<float>()[0] = 1.0f;
         y1->writeMap<float>()[1] = 2.0f;
 
-        auto y2 = _Input({2, 1, 2}, NHWC, halide_type_of<float>());
+        auto y2 = _Input({2, 1, 2}, dimensionFormat, halide_type_of<float>());
         y2->writeMap<float>()[0] = 1.0f;
         y2->writeMap<float>()[1] = 2.0f;
         y2->writeMap<float>()[2] = 3.0f;
@@ -232,7 +248,7 @@ class BinaryBroadcastTest : public MNNTestCase {
         std::shared_ptr<MNN::Express::Module> module(Module::load(std::vector<std::string>{"X"}, std::vector<std::string>{"z0", "z1", "z2", "z3"}, bufferOutput, sizeOutput));
         // First
         {
-            auto x0 = _Input({2, 1, 2}, NHWC, halide_type_of<float>());
+            auto x0 = _Input({2, 1, 2}, dimensionFormat, halide_type_of<float>());
             auto size = x0->getInfo()->size;
             auto ptr = x0->writeMap<float>();
             for (int i=0; i<size; ++i) {
@@ -266,7 +282,7 @@ class BinaryBroadcastTest : public MNNTestCase {
             }
         }
         {
-            auto x0 = _Input({2, 5, 2}, NHWC, halide_type_of<float>());
+            auto x0 = _Input({2, 5, 2}, dimensionFormat, halide_type_of<float>());
             auto size = x0->getInfo()->size;
             auto ptr = x0->writeMap<float>();
             for (int i=0; i<size; ++i) {
@@ -305,7 +321,7 @@ class BinaryBroadcastTest : public MNNTestCase {
             }
         }
         {
-            auto x0 = _Input({2, 3, 2}, NHWC, halide_type_of<float>());
+            auto x0 = _Input({2, 3, 2}, dimensionFormat, halide_type_of<float>());
             auto size = x0->getInfo()->size;
             auto ptr = x0->writeMap<float>();
             for (int i=0; i<size; ++i) {
@@ -345,6 +361,7 @@ class BinaryBroadcastTest : public MNNTestCase {
         }
         return true;
     }
+
 };
 
 MNNTestSuiteRegister(BroadcastToTest, "op/BroadcastToTest");
diff --git a/test/op/ConvolutionTest.cpp b/test/op/ConvolutionTest.cpp
index 6c127d5a8..5f1bb9e11 100644
--- a/test/op/ConvolutionTest.cpp
+++ b/test/op/ConvolutionTest.cpp
@@ -337,7 +337,7 @@ class ConvolutionCommonTest : public MNNTestCase {
     }
     bool test(MNNForwardType type, const std::string& device_name, const std::string& test_op_name, int batch,
                      int ic, int oc, int ih, int iw, PadMode mode, int pad_h, int pad_w, int kh, int kw, int stride,
-                     int dilation, int group, int precision, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool debug = false) {
+              int dilation, int group, int precision, MNN::SparseAlgo sparseAlgo = MNN::SparseAlgo_RANDOM, int sparseBlockOC = 1, bool debug = false, bool testRelu = false) {
         using namespace MNN::Express;
         std::map<PadMode, Express::PaddingMode> padMap = {
             {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}};
@@ -361,29 +361,6 @@ class ConvolutionCommonTest : public MNNTestCase {
             auto floatData = (float)(data % 255) / 255.0f;
             inputData.push_back(floatData);
         }
-
-        if (debug) {
-           std::vector<float> printCache(inputData.size());
-           for (int i = 0; i < inputData.size(); ++i) {
-               printCache[i] = FP32Converter[precision](inputData[i]);
-           }
-           MNN_PRINT("input:");
-           formatMatrix(printCache.data(), {batch, ic, ih, iw});
-           printCache.resize(weightData.size());
-           for (int i = 0; i < weightData.size(); ++i) {
-               printCache[i] = FP32Converter[precision](weightData[i]);
-           }
-           MNN_PRINT("weight:");
-           formatMatrix(printCache.data(), {oc, ic, kh, kw});
-           printCache.resize(biasData.size());
-           for (int i = 0; i < biasData.size(); ++i) {
-               printCache[i] = FP32Converter[precision](biasData[i]);
-           }
-           MNN_PRINT("bias:");
-           formatMatrix(printCache.data(), {oc});
-
-        }
-
         reference_conv2d(inputData, weightData, biasData, outputData, outputDataSeparateBias, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw,
                          stride, dilation, group, FP32Converter[precision]);
         if (outputData.size() == 0) {
@@ -416,54 +393,65 @@ class ConvolutionCommonTest : public MNNTestCase {
              }
          }
         // Single Conv
-        auto output = _Conv(std::move(weightData), std::move(biasData), input, {ic, oc}, {kw, kh}, padMap[mode],
-                            {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, sparseAlgo, sparseBlockOC, mSparse);
-
-        // difference below 0.5% relative error is considered correct.
-        auto outputPtr = output->readMap<float>();
-
-        if (debug) {
-            MNN_PRINT("\ndata NCHW shape:");
-            printDims(input->getInfo()->dim);
-            MNN_PRINT("\nweight OIHW shape:");
-            printDims({oc, ic, kh, kw});
-            MNN_PRINT("\noutput NCHW shape:");
-            printDims(output->getInfo()->dim);
-            MNN_PRINT("\nexpected output:");
-            formatMatrix(outputData.data(), output->getInfo()->dim);
-            MNN_PRINT("\nexpected output 2:");
-            formatMatrix(outputDataSeparateBias.data(), output->getInfo()->dim);
-            MNN_PRINT("\nreal output:");
-            formatMatrix(outputPtr, output->getInfo()->dim);
-        }
-        // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate.
+        std::vector<std::pair<bool, bool>> activations = {
+            {false, false},
+        };
 
+        if (testRelu) {
+            activations = {
+                {false, false},
+                {true, false},
+                {false, true}
+            };
+        }
         float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 100; // winograd error in 16-bits is relatively large
-        if (!checkVectorByRelativeError<float>(outputPtr, outputData.data(), outputDataSeparateBias.data(), outputData.size(), 0.001 * errorScale)) {
-            MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision);
-            for (int i = 0; i < outputData.size(); ++i)
-            {
-                MNN_PRINT("%f\t, %f\t, %f\n", outputData[i],outputDataSeparateBias[i], outputPtr[i]);
+        for (auto activation : activations) {
+            auto newWeight = weightData;
+            auto newBias = biasData;
+            auto toutputData = outputData;
+            auto toutputBias = outputDataSeparateBias;
+            float maxV = -10000.0f;
+            float minV = 10000.0f;
+            if (activation.first) {
+                for (auto& t : toutputData) {
+                    maxV = ALIMAX(maxV, t);
+                    minV = ALIMIN(minV, t);
+                    t = ALIMAX(0.0f, t);
+                }
+                for (auto& t : toutputBias) {
+                    maxV = ALIMAX(maxV, t);
+                    minV = ALIMIN(minV, t);
+                    t = ALIMAX(0.0f, t);
+                }
             }
-            MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
-            return false;
-        }
+            if (activation.second) {
+                for (auto& t : toutputData) {
+                    t = ALIMAX(0.0f, t);
+                    t = ALIMIN(6.0f, t);
+                }
+                for (auto& t : toutputBias) {
+                    t = ALIMAX(0.0f, t);
+                    t = ALIMIN(6.0f, t);
+                }
+            }
+            auto output = _Conv(std::move(newWeight), std::move(newBias), input, {ic, oc}, {kw, kh}, padMap[mode],
+                                {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, activation.first, activation.second, sparseAlgo, sparseBlockOC, mSparse);
 
+            // difference below 0.5% relative error is considered correct.
+            auto outputPtr = output->readMap<float>();
 
-        if (mBenchSpeed) {
-            int oh = output->getInfo()->dim[2], ow = output->getInfo()->dim[3];
-            input.fix(VARP::INPUT);
-            MNN::Timer _t;
-            const int LOOP = 20;
-            for (int i = 0; i < LOOP; ++i) {
-                input->writeMap<float>();
-                output->readMap<float>();
+            // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate.
+
+            if (!checkVectorByRelativeError<float>(outputPtr, toutputData.data(), toutputBias.data(), toutputData.size(), 0.001 * errorScale)) {
+                MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision);
+                for (int i = 0; i < toutputData.size(); ++i)
+                {
+                    MNN_PRINT("%f\t, %f\t, %f\n", toutputData[i],toutputBias[i], outputPtr[i]);
+                }
+                MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str());
+                return false;
             }
-            auto time = (float)_t.durationInUs() / 1000.0f;
-            MNN_PRINT("kernel=(%dx%d) input=(1x%dx%dx%d) output=(1x%dx%dx%d) stride=(%dx%d), avg time = %f\n",
-                      kh, kw, ic, ih, iw, oc, oh, ow, stride, stride, 1.0 * time / LOOP);
         }
-
         return true;
     }
 };
@@ -905,7 +893,7 @@ class DepthwiseConvolutionTest : public ConvolutionCommonTest {
                                                 // depthwise <==> group == outputChannel
                                                 bool succ = ConvolutionCommonTest().test(
                                                                                          type, device_name, "DepthwiseConv2D", b, ic, oc, ish, isw, PadMode_CAFFE,
-                                                                                         p, p, kh, kw, s, d, oc, precision);
+                                                                                         p, p, kh, kw, s, d, oc, precision, MNN::SparseAlgo_RANDOM, 1, false, true);
                                                 if (!succ) {
                                                     MNN_ERROR(
                                                               "Error for dw oc=%d, ic=%d, ih=%d, iw = %d, kw=%d,kh=%d,d=%d,s=%d,p=%d\n", oc,
diff --git a/test/op/GridSample3DTest.cpp b/test/op/GridSample3DTest.cpp
new file mode 100644
index 000000000..0b96dc494
--- /dev/null
+++ b/test/op/GridSample3DTest.cpp
@@ -0,0 +1,241 @@
+//
+//  GridSampler3DTest.cpp
+//  MNNTests
+//
+//  Created by MNN on 2021/03/11.
+//  Copyright © 2018, Alibaba Group Holding Limited
+//
+#include <cfenv>
+#include <cmath>
+#include <random>
+
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/Executor.hpp>
+#include <MNN/expr/ExprCreator.hpp>
+#include "MNNTestSuite.h"
+#include "TestUtils.h"
+
+using namespace MNN::Express;
+
+static float getPosition(float x, int range, bool alignCorners, GridSamplePaddingMode paddingMode) {
+    if (paddingMode == GRID_SAMPLE_PADDING_REFLECTION) {
+        // if x is on the left side of -1.0, move it to the right side of 1.0
+        if (x < -1.0f) {
+            x = (x + ::ceil(1 - x) * 4);
+        }
+        // reflect
+        if (x > 1.0f) {
+            float l = (x - 1.0f);
+            int reflectionNum = ::floor(l / 2.0);
+            float offset = (l - reflectionNum * 2.0f);
+            x = (reflectionNum % 2 == 0) ? (1 - offset) : (-1.0f + offset);
+        }
+    }
+
+    float a = alignCorners ? 1.0f : 0.0f;
+    float b = alignCorners ? 0.0f : 1.0f;
+    return (((1 + x) * (range - a) - b) / 2.0f);
+}
+
+static int CLAMP(int v, int min, int max) {
+    if ((v) < min) {
+        (v) = min;
+    } else if ((v) > max) {
+        (v) = max;
+    }
+    return v;
+}
+
+static float sample(int d, int h, int w, const float *buffer, int depth, int height, int width, GridSamplePaddingMode paddingMode) {
+    if (h < 0 || h >= height || w < 0 || w >= width || d < 0 || d >= depth) {
+        if (paddingMode == GRID_SAMPLE_PADDING_ZEROS) {
+            return 0.0f;
+        }
+        // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER
+        // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1),
+        // the leftover reflections degrade to GridSamplePaddingMode_BORDER
+        h = CLAMP(h, 0, height-1);
+        w = CLAMP(w, 0, width-1);
+        d = CLAMP(d, 0, depth-1);
+    }
+
+    return buffer[d * height * width + h * width + w];
+}
+
+static float interpolate(float d, float h, float w, const float *buffer, int depth, int height, int width, InterpolationMethod mode,
+                         GridSamplePaddingMode paddingMode) {
+    if (mode == NEAREST) {
+        int nh = ::floor(h+0.5f);
+        int nw = ::floor(w+0.5f);
+        int nd = ::floor(d+0.5f);
+        return sample(nd, nh, nw, buffer, depth, height, width, paddingMode);
+    }
+
+    // mode == GridSampleMode_BILINEAR
+    int d0 = ::floor(d);
+    int d1 = ::ceil(d);
+    int h0 = ::floor(h);
+    int h1 = ::ceil(h);
+    int w0 = ::floor(w);
+    int w1 = ::ceil(w);
+    float fx2 = w - w0;
+    float fx1 = 1.0f - fx2;
+    float fy2 = h - h0;
+    float fy1 = 1.0f - fy2;
+    float fz2 = d - d0;
+    float fz1 = 1.0f - fz2;
+
+    float i000 = sample(d0, h0, w0, buffer, depth, height, width, paddingMode);
+    float i001 = sample(d0, h0, w1, buffer, depth, height, width, paddingMode);
+    float i010 = sample(d0, h1, w0, buffer, depth, height, width, paddingMode);
+    float i011 = sample(d0, h1, w1, buffer, depth, height, width, paddingMode);
+    float i100 = sample(d1, h0, w0, buffer, depth, height, width, paddingMode);
+    float i101 = sample(d1, h0, w1, buffer, depth, height, width, paddingMode);
+    float i110 = sample(d1, h1, w0, buffer, depth, height, width, paddingMode);
+    float i111 = sample(d1, h1, w1, buffer, depth, height, width, paddingMode);
+
+    float i00 = ((i000) * fx1 + (i001) * fx2);
+    float i01 = ((i010) * fx1 + (i011) * fx2);
+    float i10 = ((i100) * fx1 + (i101) * fx2);
+    float i11 = ((i110) * fx1 + (i111) * fx2);
+    
+    float i0 = i00 * fy1 + i01 * fy2;
+    float i1 = i10 * fy1 + i11 * fy2;
+
+    return ((i0 * fz1) + (i1 * fz2));
+}
+
+static void reference_grid_sample(const float *inputPtr, const float *gridPtr, std::vector<float> &output,
+                                  int batch, int inDepth, int inHeight, int inWidth, int outDepth, int outHeight, int outWidth, int channel,
+                                  InterpolationMethod mode, GridSamplePaddingMode paddingMode, bool alignCorners) {
+    output.resize(batch * outHeight * outWidth * channel * outDepth);
+
+    float *outputPtr = output.data();
+    for (auto b = 0; b < batch; ++b) {
+        const float *_inputPtr = inputPtr + b * inDepth * inHeight * inWidth * channel;
+        const float *_gridPtr = gridPtr + b * outDepth * outHeight * outWidth * 3;
+        float *_outputPtr = outputPtr + b * outDepth * outHeight * outWidth * channel;
+
+        for (auto c = 0; c < channel; ++c) {
+            auto __inputPtr = _inputPtr + c * inDepth * inHeight * inWidth;
+            auto __outputPtr = _outputPtr + c * outDepth * outHeight * outWidth;
+            for (int d = 0; d < outDepth; ++d) {
+                for (auto h = 0; h < outHeight; ++h) {
+                    auto __gridPtr = _gridPtr + (d * outWidth * outHeight + h * outWidth) * 3;
+                    auto ___outputPtr = __outputPtr + d * outHeight * outWidth + h * outWidth;
+
+                    for (auto w = 0; w < outWidth; ++w) {
+                        auto x = getPosition(__gridPtr[3 * w + 0], inWidth, alignCorners, paddingMode);
+                        auto y = getPosition(__gridPtr[3 * w + 1], inHeight, alignCorners, paddingMode);
+                        auto z = getPosition(__gridPtr[3 * w + 2], inDepth, alignCorners, paddingMode);
+
+                        ___outputPtr[w] = interpolate(z, y, x, __inputPtr, inDepth, inHeight, inWidth, mode, paddingMode);
+                    }
+                }
+            }
+        }
+    }
+}
+
+
+class GridSample3DTest : public MNNTestCase {
+public:
+    virtual ~GridSample3DTest() = default;
+
+    virtual bool run(int precision) {
+        auto type = getCurrentType();
+
+        const std::vector<std::vector<int>> configs({
+            {1, 3, 5, 10, 5, 10, 3, 5},
+            {1, 62, 6, 10, 12, 20, 1, 2},
+            {2, 64, 12, 20, 6, 6, 5, 1},
+            {1, 3, 384, 640, 384, 640, 2, 2},
+        });
+
+        for (auto config : configs) {
+            const int batch = config[0];
+            const int depth = config[1];
+            const int inHeight = config[2];
+            const int inWidth = config[3];
+            const int outHeight = config[4];
+            const int outWidth = config[5];
+            const int inDepth = config[6];
+            const int outDepth = config[7];
+
+            std::vector<float> originInputData(batch * depth * inHeight * inWidth * inDepth);
+            std::vector<float> originGridData(batch * outHeight * outWidth * outDepth * 3);
+
+            auto inputPtr = originInputData.data();
+            auto gridPtr = originGridData.data();
+
+            std::random_device rd{};
+            std::mt19937 gen{rd()};
+            gen.seed(1024);
+            std::normal_distribution<> inputDist{0.0f, 1.0};
+            std::normal_distribution<> gridDist{0.0f, 3.0f / outWidth};
+
+            for (int i = 0; i < batch * inHeight * inWidth * inDepth * depth; i++) {
+                inputPtr[i] = inputDist(gen);
+            }
+            for (int b = 0; b < batch; b++) {
+                for (int d=0; d<outDepth; ++d) {
+                    for (int h = 0; h < outHeight; h++) {
+                        for (int w = 0; w < outWidth; w++) {
+                            float offsetH = gridDist(gen);
+                            float offsetW = gridDist(gen);
+                            float offsetD = gridDist(gen);
+                            auto basic = b * outDepth * outHeight * outWidth + d * outWidth * outHeight +  h * outWidth + w;
+                            gridPtr[3 * basic + 0] = (3.0f * w / (outWidth-1) - 1.0f + offsetW);
+                            gridPtr[3 * basic + 1] = (3.0f * h / (outHeight-1) - 1.0f + offsetH);
+                            gridPtr[3 * basic + 2] = (3.0f * d / (outDepth-0.999f) - 1.0f + offsetD);
+                        }
+                    }
+                }
+            }
+            auto input = _Input({batch, depth, inDepth, inHeight, inWidth}, NCHW);
+            auto grid = _Input({batch, outDepth, outHeight, outWidth, 3}, NCHW);
+            ::memcpy(input->writeMap<float>(), inputPtr, originInputData.size() * sizeof(float));
+            ::memcpy(grid->writeMap<float>(), gridPtr, originGridData.size() * sizeof(float));
+            input = _Convert(input, NC4HW4);
+
+            std::vector<InterpolationMethod> modes({BILINEAR});
+            std::vector<GridSamplePaddingMode> paddingModes({GRID_SAMPLE_PADDING_ZEROS, GRID_SAMPLE_PADDING_BORDER});
+            std::vector<int> alignCornersVec = {1, 0};
+            std::vector<float> expectedOutput;
+            for (auto mode : modes) {
+                for (auto paddingMode : paddingModes) {
+                    for (auto alignCorners : alignCornersVec) {
+                        reference_grid_sample(inputPtr, gridPtr, expectedOutput,
+                                              batch, inDepth, inHeight, inWidth, outDepth, outHeight, outWidth, depth,
+                                              mode, paddingMode, alignCorners);
+                        auto expectedOutPtr = expectedOutput.data();
+
+                        grid->unMap();
+                        input->unMap();
+
+                        auto output = _GridSample(input, grid, mode, paddingMode, alignCorners);
+                        output      = _Convert(output, NCHW);
+                        auto outputPtr = output->readMap<float>();
+//                        MNN_PRINT("GridSamplerTest, mode: %d, pad: %d, align: %d\n", mode, paddingMode, alignCorners);
+
+                        if (mode == NEAREST) {
+                            if (!checkVector<float>(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01)) {
+                                MNN_ERROR("GridSampleTest NEAREST test %d-%d-%d-%d-%d failed pad mode: %d, align: %d!\n", config[0], config[1], config[2], config[3], config[4], paddingMode, alignCorners);
+                                return false;
+                            }
+                        } else {
+                            if (!checkVector<float>(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01)) {
+                                MNN_ERROR("GridSampleTest BILINEAR test %d-%d-%d-%d-%d failed: pad mode: %d, align: %d!\n", config[0], config[1], config[2], config[3], config[4], paddingMode, alignCorners);
+                                return false;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+};
+
+MNNTestSuiteRegister(GridSample3DTest, "op/GridSample3D");
diff --git a/test/op/GridSampleTest.cpp b/test/op/GridSampleTest.cpp
index 77fffdc25..d945969f4 100644
--- a/test/op/GridSampleTest.cpp
+++ b/test/op/GridSampleTest.cpp
@@ -1,5 +1,5 @@
 //
-//  CropAndResizeTest.cpp
+//  GridSamplerTest.cpp
 //  MNNTests
 //
 //  Created by MNN on 2021/03/11.
@@ -149,6 +149,7 @@ class GridSampleTest : public MNNTestCase {
 
             std::random_device rd{};
             std::mt19937 gen{rd()};
+            gen.seed(1024);
             std::normal_distribution<> inputDist{0.0f, 1.0};
             std::normal_distribution<> gridDist{0.0f, 3.0f / outWidth};
 
@@ -172,7 +173,7 @@ class GridSampleTest : public MNNTestCase {
             input = _Convert(input, NC4HW4);
 
             std::vector<InterpolationMethod> modes({BILINEAR});
-            std::vector<GridSamplePaddingMode> paddingModes({GRID_SAMPLE_PADDING_ZEROS});
+            std::vector<GridSamplePaddingMode> paddingModes({GRID_SAMPLE_PADDING_ZEROS, GRID_SAMPLE_PADDING_BORDER});
             std::vector<int> alignCornersVec = {1, 0};
             std::vector<float> expectedOutput(batch * outHeight * outWidth * depth);
             for (auto mode : modes) {
diff --git a/test/op/RasterTest.cpp b/test/op/RasterTest.cpp
index 2dd10eb73..72b8baf7a 100644
--- a/test/op/RasterTest.cpp
+++ b/test/op/RasterTest.cpp
@@ -8,7 +8,6 @@
 
 #include <MNN/expr/Expr.hpp>
 #include <MNN/expr/ExprCreator.hpp>
-#include "RuntimeAttr.hpp"
 #include "MNNTestSuite.h"
 #include "TestUtils.h"
 
@@ -213,8 +212,8 @@ class ReduceBlitTest : public MNNTestCase {
     }
     virtual bool run(int precision) {
         // TODO: Other Backend Support Reduce Blit
-        auto attr = ExecutorScope::Current()->getAttr();
-        if (attr->firstType != MNN_FORWARD_CPU) {
+        auto type = getCurrentType();
+        if (type != MNN_FORWARD_CPU) {
             MNN_ERROR("Currently only cpu backend support reduce blit\n");
             return true;
         }
diff --git a/test/speed/HybridConvSpeedTest.cpp b/test/speed/HybridConvSpeedTest.cpp
index 42968330d..fc7a56868 100644
--- a/test/speed/HybridConvSpeedTest.cpp
+++ b/test/speed/HybridConvSpeedTest.cpp
@@ -17,7 +17,7 @@ using namespace MNN;
 
 class HybridConvSpeedTestCommon : public MNNTestCase {
 protected:
-    static bool testKernel(std::string title, INTS inputShape, INTS kernel, INTS channel, INTS pad, INTS strides, INTS dilate, int batch = 1, int nbit = 8, int precision = 1, bool testSpeed = false) {
+    static bool testKernel(std::string title, INTS inputShape, INTS kernel, INTS channel, INTS pad, INTS strides, INTS dilate, int batch = 1, int nbit = 8, int precision = 1, bool testSpeed = false, int block = 0) {
         float fac = 1.23;
         int res = 10;
         float tail = 0.2;
@@ -25,8 +25,12 @@ class HybridConvSpeedTestCommon : public MNNTestCase {
         int iw = inputShape[0], ih = inputShape[1];
         std::vector<float> bias(oc), biastest(oc), biasdup(oc);
         int area = kernel[0] * kernel[1];
+        if (0 == block || ic % block != 0 || area > 1) {
+            block = area * ic;
+        }
+        int group = (area * ic) / block;
         std::vector<float> weightFp32(oc * ic * area);
-        std::vector<float> wScale(oc);
+        std::vector<float> wScale(oc * group);
 
         float threshold = (float)(1 << (nbit - 1)) - 1.0f;
         float clampMin = -threshold;
@@ -48,14 +52,23 @@ class HybridConvSpeedTestCommon : public MNNTestCase {
         ::memcpy(biastest.data(), bias.data(), oc * sizeof(float));
         ::memcpy(biasdup.data(), bias.data(), oc * sizeof(float));
         int kernel_size = ic * area;
+        auto newWeightFp32 = weightFp32;
         for (int k = 0; k < oc; ++k) {
             int beginIndex = k * kernel_size;
-            auto absMax = findAbsMax(weightFp32.data() + beginIndex, kernel_size);
-            wScale[k] = absMax / threshold;
+            for (int v=0; v<group; ++v) {
+                auto absMax = findAbsMax(weightFp32.data() + beginIndex + v*block, block);
+                float scale = absMax / threshold;
+                wScale[k*group+v] = scale;
+                for (int u=0; u<block; ++u) {
+                    auto index = beginIndex+v*block+u;
+                    auto value = (int)round(weightFp32[index] / scale);
+                    newWeightFp32[index] = value * scale;
+                }
+            }
         }
         auto y     = _HybridConv(weightFp32, std::move(bias), std::move(wScale), x,
                            channel, kernel, PaddingMode::CAFFE, strides, dilate, 1, pad, false, false, nbit, false);
-        auto yfp32 = _Conv(std::move(weightFp32), std::move(biasdup), x, {ic, oc}, kernel, PaddingMode::CAFFE, strides, dilate, 1, pad);
+        auto yfp32 = _Conv(std::move(newWeightFp32), std::move(biasdup), x, {ic, oc}, kernel, PaddingMode::CAFFE, strides, dilate, 1, pad);
         auto yInfo = y->getInfo();
         auto ow = yInfo->dim[3], oh = yInfo->dim[2];
 #if defined (__aarch64__) && (precision == 2)
@@ -67,18 +80,22 @@ class HybridConvSpeedTestCommon : public MNNTestCase {
         yfp32 = _Convert(yfp32, NCHW);
         auto yPtr  = y->readMap<FLOAT_T>();
         auto tgPtr = yfp32->readMap<FLOAT_T>();
-        auto elesize = batch * oc * oh * ow;
+        auto elesize = yfp32->getInfo()->size;
         float limit = 0.1f;
+        bool correct = true;
+        float maxValue = 0.001f;
+        for (int i = 0; i < elesize; ++i) {
+            maxValue = fmaxf(maxValue, fabsf(tgPtr[i]));
+        }
+
         for (int i = 0; i < elesize; ++i) {
             float targetValue = tgPtr[i], computeResult = yPtr[i];
             float diff = targetValue - computeResult;
-            float ratio = fabsf(diff) / fmax(targetValue, computeResult);
-            if (targetValue != 0 && computeResult != 0 && ratio > limit) {
-                MNN_PRINT("%d result Error ratio=%f: right=%f, error=%f\n", i, ratio, targetValue, computeResult);
-                return false;
-            } else if ((targetValue == 0 || computeResult == 0) && fabsf(diff) > limit) {
+            float ratio = fabsf(diff) / maxValue;
+            if (ratio > limit) {
                 MNN_PRINT("%d result Error ratio=%f: right=%f, error=%f\n", i, ratio, targetValue, computeResult);
-                return false;
+                correct = false;
+                break;
             }
         }
         if (testSpeed) {
@@ -93,8 +110,7 @@ class HybridConvSpeedTestCommon : public MNNTestCase {
             MNN_PRINT("%s input=(%dx%dx%dx%d) output=(%dx%dx%dx%d) avg time = %f\n",
                       title.c_str(), batch, ic, 1, 1, batch, oc, 1, 1, 1.0 * time / LOOP);
         }
-
-        return true;
+        return correct;
     }
 };
 
@@ -102,30 +118,36 @@ class HybridConvSpeedInt8Test : public HybridConvSpeedTestCommon {
 public:
     virtual bool run(int precision) {
         INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {1, 1}; // {w, h}
-        INTS channel0 = {2048, 512}; // {ic, co}
+        INTS channel0 = {4096, 4096}; // {ic, co}
         INTS channel1 = {1496, 256};
-        int batch[2] = {23, 13};
+        int batch[3] = {23, 13, 1};
+        std::vector<int> blocks = {0, 32, 128};
+
         std::vector<int> kernels = {1, 1};
         std::vector<int> weightBits = {8, 4};
         bool lowmemory = true;
-        for (auto& bits : weightBits) {
-            MNN_PRINT("Test for %d bits\n", bits);
-            for (int n = 0; n < 2; ++n) {
-                auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel0, pad, strides, dilate, batch[n], bits, precision, true);
-                if (!res) {
-                    MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel0[0], channel0[1]);
-                    return false;
+        int batchNum = sizeof(batch) / sizeof(int);
+        bool correct = true;
+        for (auto block : blocks) {
+            for (auto& bits : weightBits) {
+                MNN_PRINT("Test for %d bits, block=%d\n", bits, block);
+                for (int n = 0; n < batchNum; ++n) {
+                    auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel0, pad, strides, dilate, batch[n], bits, precision, true, block);
+                    if (!res) {
+                        MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel0[0], channel0[1]);
+                        correct = false;
+                    }
                 }
-            }
-            for (int n = 0; n < 2; ++n) {
-                auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel1, pad, strides, dilate, batch[n], bits, precision, true);
-                if (!res) {
-                    MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel1[0], channel1[1]);
-                    return false;
+                for (int n = 0; n < batchNum; ++n) {
+                    auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel1, pad, strides, dilate, batch[n], bits, precision, true, block);
+                    if (!res) {
+                        MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel1[0], channel1[1]);
+                        correct = false;
+                    }
                 }
             }
         }
-        return true;
+        return correct;
     }
 };
 
@@ -170,20 +192,28 @@ class HybridConvInt8Test : public HybridConvSpeedTestCommon {
 class DenseConvInt8Test : public HybridConvSpeedTestCommon {
 public:
     virtual bool run(int precision) {
-        std::vector< std::vector<int>> channels = {{4, 256}, {2048, 256}, {1, 8}, {7, 9}};
+        std::vector< std::vector<int>> channels = {{4, 256}, {512, 128}, {1, 8}, {7, 9}};
         INTS strides = {1, 1}, dilate = {1, 3}, pad = {0, 3}, inputShape = {1, 2640}; // {w, h}
-        int batch[2] = {1, 13};
-        std::vector<int> kernels = {1, 3};
-        std::vector<int> weightBits = {8};
+        std::vector<int> batch = {1, 13};
+        std::vector<std::vector<int>> kernels = {{1, 1}, {1, 3}};
+        std::vector<int> weightBits = {4, 8};
         bool lowmemory = true;
         int n = 0;
         for (auto& bits : weightBits) {
-            for (int n = 0; n < 2; ++n) {
+            for (int n = 0; n < batch.size(); ++n) {
                 for (int i = 0; i < channels.size(); ++i) {
-                    auto res = testKernel("Low memory ConvInt8 with 1x3 kernel test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], bits, precision);
-                    if (!res) {
-                        MNN_ERROR("Error: low memory ConvInt8 with 1x3 kernel when n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]);
-                        return false;
+                    for (auto kernel : kernels) {
+                        std::vector<int> blocks = {0};
+                        if (kernel[0] == 1 && kernel[1] == 1) {
+                            blocks = {0, 32};
+                        }
+                        for (auto block : blocks) {
+                            auto res = testKernel("Low memory ConvInt8 with kernel test:", inputShape, kernel, channels[i], pad, strides, dilate, batch[n], bits, precision, false, block);
+                            if (!res) {
+                                MNN_ERROR("Error: low memory ConvInt8 with %dx%d kernel when n=%d, ic=%d, oc=%d, block=%d\n", kernel[0], kernel[1], batch[n], channels[i][0], channels[i][1], block);
+                                return false;
+                            }
+                        }
                     }
                 }
             }
diff --git a/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp b/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp
index ed68db4d6..7ed348dc8 100644
--- a/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp
+++ b/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp
@@ -16,6 +16,11 @@ namespace Express {
 class OnnxRandomUniformTransform : public OnnxExtraManager::Transform {
 public:
     virtual EXPRP onExecute(EXPRP expr) const override {
+        static bool gInit = false;
+        if (!gInit) {
+            MNN_PRINT("The model has random OP: %s, can't check result with onnxruntime\n", expr->name().c_str());
+            gInit = true;
+        }
         auto op   = expr->get();
         auto info = op->main_as_Extra();
         std::unique_ptr<OpT> randomUniform(new OpT);
diff --git a/tools/cpp/GetMNNInfo.cpp b/tools/cpp/GetMNNInfo.cpp
index 039ab9f79..376f265fb 100644
--- a/tools/cpp/GetMNNInfo.cpp
+++ b/tools/cpp/GetMNNInfo.cpp
@@ -105,6 +105,9 @@ int main(int argc, char *argv[]) {
     } else {
         MNN_PRINT("Model Version: %s \n", info->version.c_str());
     }
+    if (!info->bizCode.empty()) {
+        MNN_PRINT("Model bizCode: %s\n", info->bizCode.c_str());
+    }
     return 0;
 }
 
diff --git a/tools/cv/include/cv/imgproc/structural.hpp b/tools/cv/include/cv/imgproc/structural.hpp
index faf4ff7a3..470ba5a21 100644
--- a/tools/cv/include/cv/imgproc/structural.hpp
+++ b/tools/cv/include/cv/imgproc/structural.hpp
@@ -10,7 +10,7 @@
 #define STRUCTURAL_HPP
 
 #include <MNN/MNNDefine.h>
-#include "cv/types.hpp"
+#include "../types.hpp"
 
 namespace MNN {
 namespace CV {
diff --git a/tools/cv/include/cv/types.hpp b/tools/cv/include/cv/types.hpp
index 9cc4aee7a..e83c6ba38 100644
--- a/tools/cv/include/cv/types.hpp
+++ b/tools/cv/include/cv/types.hpp
@@ -407,6 +407,9 @@ typedef Scalar_<double> Scalar;
 
 static void getVARPSize(VARP var, int* height, int* width, int* channel) {
     auto info = var->getInfo();
+    if (!info) {
+        return;
+    }
     auto dims = info->dim;
     int num = dims.size();
     if (num < 2) return;
diff --git a/tools/cv/source/imgproc/filter.cpp b/tools/cv/source/imgproc/filter.cpp
index 81b5a32fb..b63b830b7 100644
--- a/tools/cv/source/imgproc/filter.cpp
+++ b/tools/cv/source/imgproc/filter.cpp
@@ -46,9 +46,7 @@ static VARP formatOutput(VARP src, halide_type_t type) {
     if (channel == 1) {
         squeeze_dims.push_back(-1);
     }
-    if (!squeeze_dims.empty()) {
-        src = _Squeeze(src, squeeze_dims);
-    }
+    src = _Squeeze(src, squeeze_dims);
     if (type == halide_type_of<uint8_t>()) {
         src = _Minimum(src, _Scalar<float>(255));
         src = _Maximum(src, _Scalar<float>(0));
diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp
index 5337afc3c..951a1578e 100644
--- a/transformers/llm/engine/include/llm/llm.hpp
+++ b/transformers/llm/engine/include/llm/llm.hpp
@@ -29,24 +29,6 @@ class Tokenizer;
 class Pipeline;
 class LlmConfig;
 
-// Llm start
-// llm stream buffer with callback
-class MNN_PUBLIC LlmStreamBuffer : public std::streambuf {
-public:
-    using CallBack = std::function<void(const char* str, size_t len)>;;
-    LlmStreamBuffer(CallBack callback) : callback_(callback) {}
-
-protected:
-    virtual std::streamsize xsputn(const char* s, std::streamsize n) override {
-        if (callback_) {
-            callback_(s, n);
-        }
-        return n;
-    }
-
-private:
-    CallBack callback_ = nullptr;
-};
 class MNN_PUBLIC Llm {
     using PromptItem = std::pair<std::string, std::string>; // <role, content>
 public:
diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
index 4d05379a4..4561338f8 100644
--- a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
+++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm
@@ -4,7 +4,7 @@
 //
 //  Created by wangzhaode on 2023/12/14.
 //
-
+#include <functional>
 #import "LLMInferenceEngineWrapper.h"
 #include <MNN/llm/llm.hpp>
 using namespace MNN::Transformer;
@@ -44,7 +44,23 @@ - (BOOL)loadModel {
     }
     return YES;
 }
+// Llm start
+// llm stream buffer with callback
+class LlmStreamBuffer : public std::streambuf {
+public:
+    using CallBack = std::function<void(const char* str, size_t len)>;;
+    LlmStreamBuffer(CallBack callback) : callback_(callback) {}
 
+protected:
+    virtual std::streamsize xsputn(const char* s, std::streamsize n) override {
+        if (callback_) {
+            callback_(s, n);
+        }
+        return n;
+    }
+private:
+    CallBack callback_ = nullptr;
+};
 - (void)processInput:(NSString *)input withStreamHandler:(StreamOutputHandler)handler {
     LlmStreamBuffer::CallBack callback = [handler](const char* str, size_t len) {
         if (handler) {
diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp
index 991c6c7ef..8b836595f 100644
--- a/transformers/llm/engine/src/llm.cpp
+++ b/transformers/llm/engine/src/llm.cpp
@@ -193,8 +193,13 @@ size_t Llm::apply_lora(const std::string& lora_path) {
     module_config.rearrange = true;
     module_config.base = modules_.begin()->get();
     size_t lora_index = modules_.size();
-    modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"},
-                                       {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+    if (attention_fused_) {
+        modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids"},
+                                           {"logits"}, model_path.c_str(), runtime_manager_, &module_config));
+    } else {
+        modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"},
+                                           {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config));
+    }
     select_module(lora_index);
     return lora_index;
 }
@@ -244,6 +249,7 @@ void Llm::trace(bool start) {
     for (auto& m : decode_modules_) {
         m->traceOrOptimize(status);
     }
+
     runtime_manager_->updateCache();
     mTracing = start;
 }
diff --git a/transformers/llm/export/llmexport.py b/transformers/llm/export/llmexport.py
index dacf40d04..904128bb5 100644
--- a/transformers/llm/export/llmexport.py
+++ b/transformers/llm/export/llmexport.py
@@ -495,6 +495,9 @@ def quant(self, weight, quant_bit, quant_block):
             block_size = ic
         else:
             block_size = quant_block
+        if ic % block_size != 0:
+            block_size = ic
+            print('Skip block quant for ic=', ic, ', quant_block:', quant_block)
         block_num = ic // block_size
         weight = weight.reshape(oc, block_num, block_size)
         max_val = np.max(weight, axis=-1, keepdims=True)
@@ -626,7 +629,7 @@ def rebuild_op(self, op, graph):
                 "quanParameter": {
                     "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0,
                     "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32,
-                    "type": 1, "aMax": 0, "aMin": q_min, "readType": oc * (ic // self.quant_block), "weightSize": 0
+                    "type": 1, "aMax": 0, "aMin": q_min, "readType": -1, "weightSize": 0
                 },
                 "external": external
             },
@@ -1864,7 +1867,7 @@ def main():
     parser.add_argument('--export', type=str, default=None, help='export model to an onnx/mnn model.')
     parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.')
     parser.add_argument('--quant_bit', type=int, default=4, help='mnn quant bit, 4 or 8, default is 4.')
-    parser.add_argument('--quant_block', type=int, default=128, help='mnn quant block, default is 0 mean channle-wise.')
+    parser.add_argument('--quant_block', type=int, default=0, help='mnn quant block, default is 0 mean channle-wise.')
     parser.add_argument('--lm_quant_bit', type=int, default=None, help='mnn lm_head quant bit, 4 or 8, default is `quant_bit`.')
     parser.add_argument('--mnnconvert', type=str, default='../../../build/MNNConvert', help='local mnnconvert path, if invalid, using pymnn.')