diff --git a/CMakeLists.txt b/CMakeLists.txt index 7b940476e..deb46775e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,14 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_LIST_DIR}/cmake" ) + +if(WIN32) + if(NOT MSVC) + set(CMAKE_MSVC_RUNTIME_LIBRARY "") + set(MSVC_RUNTIME_LIBRARY "") + endif() +endif() + # build options option(MNN_USE_SYSTEM_LIB "For opencl and vulkan, use system lib or use dlopen" OFF) option(MNN_BUILD_HARD "Build -mfloat-abi=hard or not" OFF) @@ -198,7 +206,7 @@ option(MNN_METAL "Enable Metal" OFF) option(MNN_OPENCL "Enable OpenCL" OFF) option(MNN_OPENGL "Enable OpenGL" OFF) option(MNN_VULKAN "Enable Vulkan" OFF) -option(MNN_ARM82 "Enable ARM82" OFF) +option(MNN_ARM82 "Enable ARMv8.2's FP16 Compute" ON) option(MNN_ONEDNN "Enable oneDNN" OFF) option(MNN_AVX512 "Enable AVX512" OFF) option(MNN_CUDA "Enable CUDA" OFF) @@ -452,6 +460,12 @@ set(MNN_EXTRA_DEPENDS "") # Add Thread dependency find_package(Threads) list(APPEND MNN_EXTRA_DEPENDS ${CMAKE_THREAD_LIBS_INIT}) +if(WIN32) + if(NOT MSVC) + set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt") + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=lld-link -lmsvcrt") + endif() +endif() if (NOT APPLE) if(MNN_OPENMP) diff --git a/MNN.sln b/MNN.sln index 7610b9e73..e69de29bb 100644 --- a/MNN.sln +++ b/MNN.sln @@ -1,36 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 12.00 -# Visual Studio Version 17 -VisualStudioVersion = 17.5.002.0 -MinimumVisualStudioVersion = 10.0.40219.1 -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "3rd_party", "3rd_party", "{5CD18987-C4CA-49D5-942F-14B15F46B1ED}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "flatbuffers", "flatbuffers", "{89B04BB7-86CB-4D4F-B65C-C3D0995DBD36}" -EndProject -Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{797AC14A-1653-469D-A240-76EF0F36E60A}" -EndProject -Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "FlatBuffers.Test", "3rd_party\flatbuffers\tests\FlatBuffers.Test\FlatBuffers.Test.csproj", "{E5A80CC7-62B1-4887-B637-455F34CCC9B3}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - Debug|Any CPU = Debug|Any CPU - Release|Any CPU = Release|Any CPU - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.ActiveCfg = Debug|Any CPU - {E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Debug|Any CPU.Build.0 = Debug|Any CPU - {E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.ActiveCfg = Release|Any CPU - {E5A80CC7-62B1-4887-B637-455F34CCC9B3}.Release|Any CPU.Build.0 = Release|Any CPU - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection - GlobalSection(NestedProjects) = preSolution - {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} = {5CD18987-C4CA-49D5-942F-14B15F46B1ED} - {797AC14A-1653-469D-A240-76EF0F36E60A} = {89B04BB7-86CB-4D4F-B65C-C3D0995DBD36} - {E5A80CC7-62B1-4887-B637-455F34CCC9B3} = {797AC14A-1653-469D-A240-76EF0F36E60A} - EndGlobalSection - GlobalSection(ExtensibilityGlobals) = postSolution - SolutionGuid = {11D826E1-518B-4BC2-8E45-03F5F48170D6} - EndGlobalSection -EndGlobal diff --git a/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S b/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S deleted file mode 100644 index 2312a004a..000000000 --- a/backupcode/cpubackend/arm/arm32/bf16/MNNConvRunForUnitDepthWise_BF16.S +++ /dev/null @@ -1,77 +0,0 @@ -// -// NEON_MNNConvRunForUnitDepthWise_BF16.S -// MNN -// -// Created by MNN on 2021/03/09. -// Copyright © 2018-2021 Alibaba Group Holding Limited -// - -#ifdef __arm__ -#ifndef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function NEON_MNNConvRunForUnitDepthWise_BF16 -//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) - -//Auto: r0:dst, r1:src, r2:weight, r3:fw - -push {r4-r8, lr} - -//Load from sp: -//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step -mov r4, r3 -ldr r5, [sp, #24] -ldr r6, [sp, #28] -ldr r7, [sp, #32] -ldr r8, [sp, #36] - -cmp r4, #0 -vmov.i32 q0, #0 -beq UnitEnd -cmp r5, #0 -beq UnitEnd - -mov lr, #2 -mul r6, lr, r6 // x6(weight_y_step in byte) = sizeof(int16_t) * weight_y_step -mul r7, lr, r7 // x7(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step -mul r8, lr, r8 // x8(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step - -//dilate_y_step -> dilate_y_step - dilate_x_step*fw -mul lr, r4, r7 -sub r8, r8, lr - -//weight_y_step -> weight_y_step - 4*sizeof(float)*fw -mov lr, #8 -mul lr, r4, lr -sub r6, r6, lr - - -UnitLoopH: -mov lr, r4 -UnitLoopW: -vld1.16 {d2}, [r1], r7 -vld1.16 {d4}, [r2]! -vshll.s16 q1, d2, #16 -vshll.s16 q2, d4, #16 - -vmla.f32 q0, q1, q2 -subs lr, lr, #1 -bne UnitLoopW -subs r5, r5, #1 -add r1, r1, r8 -add r2, r2, r6 -bne UnitLoopH - - -UnitEnd: -vshrn.i32 d0, q0, #16 -vst1.16 {d0}, [r0] - -pop {r4-r8, pc} - -#endif -#endif diff --git a/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S b/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S deleted file mode 100644 index 75254f555..000000000 --- a/backupcode/cpubackend/arm/arm64/bf16/MNNConvRunForUnitDepthWise_BF16.S +++ /dev/null @@ -1,66 +0,0 @@ -// -// NEON_MNNConvRunForUnitDepthWise_BF16.S -// MNN -// -// Created by MNN on 2021/03/09. -// Copyright © 2018-2021 Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function NEON_MNNConvRunForUnitDepthWise_BF16 -//void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) - -//Auto: x0:dst, x1:src, x2:weight, x3:fw -//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step - -cmp x3, #0 -movi v0.4s, #0 -beq UnitEnd -cmp x4, #0 -beq UnitEnd - -mov x9, #2 -mul x5, x9, x5 // x5(weight_y_step in byte) = sizeof(int16_t) * weight_y_step -mul x6, x9, x6 // x6(dilate_x_step in byte) = sizeof(int16_t) * dilate_x_step -mul x7, x9, x7 // x7(dilate_y_step in byte) = sizeof(int16_t) * dilate_y_step - -//dilate_y_step -> dilate_y_step - dilate_x_step*fw -mul x9, x3, x6 -sub x7, x7, x9 // because x1 has already been auto-increased at 'ld1 {v1.4h}, [x1], x6', here we should rewind by x6*fw - -//weight_y_step -> weight_y_step - 4*sizeof(int16_t)*fw -mov x9, #8 -mul x9, x3, x9 -sub x5, x5, x9 - - -UnitLoopH: -mov x9, x3 -UnitLoopW: -ld1 {v1.4h}, [x1], x6 -ld1 {v2.4h}, [x2], #8 // 4 * sizeof(int16_t) -shll v1.4s, v1.4h, #16 -shll v2.4s, v2.4h, #16 - -fmla v0.4s, v1.4s, v2.4s -subs x9, x9, #1 -bne UnitLoopW -subs x4, x4, #1 -add x1, x1, x7 -add x2, x2, x5 -bne UnitLoopH - - -UnitEnd: -shrn v0.4h, v0.4s, #16 -st1 {v0.4h}, [x0] - -ret - -#endif diff --git a/backupcode/cpubackend/bf16/BF16Functions.cpp b/backupcode/cpubackend/bf16/BF16Functions.cpp index 3f792a3ce..f9986c438 100644 --- a/backupcode/cpubackend/bf16/BF16Functions.cpp +++ b/backupcode/cpubackend/bf16/BF16Functions.cpp @@ -76,23 +76,6 @@ static void _MNNLowpToFp32(const int16_t* src, float* dst, size_t size) { ::memcpy(dst, dstTemp, sizeRemain * sizeof(float)); } } -static void MNNConvRunForUnitDepthWiseBF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { - int fx, fy; - BFVec4 dstValue(0.0f); - const int16_t* src_z = (const int16_t*)src; - const int16_t* weight_z = (const int16_t*)weight; - for (fy = 0; fy < fh; ++fy) { - const auto src_y = src_z + fy * dilateY_step; - const auto weight_y = weight_z + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - const auto weight_x = weight_y + 4 * fx; - const auto src_x = src_y + fx * dilateX_step; - dstValue = dstValue + BFVec4::load(src_x) * BFVec4::load(weight_x); - } - } - BFVec4::save((int16_t*)dst, dstValue); -} static void MNNConvRunForLineDepthwiseBF16(float* dstO, const float* srcO, const float* weightO, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, @@ -823,7 +806,6 @@ static CoreFunctions* gInstance = nullptr; bool BF16Functions::init() { gInstance = new CoreFunctions; gInstance->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwiseBF16; - gInstance->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWiseBF16; gInstance->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnitBF16; gInstance->MNNFp32ToLowp = _MNNFp32ToLowp; gInstance->MNNLowpToFp32 = _MNNLowpToFp32; @@ -890,7 +872,6 @@ bool BF16Functions::init() { gInstance->MNNPackedMatMul = NEON_MNNPackedMatMul_BF16; gInstance->MNNPackedMatMulRemain = NEON_MNNPackedMatMulRemain_BF16; gInstance->MNNConvRunForLineDepthwise = NEON_MNNConvRunForLineDepthwise_BF16; - gInstance->MNNConvRunForUnitDepthWise = NEON_MNNConvRunForUnitDepthWise_BF16; gInstance->MNNAxByClampBroadcastUnit = NEON_MNNAxByClampBroadcastC4_BF16; #ifdef __aarch64__ cpuinfo_arm_isa gCPUInfo; diff --git a/docs/compile/cmake.md b/docs/compile/cmake.md index 95f9d5760..f9927b4f9 100644 --- a/docs/compile/cmake.md +++ b/docs/compile/cmake.md @@ -38,7 +38,7 @@ MNN使用CMake构建项目,CMake中的宏定义列表如下: | MNN_OPENCL | 是否构建`OpenCL`后端,默认为`OFF` | | MNN_OPENGL | 是否构建`OpenGL`后端,默认为`OFF` | | MNN_VULKAN | 是否构建`Vulkan`后端,默认为`OFF` | -| MNN_ARM82 | 是否构建`Armv8.2`后端,默认为`OFF` | +| MNN_ARM82 | 编译ARM架构时,是否构建`Armv8.2`后端,以支持FP16计算,默认为`ON` | | MNN_ONEDNN | 是否使用`oneDNN`,默认为`OFF` | | MNN_AVX512 | 是否构建`avx512`后端,默认为`OFF` | | MNN_CUDA | 是否构建`Cuda`后端,默认为`OFF` | diff --git a/docs/compile/engine.md b/docs/compile/engine.md index eb8eb6503..200124725 100644 --- a/docs/compile/engine.md +++ b/docs/compile/engine.md @@ -22,37 +22,45 @@ ```bash mkdir build && cd build && cmake .. && make -j8 ``` -## Windows +## Windows(非ARM架构) - 环境要求 - Microsoft Visual Studio >= 2017 - cmake >= 3.13 - - powershell - Ninja - 相关编译选项 - 同`Linux/MacOS` - 具体步骤 - 1. opencl/vulkan - - *(可选)*下载GPU Caps Viewer,你可以通过这个工具来查看本机设备的详细信息(opencl、opengl、vulkan等) - - sdk和驱动准备 - - [opencl sdk](https://github.com/GPUOpen-LibrariesAndSDKs/OCL-SDK/releases),将opencl sdk目录的路径加到AMDAPPSDKROOT环境变量 - - [vulkan sdk](https://vulkan.lunarg.com/),将vulkan skd路径加入VULKAN_SDK环境变量,以备cmake查找 - - [AMD opencl驱动](https://www.amd.com/zh-hans/support) - - [NVIDIA opencl驱动](https://developer.nvidia.com/opencl) - - [AMD vulkan驱动](https://community.amd.com/community/gaming/blog/2016/02/16/radeon-gpus-are-ready-for-the-vulkan-graphics-api) - 2. 编译 - - 64位编译:在设置中找到vcvars64.bat(适用于 VS 2017 的 x64 本机工具命令提示)并单击,打开VS编译x64架构程序的虚拟环境 - - 32位编译:在设置中找到vcvarsamd64_x86.bat(VS 2017的 x64_x86 交叉工具命令提示符)并单击,打开VS交叉编译x86架构程序的虚拟环境 - - 在虚拟环境中执行如下编译命令: - ```bash - cd /path/to/MNN - ./schema/generate.ps1 # 非必须 - mkdir build && cd build - cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF - ninja - ``` - - 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON - - 若需要编译 MNN CUDA,MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON - - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题) + - 64位编译:在设置中找到vcvars64.bat(适用于 VS 2017 的 x64 本机工具命令提示)并单击,打开VS编译x64架构程序的虚拟环境 + - 32位编译:在设置中找到vcvarsamd64_x86.bat(VS 2017的 x64_x86 交叉工具命令提示符)并单击,打开VS交叉编译x86架构程序的虚拟环境 + - 在虚拟环境中执行如下编译命令: + ```bash + cd /path/to/MNN + ./schema/generate.ps1 # 非必须 + mkdir build && cd build + cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=OFF + ninja + ``` + - 若需要编译模型转换工具,cmake 命令加上 -DMNN_BUILD_CONVERTER=ON -DMNN_BUILD_SHARED_LIBS=OFF -DMNN_WIN_RUNTIME_MT=ON + - 若需要编译 MNN CUDA,MNN_WIN_RUNTIME_MT 和 MNN_BUILD_SHARED_LIBS 需要设成 ON ,另外加上 -DMNN_CUDA=ON: cmake .. -G Ninja -DCMAKE_BUILD_TYPE=Release -DMNN_BUILD_SHARED_LIBS=ON -DMNN_WIN_RUNTIME_MT=ON -DMNN_CUDA=ON + - Windows 上建议使用 Interpreter::destroy , Tensor::destroy , Module::destroy 等方法进行 MNN 相关内存对象的析构,不要直接使用 delete (直接使用 delete 在 -DMNN_WIN_RUNTIME_MT=ON 时会出问题) + +## Windows(ARM架构) +- 环境要求 + - Microsoft Visual Studio >= 2017 + - cmake >= 3.13 + - Ninja + - Clang + - Clang 安装参考: https://learn.microsoft.com/en-us/cpp/build/clang-support-msbuild?view=msvc-170#install-1 +- 相关编译选项 + - 同`Linux/MacOS` +- 具体步骤 + - 打开vs的ARM64命令行工具 + - 进入 MNN 根目录 + - mkdir build && cd build + - cmake .. -G Ninja -DCMAKE_C_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang.exe" -DCMAKE_CXX_COMPILER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\clang++.exe"  -DCMAKE_LINKER="C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\Llvm\ARM64\bin\lld.exe" -DCMAKE_BUILD_TYPE=Release + - Visual Studio 安装路径不一致的,可自行修改脚本 + - ninja -j16 + ## Android - 环境要求 - cmake >= 3.10 diff --git a/docs/tools/quant.md b/docs/tools/quant.md index 1a66b6e1b..0e4e733c9 100644 --- a/docs/tools/quant.md +++ b/docs/tools/quant.md @@ -39,8 +39,43 @@ MNN现已推出基于TensorFlow/Pytorch的模型压缩工具mnncompress,请查 | ADMM | 使用ADMM方法进行权值量化 | ## 多输入模型的参数设置的特别说明(MNN现阶段仅支持输入数据类型是非图片的多输入模型) -| input_type | `str` | 输入数据的类型,"sequence" | -| path | `str` | 存放校正特征量化系数的输入数据目录 |,例如该目录下包含2个输入数据集input_0和input_1,子目录input_0和input_1中包含模型的输入数据和一个input.json文件。input_0和input_1分别是两个输入输出信息文件夹,可使用 testMNNFromOnnx.py 等脚本生成,参考模型转换的正确性校验部分。 +| 需要特别指定的参数 | 设置值 | +|--------------------|------| +| input_type | `str`:输入数据的类型,"sequence" | +| path | `str`:存放校正特征量化系数的输入数据目录 |, +例如在quant.json文件中 "path": "/home/data/inputs_dir/",你所构造的矫正数据集有两个,分别存放在input_0和input_1子目录下,即"/home/data/inputs_dir/input_0"和"/home/data/inputs_dir/input_1".由GetMNNInfo工具可以得到模型的输入输出名称,例如该模型的输入有三个:data0, data1, data2,输出有两个:out1, out2. 那么在input_0和input_1子目录下分别有六个文件:data0.txt, data1.txt, data2.txt, out1.txt, out2.txt, input.json. 其中的五个文件名要和模型的输入输出名对应,最后一个input.json文件则描述的是输入名和对应的shape内容: +```json +{ + "inputs": [ + { + "name": "data0", + "shape": [ + 2, + 4, + 64, + 64 + ] + }, + { + "name": "data1", + "shape": [ + 1 + ] + }, + { + "name": "data2", + "shape": [ + 2, + 512, + 768 + ] + } + ], + "outputs": [ + "out1", "out2" + ] +} +``` ## 量化模型的使用 和浮点模型同样使用方法,输入输出仍然为浮点类型 diff --git a/docs/transformers/llm.md b/docs/transformers/llm.md index 0d00de862..4677821cf 100644 --- a/docs/transformers/llm.md +++ b/docs/transformers/llm.md @@ -40,13 +40,16 @@ python llmexport.py \ ├── llm.mnn ├── llm.mnn.json ├── llm.mnn.weight - ├── llm.onnx + ├── onnx/ + ├──llm.onnx + ├──llm.onnx.data ├── llm_config.json └── tokenizer.txt ``` ### 功能 -- 支持将模型为onnx或mnn模型,使用`--export onnx`或`--export mnn` +- 将模型先转为onnx模型,使用`--export onnx`,然后使用./MNNConvert工具将onnx模型转为mnn模型: ./MNNConvert --modelFile ../transformers/llm/export/model/onnx/llm.onnx --MNNModel llm.mnn --keepInputFormat --weightQuantBits=4 -f ONNX --transformerFuse=1 --allowCustomOp +- 更快的方式:直接转为mnn模型,使用`--export mnn`,注意,你需要先安装pymnn或者通过--mnnconvert选项指定MNNConvert工具的地址,两种条件必须满足其中一个。如果没有安装pymnn并且没有通过--mnnconvert指定MNNConvert工具的地址,那么llmexport.py脚本会在目录"../../../build/"下寻找MNNConvert工具,需保证该目录下存在MNNConvert文件。 - 支持对模型进行对话测试,使用`--test $query`会返回llm的回复内容 - 默认会使用onnx-slim对onnx模型进行优化,跳过该步骤使用`--skip_slim` - 支持合并lora权重后导出,指定lora权重的目录使用`--lora_path` diff --git a/express/Executor.cpp b/express/Executor.cpp index 5f6a6dd48..bb54a393e 100644 --- a/express/Executor.cpp +++ b/express/Executor.cpp @@ -32,80 +32,64 @@ void Executor::setGlobalExecutorConfig(MNNForwardType type, const BackendConfig& ScheduleConfig sConfig; sConfig.type = type; type = Schedule::getApprociateType(sConfig); - auto creator = MNNGetExtraRuntimeCreator(type); - MNN_ASSERT(nullptr != creator); - Backend::Info info; - info.type = type; - info.mode = Backend::Info::DIRECT; - info.numThread = numberThread; - if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) { - info.numThread = 4; - } - mAttr->firstType = type; - auto firstIter = mRuntimes.find(mAttr->firstType); - if (firstIter == mRuntimes.end()) { - info.user = (BackendConfig*)&config; - std::shared_ptr bn(creator->onCreate(info)); - mRuntimes[mAttr->firstType] = bn; - } else { - firstIter->second->onReset(numberThread, &config, true); - } - } else { - auto creator = MNNGetExtraRuntimeCreator(type); - if (nullptr == creator) { - MNN_ERROR("Error to find creator of %d, set CPU default\n", type); - type = MNN_FORWARD_CPU; - creator = MNNGetExtraRuntimeCreator(type); - } - MNN_ASSERT(nullptr != creator); - Backend::Info info; - info.type = type; - mAttr->firstType = type; - auto firstIter = mRuntimes.find(mAttr->firstType); - if (firstIter == mRuntimes.end()) { - info.mode = Backend::Info::DIRECT; - info.numThread = numberThread; - info.user = (BackendConfig*)&config; - std::shared_ptr bn(creator->onCreate(info)); - mRuntimes[mAttr->firstType] = bn; - } else { - firstIter->second->onReset(numberThread, &config, true); - } } - _refreshRuntime(); + auto rt = _getOrCreateRuntime(type, &config, numberThread); + if (rt == nullptr) { + type = MNN_FORWARD_CPU; + numberThread = 1; + rt = _getOrCreateRuntime(type, &config, numberThread); + } + MNN_ASSERT(nullptr != rt); + mAttr->firstType = type; } int Executor::getCurrentRuntimeStatus(RuntimeStatus statusEnum) { - return mRuntimes[mAttr->firstType]->onGetRuntimeStatus(statusEnum); + return mRuntimeInfo.first[mAttr->firstType]->onGetRuntimeStatus(statusEnum); +} +std::shared_ptr Executor::_getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset) { + auto iter = mRuntimeInfo.first.find(type); + if (iter != mRuntimeInfo.first.end()) { + iter->second->onReset(numberThread, config, reset); + return iter->second; + } + // Create Backend + auto cre = MNNGetExtraRuntimeCreator(type); + if (nullptr == cre) { + return nullptr; + } + Backend::Info info; + info.type = type; + info.mode = Backend::Info::DIRECT; + info.numThread = numberThread; + info.user = (BackendConfig*)config; + std::shared_ptr rt(cre->onCreate(info)); + if (nullptr != rt) { + mRuntimeInfo.first.insert(std::make_pair(type, rt)); + } + return rt; } void Executor::gc(GCFlag flag) { int level = flag == FULL ? 100 : 0; - for (auto& iter : mRuntimes) { + for (auto& iter : mRuntimeInfo.first) { iter.second->onGabageCollect(level); } } -Executor::Executor(std::shared_ptr backend, MNNForwardType type, int numberThread) { - mRuntimes.insert(std::make_pair(type, backend)); +Executor::Executor(std::shared_ptr runtime, MNNForwardType type, int numberThread) { + mRuntimeInfo.first.insert(std::make_pair(type, runtime)); mAttr.reset(new ExecutorAttr); mAttr->firstType = type; - if (MNN_FORWARD_CPU != type) { - // Create Backup Backend - Backend::Info info; - info.type = MNN_FORWARD_CPU; - auto cre = MNNGetExtraRuntimeCreator(MNN_FORWARD_CPU); - info.mode = Backend::Info::DIRECT; - info.numThread = 1; - std::shared_ptr backupRt(cre->onCreate(info)); - mRuntimes.insert(std::make_pair(DEFAULT_BACKUP_RUNTIME_KEY, backupRt)); + if (type == MNN_FORWARD_CPU) { + mRuntimeInfo.second = runtime; + } else { + mRuntimeInfo.second = _getOrCreateRuntime(MNN_FORWARD_CPU, nullptr, 1); } mDebug.reset(new DebugTools); BackendConfig defaultConfig; defaultConfig.flags = 4; - std::shared_ptr defaultBackend(mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]->onCreate(&defaultConfig)); + std::shared_ptr defaultBackend(mRuntimeInfo.second->onCreate(&defaultConfig)); mAttr->constantBackend = defaultBackend; - _refreshRuntime(); } Executor::~Executor(){ // Do nothing @@ -176,21 +160,6 @@ std::shared_ptr Executor::newExecutor(MNNForwardType type, auto executor = new Executor(runtime, type, numberThread); return std::shared_ptr(executor); } -void Executor::_refreshRuntime() { - mRuntimeInfo.first.clear(); - mRuntimeInfo.second = mRuntimes[DEFAULT_BACKUP_RUNTIME_KEY]; - auto firstIter = mRuntimes.find(getAttr()->firstType); - if (firstIter != mRuntimes.end()) { - mRuntimeInfo.first.insert(std::make_pair(firstIter->first, firstIter->second)); - } else { - MNN_ASSERT(false); - } - for (auto& iter : mRuntimes) { - if (iter.first != getAttr()->firstType) { - mRuntimeInfo.first.insert(std::make_pair(iter.first, iter.second)); - } - } -} RuntimeInfo Executor::getRuntime() { auto glo = ExecutorScope::Current(); @@ -297,43 +266,26 @@ Executor::RuntimeManager* Executor::RuntimeManager::createRuntimeManager(const S auto res = new RuntimeManager; auto glo = ExecutorScope::Current(); std::lock_guard _l(glo->mMutex); - auto& originRt = glo->mRuntimes; - Backend::Info compute; - compute.type = Schedule::getApprociateType(config); - compute.numThread = config.numThread; + auto& originRt = glo->mRuntimeInfo; + auto type = Schedule::getApprociateType(config); + int numThread = config.numThread; if(config.type == MNN_FORWARD_AUTO) { - if(compute.type == MNN_FORWARD_OPENCL || compute.type == MNN_FORWARD_METAL) { + if(type == MNN_FORWARD_OPENCL || type == MNN_FORWARD_METAL) { // AUTO set default gpu-mode MNN_GPU_TUNING_FAST - compute.numThread = 16; + numThread = 16; } } - compute.user = config.backendConfig; - auto iter = originRt.find(compute.type); - if (iter == originRt.end()) { - auto creator = MNNGetExtraRuntimeCreator(compute.type); - if (nullptr == creator) { - return nullptr; - } - auto newBn = creator->onCreate(compute); - if (nullptr == newBn) { - MNN_ERROR("Can't create Runtime: %s\n", EnumNameForwardType((ForwardType)compute.type)); - return nullptr; - } - originRt.insert(std::make_pair(compute.type, std::shared_ptr(newBn))); - } else { - iter->second->onReset(compute.numThread, compute.user, false); - } - res->mInside->mRuntime.second = originRt[DEFAULT_BACKUP_RUNTIME_KEY]; - res->mInside->mRuntime.first.insert(std::make_pair(compute.type, originRt[compute.type])); - res->mInside->mInfo = originRt[compute.type]; - res->mInside->mNumberThread = compute.numThread; + auto rt = glo->_getOrCreateRuntime(type, config.backendConfig, numThread, false); + res->mInside->mRuntime.second = originRt.second; + res->mInside->mRuntime.first.insert(std::make_pair(type, rt)); + res->mInside->mInfo = rt; + res->mInside->mNumberThread = numThread; if (nullptr != config.backendConfig) { res->mInside->mConfig = *config.backendConfig; res->mInside->mUserConfig = true; } else { res->mInside->mUserConfig = false; } - glo->_refreshRuntime(); return res; } ExecutorAttr* Executor::getAttr() const { diff --git a/express/module/Module.cpp b/express/module/Module.cpp index a0976bd67..d1dea03dc 100644 --- a/express/module/Module.cpp +++ b/express/module/Module.cpp @@ -379,6 +379,9 @@ static Module* loadInternal(const std::vector& inputs, const std::v if (net->extraInfo() && net->extraInfo()->version()) { info->version = net->extraInfo()->version()->str(); } + if (net->bizCode()) { + info->bizCode = net->bizCode()->str(); + } auto rtMgr = _rtMgr; Module::Config defaultConfig; if (nullptr == config) { diff --git a/express/module/StaticModule.cpp b/express/module/StaticModule.cpp index 31a07c632..33fa14afe 100644 --- a/express/module/StaticModule.cpp +++ b/express/module/StaticModule.cpp @@ -598,6 +598,7 @@ std::vector StaticModule::onForward(const std::vectorgetInfo(Interpreter::FLOPS, &flops); glo->getDebugTools()->flops += flops; #endif + return outputs; } diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp index edeceb296..19bb95407 100644 --- a/include/MNN/Interpreter.hpp +++ b/include/MNN/Interpreter.hpp @@ -234,6 +234,8 @@ class MNN_PUBLIC Interpreter { // size limit of kvcache in memory (for a single layer) // if the size of kvcache exceeds the limit, it will be moved to disk KVCACHE_SIZE_LIMIT = 8, + // Op encoder number for commit + OP_ENCODER_NUMBER_FOR_COMMIT = 9, }; enum ExternalPathType { diff --git a/include/MNN/MNNDefine.h b/include/MNN/MNNDefine.h index 8a0af32de..33bc515fd 100644 --- a/include/MNN/MNNDefine.h +++ b/include/MNN/MNNDefine.h @@ -69,6 +69,6 @@ MNN_ERROR("Check failed: %s ==> %s\n", #success, #log); \ #define STR(x) STR_IMP(x) #define MNN_VERSION_MAJOR 2 #define MNN_VERSION_MINOR 9 -#define MNN_VERSION_PATCH 5 +#define MNN_VERSION_PATCH 6 #define MNN_VERSION STR(MNN_VERSION_MAJOR) "." STR(MNN_VERSION_MINOR) "." STR(MNN_VERSION_PATCH) #endif /* MNNDefine_h */ diff --git a/include/MNN/expr/Executor.hpp b/include/MNN/expr/Executor.hpp index 367c15d03..3022f7b9d 100644 --- a/include/MNN/expr/Executor.hpp +++ b/include/MNN/expr/Executor.hpp @@ -138,12 +138,10 @@ class MNN_PUBLIC Executor { }; static bool getComputeInfo(EXPRP expr, Interpreter::SessionInfoCode code, void* ptr); private: - void _refreshRuntime(); + std::shared_ptr _getOrCreateRuntime(MNNForwardType type, const BackendConfig* config, int numberThread, bool reset = true); Executor(std::shared_ptr backend, MNNForwardType type, int numberThread); void _makeCache(const std::vector& outputs, bool forceCPU); - // TODO: Remove mRuntimes, only use mRuntimeInfo - std::map> mRuntimes; RuntimeInfo mRuntimeInfo; std::shared_ptr mDebug; std::map> mSubGraph; diff --git a/include/MNN/expr/Module.hpp b/include/MNN/expr/Module.hpp index 1e5562de8..a2e5dc41b 100644 --- a/include/MNN/expr/Module.hpp +++ b/include/MNN/expr/Module.hpp @@ -53,7 +53,7 @@ class MNN_PUBLIC Module { MNNForwardType type = MNN_FORWARD_CPU; BackendConfig* config = nullptr; }; - + struct Config { // Load module as dynamic, default static bool dynamic = false; @@ -75,7 +75,7 @@ class MNN_PUBLIC Module { // Shared RuntimeManager static Module* load(const std::vector& inputs, const std::vector& outputs, const char* fileName, const std::shared_ptr rtMgr, const Config* config = nullptr); static Module* load(const std::vector& inputs, const std::vector& outputs, const uint8_t* buffer, size_t length, const std::shared_ptr rtMgr, const Config* config = nullptr); - + static Module* extract(std::vector inputs, std::vector outputs, bool fortrain, const std::map& subGraph = {}); static Module* clone(const Module* module, const bool shareParams = false); @@ -93,6 +93,8 @@ class MNN_PUBLIC Module { std::vector outputNames; // The MNNConvert's Version build the module std::string version; + // The bizCode of MNN model + std::string bizCode; }; const Info* getInfo() const; class CloneContext { diff --git a/project/ios/MNN.xcodeproj/project.pbxproj b/project/ios/MNN.xcodeproj/project.pbxproj index 535f50d27..c8afc9f93 100644 --- a/project/ios/MNN.xcodeproj/project.pbxproj +++ b/project/ios/MNN.xcodeproj/project.pbxproj @@ -158,8 +158,6 @@ 4896D37825FE2A6B00717702 /* MNNExpFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37025FE2A6A00717702 /* MNNExpFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 4896D37925FE2A6B00717702 /* MNNPackedMatMulFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 4896D37A25FE2A6B00717702 /* MNNPackedMatMulRemainFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; - 4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; - 4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 4896D37E25FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 4896D37F25FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; 489D7A682550FDC800AD896A /* MetalReduction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 489D7A172550FDC800AD896A /* MetalReduction.hpp */; }; @@ -497,7 +495,6 @@ 92FF02F223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; }; 92FF02F423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; }; 92FF02F523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; }; - 92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; }; 92FF02F723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; }; 92FF02F823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; }; 92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; }; @@ -542,7 +539,6 @@ 92FF033223AA0B5A00AC97F6 /* MNNBlitC3ToFloatRGBA.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */; }; 92FF033423AA0B5A00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */; }; 92FF033523AA0B5A00AC97F6 /* MNNInt8ScaleToFloat.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */; }; - 92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */; }; 92FF033723AA0B5A00AC97F6 /* MNNConvDwF23MulTransUnit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */; }; 92FF033823AA0B5A00AC97F6 /* MNNConvRunForLineDepthwise.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */; }; 92FF033923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */ = {isa = PBXBuildFile; fileRef = 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */; }; @@ -603,12 +599,10 @@ 92FF03A923AA0B5A00AC97F6 /* ConvolutionGroup.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */; }; 92FF03AA23AA0B5A00AC97F6 /* ConvolutionFloatFactory.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */; }; 92FF03AC23AA0B5A00AC97F6 /* ResizeFunction.h in Headers */ = {isa = PBXBuildFile; fileRef = 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */; }; - 92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */; }; 92FF03AE23AA0B5A00AC97F6 /* ConvolutionIntFactory.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */; }; 92FF03AF23AA0B5A00AC97F6 /* WinogradOptFunction.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */; }; 92FF03B023AA0B5A00AC97F6 /* ConvolutionGroup.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */; }; 92FF03B123AA0B5A00AC97F6 /* ConvolutionFloatFactory.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */; }; - 92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */; }; 92FF03B423AA0B5A00AC97F6 /* Convolution1x1Strassen.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */; }; 92FF03B523AA0B5A00AC97F6 /* ResizeFunction.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */; }; 92FF03B623AA0B5A00AC97F6 /* StrassenMatmulComputor.hpp in Headers */ = {isa = PBXBuildFile; fileRef = 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */; }; @@ -790,6 +784,8 @@ CE072A262C91AF0700F190FD /* MNNC3ToYUVFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */; }; CE072A272C91AF0700F190FD /* MNNC3ToC4Fast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */; }; CE072A282C91AF0700F190FD /* MNNC3ToXYZFast.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */; }; + CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */; }; + CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */ = {isa = PBXBuildFile; fileRef = CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */; settings = {COMPILER_FLAGS = "-march=armv8.2-a+fp16"; }; }; CE125CC82A52BF6B003698C9 /* MNNBilinearSampleC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */; }; CE125CC92A52BF6B003698C9 /* MNNBilinearLineC8.S in Sources */ = {isa = PBXBuildFile; fileRef = CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */; }; CE7DC00028E2DE6B00797689 /* ShapeConvTranspose3D.cpp in Sources */ = {isa = PBXBuildFile; fileRef = CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */; }; @@ -1005,8 +1001,6 @@ 4896D37025FE2A6A00717702 /* MNNExpFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNExpFP16.S; path = ../../../arm82/asm/arm64/MNNExpFP16.S; sourceTree = ""; }; 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulFP16.S; sourceTree = ""; }; 4896D37225FE2A6A00717702 /* MNNPackedMatMulRemainFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNPackedMatMulRemainFP16.S; path = ../../../arm82/asm/arm64/MNNPackedMatMulRemainFP16.S; sourceTree = ""; }; - 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23MulTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S; sourceTree = ""; }; - 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvDwF23SourceTransUnitFP16.S; path = ../../../arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S; sourceTree = ""; }; 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = Arm82MNNPackForMatMul_A.S; path = ../../../arm82/asm/arm64/Arm82MNNPackForMatMul_A.S; sourceTree = ""; }; 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNConvRunForLineDepthwiseFP16.S; path = ../../../arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S; sourceTree = ""; }; 489D7A172550FDC800AD896A /* MetalReduction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = MetalReduction.hpp; sourceTree = ""; }; @@ -1353,7 +1347,6 @@ 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = ""; }; 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = ""; }; 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = ""; }; - 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = ""; }; 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = ""; }; 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = ""; }; 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = ""; }; @@ -1398,7 +1391,6 @@ 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBlitC3ToFloatRGBA.S; sourceTree = ""; }; 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNUInt8ToInt16WithOffsetC4Common.S; sourceTree = ""; }; 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNInt8ScaleToFloat.S; sourceTree = ""; }; - 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForUnitDepthWise.S; sourceTree = ""; }; 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvDwF23MulTransUnit.S; sourceTree = ""; }; 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNConvRunForLineDepthwise.S; sourceTree = ""; }; 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNGemmint8to32_8x4_Unit.S; sourceTree = ""; }; @@ -1459,12 +1451,10 @@ 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionGroup.hpp; sourceTree = ""; }; 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ConvolutionFloatFactory.h; sourceTree = ""; }; 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ResizeFunction.h; sourceTree = ""; }; - 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionDepthwise3x3.cpp; sourceTree = ""; }; 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionIntFactory.hpp; sourceTree = ""; }; 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = WinogradOptFunction.hpp; sourceTree = ""; }; 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionGroup.cpp; sourceTree = ""; }; 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ConvolutionFloatFactory.cpp; sourceTree = ""; }; - 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = ConvolutionDepthwise3x3.hpp; sourceTree = ""; }; 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = Convolution1x1Strassen.cpp; sourceTree = ""; }; 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ResizeFunction.cpp; sourceTree = ""; }; 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.h; path = StrassenMatmulComputor.hpp; sourceTree = ""; }; @@ -1647,6 +1637,8 @@ CE072A232C91AF0700F190FD /* MNNC3ToYUVFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToYUVFast.S; path = arm/arm64/MNNC3ToYUVFast.S; sourceTree = ""; }; CE072A242C91AF0700F190FD /* MNNC3ToC4Fast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToC4Fast.S; path = arm/arm64/MNNC3ToC4Fast.S; sourceTree = ""; }; CE072A252C91AF0700F190FD /* MNNC3ToXYZFast.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNC3ToXYZFast.S; path = arm/arm64/MNNC3ToXYZFast.S; sourceTree = ""; }; + CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNDepthwiseConvFastKernel.S; sourceTree = ""; }; + CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; name = MNNDepthwiseConvFastKernelFP16.S; path = ../../../arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S; sourceTree = ""; }; CE125CC62A52BF6B003698C9 /* MNNBilinearSampleC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearSampleC8.S; sourceTree = ""; }; CE125CC72A52BF6B003698C9 /* MNNBilinearLineC8.S */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.asm; path = MNNBilinearLineC8.S; sourceTree = ""; }; CE7DBFFF28E2DE6B00797689 /* ShapeConvTranspose3D.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ShapeConvTranspose3D.cpp; sourceTree = ""; }; @@ -2648,7 +2640,6 @@ 92FF017223AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */, 92FF017423AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */, 92FF017523AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */, - 92FF017623AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */, 92FF017723AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */, 92FF017823AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */, 92FF017923AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */, @@ -2659,6 +2650,8 @@ 92FF017C23AA0B4E00AC97F6 /* arm64 */ = { isa = PBXGroup; children = ( + CE072A2B2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S */, + CE072A292CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S */, 95772DCD2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM82.S */, 95772DCE2C50F12A000FC1C3 /* MNNPackC4Int8ForMatMulA_ARM86.S */, 4DDD8E0F2B1D70C1005065D1 /* MNNTranspose16Bit8x8.S */, @@ -2688,8 +2681,6 @@ 4D6D7FD02656891400F80814 /* MNNPackedSparseMatMulEpx4.S */, 4D6D7FCE2656890C00F80814 /* MNNPackedSparseMatMulEpx1.S */, 4896D37625FE2A6B00717702 /* Arm82MNNPackForMatMul_A.S */, - 4896D37325FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S */, - 4896D37425FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S */, 4896D37725FE2A6B00717702 /* MNNConvRunForLineDepthwiseFP16.S */, 4896D37025FE2A6A00717702 /* MNNExpFP16.S */, 4896D37125FE2A6A00717702 /* MNNPackedMatMulFP16.S */, @@ -2743,7 +2734,6 @@ 92FF01B323AA0B4E00AC97F6 /* MNNBlitC3ToFloatRGBA.S */, 92FF01B523AA0B4E00AC97F6 /* MNNUInt8ToInt16WithOffsetC4Common.S */, 92FF01B623AA0B4E00AC97F6 /* MNNInt8ScaleToFloat.S */, - 92FF01B723AA0B4E00AC97F6 /* MNNConvRunForUnitDepthWise.S */, 92FF01B823AA0B4E00AC97F6 /* MNNConvDwF23MulTransUnit.S */, 92FF01B923AA0B4E00AC97F6 /* MNNConvRunForLineDepthwise.S */, 92FF01BA23AA0B4E00AC97F6 /* MNNGemmint8to32_8x4_Unit.S */, @@ -2795,12 +2785,10 @@ 92FF022B23AA0B5600AC97F6 /* ConvolutionGroup.hpp */, 92FF022C23AA0B5600AC97F6 /* ConvolutionFloatFactory.h */, 92FF022E23AA0B5600AC97F6 /* ResizeFunction.h */, - 92FF022F23AA0B5600AC97F6 /* ConvolutionDepthwise3x3.cpp */, 92FF023023AA0B5600AC97F6 /* ConvolutionIntFactory.hpp */, 92FF023123AA0B5600AC97F6 /* WinogradOptFunction.hpp */, 92FF023223AA0B5600AC97F6 /* ConvolutionGroup.cpp */, 92FF023323AA0B5600AC97F6 /* ConvolutionFloatFactory.cpp */, - 92FF023523AA0B5600AC97F6 /* ConvolutionDepthwise3x3.hpp */, 92FF023623AA0B5600AC97F6 /* Convolution1x1Strassen.cpp */, 92FF023723AA0B5600AC97F6 /* ResizeFunction.cpp */, 92FF023823AA0B5600AC97F6 /* StrassenMatmulComputor.hpp */, @@ -3036,7 +3024,6 @@ 4D9A937526255BDA00F9B43C /* CoreMLCommonExecution.hpp in Headers */, 4DF87C522887D3F20003E2D4 /* CPUSvd.hpp in Headers */, 48747D4B245D9D24000B9709 /* RuntimeFactory.hpp in Headers */, - 92FF03B323AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.hpp in Headers */, CECF8C77299CAD9400D3875B /* log_builder.h in Headers */, 4D9A937226255BDA00F9B43C /* CoreMLConvolution.hpp in Headers */, 92FF038B23AA0B5A00AC97F6 /* CPUUnravelIndex.hpp in Headers */, @@ -3394,14 +3381,12 @@ 4A224A1627D0C56E000A9260 /* ConvolutionWinogradBridge.cpp in Sources */, 48747D6A245D9E33000B9709 /* GeometryStridedSlice.cpp in Sources */, 92FF04BE23AA0BFB00AC97F6 /* FileLoader.cpp in Sources */, - 92FF02F623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */, 92FF042323AA0B7100AC97F6 /* ShapeScatterNd.cpp in Sources */, 92FF045A23AA0B7100AC97F6 /* ShapeBinaryOp.cpp in Sources */, 92FF02E523AA0B5A00AC97F6 /* MNNConvDwF23SourceTransUnit.S in Sources */, CE072A192C91AEE700F190FD /* MNNBGRToGRAY.S in Sources */, EBECA37B24643D110062C7A3 /* MNNGemmInt8AddBiasScale_ARMV82_Unit.S in Sources */, 481C2DF525FE2CD6001ED6DF /* Arm82OptFunc.cpp in Sources */, - 92FF033623AA0B5A00AC97F6 /* MNNConvRunForUnitDepthWise.S in Sources */, 4DF87C502887D3E40003E2D4 /* CPUSvd.cpp in Sources */, 92FF043523AA0B7100AC97F6 /* ShapeConvolution3D.cpp in Sources */, 92FF043923AA0B7100AC97F6 /* ShapeDequantize.cpp in Sources */, @@ -3483,6 +3468,7 @@ 92FF043A23AA0B7100AC97F6 /* ShapePermute.cpp in Sources */, 489D7A8E2550FDC900AD896A /* MetalPooling.mm in Sources */, 92FF030823AA0B5A00AC97F6 /* MNNCopyC4WithStride.S in Sources */, + CE072A2A2CAA50DE00F190FD /* MNNDepthwiseConvFastKernel.S in Sources */, 4DDE2019263809920085AC8F /* CoreMLExecutorWrapper.mm in Sources */, EBECA39B24643D320062C7A3 /* Arm82Backend.cpp in Sources */, 4A224A1327D0C56E000A9260 /* ConvolutionWinogradImpl.cpp in Sources */, @@ -3592,7 +3578,6 @@ 4819FB3A24C69E680050BD09 /* GeometryInnerProduct.cpp in Sources */, 92FF037723AA0B5A00AC97F6 /* CPUConvolutionDepthwise.cpp in Sources */, EB45C774244D7C4F00E28F44 /* MNNGemmInt8AddBiasScale_16x4_Unit_FAST.S in Sources */, - 4896D37B25FE2A6B00717702 /* MNNConvDwF23MulTransUnitFP16.S in Sources */, 92FF02DE23AA0B5A00AC97F6 /* MNNSamplerC4BilinearOpt.S in Sources */, 48FD12BF2466A88D009E9102 /* GeometryConv2DBackPropFilter.cpp in Sources */, 92FF02F923AA0B5A00AC97F6 /* MNNGemmint8to32_8x4_Unit.S in Sources */, @@ -3711,6 +3696,7 @@ 4D6D7FCB265688F600F80814 /* MNNPackedSparseMatMulEpx4.S in Sources */, 92FF042123AA0B7100AC97F6 /* ShapeDeconvolution.cpp in Sources */, 92FF027F23AA0B5A00AC97F6 /* CPUDeconvolutionDepthwise.cpp in Sources */, + CE072A2C2CAA510F00F190FD /* MNNDepthwiseConvFastKernelFP16.S in Sources */, EBECA3A724643D5D0062C7A3 /* MNNQuantizeFP16_UNIT4.S in Sources */, 92FF04A423AA0BFB00AC97F6 /* Interpreter.cpp in Sources */, CECF8C5C299CACFD00D3875B /* Log.cpp in Sources */, @@ -3771,7 +3757,6 @@ 48887728215B639F0079B12E /* WingoradGenerater.cpp in Sources */, 950B28F429F629A90002F454 /* CPUBinaryInt8.cpp in Sources */, 92FF045423AA0B7100AC97F6 /* ShapeRNNSequenceGRU.cpp in Sources */, - 4896D37C25FE2A6B00717702 /* MNNConvDwF23SourceTransUnitFP16.S in Sources */, EB8D2ABE246A4975009948D1 /* Arm82OpRegister.cpp in Sources */, CE072A1E2C91AEE700F190FD /* MNNRGBToBGR555.S in Sources */, 48C84B87250F711700EE7666 /* WhileModule.cpp in Sources */, @@ -3800,7 +3785,6 @@ 92FF041C23AA0B7100AC97F6 /* ShapeNonMaxSuppressionV2.cpp in Sources */, 92FF02CE23AA0B5A00AC97F6 /* MNNPackC4.S in Sources */, 92FF037023AA0B5A00AC97F6 /* CPUPool.cpp in Sources */, - 92FF03AD23AA0B5A00AC97F6 /* ConvolutionDepthwise3x3.cpp in Sources */, 92FF031723AA0B5A00AC97F6 /* MNNConvRunForLineDepthWiseInt8.S in Sources */, 4DD1793A2694076700B0098F /* MNNSoftmax.S in Sources */, CE072A1D2C91AEE700F190FD /* MNNRGBAToBGRAFast.S in Sources */, diff --git a/pymnn/pip_package/MNN/nn/__init__.py b/pymnn/pip_package/MNN/nn/__init__.py index 3fc225092..ca9580300 100644 --- a/pymnn/pip_package/MNN/nn/__init__.py +++ b/pymnn/pip_package/MNN/nn/__init__.py @@ -13,7 +13,7 @@ def load_module_from_file(file_name, input_names, output_names, **kwargs): memory_mode = kwargs.get('memory_mode', _F.MemoryMode.Normal) power_mode = kwargs.get('power_mode', _F.PowerMode.Normal) precision_mode = kwargs.get('precision_mode', _F.PrecisionMode.Normal) - thread_num = kwargs.get('thread_num', 4) + thread_num = kwargs.get('thread_num', 1) module = _nn.load_module_from_file(runtime_manager, input_names, output_names, file_name, dynamic, shape_mutable, rearrange, backend, memory_mode, power_mode, precision_mode, thread_num) @@ -59,4 +59,4 @@ def __init(self): super(EmptyModule, self).__init__() def forward(self): return None -dummy = EmptyModule() \ No newline at end of file +dummy = EmptyModule() diff --git a/pymnn/pip_package/MNN/tools/mnnconvert.py b/pymnn/pip_package/MNN/tools/mnnconvert.py index a3f773d01..7e347c254 100644 --- a/pymnn/pip_package/MNN/tools/mnnconvert.py +++ b/pymnn/pip_package/MNN/tools/mnnconvert.py @@ -13,6 +13,8 @@ except: mnn_logger = None +def convert(args): + Tools.mnnconvert(args) def parse_args(): arg_dict = {} @@ -28,13 +30,13 @@ def parse_args(): if arg_value.startswith("--") or arg_value.startswith("-"): arg_value = True arg_dict[arg_name] = arg_value - + return arg_dict def main(): """ main funcion """ - Tools.mnnconvert(sys.argv) + convert(sys.argv) arg_dict = parse_args() @@ -52,7 +54,7 @@ def main(): arg_dict.pop("MNNModel") log_dict["detail"] = {"args": arg_dict, "src_model_size": src_model_size, "dst_model_size": dst_model_size, "compress_rate": compress_rate} mnn_logger.put_log(log_dict, "convert") - + return 0 diff --git a/pymnn/pip_package/build_deps.py b/pymnn/pip_package/build_deps.py index 320975bf5..6ee2398a5 100644 --- a/pymnn/pip_package/build_deps.py +++ b/pymnn/pip_package/build_deps.py @@ -17,6 +17,7 @@ IS_WINDOWS = (platform.system() == 'Windows') IS_DARWIN = (platform.system() == 'Darwin') IS_LINUX = (platform.system() == 'Linux') +IS_ARM = ('arm' in platform.processor()) BUILD_DIR = 'pymnn_build' # avoid overwrite temporary product when build pymnn USE_TRT = False @@ -55,8 +56,8 @@ USE_OPENMP = True if "llm" in sys.argv[1]: USE_LLM = True - if "arm82" in sys.argv[1]: - USE_ARM82 = True + +if IS_ARM: USE_ARM82 = True print ("USE_INTERNAL:", USE_INTERNAL) print ("USE_TRT:", USE_TRT) @@ -69,7 +70,6 @@ print ("USE_SSE:", USE_SSE) print ("USE_OPENMP:", USE_OPENMP) print ("USE_LLM:", USE_LLM) -print ("USE_ARM82:", USE_ARM82) def build_deps(): """ build depency """ @@ -92,6 +92,9 @@ def build_deps(): if USE_ARM82: extra_opts += ' -DMNN_ARM82=ON' extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_OPENMP=ON' if USE_OPENMP else ' -DMNN_USE_THREAD_POOL=ON -DMNN_OPENMP=OFF' + if IS_DARWIN: + # Mac / iOS System use GCD instead of MNN's thread pool + extra_opts += ' -DMNN_USE_THREAD_POOL=OFF -DMNN_METAL=ON ' if IS_WINDOWS: os.system('cmake -G "Ninja" ' + extra_opts +' -DMNN_BUILD_TRAIN=ON -DMNN_BUILD_CONVERTER=on -DMNN_BUILD_TORCH=OFF\ diff --git a/pymnn/src/llm.h b/pymnn/src/llm.h index 3ade7a17f..fc4e885e2 100644 --- a/pymnn/src/llm.h +++ b/pymnn/src/llm.h @@ -1,3 +1,4 @@ +#include #include "llm/llm.hpp" typedef struct { @@ -38,8 +39,7 @@ static PyObject* PyMNNLLM_response(LLM *self, PyObject *args) { if (!PyArg_ParseTuple(args, "s|p", &query, &stream)) { Py_RETURN_NONE; } - MNN::Transformer::LlmStreamBuffer buffer(nullptr); - std::ostream null_os(&buffer); + std::ostringstream null_os; auto res = self->llm->response(query, stream ? &std::cout : &null_os); return string2Object(res); } diff --git a/pymnn/src/nn.h b/pymnn/src/nn.h index c256754f6..a775cb0d6 100644 --- a/pymnn/src/nn.h +++ b/pymnn/src/nn.h @@ -154,6 +154,7 @@ static PyObject* PyMNN_Module_get_info(PyMNN_Module *self, PyObject *args) { } auto res = PyDict_New(); PyDict_SetItemString(res, "version", char2Object(info->version.c_str())); + PyDict_SetItemString(res, "bizCode", char2Object(info->bizCode.c_str())); { auto names = PyList_New(info->inputNames.size()); for (int i=0; iinputNames.size(); ++i) { @@ -379,6 +380,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args) } for (auto i = 0; i < PySequence_Size(dicts); ++i) { backendConfig[i].sharedContext = nullptr; + config[i].numThread = 1; config[i].backendConfig = &backendConfig[i]; bool ret = getScheduleConfig(PySequence_GetItem(dicts, i), config[i]); if (!ret) { @@ -392,7 +394,7 @@ static PyObject* PyMNNNN_create_runtime_manager(PyObject *self, PyObject *args) } else { m_ptr = Executor::RuntimeManager::createRuntimeManager(configs); } - + if (m_ptr == nullptr) { printf("config size:%d\n", configs.size()); std::string mnn_errno = "create_runtime_manager failed "; diff --git a/source/backend/arm82/Arm82Functions.cpp b/source/backend/arm82/Arm82Functions.cpp index 92749c426..ea57b4d9e 100644 --- a/source/backend/arm82/Arm82Functions.cpp +++ b/source/backend/arm82/Arm82Functions.cpp @@ -50,10 +50,10 @@ void MNNQuantSumFP16(float* sum, const float* dequant_scale, size_t thread, size #endif #if defined(__aarch64__) void CountMinMaxValue_FP16(float* source, float* minVal, float* maxVal, size_t sizeQuad); +void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); #endif -void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, size_t ow); - -void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit); void MNNConvRunForLineDepthwiseFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep); @@ -336,94 +336,6 @@ static void MNNAxByClampBroadcastC8FP16(float* CF, const float* AF, const float* } } -void ARM82MultiAndDestTransformCommon(FLOAT16 **cacheLine, const FLOAT16 *weight, FLOAT16 *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) { - constexpr int pack = 8; - int unit = ow / 2; - auto biasF = Vec::load((const float16_t*)bias); - auto minF = Vec(parameters[2]); - auto maxF = Vec(parameters[3]); - MNN_ASSERT(cacheLineSize >= 1); - for (int x = 0; x < unit; ++x) { - int offset = 4 * pack * x, i = 0; - Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); - Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1); - Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); - Vec m3 = Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3); - for (i = 1; i < cacheLineSize; ++i) { - m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); - m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack * 1); - m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); - m3 = m3 + Vec::load(weight + (i * 4 + 3) * pack) * Vec::load(cacheLine[i] + offset + pack * 3); - } - auto o0 = m0 + m1 + m2 + biasF; - auto o1 = m1 - m2 + m3 + biasF; - o0 = Vec::min(maxF, o0); - o1 = Vec::min(maxF, o1); - o0 = Vec::max(minF, o0); - o1 = Vec::max(minF, o1); - Vec::save(dest + (2 * x + 0) * pack, o0); - Vec::save(dest + (2 * x + 1) * pack, o1); - } - if (unit * 2 < ow) { - int offset = 4 * pack * unit, i = 0; - Vec m0 = Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); - Vec m1 = Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack); - Vec m2 = Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); - for (i = 1; i < cacheLineSize; ++i) { - m0 = m0 + Vec::load(weight + i * 4 * pack) * Vec::load(cacheLine[i] + offset); - m1 = m1 + Vec::load(weight + (i * 4 + 1) * pack) * Vec::load(cacheLine[i] + offset + pack); - m2 = m2 + Vec::load(weight + (i * 4 + 2) * pack) * Vec::load(cacheLine[i] + offset + pack * 2); - } - auto o0 = m0 + m1 + m2 + biasF; - o0 = Vec::min(maxF, o0); - o0 = Vec::max(minF, o0); - Vec::save(dest + 2 * unit * pack, o0); - } -} -// unit: winograd unit (output is w/2) -void ARM82SourceTransformCommon(const FLOAT16 *source, FLOAT16 *dest, int unit, int iw, int pad, int su, int eu) { - constexpr int pack = 8; // float16x8 - for (int x = 0; x < su; ++x) { - auto dstX = dest + 4 * pack * x; - auto sx = x * 2 - (int)pad; - auto ex = sx + 4; - auto clampSx = std::max(sx, 0); - auto clampEx = std::min(ex, (int)iw); - Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = clampSx; i < clampEx; ++i) { - v[i - sx] = Vec::load(source + pack * i); - } - auto m0 = v[0] - v[2]; - auto m1 = v[1] + v[2]; - auto m2 = v[2] - v[1]; - auto m3 = v[3] - v[1]; - Vec::save(dstX + pack * 0, m0); - Vec::save(dstX + pack * 1, m1); - Vec::save(dstX + pack * 2, m2); - Vec::save(dstX + pack * 3, m3); - } - MNNConvDwF23SourceTransUnitFP16(source + pack * (su * 2 - pad), dest + 4 * pack * su, eu - su); - for (int x = eu; x < unit; ++x) { - auto dstX = dest + 4 * pack * x; - auto sx = x * 2 - (int)pad; - auto ex = sx + 4; - auto clampSx = std::max(sx, 0); - auto clampEx = std::min(ex, (int)iw); - Vec v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = clampSx; i < clampEx; ++i) { - v[i - sx] = Vec::load(source + pack * i); - } - auto m0 = v[0] - v[2]; - auto m1 = v[1] + v[2]; - auto m2 = v[2] - v[1]; - auto m3 = v[3] - v[1]; - Vec::save(dstX + pack * 0, m0); - Vec::save(dstX + pack * 1, m1); - Vec::save(dstX + pack * 2, m2); - Vec::save(dstX + pack * 3, m3); - } -} - void ARM82StrassenMerge(FLOAT16* c11, FLOAT16* c12, FLOAT16* c21, FLOAT16* c22, FLOAT16* xAddr, size_t cStride, size_t eSub, size_t hSub) { const int pack = 8; @@ -516,24 +428,6 @@ void MNNPackTransposeInt16C8(int16_t* dst, const int16_t* src, size_t area, size } } -static void MNNConvRunForUnitDepthWiseFP16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { - int fx, fy; - Vec dstValue(0.0f); - auto src_z = (const FLOAT16*)src; - auto weight_z = (const FLOAT16*)weight; - for (fy = 0; fy < fh; ++fy) { - auto src_y = src_z + fy * dilateY_step; - auto weight_y = weight_z + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - auto weight_x = weight_y + 8 * fx; - auto src_x = src_y + fx * dilateX_step; - dstValue = dstValue + Vec::load(src_x) * Vec::load(weight_x); - } - } - Vec::save((FLOAT16*)dst, dstValue); -} - static void _MNNDeconvRunForUnitDepthWise(const FLOAT16* dst, FLOAT16* src, const FLOAT16* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { int fx, fy; @@ -706,12 +600,8 @@ bool Arm82Functions::init() { FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnit, MNNUnPackC8FP16); FUNC_PTR_ASSIGN(gInstance->MNNPackCUnitTranspose, MNNPackTransposeInt16C8); FUNC_PTR_ASSIGN(gInstance->MNNUnpackCUnitTranspose, MNNUnpackTransposeInt16C8); - FUNC_PTR_ASSIGN(gInstance->MNNConvRunForUnitDepthWise, MNNConvRunForUnitDepthWiseFP16); FUNC_PTR_ASSIGN(gInstance->MNNConvRunForLineDepthwise, MNNConvRunForLineDepthwiseFP16); FUNC_PTR_ASSIGN(gInstance->MNNAxByClampBroadcastUnit, MNNAxByClampBroadcastC8FP16); - FUNC_PTR_ASSIGN(gInstance->MNNConvDwF23MulTransUnit, MNNConvDwF23MulTransUnitFP16); - FUNC_PTR_ASSIGN(gInstance->MNNSourceTransformCommonF23, ARM82SourceTransformCommon); - FUNC_PTR_ASSIGN(gInstance->MNNMultiAndDestTransformCommon23, ARM82MultiAndDestTransformCommon); FUNC_PTR_ASSIGN(gInstance->MNNMatrixSub, MNNMatrixSubFP16); FUNC_PTR_ASSIGN(gInstance->MNNMatrixAdd, MNNMatrixAddFP16); FUNC_PTR_ASSIGN(gInstance->MNNStrassenMergeCFunction, ARM82StrassenMerge); @@ -754,6 +644,7 @@ bool Arm82Functions::init() { FUNC_PTR_ASSIGN(gInstance->MNNCountMaxMinValue, ARM82CountMinMaxValue); #endif FUNC_PTR_ASSIGN(gInstance->MNNSumByAxisLForMatmul_A, origin->MNNSumByAxisLForMatmul_A); + FUNC_PTR_ASSIGN(gInstance->MNNDepthwiseConvFastKernel, MNNDepthwiseConvFastKernelFP16); #endif FUNC_PTR_ASSIGN(gInstance->MNNPackC4ForMatMul_A, Arm82MNNPackForMatMul_A); FUNC_PTR_ASSIGN(gInstance->MNNGetMatMulPackMode, Arm82MNNGetMatMulPackMode); diff --git a/source/backend/arm82/CMakeLists.txt b/source/backend/arm82/CMakeLists.txt index afbe55dbb..4f6e5ebd1 100644 --- a/source/backend/arm82/CMakeLists.txt +++ b/source/backend/arm82/CMakeLists.txt @@ -5,7 +5,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?") file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm32/*") add_library(MNN_Arm82 OBJECT ${MNN_ARM82_SRCS} ${MNN_ARM82_SRCS_ASM}) target_compile_options(MNN_Arm82 PRIVATE -march=armv8.2-a+fp16 -mfpu=neon-fp-armv8 -mfloat-abi=softfp -DENABLE_ARMV82) -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64") file(GLOB MNN_ARM82_SRCS_ASM "${CMAKE_CURRENT_LIST_DIR}/asm/arm64/*") if (MNN_LOW_MEMORY) file(GLOB MNN_ARM82_SRCS_ASM ${MNN_ARM82_SRCS_ASM} ${CMAKE_CURRENT_LIST_DIR}/asm/arm64/low_memory/*) diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S deleted file mode 100644 index 7b2528991..000000000 --- a/source/backend/arm82/asm/arm32/MNNConvDwF23MulTransUnitFP16.S +++ /dev/null @@ -1,147 +0,0 @@ -// -// MNNConvDwF23MulTransUnitFP16.S -// MNN -// -// Created by MNN on 2019/4/4. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __arm__ -#ifndef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNConvDwF23MulTransUnitFP16 -//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow); -//Auto: r0:cacheLine, r1:weight, r2:dest, r3:ow -push {r4-r11, lr} -ldr r8, [sp, #36] // biasPtr -ldr r9, [sp, #40] // postParameters -ldr r10, [r9, #8] // minF -ldr r11, [r9, #12] // maxF - -vpush {q4-q7} -ldr r4, [r0, #0] -ldr r5, [r0, #4] -ldr r6, [r0, #8] - -vld1.16 {q4, q5}, [r1]! -vld1.16 {q6, q7}, [r1]! -vld1.16 {q8, q9}, [r1]! - -L2: -cmp r3, #2 -blt L1 - -LoopL2: -mov r7, r1 - -vld1.16 {q12, q13}, [r4]! -vmul.f16 q0, q4, q12 -vld1.16 {q14, q15}, [r4]! -vmul.f16 q1, q5, q13 -vld1.16 {q10, q11}, [r7]! -vmul.f16 q2, q6, q14 -vld1.16 {q12, q13}, [r5]! -vmul.f16 q3, q7, q15 - -vmla.f16 q0, q8, q12 -vld1.16 {q14, q15}, [r5]! -vmla.f16 q1, q9, q13 -vmla.f16 q2, q10, q14 -vmla.f16 q3, q11, q15 - -vld1.16 {q10, q11}, [r7]! -vld1.16 {q12, q13}, [r6]! -vmla.f16 q0, q10, q12 -vmla.f16 q1, q11, q13 -vld1.16 {q10, q11}, [r7]! -vadd.f16 q0, q1, q0 -vld1.16 {q14, q15}, [r6]! - -vmla.f16 q2, q10, q14 -vmla.f16 q3, q11, q15 -vadd.f16 q0, q0, q2 - -vadd.f16 q3, q3, q1 -vsub.f16 q1, q3, q2 - -vld1.32 {q10}, [r8] -vdup.32 q11, r10 -vdup.32 q12, r11 -vcvt.f16.f32 d22, q11 -vcvt.f16.f32 d24, q12 -vmov.32 d23, d22 -vmov.32 d25, d24 - -vadd.f16 q0, q10, q0 -vadd.f16 q1, q10, q1 - -vmin.f16 q0, q12, q0 -vmin.f16 q1, q12, q1 - -vmax.f16 q0, q11, q0 -vmax.f16 q1, q11, q1 - - -vst1.16 {q0, q1}, [r2]! - -sub r3, r3, #2 -cmp r3, #2 -bge LoopL2 - - -L1: -cmp r3, #0 -beq End -mov r7, r1 -mov r12, #32 -vld1.16 {q12, q13}, [r4]! -vmul.f16 q0, q4, q12 -vld1.16 {q14}, [r4]! -vmul.f16 q1, q5, q13 -vld1.16 {q10}, [r7], r12 -vmul.f16 q2, q6, q14 -vld1.16 {q12, q13}, [r5]! - -vmla.f16 q0, q8, q12 -vld1.16 {q14}, [r5]! -vmla.f16 q1, q9, q13 -vmla.f16 q2, q10, q14 - -vld1.16 {q10, q11}, [r7]! -vld1.16 {q12, q13}, [r6]! -vmla.f16 q0, q10, q12 -vmla.f16 q1, q11, q13 -vld1.16 {q10}, [r7] -vld1.16 {q14}, [r6]! - -vmla.f16 q2, q10, q14 - -vadd.f16 q0, q1, q0 -vadd.f16 q0, q0, q2 - -vld1.32 {q10}, [r8] -vdup.32 q11, r10 -vdup.32 q12, r11 -vcvt.f16.f32 d22, q11 -vcvt.f16.f32 d24, q12 -vmov.32 d23, d22 -vmov.32 d25, d24 - -vadd.f16 q0, q10, q0 - -vmin.f16 q0, q12, q0 - -vmax.f16 q0, q11, q0 - -vst1.16 {q0}, [r2]! -End: - -vpop {q4-q7} -pop {r4-r11, pc} - -#endif -#endif diff --git a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S deleted file mode 100644 index f2fb67713..000000000 --- a/source/backend/arm82/asm/arm32/MNNConvDwF23SourceTransUnitFP16.S +++ /dev/null @@ -1,60 +0,0 @@ -// -// MNNConvDwF23SourceTransUnitFP16.S -// MNN -// -// Created by MNN on 2019/4/4. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __arm__ -#ifndef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNConvDwF23SourceTransUnitFP16 -// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit); - -//Auto: -//r0: source, r1:dest, r2:unit - -push {lr} - -L1: -cmp r2, #0 -beq End - -vld1.16 {q8, q9}, [r0]! -vld1.16 {q10, q11}, [r0]! -subs r2, r2, #1 -vsub.f16 q0, q8, q10 -vadd.f16 q1, q9, q10 -beq L1LoopEnd - -L1Loop: - vsub.f16 q2, q10, q9 - vst1.16 {q0, q1}, [r1]! - vsub.f16 q3, q11, q9 - vmov.i32 q8, q10 - vst1.16 {q2, q3}, [r1]! - vmov.i32 q9, q11 - vld1.16 {q10, q11}, [r0]! - vsub.f16 q0, q8, q10 - vadd.f16 q1, q9, q10 - - subs r2, r2, #1 - bne L1Loop -L1LoopEnd: -vsub.f16 q2, q10, q9 -vsub.f16 q3, q11, q9 - -vst1.16 {q0, q1}, [r1]! -vst1.16 {q2, q3}, [r1]! - - -End: - -pop {pc} -#endif -#endif diff --git a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S index 240c9b17a..c39406078 100644 --- a/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S +++ b/source/backend/arm82/asm/arm32/MNNConvRunForLineDepthwiseFP16.S @@ -16,26 +16,35 @@ asm_function MNNConvRunForLineDepthwiseFP16 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup, -// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep, +// const float* bias, const float* parameters) //Auto Load: //r0:dst, r1:src, r2:weight, r3:width -push {r4-r11, lr} +push {r4-r8, r10, r11, lr} //Load From Sp -//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, r9: height, r10:srcHStep, r11:dstHStep -ldr r4, [sp, #36] -ldr r5, [sp, #40] -ldr r6, [sp, #44] -ldr r7, [sp, #48] -ldr r8, [sp, #52] -ldr r9, [sp, #56] -ldr r10, [sp, #60] -ldr r11, [sp, #64] +//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep +ldr r4, [sp, #32] +ldr r5, [sp, #36] +ldr r6, [sp, #40] +ldr r7, [sp, #44] +ldr r8, [sp, #48] +ldr lr, [sp, #52] +ldr r10, [sp, #56] +ldr r11, [sp, #60] +ldr r12, [sp, #64] // bias +vld1.32 {q0}, [r12] // bias +ldr r12, [sp, #68] // min,max +vld1.32 {d2[0]}, [r12]! +vld1.32 {d2[1]}, [r12] vpush {q4-q7} +vmov.f32 q5, q0 // bias +vdup.f32 q4, d2[0] // min +vdup.f32 q6, d2[1] // max mov r12, #2 // sizeof(FLOAT16) mul r4, r12, r4 @@ -49,7 +58,7 @@ mul r12, r5, r7 sub r8, r8, r12 LoopDY: -push {r0, r1, r3, r9, r10, r11} +push {r0, r1, r3, r10, r11, lr} L8: cmp r3, #7 @@ -59,18 +68,18 @@ mov r12, #8 mul r12, r4, r12 L8Loop: - vmov.i32 q8, #0 - vmov.i32 q9, #0 - vmov.i32 q10, #0 - vmov.i32 q11, #0 - vmov.i32 q12, #0 - vmov.i32 q13, #0 - vmov.i32 q14, #0 - vmov.i32 q15, #0 + vmov.f32 q8, q5 // use bias to init + vmov.f32 q9, q5 + vmov.f32 q10, q5 + vmov.f32 q11, q5 + vmov.f32 q12, q5 + vmov.f32 q13, q5 + vmov.f32 q14, q5 + vmov.f32 q15, q5 vmov.i32 d14[0], r1 vmov.i32 d14[1], r2 - mov r9, r6 + mov lr, r6 L8LoopH: mov r10, r5 L8LoopW: @@ -98,11 +107,27 @@ L8Loop: bne L8LoopW L8LoopWEnd: - subs r9, r9, #1 + subs lr, lr, #1 add r1, r1, r8 bne L8LoopH sub r3, r3, #8 + vmax.f32 q8, q8, q4 + vmax.f32 q9, q9, q4 + vmax.f32 q10, q10, q4 + vmax.f32 q11, q11, q4 + vmax.f32 q12, q12, q4 + vmax.f32 q13, q13, q4 + vmax.f32 q14, q14, q4 + vmax.f32 q15, q15, q4 + vmin.f32 q8, q8, q6 + vmin.f32 q9, q9, q6 + vmin.f32 q10, q10, q6 + vmin.f32 q11, q11, q6 + vmin.f32 q12, q12, q6 + vmin.f32 q13, q13, q6 + vmin.f32 q14, q14, q6 + vmin.f32 q15, q15, q6 vst1.16 {q8, q9}, [r0]! vmov.i32 r1, d14[0] vmov.i32 r2, d14[1] @@ -121,14 +146,14 @@ mov r12, #4 mul r12, r4, r12 L4Loop: - vmov.i32 q8, #0 - vmov.i32 q9, #0 - vmov.i32 q10, #0 - vmov.i32 q11, #0 - - vmov.i32 d8[0], r1 - vmov.i32 d9[0], r2 - mov r9, r6 + vmov.f32 q8, q5 + vmov.f32 q9, q5 + vmov.f32 q10, q5 + vmov.f32 q11, q5 + + vmov.i32 d14[0], r1 + vmov.i32 d14[1], r2 + mov lr, r6 L4LoopH: mov r10, r5 L4LoopW: @@ -147,14 +172,22 @@ L4Loop: add r1, r1, r7 bne L4LoopW - subs r9, r9, #1 + subs lr, lr, #1 add r1, r1, r8 bne L4LoopH + vmax.f32 q8, q8, q4 + vmax.f32 q9, q9, q4 + vmax.f32 q10, q10, q4 + vmax.f32 q11, q11, q4 + vmin.f32 q8, q8, q6 + vmin.f32 q9, q9, q6 + vmin.f32 q10, q10, q6 + vmin.f32 q11, q11, q6 sub r3, r3, #4 vst1.16 {q8, q9}, [r0]! - vmov.i32 r1, d8[0] - vmov.i32 r2, d9[0] + vmov.i32 r1, d14[0] + vmov.i32 r2, d14[1] vst1.16 {q10, q11}, [r0]! add r1, r1, r12 cmp r3, #4 @@ -168,8 +201,8 @@ cmp r3, #0 beq End L1Loop: - vmov.i32 q0, #0 - mov r9, r6 + vmov.f32 q0, q5 + mov lr, r6 mov r11, r1 mov r12, r2 L1LoopH: @@ -180,10 +213,12 @@ L1Loop: vmla.f16 q0, q1, q2 subs r10, r10, #1 bne L1LoopW - subs r9, r9, #1 + subs lr, lr, #1 add r1, r1, r8 bne L1LoopH + vmax.f32 q0, q0, q4 + vmin.f32 q0, q0, q6 subs r3, r3, #1 vst1.16 {q0}, [r0]! mov r2, r12 @@ -193,16 +228,15 @@ L1Loop: End: -pop {r0, r1, r3, r9, r10, r11} +pop {r0, r1, r3, r10, r11, lr} add r0, r0, r11 -subs r9, r9, #1 +subs lr, lr, #1 add r1, r1, r10 bne LoopDY vpop {q4-q7} -pop {r4-r11, pc} - +pop {r4-r8, r10, r11, pc} #endif #endif diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S deleted file mode 100644 index 5585b2cb0..000000000 --- a/source/backend/arm82/asm/arm64/MNNConvDwF23MulTransUnitFP16.S +++ /dev/null @@ -1,122 +0,0 @@ -// -// MNNConvDwF23MulTransUnitFP16.S -// MNN -// -// Created by MNN on 2019/4/4. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNConvDwF23MulTransUnitFP16 -//void MNNConvDwF23MulTransUnitFP16(FLOAT16 **cacheLine, const FLOAT16 *weigth, FLOAT16 *dest, size_t ow); -//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters - -stp d10, d11, [sp, #-32]! -stp d8, d9, [sp, #16] - -ld1 {v8.8h}, [x4] // bias -ldr w9, [x5, #8] -ldr w10, [x5, #12] -dup v9.4s, w9 // min -dup v10.4s, w10 // max -fcvtn v9.4h, v9.4s -fcvtn v10.4h, v10.4s -dup v9.8h, v9.h[0] -dup v10.8h, v10.h[0] - -ldr x4, [x0, #0] -ldr x5, [x0, #8] -ldr x6, [x0, #16] - -ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64 -ld1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x1], #64 -ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x1] - -L2: -cmp x3, #2 -blt L1 - -LoopL2: - -ld1 {v20.8h, v21.8h}, [x4], #32 -fmul v0.8h, v4.8h, v20.8h -ld1 {v22.8h, v23.8h}, [x4], #32 -fmul v1.8h, v5.8h, v21.8h -fmul v2.8h, v6.8h, v22.8h -ld1 {v20.8h, v21.8h}, [x5], #32 -fmul v3.8h, v7.8h, v23.8h - -fmla v0.8h, v16.8h, v20.8h -ld1 {v22.8h, v23.8h}, [x5], #32 -fmla v1.8h, v17.8h, v21.8h -fmla v2.8h, v18.8h, v22.8h -fmla v3.8h, v19.8h, v23.8h - -ld1 {v20.8h, v21.8h}, [x6], #32 -fmla v0.8h, v28.8h, v20.8h -fmla v1.8h, v29.8h, v21.8h -fadd v0.8h, v1.8h, v0.8h -ld1 {v22.8h, v23.8h}, [x6], #32 - -fmla v2.8h, v30.8h, v22.8h -fmla v3.8h, v31.8h, v23.8h -fadd v0.8h, v0.8h, v2.8h - -fadd v3.8h, v3.8h, v1.8h -fsub v1.8h, v3.8h, v2.8h - -fadd v0.8h, v0.8h, v8.8h -fadd v1.8h, v1.8h, v8.8h - -fmin v0.8h, v0.8h, v10.8h -fmin v1.8h, v1.8h, v10.8h - -fmax v0.8h, v0.8h, v9.8h -fmax v1.8h, v1.8h, v9.8h - -st1 {v0.8h, v1.8h}, [x2], #32 - -sub x3, x3, #2 -cmp x3, #2 -bge LoopL2 - - -L1: -cmp x3, #0 -beq End -ld1 {v20.8h, v21.8h, v22.8h}, [x4] -fmul v0.8h, v4.8h, v20.8h -fmul v1.8h, v5.8h, v21.8h -fmul v2.8h, v6.8h, v22.8h -ld1 {v20.8h, v21.8h, v22.8h}, [x5] - -fmla v0.8h, v16.8h, v20.8h -fmla v1.8h, v17.8h, v21.8h -fmla v2.8h, v18.8h, v22.8h - -ld1 {v20.8h, v21.8h, v22.8h}, [x6] -fmla v0.8h, v28.8h, v20.8h -fmla v1.8h, v29.8h, v21.8h -fadd v0.8h, v1.8h, v0.8h - -fmla v2.8h, v30.8h, v22.8h -fadd v0.8h, v0.8h, v2.8h - -fadd v0.8h, v0.8h, v8.8h - -fmin v0.8h, v0.8h, v10.8h - -fmax v0.8h, v0.8h, v9.8h -st1 {v0.8h}, [x2] -End: - -ldp d8, d9, [sp, #16] -ldp d10, d11, [sp], #32 - -ret -#endif diff --git a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S b/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S deleted file mode 100644 index cac31e53d..000000000 --- a/source/backend/arm82/asm/arm64/MNNConvDwF23SourceTransUnitFP16.S +++ /dev/null @@ -1,56 +0,0 @@ -// -// MNNConvDwF23SourceTransUnitFP16.S -// MNN -// -// Created by MNN on 2019/4/4. -// Copyright © 2018, Alibaba Group Holding Limited -// -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNConvDwF23SourceTransUnitFP16 -// void MNNConvDwF23SourceTransUnitFP16(const FLOAT16 *source, FLOAT16 *dest, size_t unit); - -//Auto: -//x0: source, x1:dest, x2:unit - -L1: -cmp x2, #0 -beq End - -ld1 {v16.8h, v17.8h}, [x0], #32 -ld1 {v18.8h, v19.8h}, [x0], #32 -subs x2, x2, #1 -fsub v0.8h, v16.8h, v18.8h -fadd v1.8h, v17.8h, v18.8h -beq L1LoopEnd - -L1Loop: - fsub v2.8h, v18.8h, v17.8h - st1 {v0.8h, v1.8h}, [x1], #32 - fsub v3.8h, v19.8h, v17.8h - mov v16.16b, v18.16b - st1 {v2.8h, v3.8h}, [x1], #32 - mov v17.16b, v19.16b - ld1 {v18.8h, v19.8h}, [x0], #32 - fsub v0.8h, v16.8h, v18.8h - fadd v1.8h, v17.8h, v18.8h - - subs x2, x2, #1 - bne L1Loop -L1LoopEnd: -fsub v2.8h, v18.8h, v17.8h -fsub v3.8h, v19.8h, v17.8h - -st1 {v0.8h, v1.8h}, [x1], #32 -st1 {v2.8h, v3.8h}, [x1], #32 - - -End: -ret - -#endif diff --git a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S index 1cb449d27..ada98a9b1 100644 --- a/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S +++ b/source/backend/arm82/asm/arm64/MNNConvRunForLineDepthwiseFP16.S @@ -15,17 +15,24 @@ asm_function MNNConvRunForLineDepthwiseFP16 //void MNNConvRunForLineDepthwiseFP16(FLOAT16* dst, const FLOAT16* src, const FLOAT16* weight, size_t width, size_t src_w_setup, -// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep, +// const float* bias, float* parameters) //Auto Load: //x0:dst, x1:src, x2:weight, x3:width, x4:src_w_setup, x5:fw, x6:fh, x7:dilate_x_step //Load From sp: -//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep +//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13:parameters ldr x8, [sp, #0] ldr x15, [sp, #8] ldr x10, [sp, #16] ldr x11, [sp, #24] +ldr x12, [sp, #32] +ldr x13, [sp, #40] + +stp d8, d9, [sp, #(-16 * 3)]! +stp d10, d11, [sp, #(16 * 2)] +stp x19, x20, [sp, #(16 * 1)] mov x9, #2 // sizeof(FLOAT16) mul x4, x9, x4 @@ -34,15 +41,30 @@ mul x8, x9, x8 mul x10, x9, x10 mul x11, x9, x11 +ld1 {v8.8h}, [x12] // bias +ld1r {v10.8h}, [x13], #2 // min +ld1r {v11.8h}, [x13] + //dilate_y_step -> dilate_y_step - fw*dilate_x_step mul x9, x5, x7 sub x8, x8, x9 -.macro zero_vec x0, x1, x2, x3 - movi \x0\().8h, #0 - movi \x1\().8h, #0 - movi \x2\().8h, #0 - movi \x3\().8h, #0 +.macro assign_bias x0, x1, x2, x3 + mov \x0\().16b, v8.16b + mov \x1\().16b, v8.16b + mov \x2\().16b, v8.16b + mov \x3\().16b, v8.16b +.endm + +.macro compare_min_max x0, x1, x2, x3, xmin, xmax + fmax \x0\().8h, \x0\().8h, \xmin\().8h + fmax \x1\().8h, \x1\().8h, \xmin\().8h + fmax \x2\().8h, \x2\().8h, \xmin\().8h + fmax \x3\().8h, \x3\().8h, \xmin\().8h + fmin \x0\().8h, \x0\().8h, \xmax\().8h + fmin \x1\().8h, \x1\().8h, \xmax\().8h + fmin \x2\().8h, \x2\().8h, \xmax\().8h + fmin \x3\().8h, \x3\().8h, \xmax\().8h .endm LoopDY: @@ -56,16 +78,16 @@ L16: cmp x3, #16 blt L8 -mov x12, #16 -mul x12, x4, x12 +mov x19, #16 +mul x19, x4, x19 L16Loop: - zero_vec v16, v17, v18, v19 - zero_vec v20, v21, v22, v23 - zero_vec v24, v25, v26, v27 - zero_vec v28, v29, v30, v31 + assign_bias v16, v17, v18, v19 + assign_bias v20, v21, v22, v23 + assign_bias v24, v25, v26, v27 + assign_bias v28, v29, v30, v31 - mov x13, x1 + mov x20, x1 mov x14, x2 mov x9, x6 L16LoopH: @@ -106,7 +128,7 @@ L16Loop: ld1 {v3.8h}, [x1], x4 fmla v30.8h, v7.8h, v2.8h fmla v31.8h, v7.8h, v3.8h - sub x1, x1, x12 + sub x1, x1, x19 add x1, x1, x7 bne L16LoopW @@ -115,8 +137,12 @@ L16Loop: bne L16LoopH sub x3, x3, #16 + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + compare_min_max v24, v25, v26, v27, v10, v11 + compare_min_max v28, v29, v30, v31, v10, v11 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 - add x1, x13, x12 + add x1, x20, x19 cmp x3, #16 mov x2, x14 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 @@ -129,14 +155,14 @@ L8: cmp x3, #7 ble L4 -mov x12, #8 -mul x12, x4, x12 +mov x19, #8 +mul x19, x4, x19 L8Loop: - zero_vec v16, v17, v18, v19 - zero_vec v20, v21, v22, v23 + assign_bias v16, v17, v18, v19 + assign_bias v20, v21, v22, v23 - mov x13, x1 + mov x20, x1 mov x14, x2 mov x9, x6 L8LoopH: @@ -161,7 +187,7 @@ L8Loop: ld1 {v1.8h}, [x1], x4 fmla v23.8h, v1.8h, v3.8h - sub x1, x1, x12 + sub x1, x1, x19 add x1, x1, x7 bne L8LoopW @@ -169,9 +195,12 @@ L8Loop: add x1, x1, x8 bne L8LoopH + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + sub x3, x3, #8 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 - add x1, x13, x12 + add x1, x20, x19 mov x2, x14 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 @@ -180,13 +209,13 @@ L4: cmp x3, #4 ble L1 -mov x12, #4 -mul x12, x4, x12 +mov x19, #4 +mul x19, x4, x19 L4Loop: - zero_vec v16, v17, v18, v19 + assign_bias v16, v17, v18, v19 - mov x13, x1 + mov x20, x1 mov x14, x2 mov x9, x6 L4LoopH: @@ -203,7 +232,7 @@ L4Loop: ld1 {v1.8h}, [x1], x4 fmla v19.8h, v1.8h, v3.8h - sub x1, x1, x12 + sub x1, x1, x19 add x1, x1, x7 bne L4LoopW @@ -211,9 +240,10 @@ L4Loop: add x1, x1, x8 bne L4LoopH + compare_min_max v16, v17, v18, v19, v10, v11 sub x3, x3, #4 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 - add x1, x13, x12 + add x1, x20, x19 mov x2, x14 L1: @@ -221,10 +251,10 @@ cmp x3, #0 beq End L1Loop: - movi v0.8h, #0 + mov v0.16b, v8.16b mov x9, x6 mov x11, x1 - mov x12, x2 + mov x19, x2 L1LoopH: mov x10, x5 L1LoopW: @@ -238,8 +268,10 @@ L1Loop: bne L1LoopH subs x3, x3, #1 + fmax v0.8h, v0.8h, v10.8h + fmin v0.8h, v0.8h, v11.8h st1 {v0.8h}, [x0], #16 - mov x2, x12 + mov x2, x19 add x1, x11, x4 bne L1Loop @@ -257,7 +289,9 @@ add x0, x0, x11 add x1, x1, x10 bne LoopDY - +ldp x19, x20, [sp, #(16 * 1)] +ldp d10, d11, [sp, #(16 * 2)] +ldp d8, d9, [sp], #(16 * 3) ret #endif diff --git a/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S b/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S new file mode 100644 index 000000000..80accbebc --- /dev/null +++ b/source/backend/arm82/asm/arm64/MNNDepthwiseConvFastKernelFP16.S @@ -0,0 +1,290 @@ +// +// MNNDepthwiseConvFastKernelFP16.S +// MNN +// +// Created by MNN on 2024/09/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNDepthwiseConvFastKernelFP16 + +// void MNNDepthwiseConvFastKernelFP16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, +// size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); +//Auto Load: +//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step + +//Load From sp: +//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax +ldr x8, [sp, #0] +ldr x15, [sp, #8] +ldr x10, [sp, #16] +ldr x11, [sp, #24] +ldr x12, [sp, #32] +ldr x13, [sp, #40] + +stp d14, d15, [sp, #(-16 * 9)]! +stp d12, d13, [sp, #(16 * 1)] +stp d10, d11, [sp, #(16 * 2)] +stp d8, d9, [sp, #(16 * 3)] +stp x21, x22, [sp, #(16 * 4)] +stp x19, x20, [sp, #(16 * 5)] +stp x27, x28, [sp, #(16 * 6)] +stp x25, x26, [sp, #(16 * 7)] +stp x23, x24, [sp, #(16 * 8)] + +lsl x4, x4, #1 // src_w_step*sizeof(float) +lsl x7, x7, #1 // dilate_x_step*sizeof(float) +lsl x8, x8, #1 // dilate_y_step*sizeof(float) +lsl x23, x10, #1 // srcHStep*sizeof(float) +lsl x24, x11, #1 // dstHStep*sizeof(float) +mov x20, x12 // bias +mov x26, x13 // min +add x27, x13, #2 // max + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul x9, x5, x7 +sub x8, x8, x9 +mov x25, x3 // width +.macro assign_bias x0, x1, x2, x3, bv + mov \x0\().16b, \bv\().16b + mov \x1\().16b, \bv\().16b + mov \x2\().16b, \bv\().16b + mov \x3\().16b, \bv\().16b +.endm + +.macro compare_min_max x0, x1, x2, x3, xmin, xmax + fmax \x0\().8h, \x0\().8h, \xmin\().8h + fmax \x1\().8h, \x1\().8h, \xmin\().8h + fmax \x2\().8h, \x2\().8h, \xmin\().8h + fmax \x3\().8h, \x3\().8h, \xmin\().8h + fmin \x0\().8h, \x0\().8h, \xmax\().8h + fmin \x1\().8h, \x1\().8h, \xmax\().8h + fmin \x2\().8h, \x2\().8h, \xmax\().8h + fmin \x3\().8h, \x3\().8h, \xmax\().8h +.endm + +LoopDY: +//mov x23, x10 +//mov x24, x11 +mov x21, x0 +mov x22, x1 + +L16: +cmp x3, #16 +blt L8 + +mov x12, #-176 +mov x19, #256 + +L16Loop: + ld1 {v8.8h}, [x20] // load bias + assign_bias v16, v17, v18, v19, v8 + assign_bias v20, v21, v22, v23, v8 + assign_bias v24, v25, v26, v27, v8 + assign_bias v28, v29, v30, v31, v8 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L16LoopH: + mov x10, x5 + L16LoopW: + ld1 {v8.8h}, [x2], #16 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], #64 + ld1 {v9.8h, v10.8h, v11.8h, v12.8h}, [x1], #64 + subs x10, x10, #1 + fmla v16.8h, v8.8h, v0.8h + fmla v17.8h, v8.8h, v1.8h + fmla v18.8h, v8.8h, v2.8h + fmla v19.8h, v8.8h, v3.8h + + fmla v20.8h, v8.8h, v4.8h + fmla v21.8h, v8.8h, v5.8h + fmla v22.8h, v8.8h, v6.8h + fmla v23.8h, v8.8h, v7.8h + + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12 + + fmla v24.8h, v8.8h, v9.8h + fmla v25.8h, v8.8h, v10.8h + fmla v26.8h, v8.8h, v11.8h + fmla v27.8h, v8.8h, v12.8h + + fmla v28.8h, v8.8h, v0.8h + fmla v29.8h, v8.8h, v1.8h + fmla v30.8h, v8.8h, v2.8h + fmla v31.8h, v8.8h, v3.8h + + bne L16LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L16LoopH + ld1r {v10.8h}, [x26] // min + ld1r {v11.8h}, [x27] // max + sub x3, x3, #16 + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + compare_min_max v24, v25, v26, v27, v10, v11 + compare_min_max v28, v29, v30, v31, v10, v11 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + add x1, x13, x19 // 16 * pack * sizeof(float) + cmp x3, #16 + mov x2, x14 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], #64 + st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], #64 + bge L16Loop + + +L8: +ld1r {v10.8h}, [x26] // min +ld1r {v11.8h}, [x27] // max +ld1 {v24.8h}, [x20] // load bias +cmp x3, #7 +ble L4 + +mov x12, #-48 +mov x19, #128 + +L8Loop: + assign_bias v16, v17, v18, v19, v24 + assign_bias v20, v21, v22, v23, v24 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L8LoopH: + mov x10, x5 + L8LoopW: + ld1 {v8.8h}, [x2], #16 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], #64 + ld1 {v4.8h, v5.8h, v6.8h, v7.8h}, [x1], x12 + subs x10, x10, #1 + fmla v16.8h, v8.8h, v0.8h + fmla v17.8h, v8.8h, v1.8h + fmla v18.8h, v8.8h, v2.8h + fmla v19.8h, v8.8h, v3.8h + + fmla v20.8h, v8.8h, v4.8h + fmla v21.8h, v8.8h, v5.8h + fmla v22.8h, v8.8h, v6.8h + fmla v23.8h, v8.8h, v7.8h + + bne L8LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L8LoopH + + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + sub x3, x3, #8 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + add x1, x13, x19 // 8 * pack * sizeof(float) + mov x2, x14 + st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 + + +L4: +cmp x3, #4 +ble L1 + +mov x12, #16 +mov x19, #64 + +L4Loop: + assign_bias v16, v17, v18, v19, v24 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L4LoopH: + mov x10, x5 + L4LoopW: + ld1 {v8.8h}, [x2], #16 + ld1 {v0.8h, v1.8h, v2.8h, v3.8h}, [x1], x12 + subs x10, x10, #1 + fmla v16.8h, v8.8h, v0.8h + fmla v17.8h, v8.8h, v1.8h + fmla v18.8h, v8.8h, v2.8h + fmla v19.8h, v8.8h, v3.8h + + bne L4LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L4LoopH + + compare_min_max v16, v17, v18, v19, v10, v11 + sub x3, x3, #4 + st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 + add x1, x13, x19 + mov x2, x14 + +L1: +cmp x3, #0 +beq End + +mov x19, #16 + +L1Loop: + ld1 {v16.8h}, [x20] // assign bias + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L1LoopH: + mov x10, x5 + L1LoopW: + ld1 {v8.8h}, [x2], #16 + ld1 {v0.8h}, [x1], #16 + subs x10, x10, #1 + fmla v16.8h, v8.8h, v0.8h + + bne L1LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L1LoopH + + subs x3, x3, #1 + fmax v16.8h, v16.8h, v10.8h + fmin v16.8h, v16.8h, v11.8h + st1 {v16.8h}, [x0], #16 + add x1, x13, x4 + mov x2, x14 + bne L1Loop + + +End: + +//mov x10, x23 +//mov x11, x24 +//mov x0, x21 +//mov x1, x22 +mov x3, x25 + +subs x15, x15, #1 +add x0, x21, x24 +add x1, x22, x23 +bne LoopDY + +ldp x23, x24, [sp, #(16 * 8)] +ldp x25, x26, [sp, #(16 * 7)] +ldp x27, x28, [sp, #(16 * 6)] +ldp x19, x20, [sp, #(16 * 5)] +ldp x21, x22, [sp, #(16 * 4)] +ldp d8, d9, [sp, #(16 * 3)] +ldp d10, d11, [sp, #(16 * 2)] +ldp d12, d13, [sp, #(16 * 1)] +ldp d14, d15, [sp], #(16 * 9) +ret + +#endif diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S index ad9313244..2a7cf474f 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_Unit_FP16.S @@ -108,14 +108,12 @@ stp x23, x24, [sp, #(16 * 8)] ldr x25, [x6, #40] // xKernelSum ldr x26, [x6, #48] // weightQuantBias ldr x23, [x6, #56] // fp32minmax -ldr x27, [x6, #64] // blockNum //add x24, x23, #4 mov x21, #16 // sizeof(float16_t) * PACK -mul x27, x27, x3 Start: -lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT +lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT mov x22, #48 // src_steps ldr x27, [x6, #80] // extra scale TILE_12: diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S index dd893b292..decf68d84 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit_FP16.S @@ -109,12 +109,10 @@ stp x23, x24, [sp, #(16 * 8)] ldr x25, [x6, #40] // xKernelSum ldr x26, [x6, #48] // weightQuantBias ldr x23, [x6, #56] // fp32minmax -ldr x27, [x6, #64] // blockNum mov x21, #16 // sizeof(float16_t) * PACK -mul x27, x27, x3 Start: -lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t) +lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t) mov x22, #48 // src_steps ldr x27, [x6, #80] // extra scale TILE_12: diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S index 76c79b42e..6602d18b9 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_Unit_FP16.S @@ -150,15 +150,13 @@ stp x27, x28, [sp, #(16 * 8)] // ldr w23, [x6, #24] ldr x27, [x6, #40] // srcKernelSum ldr x28, [x6, #48] // weightQuanBias -ldr x23, [x6, #64] // blockNum ldr x14, [x6, #56] // fp32minmax -mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80 mov x21, #16 // sizeof(float16_t) * UNIT Start: -lsl x15, x23, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6 +lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int8_t) = src_depth_quad * 64 = src_depth_quad << 6 ldr x23, [x6, #80] // extra scale TILE_10: cmp x7, #10 diff --git a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S index f6f6625d7..ea01fef1a 100644 --- a/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S +++ b/source/backend/arm82/asm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit_FP16.S @@ -130,15 +130,13 @@ stp x27, x28, [sp, #(16 * 8)] // ldr w23, [x6, #24] ldr x27, [x6, #40] // srcKernelSum ldr x28, [x6, #48] // weightQuanBias -ldr x23, [x6, #64] // blockNum ldr x14, [x6, #56] // fp32minmax -mul x23, x23, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block mov x22, #80 // GEMM_INT8_DST_XUNIT * GEMM_INT8_SRC_UNIT = 10 * 8 = 80 mov x21, #16 // sizeof(float16_t) * UNIT Start: -lsl x15, x23, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5 +lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC * sizeof(int4_t) = src_depth_quad * 8 * 8 * 0.5 = src_depth_quad << 5 ldr x23, [x6, #80] // extra scale TILE_10: cmp x7, #10 diff --git a/source/backend/cpu/CMakeLists.txt b/source/backend/cpu/CMakeLists.txt index e37ae3e55..82287d69f 100644 --- a/source/backend/cpu/CMakeLists.txt +++ b/source/backend/cpu/CMakeLists.txt @@ -42,9 +42,11 @@ ENDIF() # ARM82 Assemblies IF(MNN_ARM82) - target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82) - include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt) - list(APPEND MNN_TARGETS MNN_Arm82) - list(APPEND MNN_OBJECTS_TO_LINK $) + IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64") + target_compile_options(MNNCPU PRIVATE -DENABLE_ARMV82) + include(${CMAKE_CURRENT_LIST_DIR}/../arm82/CMakeLists.txt) + list(APPEND MNN_TARGETS MNN_Arm82) + list(APPEND MNN_OBJECTS_TO_LINK $) + ENDIF() ENDIF() diff --git a/source/backend/cpu/CPUBackend.cpp b/source/backend/cpu/CPUBackend.cpp index dd3401dcf..f28ba3e4a 100644 --- a/source/backend/cpu/CPUBackend.cpp +++ b/source/backend/cpu/CPUBackend.cpp @@ -48,7 +48,7 @@ ErrorCode CastWrapExecution::onExecute(const std::vector& inputs, const CPUCastCreator::cast(inputs[0], outputs[0], cpuBackend, convertType); return NO_ERROR; } -void CPURuntime::computeDivideSizes(int size, int* dst) const { +void CPUBackend::computeDivideSizes(int size, int* dst) const { if (mGroupWithComputeRate.size() <= 1) { // Avg divide int length = UP_DIV(size, mThreadNumber); @@ -132,40 +132,6 @@ void CPURuntime::_bindCPUCore() const { #endif } -void CPURuntime::_resetGroupCompute() const { - if (mPastDecreaseHint == hint().cpuDecreaseRate) { - return; - } - mGroupWithComputeRate.clear(); - if (mThreadNumber <= 1 || mPower == BackendConfig::Power_Low) { - return; - } - mPastDecreaseHint = hint().cpuDecreaseRate; - auto cpuInfo = MNNGetCPUInfo(); - if (cpuInfo->groups.size() < 2) { - return; - } - float decreaseRate = (float)(hint().cpuDecreaseRate) / 100.0f; - int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size()); - int groupIndex = (int)cpuInfo->groups.size()-2; - float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq; - validCpuSize = ALIMIN(validCpuSize, mThreadNumber); - float totalComputeRate = 1.0f * validCpuSize; - mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize)); - float currentRate = 1.0f; - while (validCpuSize < mThreadNumber && groupIndex >= 0) { - auto& group = cpuInfo->groups[groupIndex]; - int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size()); - validCpuSize += group.ids.size(); - currentRate *= decreaseRate; - totalComputeRate += currentRate * selectSize; - mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize)); - } - for (auto& g : mGroupWithComputeRate) { - g.first = g.first / totalComputeRate; - } -} - void CPURuntime::_resetThreadPool() { mThreadNumber = std::max(1, mThreadNumber); mThreadNumber = std::min(mThreadNumber, MAX_THREAD_NUMBER); @@ -179,7 +145,6 @@ void CPURuntime::_resetThreadPool() { } mThreadNumber = ALIMIN(ThreadPool::init(systemThreadNumber), mThreadNumber); } - mGroupWithComputeRate.clear(); if (mThreadNumber > 1) { mTaskIndex = ThreadPool::acquireWorkIndex(); if (-1 == mTaskIndex) { @@ -204,8 +169,6 @@ void CPURuntime::onReset(int numberThread, const BackendConfig* config, bool ful } mThreadNumber = numberThread; _resetThreadPool(); - // Mask Group Compute reset - mPastDecreaseHint = -1; } CPURuntime::CPURuntime(const Backend::Info& info) { @@ -280,7 +243,6 @@ Backend* CPURuntime::onCreate(const BackendConfig* config, Backend* origin) cons auto cpuBn = static_cast(origin); mSharedDmaInfo = cpuBn->mDmaInfo; } - _resetGroupCompute(); if (nullptr != config) { precision = config->precision; flags = config->flags; @@ -403,6 +365,41 @@ CPUBackend::CPUBackend(const CPURuntime* runtime, BackendConfig::PrecisionMode p #endif mMemory = memory; mRuntime = const_cast(runtime); + mThreadNumber = mRuntime->mThreadNumber; + // Compute Group Rate + do { + if (mThreadNumber <= 1 || mRuntime->mPower == BackendConfig::Power_Low) { + break; + } + auto rate = mRuntime->hint().cpuDecreaseRate; + if (rate >= 100 || rate <= 0) { + break; + } + auto cpuInfo = MNNGetCPUInfo(); + if (cpuInfo->groups.size() < 2) { + break; + } + mGroupWithComputeRate.clear(); + float decreaseRate = (float)(rate) / 100.0f; + int validCpuSize = (int)(cpuInfo->groups[cpuInfo->groups.size()-1].ids.size()); + int groupIndex = (int)cpuInfo->groups.size()-2; + float maxFreq = (float)cpuInfo->groups[cpuInfo->groups.size()-1].maxFreq; + validCpuSize = ALIMIN(validCpuSize, mThreadNumber); + float totalComputeRate = 1.0f * validCpuSize; + mGroupWithComputeRate.emplace_back(std::make_pair(totalComputeRate, validCpuSize)); + float currentRate = 1.0f; + while (validCpuSize < mThreadNumber && groupIndex >= 0) { + auto& group = cpuInfo->groups[groupIndex]; + int selectSize = ALIMIN(mThreadNumber - validCpuSize, (int)group.ids.size()); + validCpuSize += group.ids.size(); + currentRate *= decreaseRate; + totalComputeRate += currentRate * selectSize; + mGroupWithComputeRate.emplace_back(std::make_pair(currentRate * selectSize, selectSize)); + } + for (auto& g : mGroupWithComputeRate) { + g.first = g.first / totalComputeRate; + } + } while (false); auto dynamicAlloc = mRuntime->mSharedDmaInfo; if (nullptr == dynamicAlloc.get()) { mDmaInfo.reset(new CPURuntime::DynamicAllocator); diff --git a/source/backend/cpu/CPUBackend.hpp b/source/backend/cpu/CPUBackend.hpp index b4c9843d0..00e39fc30 100644 --- a/source/backend/cpu/CPUBackend.hpp +++ b/source/backend/cpu/CPUBackend.hpp @@ -40,9 +40,6 @@ class CPURuntime : public Runtime { void onConcurrencyEnd() const; virtual bool onCheckInfo(Backend::Info& info) const override; - // dividedSize's length should be larger than threadNumber - void computeDivideSizes(int size, int* dst) const; - #ifdef MNN_USE_THREAD_POOL inline bool multiThreadValid() const { return mThreadOpen; @@ -60,9 +57,6 @@ class CPURuntime : public Runtime { mutable int mTaskIndex = -1; mutable bool mThreadOpen = false; #endif - void _resetGroupCompute() const; - mutable std::vector> mGroupWithComputeRate; - mutable int mPastDecreaseHint = -1; BackendConfig::MemoryMode mMemory; BackendConfig::PowerMode mPower; BackendConfig::PrecisionMode mPrecision; @@ -108,6 +102,8 @@ class CPUBackend : public Backend { // Return sizeDivide, scheduleNumber aligned memory std::pair multiThreadDivide(int size) const; virtual bool onSelectDynamicAllocator(int index, int maxIndex) override; + // dividedSize's length should be larger than threadNumber + void computeDivideSizes(int size, int* dst) const; public: virtual MemObj* onAcquire(const Tensor* nativeTensor, StorageType storageType) override; @@ -145,7 +141,7 @@ class CPUBackend : public Backend { static bool addCreator(OpType t, Creator* c); inline int threadNumber() const { - return mRuntime->mThreadNumber; + return mThreadNumber; } #ifdef MNN_USE_THREAD_POOL inline bool threadOpen() const { @@ -182,6 +178,9 @@ class CPUBackend : public Backend { CoreFunctions* mCoreFunctions; CoreInt8Functions* mInt8CoreFunctions; private: + int mThreadNumber; + std::vector> mGroupWithComputeRate; + std::shared_ptr mDmaInfo; std::shared_ptr mStaticAllocator; CPURuntime* mRuntime; diff --git a/source/backend/cpu/CPUConvolutionDepthwise.cpp b/source/backend/cpu/CPUConvolutionDepthwise.cpp index f3fdf2cb3..6d6e2df96 100644 --- a/source/backend/cpu/CPUConvolutionDepthwise.cpp +++ b/source/backend/cpu/CPUConvolutionDepthwise.cpp @@ -14,7 +14,6 @@ #include "core/TensorUtils.hpp" #include "backend/cpu/compute/CommonOptFunction.h" #include "backend/cpu/compute/ConvOpt.h" -#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp" namespace MNN { CPUConvolutionDepthwise::FloatExecution::FloatExecution(const Convolution2DCommon* common, Backend* b, @@ -129,8 +128,7 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect auto core = static_cast(backend())->functions(); int bytes = core->bytes; int unit = core->pack; - auto unitFunc = core->MNNConvRunForUnitDepthWise; - auto lineFunc = core->MNNConvRunForLineDepthwise; + auto kernelFunc = core->MNNConvRunForLineDepthwise; auto postFunc = core->MNNAxByClampBroadcastUnit; auto inputTensor = inputs[0]; auto outputTensor = outputs[0]; @@ -169,72 +167,60 @@ ErrorCode CPUConvolutionDepthwise::BasicFloatExecution::onResize(const std::vect int weight_z_step = kernel_height * kernel_width * unit; int dilateY_step = dilateY * src_width * unit; int dilateX_step = dilateX * unit; - // Compute Mid Rect - int l = 0, t = 0, r = dst_width, b = dst_height; - for (; l * strideX - padX < 0 && l < dst_width; l++) { - // do nothing - } - for (; t * strideY - padY < 0 && t < dst_height; t++) { - // do nothing - } - for (; (r - 1) * strideX - padX + (kernel_width - 1) * dilateX >= src_width && r > l; r--) { - // do nothing - } - for (; (b - 1) * strideY - padY + (kernel_height - 1) * dilateY >= src_height && b > t; b--) { - // do nothing - } - auto postData = getPostParameters(); auto batch = inputs[0]->batch(); int total = batch * dst_depth_quad; int numberThread = ((CPUBackend*)backend())->threadNumber(); - auto rt = static_cast(backend()->getRuntime()); - auto runBasic = [=](uint8_t* dst_z, const uint8_t* src_z, const uint8_t* weight_dz, int L, int T, int R, int B) { - for (int dy = T; dy < B; ++dy) { - auto dst_y = dst_z + dy * dst_y_step * bytes; - int srcStartY = dy * strideY - padY; - const auto src_dy = src_z + srcStartY * src_y_step * bytes; - int sfy = ALIMAX(0, (UP_DIV(-srcStartY, dilateY))); - int efy = ALIMIN(kernel_height, UP_DIV(src_height - srcStartY, dilateY)); - for (int dx = L; dx < R; ++dx) { - auto dst_x = dst_y + unit * dx * bytes; - int srcStartX = dx * strideX - padX; - const auto src_dx = src_dy + srcStartX * unit * bytes; - int sfx = ALIMAX(0, (UP_DIV(-srcStartX, dilateX))); - int efx = ALIMIN(kernel_width, UP_DIV(src_width - srcStartX, dilateX)); - unitFunc((float*)dst_x, (const float*)(src_dx + (sfx * dilateX + sfy * dilateY * src_width) * unit * bytes), - (const float*)(weight_dz + unit * (kernel_width * sfy + sfx) * bytes), efx - sfx, efy - sfy, - unit * kernel_width, dilateX_step, dilateY_step); - } - } - }; std::vector divides(numberThread+1); divides[0] = 0; - rt->computeDivideSizes(total, divides.data()+1); - mExecutor = [=](const uint8_t* srcOrigin, uint8_t* dstOrigin, int tId) { + static_cast(backend())->computeDivideSizes(total, divides.data()+1); + mNumber = numberThread; + auto postData = getPostParameters(); + if (static_cast(backend())->functions()->bytes < 4) { + static_cast(backend())->functions()->MNNFp32ToLowp(postData.data() + 2, (int16_t*)(postData.data() + 2), 2); + } + mFastKernelApply = (dilateX == 1 && dilateY == 1 && strideX == 1 && strideY == 1 && core->MNNDepthwiseConvFastKernel); + if (mFastKernelApply ) { // Only support ARM kernel + kernelFunc = core->MNNDepthwiseConvFastKernel; + } + auto pads = ConvolutionCommon::convolutionPadFull(inputs[0], outputs[0], mCommon); + int paddedWidth = std::get<0>(pads) + std::get<2>(pads) + src_width; + int paddedHeight = std::get<1>(pads) + std::get<3>(pads) + src_height; + mInputPad.reset(Tensor::createDevice({mNumber, paddedWidth * paddedHeight * unit})); + bool succ = backend()->onAcquireBuffer(mInputPad.get(), Backend::DYNAMIC); + if (!succ) { + return OUT_OF_MEMORY; + } + if (paddedWidth != src_width) { + dilateY_step = dilateY * paddedWidth * unit; + src_y_step = paddedWidth * unit; + } + mExecutor = [=](const uint8_t* inputPtr, uint8_t* outputPtr, int tId) { + const auto inputPadPtr = mInputPad->host() + mInputPad->stride(0) * tId * bytes; + ::memset(inputPadPtr, 0, mInputPad->stride(0) * bytes); auto biasP = inputs[2]->host(); auto weightP = inputs[1]->host(); for (int index = divides[tId]; index < divides[tId+1]; ++index) { + int dz = index / batch; - auto dst_z = dstOrigin + dst_z_step * index * bytes; - const auto src_z = srcOrigin + src_z_step * index * bytes; + auto dstOrigin = outputPtr + dst_z_step * index * bytes; + const auto srcOrigin = inputPtr + src_z_step * index * bytes; auto bias_z = biasP + unit * dz * bytes; const auto weight_dz = weightP + dz * weight_z_step * bytes; - runBasic(dst_z, src_z, weight_dz, 0, 0, dst_width, t); - runBasic(dst_z, src_z, weight_dz, 0, b, dst_width, dst_height); - runBasic(dst_z, src_z, weight_dz, 0, t, l, b); - runBasic(dst_z, src_z, weight_dz, r, t, dst_width, b); - if (r > l && b > t) { - lineFunc((float*)(dst_z + (t * dst_y_step + l * unit) * bytes), - (const float*)(src_z + ((t * strideY - padY) * src_y_step + (l * strideX - padX) * unit) * bytes), - (const float*)weight_dz, r - l, strideX * unit, kernel_width, kernel_height, dilateX_step, - dilateY_step, b - t, src_y_step * strideY, dst_y_step); + + auto srcPtr = srcOrigin; + // Pad inputs + for (int y = 0; y < src_height; ++y) { + auto src = srcOrigin + y * src_width * unit * bytes; + auto dst = inputPadPtr + ((y + padY) * paddedWidth + padX) * unit * bytes; + ::memcpy(dst, src, src_width * unit * bytes); } - postFunc((float*)dst_z, (float*)dst_z, (const float*)bias_z, dst_width * dst_height, 0, 0, 1, postData.data()); + + // Compute + kernelFunc((float*)dstOrigin, (const float*)(inputPadPtr), (const float*)weight_dz, dst_width, strideX * unit, kernel_width, kernel_height, dilateX_step, dilateY_step, dst_height, src_y_step * strideY, dst_y_step, (const float*)bias_z, postData.data() + 2); } }; - mNumber = numberThread; - + backend()->onReleaseBuffer(mInputPad.get(), Backend::DYNAMIC); return NO_ERROR; } @@ -281,11 +267,6 @@ class CPUConvolutionDepthwiseCreator : public CPUBackend::Creator { if (inputs.empty()) { return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize); } - auto core = static_cast(backend)->functions(); - if (conv->dilateX() == 1 && conv->dilateY() == 1 && conv->strideX() == 1 && conv->strideY() == 1 && - conv->kernelX() == 3 && conv->kernelY() == 3 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2 && core->MNNMultiAndDestTransformCommon23 != nullptr) { - return new ConvolutionDepthwise3x3(conv, backend, originWeight, originWeightSize, originBias, originBiasSize); - } return new CPUConvolutionDepthwise::FloatExecution(conv2d->common(), backend, originWeight, originWeightSize, originBias, originBiasSize); } }; diff --git a/source/backend/cpu/CPUConvolutionDepthwise.hpp b/source/backend/cpu/CPUConvolutionDepthwise.hpp index 9b7cbecbc..91efb7b01 100644 --- a/source/backend/cpu/CPUConvolutionDepthwise.hpp +++ b/source/backend/cpu/CPUConvolutionDepthwise.hpp @@ -26,7 +26,12 @@ class CPUConvolutionDepthwise { private: std::function mExecutor; + std::function mFastKernel; int mNumber = 1; + std::shared_ptr mInputPad; + bool mFastKernelApply = false; }; class MultiInputFloatExecution : public BasicFloatExecution { public: diff --git a/source/backend/cpu/CPUDepthwiseConvInt8.cpp b/source/backend/cpu/CPUDepthwiseConvInt8.cpp index 0df722bb4..8f94e84fc 100644 --- a/source/backend/cpu/CPUDepthwiseConvInt8.cpp +++ b/source/backend/cpu/CPUDepthwiseConvInt8.cpp @@ -142,7 +142,7 @@ ErrorCode CPUDepthwiseConvInt8::onResize(const std::vector& inputs, con int size_ = mMutableResource.mBiasInt32->length(0); if (core->ConvDepthwise3x3LineInt8_ARM82) { - if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && dst_width >= 2 && dst_height >= 2) { + if (kernel_width == 3 && kernel_height == 3 && strideX == 1 && strideY == 1 && dilateX == 1 && dilateY == 1 && dst_width >= 2 && dst_height >= 2) { mUse3x3Kernel = true; mThreadFunction = core->ConvDepthwise3x3LineInt8_ARM82; UNIT = 4; @@ -247,7 +247,7 @@ class CPUDepthwiseConvInt8Creator : public CPUBackend::Creator { if (core->ConvDepthwise3x3LineInt8_ARM82) { if (common->kernelX() == 3 && common->kernelY() == 3 && common->strideX() == 1 && common->strideY() == 1 && common->dilateX() == 1 - && common->dilateY() == 1 && gcore->MNNMultiAndDestTransformCommon23 != nullptr && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) { + && common->dilateY() == 1 && outputs[0]->width() >= 2 && outputs[0]->height() >= 2) { use3x3kernel = true; UNIT = 4; } diff --git a/source/backend/cpu/CPUGridSample.cpp b/source/backend/cpu/CPUGridSample.cpp index 3cc633d3d..e39c22171 100644 --- a/source/backend/cpu/CPUGridSample.cpp +++ b/source/backend/cpu/CPUGridSample.cpp @@ -98,8 +98,8 @@ ErrorCode CPUGridSample::onExecute(const std::vector &inputs, const st auto outW = outputTensor->buffer().dim[4].extent; auto threadCount = static_cast(backend())->threadNumber(); auto tileCount = outD; - auto inOffset = batches * inH * inW * core->pack; - auto outOffset = batches * outH * outW * core->pack; + auto inOffset = batches * inD * inH * inW * core->pack; + auto outOffset = batches * outD * outH * outW * core->pack; auto cordPtr = mTempCordBuffer->host(); for (auto b = 0; b < batches; ++b) { auto _inputPtr = inputPtr + b * inD * inH * inW * core->pack * core->bytes; @@ -109,10 +109,9 @@ ErrorCode CPUGridSample::onExecute(const std::vector &inputs, const st // Compute cord MNN_CONCURRENCY_BEGIN(tId, threadCount) { for (int index=tId; index < tileCount; index += threadCount) { - auto c = index / outD; - auto d = index % outD; - auto inputC = _inputPtr + c * inD * inW * inH * batches * core->pack * core->bytes; - auto outputC = _outputPtr + c * outD * outW * outH * batches * core->pack * core->bytes; + auto d = index; + auto inputC = _inputPtr; + auto outputC = _outputPtr; auto cordD = cordPtr + d * outH * outW * 3 * core->bytes; auto outputD = outputC + d * outH * outW * core->pack * core->bytes; for (int h = 0; h < outH; h++) { diff --git a/source/backend/cpu/CPURuntime.cpp b/source/backend/cpu/CPURuntime.cpp index 41f04fd8b..17a653f52 100644 --- a/source/backend/cpu/CPURuntime.cpp +++ b/source/backend/cpu/CPURuntime.cpp @@ -1373,6 +1373,9 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) { } group.ids = _readNumber((const char*)buffer.get(), buffer.size()); } + if (group.ids.empty()) { + continue; + } std::string minfreq = policyName + "/cpuinfo_min_freq"; { MNN::AutoStorage buffer; @@ -1439,6 +1442,11 @@ static void _fillInfo(MNNCPUInfo* cpuinfo_isa) { _getInfoApple(cpuinfo_isa); #endif +#if defined(__aarch64__) && defined(_WIN32) + cpuinfo_isa->fp16arith = true; + cpuinfo_isa->dot = true; +#endif + MNN_PRINT("The device supports: i8sdot:%d, fp16:%d, i8mm: %d, sve2: %d\n", cpuinfo_isa->dot, cpuinfo_isa->fp16arith, cpuinfo_isa->i8mm, cpuinfo_isa->sve2); return; } diff --git a/source/backend/cpu/GridSampler.hpp b/source/backend/cpu/GridSampler.hpp index e2e738d26..895521349 100644 --- a/source/backend/cpu/GridSampler.hpp +++ b/source/backend/cpu/GridSampler.hpp @@ -138,7 +138,7 @@ static int MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, int heig h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h); w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w); } - return ((d * height + h) * width + w) * 4; + return ((d * height + h) * width + w) * PACK; } static void MNNGridSampleInterp3D(FLOAT* outputPtr, const FLOAT* inputPtr, const FLOAT* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) { diff --git a/source/backend/cpu/arm/CMakeLists.txt b/source/backend/cpu/arm/CMakeLists.txt index d8d06136c..37ae4c6d4 100644 --- a/source/backend/cpu/arm/CMakeLists.txt +++ b/source/backend/cpu/arm/CMakeLists.txt @@ -30,7 +30,7 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "^armv7" OR ARCHS MATCHES "^armv7(;armv7s)?") if (MNN_SUPPORT_BF16) target_compile_options(MNNARM32 PRIVATE -DMNN_SUPPORT_BF16) endif() -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64" OR ARCHS STREQUAL "ARM64") message(STATUS "Enabling AArch64 Assemblies") add_library(MNNARM64 OBJECT ${MNN_AArch64_SRC} ${MNN_NEON_SRC}) target_include_directories(MNNARM64 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/) @@ -42,11 +42,6 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^aarch64" OR ARCHS STREQUAL "arm64") target_compile_options(MNNARM64 PRIVATE -DMNN_SUPPORT_BF16) endif() - if(MNN_ARM82) - message(STATUS "Enable INT8 SDOT") - target_compile_options(MNNARM64 PRIVATE -DENABLE_ARMV82) - endif() - else() # Building fat binary requires multiple separate builds and lipo-by-hand under CMake's design endif() diff --git a/source/backend/cpu/arm/FunctionSummary.hpp b/source/backend/cpu/arm/FunctionSummary.hpp index 4c9a3ad19..be435004d 100644 --- a/source/backend/cpu/arm/FunctionSummary.hpp +++ b/source/backend/cpu/arm/FunctionSummary.hpp @@ -34,9 +34,6 @@ void NEON_MNNPackedMatMul_BF16(float* C, const float* A, const float* B, const s const float* postParameters, const float* bias, const float* k, const float* b); void NEON_MNNPackedMatMulRemain_BF16(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter, const float* postParameters, const float* bias, const float* k, const float* b); - -void NEON_MNNConvRunForUnitDepthWise_BF16(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); void NEON_MNNConvRunForLineDepthwise_BF16(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep); diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S index 6fde7c37b..2cccf62ea 100644 --- a/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S +++ b/source/backend/cpu/arm/arm32/MNNConvRunForLineDepthwise.S @@ -34,8 +34,16 @@ ldr r8, [sp, #48] ldr lr, [sp, #52] ldr r10, [sp, #56] ldr r11, [sp, #60] +ldr r12, [sp, #64] // bias +vld1.32 {q0}, [r12] // bias +ldr r12, [sp, #68] // min,max +vld1.32 {d2[0]}, [r12]! +vld1.32 {d2[1]}, [r12] vpush {q4-q7} +vmov.f32 q5, q0 // bias +vdup.f32 q4, d2[0] // min +vdup.f32 q6, d2[1] // max mov r12, #4 mul r4, r12, r4 @@ -59,14 +67,14 @@ mov r12, #8 mul r12, r4, r12 L8Loop: - vmov.i32 q8, #0 - vmov.i32 q9, #0 - vmov.i32 q10, #0 - vmov.i32 q11, #0 - vmov.i32 q12, #0 - vmov.i32 q13, #0 - vmov.i32 q14, #0 - vmov.i32 q15, #0 + vmov.f32 q8, q5 // use bias to init + vmov.f32 q9, q5 + vmov.f32 q10, q5 + vmov.f32 q11, q5 + vmov.f32 q12, q5 + vmov.f32 q13, q5 + vmov.f32 q14, q5 + vmov.f32 q15, q5 vmov.i32 d14[0], r1 vmov.i32 d14[1], r2 @@ -103,6 +111,22 @@ L8Loop: bne L8LoopH sub r3, r3, #8 + vmax.f32 q8, q8, q4 + vmax.f32 q9, q9, q4 + vmax.f32 q10, q10, q4 + vmax.f32 q11, q11, q4 + vmax.f32 q12, q12, q4 + vmax.f32 q13, q13, q4 + vmax.f32 q14, q14, q4 + vmax.f32 q15, q15, q4 + vmin.f32 q8, q8, q6 + vmin.f32 q9, q9, q6 + vmin.f32 q10, q10, q6 + vmin.f32 q11, q11, q6 + vmin.f32 q12, q12, q6 + vmin.f32 q13, q13, q6 + vmin.f32 q14, q14, q6 + vmin.f32 q15, q15, q6 vst1.32 {q8, q9}, [r0]! vmov.i32 r1, d14[0] vmov.i32 r2, d14[1] @@ -121,13 +145,13 @@ mov r12, #4 mul r12, r4, r12 L4Loop: - vmov.i32 q8, #0 - vmov.i32 q9, #0 - vmov.i32 q10, #0 - vmov.i32 q11, #0 + vmov.f32 q8, q5 + vmov.f32 q9, q5 + vmov.f32 q10, q5 + vmov.f32 q11, q5 - vmov.i32 d8[0], r1 - vmov.i32 d9[0], r2 + vmov.i32 d14[0], r1 + vmov.i32 d14[1], r2 mov lr, r6 L4LoopH: mov r10, r5 @@ -151,10 +175,18 @@ L4Loop: add r1, r1, r8 bne L4LoopH + vmax.f32 q8, q8, q4 + vmax.f32 q9, q9, q4 + vmax.f32 q10, q10, q4 + vmax.f32 q11, q11, q4 + vmin.f32 q8, q8, q6 + vmin.f32 q9, q9, q6 + vmin.f32 q10, q10, q6 + vmin.f32 q11, q11, q6 sub r3, r3, #4 vst1.32 {q8, q9}, [r0]! - vmov.i32 r1, d8[0] - vmov.i32 r2, d9[0] + vmov.i32 r1, d14[0] + vmov.i32 r2, d14[1] vst1.32 {q10, q11}, [r0]! add r1, r1, r12 cmp r3, #4 @@ -168,7 +200,7 @@ cmp r3, #0 beq End L1Loop: - vmov.i32 q0, #0 + vmov.f32 q0, q5 mov lr, r6 mov r11, r1 mov r12, r2 @@ -184,6 +216,8 @@ L1Loop: add r1, r1, r8 bne L1LoopH + vmax.f32 q0, q0, q4 + vmin.f32 q0, q0, q6 subs r3, r3, #1 vst1.32 {q0}, [r0]! mov r2, r12 @@ -203,6 +237,5 @@ bne LoopDY vpop {q4-q7} pop {r4-r8, r10, r11, pc} - #endif #endif diff --git a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S b/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S deleted file mode 100644 index 06c98c03b..000000000 --- a/source/backend/cpu/arm/arm32/MNNConvRunForUnitDepthWise.S +++ /dev/null @@ -1,74 +0,0 @@ -// -// MNNConvRunForUnitDepthWise.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __arm__ -#ifndef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNConvRunForUnitDepthWise -//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) - -//Auto: r0:dst, r1:src, r2:weight, r3:fw - -push {r4-r8, lr} - -//Load from sp: -//r5:fh, r6:weight_y_step, r7:dilate_x_step, r8:dilate_y_step -mov r4, r3 -ldr r5, [sp, #24] -ldr r6, [sp, #28] -ldr r7, [sp, #32] -ldr r8, [sp, #36] - -cmp r4, #0 -vmov.i32 q0, #0 -beq UnitEnd -cmp r5, #0 -beq UnitEnd - -mov lr, #4 -mul r6, lr, r6 -mul r7, lr, r7 -mul r8, lr, r8 - -//dilate_y_step -> dilate_y_step - dilate_x_step*fw -mul lr, r4, r7 -sub r8, r8, lr - -//weight_y_step -> weight_y_step - 4*sizeof(float)*fw -mov lr, #16 -mul lr, r4, lr -sub r6, r6, lr - - -UnitLoopH: -mov lr, r4 -UnitLoopW: -vld1.32 {q1}, [r1], r7 -vld1.32 {q2}, [r2]! -vmla.f32 q0, q1, q2 -subs lr, lr, #1 -bne UnitLoopW -subs r5, r5, #1 -add r1, r1, r8 -add r2, r2, r6 -bne UnitLoopH - - -UnitEnd: - -vst1.32 {q0}, [r0] - -pop {r4-r8, pc} - -#endif -#endif diff --git a/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S b/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S new file mode 100644 index 000000000..3c71c406d --- /dev/null +++ b/source/backend/cpu/arm/arm32/MNNDepthwiseConvFastKernel.S @@ -0,0 +1,221 @@ +// +// MNNDepthwiseConvFastKernel.S +// MNN +// +// Created by MNN on 2019/02/04. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifdef __arm__ +#ifndef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNDepthwiseConvFastKernel +//void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, size_t srcHStep, size_t dstHStep) + + +//Auto Load: +//r0:dst, r1:src, r2:weight, r3:width + +push {r4-r8, r10, r11, lr} + +//Load From Sp +//r4:src_w_setup, r5:fw, r6:fh, r7:dilate_x_step, r8:dilate_y_step, lr: height, r10:srcHStep, r11:dstHStep +ldr r4, [sp, #32] +ldr r5, [sp, #36] +ldr r6, [sp, #40] +ldr r7, [sp, #44] +ldr r8, [sp, #48] +ldr lr, [sp, #52] +ldr r10, [sp, #56] +ldr r11, [sp, #60] +ldr r12, [sp, #64] // bias +vld1.32 {q0}, [r12] // bias +ldr r12, [sp, #68] // min,max +vld1.32 {d2[0]}, [r12]! +vld1.32 {d2[1]}, [r12] + +vpush {q4-q7} +vmov.f32 q5, q0 // bias +vdup.f32 q4, d2[0] // min +vdup.f32 q6, d2[1] // max + +mov r12, #4 +mul r4, r12, r4 +mul r7, r12, r7 +mul r8, r12, r8 +mul r10, r12, r10 +mul r11, r12, r11 + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul r12, r5, r7 +sub r8, r8, r12 + +LoopDY: +push {r0, r1, r3, r10, r11, lr} + +L8: +cmp r3, #7 +ble L4 + +L8Loop: + vmov.f32 q8, q5 // use bias to init + vmov.f32 q9, q5 + vmov.f32 q10, q5 + vmov.f32 q11, q5 + vmov.f32 q12, q5 + vmov.f32 q13, q5 + vmov.f32 q14, q5 + vmov.f32 q15, q5 + + mov r12, r1 + mov r4, r2 + mov lr, r6 + L8LoopH: + mov r10, r5 + L8LoopW: + vld1.32 {q7}, [r2]! + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1]! + subs r10, r10, #1 + vmla.f32 q8, q0, q7 + vmla.f32 q9, q1, q7 + vmla.f32 q10, q2, q7 + vmla.f32 q11, q3, q7 + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1] + vmla.f32 q12, q0, q7 + vmla.f32 q13, q1, q7 + vmla.f32 q14, q2, q7 + vmla.f32 q15, q3, q7 + sub r1, r1, #80 + + bne L8LoopW + L8LoopWEnd: + subs lr, lr, #1 + add r1, r1, r8 + bne L8LoopH + + sub r3, r3, #8 + vmax.f32 q8, q8, q4 + vmax.f32 q9, q9, q4 + vmax.f32 q10, q10, q4 + vmax.f32 q11, q11, q4 + vmax.f32 q12, q12, q4 + vmax.f32 q13, q13, q4 + vmax.f32 q14, q14, q4 + vmax.f32 q15, q15, q4 + vmin.f32 q8, q8, q6 + vmin.f32 q9, q9, q6 + vmin.f32 q10, q10, q6 + vmin.f32 q11, q11, q6 + vmin.f32 q12, q12, q6 + vmin.f32 q13, q13, q6 + vmin.f32 q14, q14, q6 + vmin.f32 q15, q15, q6 + vst1.32 {q8, q9}, [r0]! + mov r1, r12 + mov r2, r4 + vst1.32 {q10, q11}, [r0]! + vst1.32 {q12, q13}, [r0]! + vst1.32 {q14, q15}, [r0]! + add r1, r1, #128 + cmp r3, #8 + bge L8Loop + +L4: +cmp r3, #3 +ble L1 + +L4Loop: + vmov.f32 q8, q5 + vmov.f32 q9, q5 + vmov.f32 q10, q5 + vmov.f32 q11, q5 + + mov r12, r1 + mov r4, r2 + mov lr, r6 + L4LoopH: + mov r10, r5 + L4LoopW: + vld1.32 {q12}, [r2]! + vld1.32 {q0, q1}, [r1]! + vld1.32 {q2, q3}, [r1] + sub r1, r1, #16 + subs r10, r10, #1 + vmla.f32 q8, q12, q0 + vmla.f32 q9, q12, q1 + vmla.f32 q10, q12, q2 + vmla.f32 q11, q12, q3 + + bne L4LoopW + subs lr, lr, #1 + add r1, r1, r8 + bne L4LoopH + + vmax.f32 q8, q8, q4 + vmax.f32 q9, q9, q4 + vmax.f32 q10, q10, q4 + vmax.f32 q11, q11, q4 + vmin.f32 q8, q8, q6 + vmin.f32 q9, q9, q6 + vmin.f32 q10, q10, q6 + vmin.f32 q11, q11, q6 + sub r3, r3, #4 + vst1.32 {q8, q9}, [r0]! + mov r1, r12 + mov r2, r4 + vst1.32 {q10, q11}, [r0]! + add r1, r1, #64 + cmp r3, #4 + bge L4Loop + +L1: +cmp r3, #0 +beq End +L1Loop: + vmov.f32 q0, q5 + mov lr, r6 + mov r11, r1 + mov r12, r2 + L1LoopH: + mov r10, r5 + L1LoopW: + vld1.32 {q1}, [r1]! + vld1.32 {q2}, [r2]! + vmla.f32 q0, q1, q2 + subs r10, r10, #1 + bne L1LoopW + subs lr, lr, #1 + add r1, r1, r8 + bne L1LoopH + + vmax.f32 q0, q0, q4 + vmin.f32 q0, q0, q6 + subs r3, r3, #1 + vst1.32 {q0}, [r0]! + mov r2, r12 + add r1, r11, #16 + bne L1Loop + + +End: + +pop {r0, r1, r3, r10, r11, lr} +add r0, r0, r11 +subs lr, lr, #1 +add r1, r1, r10 +bne LoopDY + +vpop {q4-q7} +pop {r4-r8, r10, r11, pc} + + +#endif +#endif diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S index 8b62af530..9c37ae75d 100644 --- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S +++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_Unit.S @@ -65,9 +65,7 @@ ldr r12, [r6, #8] // int8 max str r12, [sp, #16] ldr r12, [r6, #12] // int8 min str r12, [sp, #20] -ldr r12, [r6, #40] // blockNum -mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum -lsl r12, r12, #6 // weight_stride = src_depth_quad*LP*HP +lsl r12, r3, #6 // weight_stride = src_depth_quad*LP*HP str r12, [sp, #24] ldr r12, [r6, #48] // extraScale str r12, [sp, #28] diff --git a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S index 0e3966b9e..f3cdc98f9 100644 --- a/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S +++ b/source/backend/cpu/arm/arm32/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S @@ -65,9 +65,7 @@ ldr r12, [r6, #32] // weightBias str r12, [sp, #8] ldr r12, [r6, #36] // f32minmax str r12, [sp, #12] -ldr r12, [r6, #40] // blockNum -mul r12, r12, r3 // src_depth_quad=src_depth_quad*blockNum -lsl r12, r12, #5 // weight_stride = src_depth_quad*LP*HP +lsl r12, r3, #5 // weight_stride = src_depth_quad*LP*HP str r12, [sp, #16] ldr r12, [r6, #48] // extraScale str r12, [sp, #20] @@ -82,12 +80,14 @@ L2LoopDz: subs r12, r3, #1 // first four output vld1.8 {q2}, [r1]! - vld1.8 {q4}, [r2]! // weight, d8,d9,d10,d11 + vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11 // int4->int8 - vmov.i8 q5, #15 - vand.i8 q5, q5, q4 + vmov.i8 q6, #15 + vmov.i8 q7, #15 + vand.i8 q6, q6, q4 + vand.i8 q7, q7, q5 vshr.u8 q4, q4, #4 - vzip.8 q4, q5 + vshr.u8 q5, q5, #4 vmull.s8 q0, d4, d8 vmull.s8 q1, d4, d10 @@ -95,12 +95,6 @@ L2LoopDz: vmlal.s8 q1, d5, d11 vpaddl.s16 q8, q0 vpaddl.s16 q9, q1 - vld1.8 {q6}, [r2]! // weight,d12,d13,d14,d15 - // int4->int8 - vmov.i8 q7, #15 - vand.i8 q7, q7, q6 - vshr.u8 q6, q6, #4 - vzip.8 q6, q7 vmull.s8 q0, d4, d12 vmull.s8 q1, d4, d14 @@ -129,22 +123,18 @@ L2LoopDz: L2LoopSz: // first four output vld1.8 {q2}, [r1]! - vld1.8 {q4}, [r2]! + vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11 // int4->int8 - vmov.i8 q5, #15 - vand.i8 q5, q5, q4 + vmov.i8 q6, #15 + vmov.i8 q7, #15 + vand.i8 q6, q6, q4 + vand.i8 q7, q7, q5 vshr.u8 q4, q4, #4 - vzip.8 q4, q5 + vshr.u8 q5, q5, #4 vmull.s8 q0, d4, d8 vmull.s8 q1, d4, d10 vmlal.s8 q0, d5, d9 vmlal.s8 q1, d5, d11 - vld1.8 {q6}, [r2]! - // int4->int8 - vmov.i8 q7, #15 - vand.i8 q7, q7, q6 - vshr.u8 q6, q6, #4 - vzip.8 q6, q7 vpadal.s16 q8, q0 vpadal.s16 q9, q1 @@ -269,12 +259,14 @@ L1LoopDz: subs r12, r3, #1 // first four output vld1.8 {q2}, [r1]! - vld1.8 {q4}, [r2]! + vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11 // int4->int8 - vmov.i8 q5, #15 - vand.i8 q5, q5, q4 + vmov.i8 q6, #15 + vmov.i8 q7, #15 + vand.i8 q6, q6, q4 + vand.i8 q7, q7, q5 vshr.u8 q4, q4, #4 - vzip.8 q4, q5 + vshr.u8 q5, q5, #4 vmull.s8 q0, d4, d8 vmull.s8 q1, d4, d10 @@ -282,12 +274,6 @@ L1LoopDz: vmlal.s8 q1, d5, d11 vpaddl.s16 q8, q0 vpaddl.s16 q9, q1 - vld1.8 {q6}, [r2]! - // int4->int8 - vmov.i8 q7, #15 - vand.i8 q7, q7, q6 - vshr.u8 q6, q6, #4 - vzip.8 q6, q7 vmull.s8 q0, d4, d12 vmull.s8 q1, d4, d14 @@ -302,22 +288,18 @@ L1LoopDz: L1LoopSz: // first four output vld1.8 {q2}, [r1]! - vld1.8 {q4}, [r2]! + vld1.8 {q4, q5}, [r2]! // weight, d8,d9,d10,d11 // int4->int8 - vmov.i8 q5, #15 - vand.i8 q5, q5, q4 + vmov.i8 q6, #15 + vmov.i8 q7, #15 + vand.i8 q6, q6, q4 + vand.i8 q7, q7, q5 vshr.u8 q4, q4, #4 - vzip.8 q4, q5 + vshr.u8 q5, q5, #4 vmull.s8 q0, d4, d8 vmull.s8 q1, d4, d10 vmlal.s8 q0, d5, d9 vmlal.s8 q1, d5, d11 - vld1.8 {q6}, [r2]! - // int4->int8 - vmov.i8 q7, #15 - vand.i8 q7, q7, q6 - vshr.u8 q6, q6, #4 - vzip.8 q6, q7 vpadal.s16 q8, q0 vpadal.s16 q9, q1 diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S index 9a1bcbf46..08c174af3 100644 --- a/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S +++ b/source/backend/cpu/arm/arm64/MNNConvRunForLineDepthwise.S @@ -26,6 +26,12 @@ ldr x8, [sp, #0] ldr x15, [sp, #8] ldr x10, [sp, #16] ldr x11, [sp, #24] +ldr x12, [sp, #32] +ldr x13, [sp, #40] + +stp d8, d9, [sp, #(-16 * 3)]! +stp d10, d11, [sp, #(16 * 2)] +stp x19, x20, [sp, #(16 * 1)] mov x9, #4 mul x4, x9, x4 @@ -34,10 +40,32 @@ mul x8, x9, x8 mul x10, x9, x10 mul x11, x9, x11 +ld1 {v8.4s}, [x12] // bias +ld1r {v10.4s}, [x13], #4 // min +ld1r {v11.4s}, [x13] + //dilate_y_step -> dilate_y_step - fw*dilate_x_step mul x9, x5, x7 sub x8, x8, x9 +.macro assign_bias x0, x1, x2, x3 + mov \x0\().16b, v8.16b + mov \x1\().16b, v8.16b + mov \x2\().16b, v8.16b + mov \x3\().16b, v8.16b +.endm + +.macro compare_min_max x0, x1, x2, x3, xmin, xmax + fmax \x0\().4s, \x0\().4s, \xmin\().4s + fmax \x1\().4s, \x1\().4s, \xmin\().4s + fmax \x2\().4s, \x2\().4s, \xmin\().4s + fmax \x3\().4s, \x3\().4s, \xmin\().4s + fmin \x0\().4s, \x0\().4s, \xmax\().4s + fmin \x1\().4s, \x1\().4s, \xmax\().4s + fmin \x2\().4s, \x2\().4s, \xmax\().4s + fmin \x3\().4s, \x3\().4s, \xmax\().4s +.endm + LoopDY: mov v4.d[0], x10 mov v4.d[1], x11 @@ -53,22 +81,10 @@ mov x12, #16 mul x12, x4, x12 L16Loop: - movi v16.4s, #0 - movi v17.4s, #0 - movi v18.4s, #0 - movi v19.4s, #0 - movi v20.4s, #0 - movi v21.4s, #0 - movi v22.4s, #0 - movi v23.4s, #0 - movi v24.4s, #0 - movi v25.4s, #0 - movi v26.4s, #0 - movi v27.4s, #0 - movi v28.4s, #0 - movi v29.4s, #0 - movi v30.4s, #0 - movi v31.4s, #0 + assign_bias v16, v17, v18, v19 + assign_bias v20, v21, v22, v23 + assign_bias v24, v25, v26, v27 + assign_bias v28, v29, v30, v31 mov x13, x1 mov x14, x2 @@ -120,6 +136,10 @@ L16Loop: bne L16LoopH sub x3, x3, #16 + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + compare_min_max v24, v25, v26, v27, v10, v11 + compare_min_max v28, v29, v30, v31, v10, v11 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 add x1, x13, x12 cmp x3, #16 @@ -138,14 +158,8 @@ mov x12, #8 mul x12, x4, x12 L8Loop: - movi v16.4s, #0 - movi v17.4s, #0 - movi v18.4s, #0 - movi v19.4s, #0 - movi v20.4s, #0 - movi v21.4s, #0 - movi v22.4s, #0 - movi v23.4s, #0 + assign_bias v16, v17, v18, v19 + assign_bias v20, v21, v22, v23 mov x13, x1 mov x14, x2 @@ -180,6 +194,8 @@ L8Loop: add x1, x1, x8 bne L8LoopH + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 sub x3, x3, #8 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 add x1, x13, x12 @@ -195,10 +211,7 @@ mov x12, #4 mul x12, x4, x12 L4Loop: - movi v16.4s, #0 - movi v17.4s, #0 - movi v18.4s, #0 - movi v19.4s, #0 + assign_bias v16, v17, v18, v19 mov x13, x1 mov x14, x2 @@ -225,6 +238,7 @@ L4Loop: add x1, x1, x8 bne L4LoopH + compare_min_max v16, v17, v18, v19, v10, v11 sub x3, x3, #4 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 add x1, x13, x12 @@ -235,7 +249,7 @@ cmp x3, #0 beq End L1Loop: - movi v0.4s, #0 + mov v0.16b, v8.16b mov x9, x6 mov x11, x1 mov x12, x2 @@ -252,6 +266,8 @@ L1Loop: bne L1LoopH subs x3, x3, #1 + fmax v0.4s, v0.4s, v10.4s + fmin v0.4s, v0.4s, v11.4s st1 {v0.4s}, [x0], #16 mov x2, x12 add x1, x11, x4 @@ -271,7 +287,9 @@ add x0, x0, x11 add x1, x1, x10 bne LoopDY - +ldp x19, x20, [sp, #(16 * 1)] +ldp d10, d11, [sp, #(16 * 2)] +ldp d8, d9, [sp], #(16 * 3) ret //MNNConvRunForLineDepthwise End diff --git a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S b/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S deleted file mode 100644 index 1036c90eb..000000000 --- a/source/backend/cpu/arm/arm64/MNNConvRunForUnitDepthWise.S +++ /dev/null @@ -1,63 +0,0 @@ -// -// MNNConvRunForUnitDepthWise.S -// MNN -// -// Created by MNN on 2019/02/04. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifdef __aarch64__ - -#include "MNNAsmGlobal.h" - -.text -.align 5 - -asm_function MNNConvRunForUnitDepthWise -//void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilate_x_step, size_t dilate_y_step) - -//Auto: x0:dst, x1:src, x2:weight, x3:fw -//x4:fh, x5:weight_y_step, x6:dilate_x_step, x7:dilate_y_step - -cmp x3, #0 -movi v0.4s, #0 -beq UnitEnd -cmp x4, #0 -beq UnitEnd - -mov x9, #4 -mul x5, x9, x5 -mul x6, x9, x6 -mul x7, x9, x7 - -//dilate_y_step -> dilate_y_step - dilate_x_step*fw -mul x9, x3, x6 -sub x7, x7, x9 - -//weight_y_step -> weight_y_step - 4*sizeof(float)*fw -mov x9, #16 -mul x9, x3, x9 -sub x5, x5, x9 - - -UnitLoopH: -mov x9, x3 -UnitLoopW: -ld1 {v1.4s}, [x1], x6 -ld1 {v2.4s}, [x2], #16 -fmla v0.4s, v1.4s, v2.4s -subs x9, x9, #1 -bne UnitLoopW -subs x4, x4, #1 -add x1, x1, x7 -add x2, x2, x5 -bne UnitLoopH - - -UnitEnd: - -st1 {v0.4s}, [x0] - -ret - -#endif diff --git a/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S b/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S new file mode 100644 index 000000000..79770ba09 --- /dev/null +++ b/source/backend/cpu/arm/arm64/MNNDepthwiseConvFastKernel.S @@ -0,0 +1,292 @@ +// +// MNNDepthwiseConvFastKernel.S +// MNN +// +// Created by MNN on 2024/09/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + + +#ifdef __aarch64__ + +#include "MNNAsmGlobal.h" + +.text +.align 5 + +asm_function MNNDepthwiseConvFastKernel + +// void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, +// size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, +// size_t srcHStep, size_t dstHStep); +//Auto Load: +//x0:dst, x1:src, x2:weight, x3:width, x4:src_w_step=pack*1, x5:fw, x6:fh, x7:dilate_x_step + +//Load From sp: +//x8:dilate_y_step, x15: height, x10: srcHStep, x11:dstHStep, x12:bias, x13: minmax +ldr x8, [sp, #0] +ldr x15, [sp, #8] +ldr x10, [sp, #16] +ldr x11, [sp, #24] +ldr x12, [sp, #32] +ldr x13, [sp, #40] + +stp d14, d15, [sp, #(-16 * 9)]! +stp d12, d13, [sp, #(16 * 1)] +stp d10, d11, [sp, #(16 * 2)] +stp d8, d9, [sp, #(16 * 3)] +stp x21, x22, [sp, #(16 * 4)] +stp x19, x20, [sp, #(16 * 5)] +stp x27, x28, [sp, #(16 * 6)] +stp x25, x26, [sp, #(16 * 7)] +stp x23, x24, [sp, #(16 * 8)] + +lsl x4, x4, #2 // src_w_step*sizeof(float) +lsl x7, x7, #2 // dilate_x_step*sizeof(float) +lsl x8, x8, #2 // dilate_y_step*sizeof(float) +lsl x23, x10, #2 // srcHStep*sizeof(float) +lsl x24, x11, #2 // dstHStep*sizeof(float) +mov x20, x12 // bias +mov x26, x13 // min +add x27, x13, #4 // max + +//dilate_y_step -> dilate_y_step - fw*dilate_x_step +mul x9, x5, x7 +sub x8, x8, x9 +mov x25, x3 // width +.macro assign_bias x0, x1, x2, x3, bv + mov \x0\().16b, \bv\().16b + mov \x1\().16b, \bv\().16b + mov \x2\().16b, \bv\().16b + mov \x3\().16b, \bv\().16b +.endm + +.macro compare_min_max x0, x1, x2, x3, xmin, xmax + fmax \x0\().4s, \x0\().4s, \xmin\().4s + fmax \x1\().4s, \x1\().4s, \xmin\().4s + fmax \x2\().4s, \x2\().4s, \xmin\().4s + fmax \x3\().4s, \x3\().4s, \xmin\().4s + fmin \x0\().4s, \x0\().4s, \xmax\().4s + fmin \x1\().4s, \x1\().4s, \xmax\().4s + fmin \x2\().4s, \x2\().4s, \xmax\().4s + fmin \x3\().4s, \x3\().4s, \xmax\().4s +.endm + +LoopDY: +//mov x23, x10 +//mov x24, x11 +mov x21, x0 +mov x22, x1 + +L16: +cmp x3, #16 +blt L8 + +mov x12, #-176 +mov x19, #256 + +L16Loop: + ld1 {v8.4s}, [x20] // load bias + assign_bias v16, v17, v18, v19, v8 + assign_bias v20, v21, v22, v23, v8 + assign_bias v24, v25, v26, v27, v8 + assign_bias v28, v29, v30, v31, v8 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L16LoopH: + mov x10, x5 + L16LoopW: + ld1 {v8.4s}, [x2], #16 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64 + ld1 {v9.4s, v10.4s, v11.4s, v12.4s}, [x1], #64 + subs x10, x10, #1 + fmla v16.4s, v8.4s, v0.4s + fmla v17.4s, v8.4s, v1.4s + fmla v18.4s, v8.4s, v2.4s + fmla v19.4s, v8.4s, v3.4s + + fmla v20.4s, v8.4s, v4.4s + fmla v21.4s, v8.4s, v5.4s + fmla v22.4s, v8.4s, v6.4s + fmla v23.4s, v8.4s, v7.4s + + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12 + + fmla v24.4s, v8.4s, v9.4s + fmla v25.4s, v8.4s, v10.4s + fmla v26.4s, v8.4s, v11.4s + fmla v27.4s, v8.4s, v12.4s + + fmla v28.4s, v8.4s, v0.4s + fmla v29.4s, v8.4s, v1.4s + fmla v30.4s, v8.4s, v2.4s + fmla v31.4s, v8.4s, v3.4s + + bne L16LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L16LoopH + ld1r {v10.4s}, [x26] // min + ld1r {v11.4s}, [x27] // max + sub x3, x3, #16 + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + compare_min_max v24, v25, v26, v27, v10, v11 + compare_min_max v28, v29, v30, v31, v10, v11 + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + add x1, x13, x19 // 16 * pack * sizeof(float) + cmp x3, #16 + mov x2, x14 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 + st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], #64 + bge L16Loop + + +L8: +ld1r {v10.4s}, [x26] // min +ld1r {v11.4s}, [x27] // max +ld1 {v24.4s}, [x20] // load bias +cmp x3, #7 +ble L4 + +mov x12, #-48 +mov x19, #128 + + +L8Loop: + assign_bias v16, v17, v18, v19, v24 + assign_bias v20, v21, v22, v23, v24 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L8LoopH: + mov x10, x5 + L8LoopW: + ld1 {v8.4s}, [x2], #16 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], #64 + ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], x12 + subs x10, x10, #1 + fmla v16.4s, v8.4s, v0.4s + fmla v17.4s, v8.4s, v1.4s + fmla v18.4s, v8.4s, v2.4s + fmla v19.4s, v8.4s, v3.4s + + fmla v20.4s, v8.4s, v4.4s + fmla v21.4s, v8.4s, v5.4s + fmla v22.4s, v8.4s, v6.4s + fmla v23.4s, v8.4s, v7.4s + + bne L8LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L8LoopH + + compare_min_max v16, v17, v18, v19, v10, v11 + compare_min_max v20, v21, v22, v23, v10, v11 + sub x3, x3, #8 + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + add x1, x13, x19 // 8 * pack * sizeof(float) + mov x2, x14 + st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 + + +L4: +cmp x3, #4 +ble L1 + +mov x12, #16 +mov x19, #64 + +L4Loop: + assign_bias v16, v17, v18, v19, v24 + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L4LoopH: + mov x10, x5 + L4LoopW: + ld1 {v8.4s}, [x2], #16 + ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x1], x12 + subs x10, x10, #1 + fmla v16.4s, v8.4s, v0.4s + fmla v17.4s, v8.4s, v1.4s + fmla v18.4s, v8.4s, v2.4s + fmla v19.4s, v8.4s, v3.4s + + bne L4LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L4LoopH + + compare_min_max v16, v17, v18, v19, v10, v11 + sub x3, x3, #4 + st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], #64 + add x1, x13, x19 + mov x2, x14 + +L1: +cmp x3, #0 +beq End + +mov x19, #16 + +L1Loop: + ld1 {v16.4s}, [x20] // assign bias + + mov x13, x1 + mov x14, x2 + mov x9, x6 + L1LoopH: + mov x10, x5 + L1LoopW: + ld1 {v8.4s}, [x2], #16 + ld1 {v0.4s}, [x1], #16 + subs x10, x10, #1 + fmla v16.4s, v8.4s, v0.4s + + bne L1LoopW + subs x9, x9, #1 + add x1, x1, x8 + bne L1LoopH + + subs x3, x3, #1 + fmax v16.4s, v16.4s, v10.4s + fmin v16.4s, v16.4s, v11.4s + st1 {v16.4s}, [x0], #16 + add x1, x13, x4 + mov x2, x14 + bne L1Loop + + +End: + +//mov x10, x23 +//mov x11, x24 +//mov x0, x21 +//mov x1, x22 +mov x3, x25 + +subs x15, x15, #1 +add x0, x21, x24 +add x1, x22, x23 +bne LoopDY + +ldp x23, x24, [sp, #(16 * 8)] +ldp x25, x26, [sp, #(16 * 7)] +ldp x27, x28, [sp, #(16 * 6)] +ldp x19, x20, [sp, #(16 * 5)] +ldp x21, x22, [sp, #(16 * 4)] +ldp d8, d9, [sp, #(16 * 3)] +ldp d10, d11, [sp, #(16 * 2)] +ldp d12, d13, [sp, #(16 * 1)] +ldp d14, d15, [sp], #(16 * 9) +ret +//MNNConvRunForLineDepthwise End + +#endif diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S index d31d57ad7..b2cf3b215 100644 --- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S +++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_16x4_Unit.S @@ -118,8 +118,7 @@ stp x23, x24, [sp, #(16 * 6)] ldr x19, [x15, #56] // fp32 min max ldr x21, [x15, #64] // blockNum ldr x23, [x15, #80] // extraScale -mul x21, x21, x3 // blockNum * src_depth_quad_perblock -lsl x21, x21, #6 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t) +lsl x21, x3, #6 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int8_t) add x20, x19, #4 Start: diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S index 339bbd37e..c5203dde4 100644 --- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S +++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV82_Unit.S @@ -125,9 +125,7 @@ stp x27, x28, [sp, #(16 * 6)] stp x25, x26, [sp, #(16 * 7)] stp x23, x24, [sp, #(16 * 8)] -ldr x27, [x6, #64] // blockNum -mul x27, x27, x3 // blockNum * src_depth_quad_perblock -lsl x15, x27, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT +lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * SRC_UNIT ldr w28, [x6, #24] // useInt8 ldr x25, [x6, #40] // xKernelSum diff --git a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S index 0225e0b4e..621f7a84b 100644 --- a/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S +++ b/source/backend/cpu/arm/arm64/MNNGemmInt8AddBiasScale_ARMV86_Unit.S @@ -138,9 +138,7 @@ ldr w23, [x6, #24] ldr x27, [x6, #40] // srcKernelSum ldr x28, [x6, #48] // weightQuanBias -ldr x22, [x6, #64] // blockNum -mul x22, x22, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block -lsl x15, x22, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6 +lsl x15, x3, #6 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 = src_depth_quad << 6 ldr x10, [x6, #80] // extra scale mov x21, #4 // sizeof(int8_t) * pack diff --git a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S index 803166f17..dde601bfc 100644 --- a/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S +++ b/source/backend/cpu/arm/arm64/MNNPackC4Int8ForMatMulA_ARM86.S @@ -55,8 +55,7 @@ mov x9, x6 // blockNum cbnz x12, TILE10_BLOCK_NUM ld1 {v5.4s, v6.4s}, [x2], #32 -ld1 {v7.d}[0], [x2] -sub x2, x2, #32 +ld1 {v7.d}[0], [x2], #8 TILE10_BLOCK_NUM: cbz x9, TILE10_END @@ -315,4 +314,4 @@ ldp d10, d11, [sp, #(16 * 2)] ldp d12, d13, [sp, #(16 * 1)] ldp d14, d15, [sp], #(16 * 4) ret -#endif \ No newline at end of file +#endif diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S index 90ad5673b..01d574fa8 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_16x4_w4_Unit.S @@ -113,10 +113,8 @@ stp x21, x22, [sp, #(16 * 5)] stp x23, x24, [sp, #(16 * 6)] ldr x19, [x15, #56] // fp32 min max -ldr x21, [x15, #64] // blockNum ldr x23, [x15, #80] // extraScale -mul x21, x21, x3 // blockNum * src_depth_quad_perblock -lsl x21, x21, #5 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t) +lsl x21, x3, #5 // src_depth_quad* SRC_UNIT * UNIT * sizeof(int4_t) add x20, x19, #4 Start: diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S index 49b9567cc..4e94c454d 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV82_w4_Unit.S @@ -124,9 +124,7 @@ stp x27, x28, [sp, #(16 * 6)] stp x25, x26, [sp, #(16 * 7)] stp x23, x24, [sp, #(16 * 8)] -ldr x27, [x6, #64] // blockNum -mul x27, x27, x3 // blockNum * src_depth_quad_perblock -lsl x15, x27, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t) +lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT * sizeof(int4_t) ldr x25, [x6, #40] // xKernelSum ldr x26, [x6, #48] // weightQuantBias diff --git a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S index 891196103..d6b2c53e2 100644 --- a/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S +++ b/source/backend/cpu/arm/arm64/low_memory/MNNGemmInt8AddBiasScale_ARMV86_w4_Unit.S @@ -116,9 +116,7 @@ stp x27, x28, [sp, #(16 * 8)] ldr x27, [x6, #40] // srcKernelSum ldr x28, [x6, #48] // weightQuanBias -ldr x22, [x6, #64] // blockNum -mul x22, x22, x3 // UP_DIV(ic*ky*kx, SRC_UNIT) = blockNum * src_depth_quad_per_block -lsl x15, x22, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4 +lsl x15, x3, #5 // x15 = src_depth_quad * UNIT * UNIT_SRC = src_depth_quad * 64 * (sizeof(int4)) = src_depth_quad << 4 mov x21, #16 // sizeof(float) * pack ldr x14, [x6, #56] // float32 maxmin ptr diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp index df1b70970..f5f1af06a 100644 --- a/source/backend/cpu/compute/CommonOptFunction.cpp +++ b/source/backend/cpu/compute/CommonOptFunction.cpp @@ -3028,203 +3028,6 @@ void MNNSigmoidLowp(float* dst, const float* src, size_t dataSize) { #endif } -void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameters) { - int unit = ow / 2; - MNN_ASSERT(cacheLineSize >= 1); - auto biasF = Vec4::load(bias); - auto minF = Vec4(parameters[2]); - auto maxF = Vec4(parameters[3]); - for (int x = 0; x < unit; ++x) { - auto offset = 4 * 4 * x; - int i = 0; - Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - Vec4 m3 = Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3); - - for (i = 1; i < cacheLineSize; ++i) { - m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - m3 = m3 + Vec4::load(weigth + i * 16 + 4 * 3) * Vec4::load(cacheLine[i] + offset + 4 * 3); - } - - auto o0 = m0 + m1 + m2 + biasF; - auto o1 = m1 - m2 + m3 + biasF; - o0 = Vec4::min(maxF, o0); - o1 = Vec4::min(maxF, o1); - o0 = Vec4::max(minF, o0); - o1 = Vec4::max(minF, o1); - Vec4::save(dest + 8 * x + 0 * 4, o0); - Vec4::save(dest + 8 * x + 1 * 4, o1); - } - if (unit * 2 < ow) { - auto offset = 4 * 4 * unit; - int i = 0; - Vec4 m0 = Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - Vec4 m1 = Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - Vec4 m2 = Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - - for (i = 1; i < cacheLineSize; ++i) { - m0 = m0 + Vec4::load(weigth + i * 16 + 4 * 0) * Vec4::load(cacheLine[i] + offset + 4 * 0); - m1 = m1 + Vec4::load(weigth + i * 16 + 4 * 1) * Vec4::load(cacheLine[i] + offset + 4 * 1); - m2 = m2 + Vec4::load(weigth + i * 16 + 4 * 2) * Vec4::load(cacheLine[i] + offset + 4 * 2); - } - auto o0 = m0 + m1 + m2 + biasF; - o0 = Vec4::min(maxF, o0); - o0 = Vec4::max(minF, o0); - Vec4::save(dest + 8 * unit + 0 * 4, o0); - } -} -extern "C" { -void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit); -} - -void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu) { - for (int x = 0; x < su; ++x) { - auto dstX = dest + 4 * 4 * x; - auto sx = x * 2 - (int)pad; - auto ex = sx + 4; - - auto clampSx = std::max(sx, 0); - auto clampEx = std::min(ex, (int)iw); - - Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = clampSx; i < clampEx; ++i) { - v[i - sx] = Vec4::load(source + 4 * i); - } - auto m0 = v[0] - v[2]; - auto m1 = v[1] + v[2]; - auto m2 = v[2] - v[1]; - auto m3 = v[3] - v[1]; - - Vec4::save(dstX + 4 * 0, m0); - Vec4::save(dstX + 4 * 1, m1); - Vec4::save(dstX + 4 * 2, m2); - Vec4::save(dstX + 4 * 3, m3); - } - MNNConvDwF23SourceTransUnit(source + 4 * (su * 2 - pad), dest + 4 * 4 * su, eu - su); - - for (int x = eu; x < unit; ++x) { - auto dstX = dest + 4 * 4 * x; - auto sx = x * 2 - (int)pad; - auto ex = sx + 4; - - auto clampSx = std::max(sx, 0); - auto clampEx = std::min(ex, (int)iw); - - Vec4 v[4] = {0.0f, 0.0f, 0.0f, 0.0f}; - for (int i = clampSx; i < clampEx; ++i) { - v[i - sx] = Vec4::load(source + 4 * i); - } - auto m0 = v[0] - v[2]; - auto m1 = v[1] + v[2]; - auto m2 = v[2] - v[1]; - auto m3 = v[3] - v[1]; - - Vec4::save(dstX + 4 * 0, m0); - Vec4::save(dstX + 4 * 1, m1); - Vec4::save(dstX + 4 * 2, m2); - Vec4::save(dstX + 4 * 3, m3); - } -} - -#ifndef MNN_USE_NEON -void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters) { - int unit = ow / 2; - auto w00 = Vec4::load(weigth + 0 * 16 + 4 * 0); - auto w01 = Vec4::load(weigth + 0 * 16 + 4 * 1); - auto w02 = Vec4::load(weigth + 0 * 16 + 4 * 2); - auto w03 = Vec4::load(weigth + 0 * 16 + 4 * 3); - auto w10 = Vec4::load(weigth + 1 * 16 + 4 * 0); - auto w11 = Vec4::load(weigth + 1 * 16 + 4 * 1); - auto w12 = Vec4::load(weigth + 1 * 16 + 4 * 2); - auto w13 = Vec4::load(weigth + 1 * 16 + 4 * 3); - auto w20 = Vec4::load(weigth + 2 * 16 + 4 * 0); - auto w21 = Vec4::load(weigth + 2 * 16 + 4 * 1); - auto w22 = Vec4::load(weigth + 2 * 16 + 4 * 2); - auto w23 = Vec4::load(weigth + 2 * 16 + 4 * 3); - auto biasF = Vec4::load(bias); - auto minF = Vec4(parameters[2]); - auto maxF = Vec4(parameters[3]); - for (int x = 0; x < unit; ++x) { - auto offset = 4 * 4 * x; - int i = 0; - Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0); - Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1); - Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2); - Vec4 m3 = w03 * Vec4::load(cacheLine[0] + offset + 4 * 3); - - m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0); - m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1); - m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2); - m3 = m3 + w13 * Vec4::load(cacheLine[1] + offset + 4 * 3); - - m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0); - m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1); - m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2); - m3 = m3 + w23 * Vec4::load(cacheLine[2] + offset + 4 * 3); - - auto o0 = m0 + m1 + m2 + biasF; - auto o1 = m1 - m2 + m3 + biasF; - o0 = Vec4::min(maxF, o0); - o1 = Vec4::min(maxF, o1); - o0 = Vec4::max(minF, o0); - o1 = Vec4::max(minF, o1); - Vec4::save(dest + 8 * x + 0 * 4, o0); - Vec4::save(dest + 8 * x + 1 * 4, o1); - } - if (unit * 2 < ow) { - auto offset = 4 * 4 * unit; - Vec4 m0 = w00 * Vec4::load(cacheLine[0] + offset + 4 * 0); - Vec4 m1 = w01 * Vec4::load(cacheLine[0] + offset + 4 * 1); - Vec4 m2 = w02 * Vec4::load(cacheLine[0] + offset + 4 * 2); - - m0 = m0 + w10 * Vec4::load(cacheLine[1] + offset + 4 * 0); - m1 = m1 + w11 * Vec4::load(cacheLine[1] + offset + 4 * 1); - m2 = m2 + w12 * Vec4::load(cacheLine[1] + offset + 4 * 2); - - m0 = m0 + w20 * Vec4::load(cacheLine[2] + offset + 4 * 0); - m1 = m1 + w21 * Vec4::load(cacheLine[2] + offset + 4 * 1); - m2 = m2 + w22 * Vec4::load(cacheLine[2] + offset + 4 * 2); - auto o0 = m0 + m1 + m2 + biasF; - o0 = Vec4::min(maxF, o0); - o0 = Vec4::max(minF, o0); - Vec4::save(dest + 8 * unit + 0 * 4, o0); - } -} -void MNNConvDwF23SourceTransUnit(const float *source, float *dest, size_t unit) { - if (unit <= 0) { - return; - } - Vec4 v0 = Vec4::load(source + 4 * 0); - Vec4 v1 = Vec4::load(source + 4 * 1); - Vec4 v2; - Vec4 v3; - source += 8; - - for (int x = 0; x < unit; ++x) { - v2 = Vec4::load(source + 0 * 4); - v3 = Vec4::load(source + 1 * 4); - auto m0 = v0 - v2; - auto m1 = v1 + v2; - auto m2 = v2 - v1; - auto m3 = v3 - v1; - - Vec4::save(dest + 4 * 0, m0); - Vec4::save(dest + 4 * 1, m1); - Vec4::save(dest + 4 * 2, m2); - Vec4::save(dest + 4 * 3, m3); - - source += 8; - dest += 16; - - v0 = v2; - v1 = v3; - } -} -#endif - static void _MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFunctions::MNNPackedSparseMatMul& packedSparseMatMul) { if(sparseBlockOC == 4) { packedSparseMatMul = MNNPackedSparseMatMulEpx4; @@ -3365,10 +3168,6 @@ void MNNCoreFunctionInit() { gCoreFunction->MNNAxByClampBroadcastUnit = MNNAxByClampBroadcastUnit; gCoreFunction->MNNConvRunForLineDepthwise = MNNConvRunForLineDepthwise; - gCoreFunction->MNNConvRunForUnitDepthWise = MNNConvRunForUnitDepthWise; - gCoreFunction->MNNSourceTransformCommonF23 = MNNSourceTransformCommonF23; - gCoreFunction->MNNConvDwF23MulTransUnit = MNNConvDwF23MulTransUnit; - gCoreFunction->MNNMultiAndDestTransformCommon23 = MNNMultiAndDestTransformCommon23; gCoreFunction->MNNMatrixAdd = MNNMatrixAdd; gCoreFunction->MNNMatrixSub = MNNMatrixSub; gCoreFunction->MNNStrassenMergeCFunction = MNNStrassenMergeCFunction; @@ -3390,6 +3189,9 @@ void MNNCoreFunctionInit() { gCoreFunction->chooseWinoDestUnrollTransform = WinogradFunction::chooseWinoDestUnrollTransform; gCoreFunction->MNNDeconvRunForLineDepthwise = MNNDeconvRunForLineDepthwise; gCoreFunction->MNNDeconvRunForUnitDepthWise = MNNDeconvRunForUnitDepthWise; +#ifdef MNN_USE_NEON + gCoreFunction->MNNDepthwiseConvFastKernel = MNNDepthwiseConvFastKernel; +#endif gCoreFunction->MNNSelectBinaryFunctionForFloat = CPUBinary::selectForFloat; gCoreFunction->MNNSelectUnaryFunctionForFloat = CPUUnary::selectForFloat; gCoreFunction->MNNSelectUnaryFunctionForInt8 = CPUUnary::selectForInt8; @@ -3514,4 +3316,4 @@ void MNNPackInt8C2Origin(float* dst, const float* src, size_t area, size_t depth areaOffset, }; MNNPackInt8C2(dst, src, area, depth, offset); -} \ No newline at end of file +} diff --git a/source/backend/cpu/compute/CommonOptFunction.h b/source/backend/cpu/compute/CommonOptFunction.h index 4af1a81a8..32ebd0c54 100644 --- a/source/backend/cpu/compute/CommonOptFunction.h +++ b/source/backend/cpu/compute/CommonOptFunction.h @@ -170,9 +170,6 @@ struct MatMulParam { void MNNComputeMatMulForE_1(const float* A, const float* B, float* C, const float* biasPtr, const MatMulParam* param, size_t tId); void MNNCopyC4Int16WithStride(const float* sourceF, float* destF, size_t srcStride, size_t dstStride, size_t count); -void MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu); -void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* postParameter); -void MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow); void MNNInt8ToInt16(int16_t* dest, const int8_t* source, size_t count); struct SumByAxisParams { @@ -267,15 +264,10 @@ struct CoreFunctions { void(*MNNUnpackCUnitTranspose)(float* dst, const float* src, size_t area, size_t depth, int* areaOffset); // NC4HW4's compute function - void(*MNNConvRunForUnitDepthWise)(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); void(*MNNConvRunForLineDepthwise)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep); + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); void(*MNNAxByClampBroadcastUnit)(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters); - void(*MNNMultiAndDestTransformCommon23)(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* post); - void(*MNNSourceTransformCommonF23)(const float *source, float *dest, int unit, int iw, int pad, int su, int eu); - void(*MNNConvDwF23MulTransUnit)(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* post); void(*MNNMatrixAdd)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, size_t bStride, size_t height); void(*MNNMatrixSub)(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, @@ -309,6 +301,9 @@ struct CoreFunctions { size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); void(*MNNDeconvRunForLineDepthwise)(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step); + void(*MNNDepthwiseConvFastKernel)(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) = nullptr; void(*MNNReluWithSlopeChannel)(float* dst, const float* src, const float* slope, size_t sizeQuad, size_t depthQuad); void(*MNNPoolingAvg)(const void* channelInput, int inputWidth, int inputHeight, void *channelOutput, int outputWidth, int outputHeight, int kernelWidth, int kernelHeight, int strideWidth, diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp index ae2c1a8ff..bcba4eedb 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.cpp @@ -44,10 +44,14 @@ ErrorCode ConvInt8TiledExecutor::onResize(const std::vector& inputs, co return NO_ERROR; } -void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack) { +void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum) { auto weightDst = weight->host(); memset(weightDst, 0, weight->size()); - if (SRC_UNIT > pack) { + int kernelCountUnit = weight->shape()[1]; + int blockL = kernelCountUnit / blockNum; + int strideOutside = ROUND_UP(oc, UNIT) * SRC_UNIT * blockL; + int strideInside = weight->stride(0) / blockNum; + if (SRC_UNIT > pack) { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(UP_DIV(ic, pack) * kernelCount, SRC_UNIT / pack) / blockNum, UNIT, SRC_UNIT}; auto icDivU = UP_DIV(ic, pack); for (int k = 0; k < kernelCount; ++k) { const auto srcK = weightSrc + k; @@ -58,31 +62,37 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS const int ySubOutSide = yIndex / (SRC_UNIT / pack); const int ySubInSide = yIndex % (SRC_UNIT / pack); - auto dstY = weightDst + ySubOutSide * weight->stride(1) + ySubInSide * pack + yInSide; + int blockId = ySubOutSide / blockL; + int blockInsideId = ySubOutSide % blockL; + + auto dstY = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + ySubInSide * pack + yInSide; const auto srcY = srcK + y * kernelCount; for (int x = 0; x < oc; ++x) { const int xOutSide = x / UNIT; const int xInSide = x % UNIT; - const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT; + const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT; const int srcIndex = x * kernelCount * ic; dstY[dstIndex] = srcY[srcIndex]; } } } - } else { + } else { // shape = {blockNum, UP_DIV(oc, UNIT), UP_DIV(ic, SRC_UNIT) * kernelCount / blockNum, UNIT, SRC_UNIT}; for (int k = 0; k < kernelCount; ++k) { auto icDivU = UP_DIV(ic, SRC_UNIT); const auto srcK = weightSrc + k; for (int y = 0; y < ic; ++y) { const int yOutSide = y / SRC_UNIT; const int yInSide = y % SRC_UNIT; + + int blockId = (yOutSide + k * icDivU) / blockL; + int blockInsideId = (yOutSide + k * icDivU) % blockL; - auto dstY = weightDst + (yOutSide + k * icDivU) * weight->stride(1) + yInSide; + auto dstY = weightDst + blockId * strideOutside + blockInsideId * weight->stride(1) + yInSide; const auto srcY = srcK + y * kernelCount; for (int x = 0; x < oc; ++x) { const int xOutSide = x / UNIT; const int xInSide = x % UNIT; - const int dstIndex = xOutSide * weight->stride(0) + xInSide * SRC_UNIT; + const int dstIndex = xOutSide * strideInside + xInSide * SRC_UNIT; const int srcIndex = x * kernelCount * ic; dstY[dstIndex] = srcY[srcIndex]; } @@ -93,7 +103,8 @@ void ConvInt8TiledExecutor::reorderWeight(Tensor* weight, const uint8_t* weightS static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common, const std::shared_ptr& weightOrigin, - std::shared_ptr& weight) { + std::shared_ptr& weight, int blockNum) { + MNN_ASSERT(blockNum > 0); auto core = static_cast(bn)->int8Functions(); auto gcore = static_cast(bn)->functions(); int UNIT, SRC_UNIT, DST_XUNIT; @@ -119,11 +130,11 @@ static bool _reorderWeightInside(Backend* bn, const Convolution2DCommon* common, MNN_ERROR("Memory not enough"); return false; } - ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack); + ConvInt8TiledExecutor::reorderWeight(weight.get(), weightOrigin->host(), SRC_UNIT, UNIT, ic, oc, kernelCount, pack, blockNum); return true; } -static void GetResourceInt8(std::shared_ptr resource, std::shared_ptr quantCommon, const Convolution2D* conv2d, Backend* backend) { +static void GetResourceInt8(std::shared_ptr resource, std::shared_ptr quantCommon, const Convolution2D* conv2d, Backend* backend, int32_t* blocknumPtr) { // common parameters int outputCount = conv2d->common()->outputCount(); auto core = static_cast(backend)->functions(); @@ -135,6 +146,7 @@ static void GetResourceInt8(std::shared_ptr resour dequantCnt /= 2; } int blockNum = dequantCnt / outputCount; + blocknumPtr[0] = blockNum; int scaleSize = blockNum * ocUp4; // pack size. int blockSize = LSize / blockNum; int originOffset = 0; @@ -244,7 +256,9 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O auto gcore = static_cast(backend)->functions(); mResourceInt8.reset(new CPUConvolution::ResourceInt8); mResourceInt8->mDynamicQuant = true; - GetResourceInt8(mResourceInt8, quanCommon, convOp, backend); + int blockNum = 1; + GetResourceInt8(mResourceInt8, quanCommon, convOp, backend, &blockNum); + mBlockNum = blockNum; // dynamic quant int UNIT, SRC_UNIT, DST_XUNIT; core->MNNGetGemmUnit(&UNIT, &SRC_UNIT, &DST_XUNIT); @@ -285,10 +299,15 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O // Pack two int4-weight to one int8-weight. int cnt = lP * hP / 4; int L = lU * lP; + int blockL = lU / blockNum; + int stride0 = (lP * hP) * hU * blockL; + int stride1 = (lP * hP) * blockL; for (int i = 0; i < hU; ++i) { for (int j = 0; j < lU; ++j) { + int blockId = j / blockL; + int blockkInsideId = j % blockL; for (int k = 0; k < cnt; ++k) { - int dstIndx0 = (i * lU * lP * hP + j * lP * hP) / 2 + (2 * k); + int dstIndx0 = (blockId * stride0 + i * stride1 + blockkInsideId * lP * hP) / 2 + (2 * k); int hpId0 = (2 * k + 1) / lP; int lpId0 = (2 * k) % lP; @@ -322,7 +341,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O tmpWeight[2 * i + 1] = s1; } std::shared_ptr srcWeight(Tensor::create({weightLength * 2}, (void*)tmpWeight.data())); - mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8); + mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum); if(!mValid) { return; } @@ -349,7 +368,7 @@ DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const O mResourceInt8->mWeightInt8 = weightLow; } else { std::shared_ptr srcWeight(Tensor::create({weightLength}, (void*)quanCommon->weight.get())); - mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8); + mValid = _reorderWeightInside(backend, convOp->common(), srcWeight, mResourceInt8->mWeightInt8, blockNum); if(!mValid) { return; } @@ -429,7 +448,7 @@ static void _computeAlphaScale(Backend* backend, const Convolution2D* conv2d, st DenseConvInt8TiledExecutor::DenseConvInt8TiledExecutor(Backend* backend, const Op* op, std::shared_ptr res) : ConvInt8TiledExecutor(backend, op, res) { std::shared_ptr weightOrigin = mResourceInt8->mWeightInt8; auto convOp = op->main_as_Convolution2D(); - mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8); + mValid = _reorderWeightInside(backend, convOp->common(), weightOrigin, mResourceInt8->mWeightInt8, mBlockNum); if(!mValid) { return; } @@ -559,7 +578,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input mDivides.resize(threads+1); mDivides[0] = 0; - static_cast(backend()->getRuntime())->computeDivideSizes(totalWork, mDivides.data() + 1); + static_cast(backend())->computeDivideSizes(totalWork, mDivides.data() + 1); for (int i = 0; i < mDivides.size(); ++i) { mDivides[i] *= part; } @@ -572,7 +591,7 @@ ErrorCode DenseConvInt8TiledExecutor::onResize(const std::vector& input mThreadNums = ALIMIN(threads, mTileCount); mDivides.resize(threads+1); mDivides[0] = 0; - static_cast(backend()->getRuntime())->computeDivideSizes(mTileCount, mDivides.data() + 1); + static_cast(backend())->computeDivideSizes(mTileCount, mDivides.data() + 1); } int ocUp4 = ROUND_UP(outC, gcore->pack); // int alphaSize = mResource->mDequantize.mScaleBias->size() / (sizeof(float) * 2); @@ -663,6 +682,9 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu auto inputDataPtr = input->host(); auto im2colPtr = mTempIm2ColBuffer->host(); + if (SRC_UNIT > PackUnit) { + memset(im2colPtr, 0, mTempIm2ColBuffer->size()); + } const auto weightDataPtr = mResourceInt8->mWeightInt8->host(); auto srcKernelSumPtr = mTempSrcSum.data(); auto weightDequantBias = mResourceInt8->mOriginScale->host() + alphaSize * 4; @@ -736,7 +758,6 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu dequantscale = range / 255.0f; zeropoint = roundf(-minVal * 255.f / range) - 128.0f; } - std::vectorqsVec(PackUnit, quantscale); auto sizeDiv = UP_DIV(inputsize, PackUnit); int inputPlane = input->batch() * mIm2ColParamter.iw * mIm2ColParamter.ih; if (gcore->bytes == 2 && gcore->pack == 8 && inputPlane > 1) { // C8->C4 @@ -867,7 +888,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu const auto biasFloatTid = reinterpret_cast(biasPtr + ocIndex * 4); const auto scaleFloatTid = reinterpret_cast(scalePtr + ocIndex * 4); const auto weightDequanBiasTid = reinterpret_cast(weightDequantBias + ocIndex * 4); - const auto weightPtrTid = weightDataPtr + static_cast(ocIndex * kernelCountUnitDouble * SRC_UNIT * weightBytes); + const auto weightPtrTid = weightDataPtr + static_cast(ocIndex * blockL * SRC_UNIT * weightBytes); if (mBlockNum == 1) { quanParam.biasFloat = biasFloatTid; quanParam.scale = scaleFloatTid; @@ -941,7 +962,7 @@ ErrorCode DenseConvInt8TiledExecutor::onExecute(const std::vector& inpu quanParam.weightQuanBias = weightDequanBiasTid + k * ocUp4; quanParam.scale = (float*)(scaleFloatTid + k * ocUp4); - mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y, blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step); + mGemmKernel(outputInTilePtr, colAddrTemp + k * blockL * src_step_Y, weightPtrTid + k * blockL * weight_step_Y * UP_DIV(output->channel(), UNIT__), blockL, dstZStep * dstBytes, ocDivThread, &quanParam, step); } ptrX += (step * mBlockNum); realDstCount-=step; diff --git a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp index bebeaa5c4..6c46b9161 100644 --- a/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp +++ b/source/backend/cpu/compute/ConvInt8TiledExecutor.hpp @@ -24,7 +24,7 @@ class ConvInt8TiledExecutor : public CPUConvolution { virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; virtual void getPackParameter(int* Unit, int* SrcUnit, int* DestUnit, const CoreInt8Functions* core) = 0; - static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack); + static void reorderWeight(Tensor* weight, const uint8_t* weightSrc, int SRC_UNIT, int UNIT, int ic, int oc, int kernelCount, int pack, int blockNum = 1); protected: ConvolutionCommon::Im2ColParameter mIm2ColParamter; @@ -74,7 +74,7 @@ class DenseConvInt8TiledExecutor : public ConvInt8TiledExecutor { std::vector mDivides; int mThreadNums; - int mBlockNum; + int mBlockNum = 1; int mOcPerThread; bool mSplitByOc; bool mUseBatchQuan; diff --git a/source/backend/cpu/compute/ConvOpt.cpp b/source/backend/cpu/compute/ConvOpt.cpp index 5f7545c2c..3f209e059 100644 --- a/source/backend/cpu/compute/ConvOpt.cpp +++ b/source/backend/cpu/compute/ConvOpt.cpp @@ -39,14 +39,17 @@ void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) { + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) { int dx, fx, fy; + auto biasValue = Vec4::load(bias); + auto minF = Vec4(parameters[0]); + auto maxF = Vec4(parameters[1]); for (int y = 0; y < height; ++y) { auto srcY = src + y * srcHStep; auto dstY = dst + y * dstHStep; for (dx = 0; dx < width; ++dx) { float* dst_x = dstY + dx * 4; - Vec4 dstValue(0.0f); + auto dstValue = biasValue; const float* src_z = srcY + src_w_setup * dx; const float* weight_z = weight; for (fy = 0; fy < fh; ++fy) { @@ -58,29 +61,13 @@ void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weigh dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x); } } + dstValue = Vec4::min(dstValue, maxF); + dstValue = Vec4::max(dstValue, minF); Vec4::save(dst_x, dstValue); } } } -void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { - int fx, fy; - Vec4 dstValue(0.0f); - const float* src_z = src; - const float* weight_z = weight; - for (fy = 0; fy < fh; ++fy) { - const float* src_y = src_z + fy * dilateY_step; - const float* weight_y = weight_z + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - const float* weight_x = weight_y + 4 * fx; - const float* src_x = src_y + fx * dilateX_step; - dstValue = dstValue + Vec4::load(src_x) * Vec4::load(weight_x); - } - } - Vec4::save(dst, dstValue); -} - void MNNConvRunForUnitint8_t(float* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t src_depth_step, size_t fw, size_t fh, size_t weight_y_step, size_t weight_z_step, size_t dilateX_step, size_t dilateY_step, float* alpha) { diff --git a/source/backend/cpu/compute/ConvOpt.h b/source/backend/cpu/compute/ConvOpt.h index 3d727c98e..bdb96666f 100644 --- a/source/backend/cpu/compute/ConvOpt.h +++ b/source/backend/cpu/compute/ConvOpt.h @@ -16,17 +16,19 @@ extern "C" { #endif -void MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); void MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep); + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); void MNNDeconvRunForUnitDepthWise(const float* dst, float* src, const float* weight, size_t fw, size_t fh, size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); void MNNDeconvRunForLineDepthwise(const float* dst, float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step); +void MNNDepthwiseConvFastKernel(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, + size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); + void MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, size_t bStride, size_t height); void MNNMatrixSub(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, diff --git a/source/backend/cpu/compute/Convolution1x1Strassen.cpp b/source/backend/cpu/compute/Convolution1x1Strassen.cpp index 3ed5c0c6e..455e9cb6c 100644 --- a/source/backend/cpu/compute/Convolution1x1Strassen.cpp +++ b/source/backend/cpu/compute/Convolution1x1Strassen.cpp @@ -133,11 +133,10 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, } #endif mWeightBytes = static_cast(dequantBits) / 8.0f; - auto rt = static_cast(backend()->getRuntime()); if (matrixSizeE > CONVOLUTION_TILED_NUMBER * 8 * numberThread && matrixSizeE > ocC4) { std::vector divides(numberThread+1); divides[0] = 0; - rt->computeDivideSizes(matrixSizeE, divides.data()+1); + static_cast(backend())->computeDivideSizes(matrixSizeE, divides.data()+1); mUnits.resize(numberThread); for (int i = 0; i < numberThread; ++i) { int planeStart = divides[i]; @@ -177,7 +176,7 @@ ErrorCode Convolution1x1Strassen::onResize(const std::vector &inputs, auto ocDiv = UP_DIV(ocC4, hDiv); std::vector divides(numberThread+1); divides[0] = 0; - rt->computeDivideSizes(ocDiv, divides.data()+1); + static_cast(backend())->computeDivideSizes(ocDiv, divides.data()+1); mUnits.resize(numberThread); for (int i = 0; i < numberThread; ++i) { int ocStart = divides[i] * hDiv; diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp deleted file mode 100644 index 46fc68048..000000000 --- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.cpp +++ /dev/null @@ -1,221 +0,0 @@ -// -// ConvolutionDepthwise3x3.cpp -// MNN -// -// Created by MNN on 2019/4/3. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#include "backend/cpu/compute/ConvolutionDepthwise3x3.hpp" -#include "backend/cpu/CPUBackend.hpp" -#include "CommonOptFunction.h" -#include "core/Concurrency.h" -#include "core/Macro.h" - -namespace MNN { -ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(std::shared_ptr resource, const Convolution2DCommon* common, Backend* b) : CPUConvolution(common, b) { - mResource = resource; -} - -ConvolutionDepthwise3x3::ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, - const float *originWeight, size_t originWeightSize, const float *bias, - size_t biasSize) - : CPUConvolution(common, b) { - MNN_ASSERT(3 == common->kernelX() && 3 == common->kernelY()); - MNN_ASSERT(1 == common->strideX() && 1 == common->strideY()); - MNN_ASSERT(1 == common->dilateX() && 1 == common->dilateY()); - mResource.reset(new Resource); - mResource->backend = b; - auto core = static_cast(b)->functions(); - auto pack = core->pack; - auto bytes = core->bytes; - auto success = mResource->copyBiasAlign(bias, biasSize); - if (!success) { - mValid = false; - return; - } - auto channel = common->outputCount(); - auto channelC4 = UP_DIV(channel, pack); - auto unitSize = channelC4 * pack * 3 * 4; - mResource->mWeight.reset(Tensor::createDevice({unitSize * bytes})); - mValid = backend()->onAcquireBuffer(mResource->mWeight.get(), Backend::STATIC); - if (!mValid) { - return; - } - AutoStorage tempWeightStorge; - auto weightHost = mResource->mWeight->host(); - if (bytes < 4) { - // Lowp need extra float storage for transform - tempWeightStorge.reset(unitSize); - if (nullptr == tempWeightStorge.get()) { - mValid = false; - return; - } - weightHost = tempWeightStorge.get(); - } - ::memset(weightHost, 0, unitSize * sizeof(float)); - /* 1D-Winograd F(2,3) and tiling */ - for (int c = 0; c < channel; ++c) { - auto cIndex = c / pack; - auto cRemain = c % pack; - auto weightDstZ = weightHost + cIndex * pack * 4 * 3 + cRemain; - auto weightSrcZ = originWeight + c * 9; - for (int y = 0; y < 3; ++y) { - auto k0 = weightSrcZ[3 * y + 0]; - auto k1 = weightSrcZ[3 * y + 1]; - auto k2 = weightSrcZ[3 * y + 2]; - - auto m0 = k0; - auto m1 = 0.5f * (k0 + k1 + k2); - auto m2 = 0.5f * (k0 - k1 + k2); - auto m3 = k2; - - weightDstZ[(y * 4 + 0) * pack] = m0; - weightDstZ[(y * 4 + 1) * pack] = m1; - weightDstZ[(y * 4 + 2) * pack] = m2; - weightDstZ[(y * 4 + 3) * pack] = m3; - } - } - if (bytes < 4) { - core->MNNFp32ToLowp(weightHost, mResource->mWeight->host(), unitSize); - } -} - -ConvolutionDepthwise3x3::~ConvolutionDepthwise3x3() { - // Do nothing -} - -bool ConvolutionDepthwise3x3::onClone(Backend* bn, const Op* op, Execution** dst) { - if (nullptr == dst) { - return true; - } - auto dstExe = new ConvolutionDepthwise3x3(mResource, op->main_as_Convolution2D()->common(), bn); - *dst = dstExe; - return true; -} - -ErrorCode ConvolutionDepthwise3x3::onResize(const std::vector &inputs, const std::vector &outputs) { - CPUConvolution::onResize(inputs, outputs); - const int numberThread = ((CPUBackend *)backend())->threadNumber(); - auto output = outputs[0]; - auto owUnit = UP_DIV(output->width(), 2); - auto core = static_cast(backend())->functions(); - // 3 cacheline - mCacheLine.reset(Tensor::createDevice({numberThread, 3 * 4 * owUnit * core->pack * core->bytes})); - auto valid = backend()->onAcquireBuffer(mCacheLine.get(), Backend::DYNAMIC); - if (!valid) { - return OUT_OF_MEMORY; - } - backend()->onReleaseBuffer(mCacheLine.get(), Backend::DYNAMIC); - auto iw = inputs[0]->width(); - mSourceStartX = UP_DIV(mPadX, 2); - mSourceEndX = std::max((iw + mPadX - 4) / 2, mSourceStartX); - mPostParameters = getPostParameters(); - // auto rate = (float)(mSourceEndX-mSourceStartX) / (float)owUnit; - // FUNC_PRINT_ALL(rate, f); - - int channelC4 = UP_DIV(inputs[0]->channel(), core->pack); - int batch = inputs[0]->batch(); - auto total = channelC4 * batch; - - mDivides.resize(numberThread+1); - mDivides[0] = 0; - static_cast(backend()->getRuntime())->computeDivideSizes(total, mDivides.data() + 1); - - return NO_ERROR; -} - -ErrorCode ConvolutionDepthwise3x3::onExecute(const std::vector &inputs, - const std::vector &outputs) { - auto input = inputs[0]; - auto output = outputs[0]; - auto core = static_cast(backend())->functions(); - - int channelC4 = UP_DIV(input->channel(), core->pack); - int initSize = std::min(input->height(), 2); - int batch = input->batch(); - int ow = output->width(); - int oh = output->height(); - int owUnit = UP_DIV(ow, 2); - - auto iw = input->width(); - auto ih = input->height(); - auto kernelOrigin = mResource->mWeight->host(); - - /*oy-mPadY>=0*/ - int middelYStart = mPadY; - - /*oy-mPadY+3-1 < ih*/ - int middelYEnd = std::max(ih - 2 + mPadY, middelYStart); - - int threadNumber = ((CPUBackend *)backend())->threadNumber(); - auto maxKernelH = std::min(mPadY + ih, 3); - auto inputOrigin = input->host(); - auto outputOrigin = output->host(); - MNN_CONCURRENCY_BEGIN(tId, threadNumber) { - auto cacheLineStart = mCacheLine->host() + tId * mCacheLine->stride(0); - for (int index = mDivides[tId]; index < mDivides[tId+1]; ++index) { - int z = index / batch; - auto biasPtr = (const float*)(mResource->mBias->host() + core->bytes * core->pack * z); - auto inputZ = inputOrigin + core->pack * index * iw * ih * core->bytes; - auto outputZ = outputOrigin + core->pack * index * ow * oh * core->bytes; - auto kernelZ = kernelOrigin + z * core->pack * core->bytes * 4 * 3; - auto cacheLine0 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 0; - auto cacheLine1 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 1; - auto cacheLine2 = cacheLineStart + 4 * core->pack * core->bytes * owUnit * 2; - - float *cacheLine[3] = {(float*)cacheLine0, (float*)cacheLine1, (float*)cacheLine2}; - - // Init - for (int i = 0; i < initSize; ++i) { - core->MNNSourceTransformCommonF23((const float*)(inputZ + i * iw * core->bytes * core->pack), cacheLine[i], owUnit, iw, mPadX, mSourceStartX, - mSourceEndX); - } - - // Compute Top - for (int y = 0; y < middelYStart; ++y) { - auto outputY = outputZ + y * core->bytes * core->pack * ow; - int cacheLineSize = y - mPadY + maxKernelH; - if (cacheLineSize <= 0) { - ::memset(outputY, 0, core->bytes * ow * core->pack); - core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data()); - continue; - } - auto kernelPtr = kernelZ + (maxKernelH - cacheLineSize) * 4 * core->pack * core->bytes; - cacheLineSize = std::min(cacheLineSize, ih); - core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelPtr, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data()); - } - - // Compute Mid - for (int y = middelYStart; y < middelYEnd; ++y) { - auto outputY = outputZ + y * core->bytes * core->pack * ow; - auto iy = y - mPadY + 2; - core->MNNSourceTransformCommonF23((float*)(inputZ + core->bytes * core->pack * iy * iw), cacheLine[2], owUnit, iw, mPadX, mSourceStartX, - mSourceEndX); - // FUNC_PRINT(ow); - core->MNNConvDwF23MulTransUnit(cacheLine, (float*)kernelZ, (float*)outputY, ow, biasPtr, mPostParameters.data()); - - auto temp = cacheLine[0]; - cacheLine[0] = cacheLine[1]; - cacheLine[1] = cacheLine[2]; - cacheLine[2] = temp; - } - - // Compute Bottom - for (int y = middelYEnd; y < oh; ++y) { - auto outputY = outputZ + y * core->bytes * core->pack * ow; - int cacheLineSize = (ih - y + mPadY); - if (cacheLineSize <= 0) { - ::memset(outputY, 0, ow * core->bytes * core->pack); - core->MNNAxByClampBroadcastUnit((float*)outputY, (float*)outputY, biasPtr, ow, 0, 0, 1, mPostParameters.data()); - continue; - } - core->MNNMultiAndDestTransformCommon23(cacheLine, (float*)kernelZ, (float*)outputY, cacheLineSize, ow, biasPtr, mPostParameters.data()); - cacheLine[0] = cacheLine[1]; - cacheLine[1] = cacheLine[2]; - } - } - } MNN_CONCURRENCY_END(); - return NO_ERROR; -} -} // namespace MNN diff --git a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp b/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp deleted file mode 100644 index 4ff4d4ef0..000000000 --- a/source/backend/cpu/compute/ConvolutionDepthwise3x3.hpp +++ /dev/null @@ -1,37 +0,0 @@ -// -// ConvolutionDepthwise3x3.hpp -// MNN -// -// Created by MNN on 2019/4/3. -// Copyright © 2018, Alibaba Group Holding Limited -// - -#ifndef ConvolutionDepthwise3x3_hpp -#define ConvolutionDepthwise3x3_hpp - -#include "backend/cpu/CPUConvolution.hpp" - -namespace MNN { -class ConvolutionDepthwise3x3 : public CPUConvolution { -public: - ConvolutionDepthwise3x3(const Convolution2DCommon *common, Backend *b, const float *originWeight, - size_t originWeightSize, const float *bias, size_t biasSize); - virtual ~ConvolutionDepthwise3x3(); - - virtual ErrorCode onResize(const std::vector &inputs, const std::vector &outputs) override; - virtual ErrorCode onExecute(const std::vector &inputs, const std::vector &outputs) override; - virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override; -private: - ConvolutionDepthwise3x3(std::shared_ptr resource, const Convolution2DCommon* common, Backend* b); - - std::shared_ptr mResource; - - std::unique_ptr mCacheLine; - int mSourceStartX = 0; - int mSourceEndX = 0; - std::vector mPostParameters; - std::vector mDivides; -}; -} // namespace MNN - -#endif /* ConvolutionDepthwise3x3_hpp */ diff --git a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp index 74b23af3e..79ffa1451 100644 --- a/source/backend/cpu/compute/ConvolutionPackWinograd.cpp +++ b/source/backend/cpu/compute/ConvolutionPackWinograd.cpp @@ -262,7 +262,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector &inputs, // MNN_PRINT("ow=%d, oh=%d\n", ow, oh); std::vector divides(threadNumber+1); - static_cast( static_cast(backend())->getRuntime())->computeDivideSizes(totalCount, divides.data()+1); + static_cast(backend())->computeDivideSizes(totalCount, divides.data()+1); divides[0] = 0; auto midBuffer0Bytes = srcUnit2 * pack * bytes; bool allow_x86_bf16_winograd = true; @@ -542,7 +542,7 @@ ErrorCode ConvolutionPackWinograd::onResize(const std::vector &inputs, } }; std::vector postDivides(threadNumber+1); - static_cast( static_cast(backend())->getRuntime())->computeDivideSizes(dc_4, postDivides.data()+1); + static_cast(backend())->computeDivideSizes(dc_4, postDivides.data()+1); postDivides[0] = 0; mPostFunction.first = threadNumber; diff --git a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp index 918f47fa1..fea897d71 100644 --- a/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp +++ b/source/backend/cpu/compute/DenseConvolutionTiledExecutor.cpp @@ -541,7 +541,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs auto rt = static_cast(backend()->getRuntime()); std::vector ocC4ParralSize(threadNumber + 1); ocC4ParralSize[0] = 0; - rt->computeDivideSizes(oC4, ocC4ParralSize.data()+1); + static_cast(backend())->computeDivideSizes(oC4, ocC4ParralSize.data()+1); mFunction.second = [=](int placeholder) { const float* biasPtr = bias ? bias->host() : nullptr; auto gemmBuffer = mTempBufferTranspose.host() + mTempBufferTranspose.stride(0) * 0; @@ -583,7 +583,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs } info[0] = 1; int hw4Stride = info[1] * unit * bytes; - rt->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1); + static_cast(backend())->computeDivideSizes(number * icC4, im2colParallelSize.data() + 1); im2colParallelSize[0] = 0; MNN_CONCURRENCY_BEGIN(tId, threadNumber) { int threadEL[4]; @@ -672,7 +672,7 @@ ErrorCode DenseConvolutionTiledImpl::onResize(const std::vector& inputs std::vector divides(threadNumber + 1); divides[0] = 0; - static_cast(static_cast(backend())->getRuntime())->computeDivideSizes(tileCount, divides.data() + 1); + static_cast(backend())->computeDivideSizes(tileCount, divides.data() + 1); mFunction.second = [=](int tId) { const float* biasPtr = bias ? bias->host() : nullptr; diff --git a/source/backend/cpu/compute/Int8FunctionsOpt.cpp b/source/backend/cpu/compute/Int8FunctionsOpt.cpp index 497ef3bf9..5bed95103 100644 --- a/source/backend/cpu/compute/Int8FunctionsOpt.cpp +++ b/source/backend/cpu/compute/Int8FunctionsOpt.cpp @@ -1416,12 +1416,7 @@ static void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, co size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realCount) { const int bytes = ((post->useInt8 == 1) ? 1 : 4); float fp32min = 0, fp32max = 0; -// if (0 == post->useInt8) { -// fp32min = (post->fp32minmax)[0]; -// fp32max = (post->fp32minmax)[1]; -// } - auto blockNum = post->blockNum; - int weight_step_Z = (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); + int weight_step_Z = src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); int weight_step_Y = (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); const auto srcSumPtr = post->srcKernelSum; if (0 == post->useInt8 && post->fp32minmax) { @@ -1486,7 +1481,7 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src, uint32_t c = 0xf; const int bytes = 4; float fp32min = 0, fp32max = 0; - int weight_step_Z = 0.5 * (post->blockNum * src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); + int weight_step_Z = 0.5 * (src_depth_quad) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); MNN_ASSERT(post->useInt8==0); if (post->fp32minmax) { @@ -1495,7 +1490,6 @@ static void MNNGemmInt8AddBiasScale_16x4_w4_Unit(int8_t* dst, const int8_t* src, } float* biasPtr = (float*)post->biasFloat; - int blockNum = post->blockNum; const auto srcSumPtr = post->srcKernelSum; for (int dz = 0; dz < dst_depth_quad; ++dz) { diff --git a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp index 450714416..ed4226b89 100644 --- a/source/backend/cpu/x86_x64/avx/GemmInt8.cpp +++ b/source/backend/cpu/x86_x64/avx/GemmInt8.cpp @@ -68,13 +68,12 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const fp32min = _mm256_set1_ps((post->fp32minmax)[0]); fp32max = _mm256_set1_ps((post->fp32minmax)[1]); } - int blockNum = post->blockNum; const float* biasPtr = nullptr; if (post->biasFloat) { biasPtr = post->biasFloat; } - int weight_step_Z = 0.5 * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + int weight_step_Z = 0.5 * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); int weight_step_Y = 0.5 * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); const __m128i mask = _mm_set1_epi8(0xf); @@ -506,7 +505,6 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons fp32min = _mm256_set1_ps((post->fp32minmax)[0]); fp32max = _mm256_set1_ps((post->fp32minmax)[1]); } - int blockNum = post->blockNum; const float* biasPtr = nullptr; if (post->biasFloat) { biasPtr = post->biasFloat; @@ -554,7 +552,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons //printf("e=%d, sz=%d, dz=%d\n", realDst, src_depth_quad, dst_depth_quad); if (GEMMINT8_AVX2_E == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8; const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; @@ -683,7 +681,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons } if (3 == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8; const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; @@ -791,7 +789,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons } if (2 == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8; const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; @@ -879,7 +877,7 @@ void _AVX_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons } if (1 == realDst) { for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * blockNum * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); + const auto weight_dz = weight + dz * src_depth_quad * (GEMMINT8_AVX2_L * GEMMINT8_AVX2_H); const auto weightBias_dz = post->weightQuanBias + dz * AVX2_PACKINT8; const float* scale_dz = post->scale + dz * AVX2_PACKINT8; auto dst_z = dst + dz * dst_step_tmp; diff --git a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp index eb006312c..3f2ae1291 100644 --- a/source/backend/cpu/x86_x64/avx/PackedFunction.cpp +++ b/source/backend/cpu/x86_x64/avx/PackedFunction.cpp @@ -35,8 +35,6 @@ void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int void _AVX_MNNRoiAlignMax(float* dst, const float* src, const std::vector> &vecPos, const std::vector> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth); void _AVX_MNNRoiAlignAvg(float* dst, const float* src, const std::vector> &vecPos, const std::vector> &vecArea, int samplingRatioArea, int pooledHeight, int pooledWidth); void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c22, float* xAddr, size_t cStride, size_t eSub, size_t hSub); -void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step); void _AVX_MNNMultiAndDestTransformCommon23(float **cacheLine, const float *weigth, float *dest, int cacheLineSize, int ow, const float* bias, const float* parameter); void _AVX_MNNSourceTransformCommonF23(const float *source, float *dest, int unit, int iw, int pad, int su, int eu); void _AVX_MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameter); @@ -48,7 +46,7 @@ void _AVX_MNNStrassenMergeCFunction(float* c11, float* c12, float* c21, float* c size_t length, size_t hSub); void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep); + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, size_t width, size_t cStride, size_t aStride, size_t height, const float* parameters); } @@ -108,40 +106,25 @@ void _AVX_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, si } } -void _AVX_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { - int fx, fy; - __m256 dstValue = _mm256_setzero_ps(); - const float* src_z = src; - const float* weight_z = weight; - for (fy = 0; fy < fh; ++fy) { - const float* src_y = src_z + fy * dilateY_step; - const float* weight_y = weight_z + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - const float* weight_x = weight_y + PACK_UNIT * fx; - const float* src_x = src_y + fx * dilateX_step; - dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x))); - } - } - _mm256_storeu_ps(dst, dstValue); -} - void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) { + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) { int dx, fx, fy; const int unit = 4; int widthUnit = width / unit; int widthRemain = width - widthUnit * unit; const float* weight_z = weight; + auto minF = _mm256_broadcast_ss(parameters + 0); + auto maxF = _mm256_broadcast_ss(parameters + 1); + auto bv = _mm256_loadu_ps(bias); for (int y = 0; y < height; ++y) { auto srcY = src + y * srcHStep; auto dstY = dst + y * dstHStep; for (dx = 0; dx < widthUnit; ++dx) { - auto dstValue0 = _mm256_setzero_ps(); - auto dstValue1 = _mm256_setzero_ps(); - auto dstValue2 = _mm256_setzero_ps(); - auto dstValue3 = _mm256_setzero_ps(); + auto dstValue0 = bv; + auto dstValue1 = bv; + auto dstValue2 = bv; + auto dstValue3 = bv; for (fy = 0; fy < fh; ++fy) { const float* src_y = srcY + fy * dilateY_step; const float* weight_y = weight_z + fy * fw * PACK_UNIT; @@ -155,6 +138,14 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* dstValue3 = _mm256_add_ps(dstValue3, _mm256_mul_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue)); } } + dstValue0 = _mm256_min_ps(dstValue0, maxF); + dstValue1 = _mm256_min_ps(dstValue1, maxF); + dstValue2 = _mm256_min_ps(dstValue2, maxF); + dstValue3 = _mm256_min_ps(dstValue3, maxF); + dstValue0 = _mm256_max_ps(dstValue0, minF); + dstValue1 = _mm256_max_ps(dstValue1, minF); + dstValue2 = _mm256_max_ps(dstValue2, minF); + dstValue3 = _mm256_max_ps(dstValue3, minF); _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0); _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1); _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2); @@ -164,7 +155,7 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* } for (dx = 0; dx < widthRemain; ++dx) { float* dst_x = dstY + dx * PACK_UNIT; - auto dstValue = _mm256_setzero_ps(); + auto dstValue = bv; const float* src_z = srcY + src_w_setup * dx; const float* weight_z = weight; for (fy = 0; fy < fh; ++fy) { @@ -176,6 +167,8 @@ void _AVX_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* dstValue = _mm256_add_ps(dstValue, _mm256_mul_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x))); } } + dstValue = _mm256_min_ps(dstValue, maxF); + dstValue = _mm256_max_ps(dstValue, minF); _mm256_storeu_ps(dst_x, dstValue); } } @@ -316,68 +309,6 @@ void _AVX_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, siz } } -static size_t _AVX_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) { - if (padMode == true) { //padMode == BorderMode_ZEROS - if (h < 0 || h >= height || w < 0 || w >= width) { - return -1; - } - } else { - // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER - // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1), - // the leftover reflections degrade to GridSamplePaddingMode_BORDER - h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h); - w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w); - } - return h * width * PACK_UNIT + w * PACK_UNIT; -} - -void _AVX_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) { - for (auto ow = 0; ow < outW; ++ow) { - auto w = cordPtr[2 * ow + 0]; - auto h = cordPtr[2 * ow + 1]; - __m256 interp; - - if (sampleMode == true) { //sampleMode == SampleMode_NEAREST - int nh = ::floor(h + 0.5f); - int nw = ::floor(w + 0.5f); - size_t ns = _AVX_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode); - for (int k = 0; k < channelCUnit; ++k) { - interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns); - _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp); - } - } else { //sampleMode == GridSampleMode_BILINEAR - int w0_h = ::floor(h); - int w0_w = ::floor(w); - int w1_h = ::ceil(h); - int w1_w = ::ceil(w); - auto oneV = _mm256_set1_ps(1.0f); - - auto f0 = _mm256_set1_ps((float)w1_w - w); - auto f1 = _mm256_sub_ps(oneV, f0); - auto h0 = _mm256_set1_ps((float)w1_h - h); - auto h1 = _mm256_sub_ps(oneV, h0); - - size_t s00 = _AVX_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode); - size_t s01 = _AVX_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode); - size_t s10 = _AVX_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode); - size_t s11 = _AVX_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode); - - for (int k = 0; k < channelCUnit; ++k) { - __m256 i00 = s00 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s00); - __m256 i01 = s01 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s01); - __m256 i10 = s10 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s10); - __m256 i11 = s11 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s11); - - __m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, f0), _mm256_mul_ps(i01, f1)); - __m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, f0), _mm256_mul_ps(i11, f1)); - - interp = _mm256_add_ps(_mm256_mul_ps(i0, h0), _mm256_mul_ps(i1, h1)); - _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp); - } - } - } -} - void _AVX_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) { Vec8 max = Vec8(-FLT_MAX); for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) { @@ -524,70 +455,6 @@ static size_t _AVX_MNNGridSampleComputeOffset3D(int d, int h, int w, int depth, return ((d * height + h) * width + w) * PACK_UNIT; } -void _AVX_MNNGridSampleInterp3D(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inD, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) { - for (auto ow = 0; ow < outW; ++ow) { - auto w = cordPtr[3 * ow + 0]; - auto h = cordPtr[3 * ow + 1]; - auto d = cordPtr[3 * ow + 2]; - __m256 interp; - - if (sampleMode == true) { //sampleMode == SampleMode_NEAREST - int nd = ::floor(d + 0.5f); - int nh = ::floor(h + 0.5f); - int nw = ::floor(w + 0.5f); - size_t ns = _AVX_MNNGridSampleComputeOffset3D(nd, nh, nw, inD, inH, inW, padMode); - for (int k = 0; k < channelCUnit; ++k) { - interp = ns == -1 ? _mm256_set1_ps(0.f) : _mm256_loadu_ps(inputPtr + k * inOffset + ns); - _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp); - } - } else { //sampleMode == GridSampleMode_BILINEAR - int w0_d = ::floor(d); - int w0_h = ::floor(h); - int w0_w = ::floor(w); - int w1_d = ::ceil(d); - int w1_h = ::ceil(h); - int w1_w = ::ceil(w); - auto oneV = _mm256_set1_ps(1.0f); - - auto f0 = _mm256_set1_ps((float)w1_w - w); - auto f1 = _mm256_sub_ps(oneV, f0); - auto h0 = _mm256_set1_ps((float)w1_h - h); - auto h1 = _mm256_sub_ps(oneV, h0); - auto d0 = _mm256_set1_ps((float)w1_d - d); - auto d1 = _mm256_sub_ps(oneV, d0); - - size_t s000 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w0_w, inD, inH, inW, padMode); - size_t s001 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w0_h, w1_w, inD, inH, inW, padMode); - size_t s010 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w0_w, inD, inH, inW, padMode); - size_t s011 = _AVX_MNNGridSampleComputeOffset3D(w0_d, w1_h, w1_w, inD, inH, inW, padMode); - size_t s100 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w0_w, inD, inH, inW, padMode); - size_t s101 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w0_h, w1_w, inD, inH, inW, padMode); - size_t s110 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w0_w, inD, inH, inW, padMode); - size_t s111 = _AVX_MNNGridSampleComputeOffset3D(w1_d, w1_h, w1_w, inD, inH, inW, padMode); - - for (int k = 0; k < channelCUnit; ++k) { - __m256 i000 = s000 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s000); - __m256 i001 = s001 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s001); - __m256 i010 = s010 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s010); - __m256 i011 = s011 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s011); - __m256 i100 = s100 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s100); - __m256 i101 = s101 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s101); - __m256 i110 = s110 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s110); - __m256 i111 = s111 == -1 ? _mm256_setzero_ps() : _mm256_loadu_ps(inputPtr + k * inOffset + s111); - - __m256 i00 = _mm256_add_ps(_mm256_mul_ps(i000, f0), _mm256_mul_ps(i001, f1)); - __m256 i01 = _mm256_add_ps(_mm256_mul_ps(i010, f0), _mm256_mul_ps(i011, f1)); - __m256 i0 = _mm256_add_ps(_mm256_mul_ps(i00, h0), _mm256_mul_ps(i01, h1)); - __m256 i10 = _mm256_add_ps(_mm256_mul_ps(i100, f0), _mm256_mul_ps(i101, f1)); - __m256 i11 = _mm256_add_ps(_mm256_mul_ps(i110, f0), _mm256_mul_ps(i111, f1)); - __m256 i1 = _mm256_add_ps(_mm256_mul_ps(i10, h0), _mm256_mul_ps(i11, h1)); - - interp = _mm256_add_ps(_mm256_mul_ps(i0, d0), _mm256_mul_ps(i1, d1)); - _mm256_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp); - } - } - } -} void _AVX_MNNMatrixAdd(float* C, const float* A, const float* B, size_t widthC4, size_t cStride, size_t aStride, size_t bStride, size_t height) { @@ -867,13 +734,9 @@ void _AVX_ExtraInit(void* functions) { coreFunction->MNNMatrixAdd = _AVX_MNNMatrixAdd; coreFunction->MNNMatrixSub = _AVX_MNNMatrixSub; - coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWise; coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwise; coreFunction->MNNAxByClampBroadcastUnit = _AVX_MNNAxByClampBroadcastUnit; coreFunction->MNNStrassenMergeCFunction = _AVX_MNNStrassenMergeCFunction; - coreFunction->MNNMultiAndDestTransformCommon23 = _AVX_MNNMultiAndDestTransformCommon23; - coreFunction->MNNSourceTransformCommonF23 = _AVX_MNNSourceTransformCommonF23; - coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnit; coreFunction->MNNReluWithSlopeChannel = _AVX_MNNReluWithSlopeChannel; coreFunction->MNNDeconvRunForLineDepthwise = _AVX_MNNDeconvRunForLineDepthwise; coreFunction->MNNDeconvRunForUnitDepthWise = _AVX_MNNDeconvRunForUnitDepthWise; @@ -881,7 +744,7 @@ void _AVX_ExtraInit(void* functions) { coreFunction->MNNGridSampleInterp = MNNGridSampleInterp; coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad; coreFunction->MNNGridSampleComputeCord3D = _AVX_MNNGridSampleComputeCord3D; - coreFunction->MNNGridSampleInterp3D = _AVX_MNNGridSampleInterp3D; + coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D; coreFunction->MNNRoiPoolingMax = _AVX_MNNRoiPoolingMax; coreFunction->MNNRoiAlignMax = _AVX_MNNRoiAlignMax; coreFunction->MNNRoiAlignAvg = _AVX_MNNRoiAlignAvg; diff --git a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp index 31335e2cf..5d73ffc50 100644 --- a/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp +++ b/source/backend/cpu/x86_x64/avx512/GemmInt8_VNNI.cpp @@ -115,7 +115,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s fp32min = _mm512_set1_ps((post->fp32minmax)[0]); fp32max = _mm512_set1_ps((post->fp32minmax)[1]); } - auto blockNum = post->blockNum; const float* biasPtr = nullptr; const float* bias_dz = nullptr; const float* extraB_dz = nullptr; @@ -162,7 +161,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_Unit_VNNI(int8_t* dst, const int8_t* s } } } - int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); if (realDst == GEMMINT8_AVX512_E) { for (int dz = 0; dz < dzU; ++dz) { auto weight_dz = weight + dz * weightZStride; @@ -1452,7 +1451,6 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t fp32min = _mm512_set1_ps((post->fp32minmax)[0]); fp32max = _mm512_set1_ps((post->fp32minmax)[1]); } - auto blockNum = post->blockNum; const float* biasPtr = nullptr; const float* bias_dz = nullptr; const float* extraB_dz = nullptr; @@ -1500,7 +1498,7 @@ void _AVX512_MNNGemmInt8AddBiasScale_16x4_w4_Unit_VNNI(int8_t* dst, const int8_t } } } - int weight_step_Z = static_cast(blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t) + int weight_step_Z = static_cast(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); // sizeof(int4_t) int weight_step_Y = static_cast(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); // sizeof(int4_t) if (realDst == GEMMINT8_AVX512_E) { diff --git a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl index 5addec946..44e9bc36f 100644 --- a/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl +++ b/source/backend/cpu/x86_x64/avx512/Matmul_4_4_64.inl @@ -105,7 +105,6 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s fp32min = _mm512_set1_ps((post->fp32minmax)[0]); fp32max = _mm512_set1_ps((post->fp32minmax)[1]); } - auto blockNum = post->blockNum; const float* biasPtr = nullptr; const float* bias_dz = nullptr; const float* extraB_dz = nullptr; @@ -113,7 +112,7 @@ void MATMULCOREFUNC_NAME(int8_t* dst, const int8_t* src, const int8_t* weight, s biasPtr = post->biasFloat; } - int weightZStride = blockNum * src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); + int weightZStride = src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H); auto srcKernelSumPtr = post->srcKernelSum; __m512 kernelSum0 = _mm512_setzero_ps(); @@ -1444,7 +1443,6 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight fp32min = _mm512_set1_ps((post->fp32minmax)[0]); fp32max = _mm512_set1_ps((post->fp32minmax)[1]); } - auto blockNum = post->blockNum; const float* biasPtr = nullptr; const float* bias_dz = nullptr; const float* extraB_dz = nullptr; @@ -1458,7 +1456,7 @@ void MATMULCOREFUNC_NAME_W4(int8_t* dst, const int8_t* src, const int8_t* weight __m512 kernelSum2 = _mm512_setzero_ps(); __m512 kernelSum3 = _mm512_setzero_ps(); - int weight_step_Z = static_cast(src_depth_quad * blockNum * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); + int weight_step_Z = static_cast(src_depth_quad * (GEMMINT8_AVX512_L * GEMMINT8_AVX512_H) / 2); int weight_step_Y = static_cast(GEMMINT8_AVX512_L * GEMMINT8_AVX512_H / 2); const __m512i mask = _mm512_set1_epi8(0xf); if (GEMMINT8_AVX512_E == realDst) { diff --git a/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp b/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp index 047c3dc7a..3542e717c 100644 --- a/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp +++ b/source/backend/cpu/x86_x64/avx512/PackedFunction.cpp @@ -124,40 +124,25 @@ void _AVX512_MNNAxByClampBroadcastUnit(float* C, const float* A, const float* B, } } -void _AVX512_MNNConvRunForUnitDepthWise(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { - int fx, fy; - __m512 dstValue = _mm512_setzero_ps(); - const float* src_z = src; - const float* weight_z = weight; - for (fy = 0; fy < fh; ++fy) { - const float* src_y = src_z + fy * dilateY_step; - const float* weight_y = weight_z + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - const float* weight_x = weight_y + PACK_UNIT * fx; - const float* src_x = src_y + fx * dilateX_step; - dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue); - } - } - _mm512_storeu_ps(dst, dstValue); -} - void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) { + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) { int dx, fx, fy; const int unit = 4; int widthUnit = width / unit; int widthRemain = width - widthUnit * unit; const float* weight_z = weight; + auto minF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 0)); + auto maxF = _mm512_broadcastss_ps(_mm_load_ss(parameters + 1)); + auto bv = _mm512_loadu_ps(bias); for (int y = 0; y < height; ++y) { auto srcY = src + y * srcHStep; auto dstY = dst + y * dstHStep; for (dx = 0; dx < widthUnit; ++dx) { - auto dstValue0 = _mm512_setzero_ps(); - auto dstValue1 = _mm512_setzero_ps(); - auto dstValue2 = _mm512_setzero_ps(); - auto dstValue3 = _mm512_setzero_ps(); + auto dstValue0 = bv; + auto dstValue1 = bv; + auto dstValue2 = bv; + auto dstValue3 = bv; for (fy = 0; fy < fh; ++fy) { const float* src_y = srcY + fy * dilateY_step; const float* weight_y = weight_z + fy * fw * PACK_UNIT; @@ -171,6 +156,14 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa dstValue3 = _mm512_fmadd_ps(_mm512_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3); } } + dstValue0 = _mm512_min_ps(dstValue0, maxF); + dstValue1 = _mm512_min_ps(dstValue1, maxF); + dstValue2 = _mm512_min_ps(dstValue2, maxF); + dstValue3 = _mm512_min_ps(dstValue3, maxF); + dstValue0 = _mm512_max_ps(dstValue0, minF); + dstValue1 = _mm512_max_ps(dstValue1, minF); + dstValue2 = _mm512_max_ps(dstValue2, minF); + dstValue3 = _mm512_max_ps(dstValue3, minF); _mm512_storeu_ps(dstY + PACK_UNIT * 0, dstValue0); _mm512_storeu_ps(dstY + PACK_UNIT * 1, dstValue1); _mm512_storeu_ps(dstY + PACK_UNIT * 2, dstValue2); @@ -180,7 +173,7 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa } for (dx = 0; dx < widthRemain; ++dx) { float* dst_x = dstY + dx * PACK_UNIT; - auto dstValue = _mm512_setzero_ps(); + auto dstValue = bv; const float* src_z = srcY + src_w_setup * dx; const float* weight_z = weight; for (fy = 0; fy < fh; ++fy) { @@ -192,6 +185,8 @@ void _AVX512_MNNConvRunForLineDepthwise(float* dst, const float* src, const floa dstValue = _mm512_fmadd_ps(_mm512_loadu_ps(src_x), _mm512_loadu_ps(weight_x), dstValue); } } + dstValue = _mm512_min_ps(dstValue, maxF); + dstValue = _mm512_max_ps(dstValue, minF); _mm512_storeu_ps(dst_x, dstValue); } } @@ -307,68 +302,6 @@ void _AVX512_MNNGridSampleComputeCord(float* dst, const float* src, size_t inH, } } -static size_t _AVX512_MNNGridSampleComputeOffset(int h, int w, int height, int width, bool padMode) { - if (padMode == true) { //padMode == BorderMode_ZEROS - if (h < 0 || h >= height || w < 0 || w >= width) { - return -1; - } - } else { - // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER - // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1), - // the leftover reflections degrade to GridSamplePaddingMode_BORDER - h = h < 0 ? 0 : ( h > (height - 1) ? (height - 1) : h); - w = w < 0 ? 0 : ( w > (width - 1) ? (width - 1) : w); - } - return h * width * PACK_UNIT + w * PACK_UNIT; -} - -void _AVX512_MNNGridSampleInterp(float* outputPtr, const float* inputPtr, const float* cordPtr, size_t inH, size_t inW, size_t outW, size_t channelCUnit, size_t inOffset, size_t outOffset, bool sampleMode, bool padMode) { - for (auto ow = 0; ow < outW; ++ow) { - auto w = cordPtr[2 * ow + 0]; - auto h = cordPtr[2 * ow + 1]; - __m512 interp; - - if (sampleMode == true) { //sampleMode == SampleMode_NEAREST - int nh = ::floor(h + 0.5f); - int nw = ::floor(w + 0.5f); - size_t ns = _AVX512_MNNGridSampleComputeOffset(nh, nw, inH, inW, padMode); - for (int k = 0; k < channelCUnit; ++k) { - interp = ns == -1 ? _mm512_set1_ps(0.f) : _mm512_loadu_ps(inputPtr + k * inOffset + ns); - _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp); - } - } else { //sampleMode == GridSampleMode_BILINEAR - int w0_h = ::floor(h); - int w0_w = ::floor(w); - int w1_h = ::ceil(h); - int w1_w = ::ceil(w); - auto oneV = _mm512_set1_ps(1.0f); - - auto f0 = _mm512_set1_ps((float)w1_w - w); - auto f1 = _mm512_sub_ps(oneV, f0); - auto h0 = _mm512_set1_ps((float)w1_h - h); - auto h1 = _mm512_sub_ps(oneV, h0); - - size_t s00 = _AVX512_MNNGridSampleComputeOffset(w0_h, w0_w, inH, inW, padMode); - size_t s01 = _AVX512_MNNGridSampleComputeOffset(w0_h, w1_w, inH, inW, padMode); - size_t s10 = _AVX512_MNNGridSampleComputeOffset(w1_h, w0_w, inH, inW, padMode); - size_t s11 = _AVX512_MNNGridSampleComputeOffset(w1_h, w1_w, inH, inW, padMode); - - for (int k = 0; k < channelCUnit; ++k) { - __m512 i00 = s00 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s00); - __m512 i01 = s01 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s01); - __m512 i10 = s10 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s10); - __m512 i11 = s11 == -1 ? _mm512_setzero_ps() : _mm512_loadu_ps(inputPtr + k * inOffset + s11); - - __m512 i0 = _mm512_add_ps(_mm512_mul_ps(i00, f0), _mm512_mul_ps(i01, f1)); - __m512 i1 = _mm512_add_ps(_mm512_mul_ps(i10, f0), _mm512_mul_ps(i11, f1)); - - interp = _mm512_add_ps(_mm512_mul_ps(i0, h0), _mm512_mul_ps(i1, h1)); - _mm512_storeu_ps(outputPtr + k * outOffset + PACK_UNIT * ow, interp); - } - } - } -} - void _AVX512_MNNRoiPoolingMax(float* dst, const float* src, int hLen, int wLen, int iw) { Vec16 max = Vec16(-FLT_MAX); for (int h = 0; h < hLen; h++, src += iw * PACK_UNIT) { @@ -752,13 +685,9 @@ void _AVX512_ExtraInit(void* functions) { coreFunction->MNNCountMaxMinValue = _AVX512_MNNComputeScaleZeroScalar; coreFunction->MNNAbsMax = _AVX512_MNNAbsMaxFP32; - coreFunction->MNNConvRunForUnitDepthWise = _AVX512_MNNConvRunForUnitDepthWise; coreFunction->MNNConvRunForLineDepthwise = _AVX512_MNNConvRunForLineDepthwise; coreFunction->MNNAxByClampBroadcastUnit = _AVX512_MNNAxByClampBroadcastUnit; coreFunction->MNNStrassenMergeCFunction = _AVX512_MNNStrassenMergeCFunction; - coreFunction->MNNMultiAndDestTransformCommon23 = _AVX512_MNNMultiAndDestTransformCommon23; - coreFunction->MNNSourceTransformCommonF23 = _AVX512_MNNSourceTransformCommonF23; - coreFunction->MNNConvDwF23MulTransUnit = _AVX512_MNNConvDwF23MulTransUnit; coreFunction->MNNReluWithSlopeChannel = _AVX512_MNNReluWithSlopeChannel; coreFunction->MNNDeconvRunForLineDepthwise = _AVX512_MNNDeconvRunForLineDepthwise; coreFunction->MNNDeconvRunForUnitDepthWise = _AVX512_MNNDeconvRunForUnitDepthWise; @@ -767,6 +696,7 @@ void _AVX512_ExtraInit(void* functions) { coreFunction->MNNRoiAlignMax = _AVX512_MNNRoiAlignMax; coreFunction->MNNRoiAlignAvg = _AVX512_MNNRoiAlignAvg; coreFunction->MNNGridSampleInterp = MNNGridSampleInterp; + coreFunction->MNNGridSampleInterp3D = MNNGridSampleInterp3D; coreFunction->MNNGridSampleInterpGrad = MNNGridSampleInterpGrad; coreFunction->MNNGetSparseMatMulPackMode = _AVX512_MNNGetSparseMatMulPackMode; diff --git a/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp b/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp index 8b3dc590a..6102508a6 100644 --- a/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp +++ b/source/backend/cpu/x86_x64/avxfma/PackedFunction.cpp @@ -11,40 +11,25 @@ #define PACK_UNIT 8 -void _AVX_MNNConvRunForUnitDepthWiseFMA(float* dst, const float* src, const float* weight, size_t fw, size_t fh, - size_t weight_y_step, size_t dilateX_step, size_t dilateY_step) { - int fx, fy; - __m256 dstValue = _mm256_setzero_ps(); - const float* src_z = src; - const float* weight_z = weight; - for (fy = 0; fy < fh; ++fy) { - const float* src_y = src_z + fy * dilateY_step; - const float* weight_y = weight_z + fy * weight_y_step; - for (fx = 0; fx < fw; ++fx) { - const float* weight_x = weight_y + PACK_UNIT * fx; - const float* src_x = src_y + fx * dilateX_step; - dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue); - } - } - _mm256_storeu_ps(dst, dstValue); -} - void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) { + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) { int dx, fx, fy; const int unit = 4; int widthUnit = width / unit; int widthRemain = width - widthUnit * unit; const float* weight_z = weight; + auto minF = _mm256_broadcast_ss(parameters + 0); + auto maxF = _mm256_broadcast_ss(parameters + 1); + auto bv = _mm256_loadu_ps(bias); for (int y = 0; y < height; ++y) { auto srcY = src + y * srcHStep; auto dstY = dst + y * dstHStep; for (dx = 0; dx < widthUnit; ++dx) { - auto dstValue0 = _mm256_setzero_ps(); - auto dstValue1 = _mm256_setzero_ps(); - auto dstValue2 = _mm256_setzero_ps(); - auto dstValue3 = _mm256_setzero_ps(); + auto dstValue0 = bv; + auto dstValue1 = bv; + auto dstValue2 = bv; + auto dstValue3 = bv; for (fy = 0; fy < fh; ++fy) { const float* src_y = srcY + fy * dilateY_step; const float* weight_y = weight_z + fy * fw * PACK_UNIT; @@ -58,6 +43,14 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa dstValue3 = _mm256_fmadd_ps(_mm256_loadu_ps(src_x + 3 * src_w_setup), weightValue, dstValue3); } } + dstValue0 = _mm256_min_ps(dstValue0, maxF); + dstValue1 = _mm256_min_ps(dstValue1, maxF); + dstValue2 = _mm256_min_ps(dstValue2, maxF); + dstValue3 = _mm256_min_ps(dstValue3, maxF); + dstValue0 = _mm256_max_ps(dstValue0, minF); + dstValue1 = _mm256_max_ps(dstValue1, minF); + dstValue2 = _mm256_max_ps(dstValue2, minF); + dstValue3 = _mm256_max_ps(dstValue3, minF); _mm256_storeu_ps(dstY + PACK_UNIT * 0, dstValue0); _mm256_storeu_ps(dstY + PACK_UNIT * 1, dstValue1); _mm256_storeu_ps(dstY + PACK_UNIT * 2, dstValue2); @@ -67,7 +60,7 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa } for (dx = 0; dx < widthRemain; ++dx) { float* dst_x = dstY + dx * PACK_UNIT; - auto dstValue = _mm256_setzero_ps(); + auto dstValue = bv; const float* src_z = srcY + src_w_setup * dx; const float* weight_z = weight; for (fy = 0; fy < fh; ++fy) { @@ -79,6 +72,8 @@ void _AVX_MNNConvRunForLineDepthwiseFMA(float* dst, const float* src, const floa dstValue = _mm256_fmadd_ps(_mm256_loadu_ps(src_x), _mm256_loadu_ps(weight_x), dstValue); } } + dstValue = _mm256_min_ps(dstValue, maxF); + dstValue = _mm256_max_ps(dstValue, minF); _mm256_storeu_ps(dst_x, dstValue); } } @@ -173,8 +168,6 @@ static void _AVXFMA_MNNAdjustOptimalSparseKernel(int& sparseBlockOC, MNN::CoreFu void _AVX_ExtraInitFMA(void* functions) { auto coreFunction = static_cast(functions); coreFunction->MNNConvRunForLineDepthwise = _AVX_MNNConvRunForLineDepthwiseFMA; - coreFunction->MNNConvRunForUnitDepthWise = _AVX_MNNConvRunForUnitDepthWiseFMA; - coreFunction->MNNConvDwF23MulTransUnit = _AVX_MNNConvDwF23MulTransUnitFMA; // sparse conv init coreFunction->MNNAdjustOptimalSparseKernel = _AVXFMA_MNNAdjustOptimalSparseKernel; diff --git a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp index 7e8fff748..a132b48b9 100644 --- a/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp +++ b/source/backend/cpu/x86_x64/sse/FunctionSummary.hpp @@ -68,7 +68,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const void _SSE_MNNPackC4ForMatMul_A(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el); void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep); + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters); void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realDst); void _SSE_MNNExpC8(float* dest, const float* source, float* offset, const float* parameters, size_t countC8); diff --git a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp index f1fb9b338..d20f3dc23 100644 --- a/source/backend/cpu/x86_x64/sse/GemmInt8.cpp +++ b/source/backend/cpu/x86_x64/sse/GemmInt8.cpp @@ -73,9 +73,8 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, cons if (post->biasFloat) { biasPtr = post->biasFloat; } - auto blockNum = post->blockNum; for (int dz = 0; dz < dst_depth_quad; ++dz) { - const auto weight_dz = weight + dz * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); + const auto weight_dz = weight + dz * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); const auto weightBias_dz = post->weightQuanBias + dz * GEMM_INT8_UNIT; const float* scale_dz = nullptr; scale_dz = post->scale + dz * GEMM_INT8_UNIT; @@ -324,8 +323,7 @@ void _SSE_MNNGemmInt8AddBiasScale_16x4_w4(int8_t* dst, const int8_t* src, const if (post->biasFloat) { biasPtr = post->biasFloat; } - int blockNum = post->blockNum; - int weight_step_Z = 0.5 * (src_depth_quad * blockNum) * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); + int weight_step_Z = 0.5 * src_depth_quad * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); int weight_step_Y = 0.5 * (GEMM_INT8_UNIT * GEMM_INT8_SRC_UNIT); auto oneValue = _mm_set1_epi16(1); diff --git a/source/backend/cpu/x86_x64/sse/PackedFunction.cpp b/source/backend/cpu/x86_x64/sse/PackedFunction.cpp index 0006aeb21..ab6d5a705 100644 --- a/source/backend/cpu/x86_x64/sse/PackedFunction.cpp +++ b/source/backend/cpu/x86_x64/sse/PackedFunction.cpp @@ -65,7 +65,7 @@ void _SSE_MNNReluWithSlopeChannel(float* dst, const float* src, const float* slo void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* weight, size_t width, size_t src_w_setup, size_t fw, size_t fh, size_t dilateX_step, size_t dilateY_step, size_t height, - size_t srcHStep, size_t dstHStep) { + size_t srcHStep, size_t dstHStep, const float* bias, const float* parameters) { int dx, fx, fy; const int unit = 8; int widthUnit = width / unit; @@ -75,18 +75,21 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* if (need4) { widthRemain-=4; } + auto minF = _mm_set1_ps(parameters[0]); + auto maxF = _mm_set1_ps(parameters[1]); + auto bv = _mm_loadu_ps(bias); for (int y = 0; y < height; ++y) { auto srcY = src + y * srcHStep; auto dstY = dst + y * dstHStep; for (dx = 0; dx < widthUnit; ++dx) { - auto dstValue0 = _mm_set1_ps(0.0f); - auto dstValue1 = _mm_set1_ps(0.0f); - auto dstValue2 = _mm_set1_ps(0.0f); - auto dstValue3 = _mm_set1_ps(0.0f); - auto dstValue4 = _mm_set1_ps(0.0f); - auto dstValue5 = _mm_set1_ps(0.0f); - auto dstValue6 = _mm_set1_ps(0.0f); - auto dstValue7 = _mm_set1_ps(0.0f); + auto dstValue0 = bv; + auto dstValue1 = bv; + auto dstValue2 = bv; + auto dstValue3 = bv; + auto dstValue4 = bv; + auto dstValue5 = bv; + auto dstValue6 = bv; + auto dstValue7 = bv; for (fy = 0; fy < fh; ++fy) { const float* src_y = srcY + fy * dilateY_step; const float* weight_y = weight_z + fy * fw * 4; @@ -104,6 +107,24 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* dstValue7 = _mm_add_ps(dstValue7, _mm_mul_ps(_mm_loadu_ps(src_x + 7 * src_w_setup), weightValue)); } } + dstValue0 = _mm_min_ps(dstValue0, maxF); + dstValue1 = _mm_min_ps(dstValue1, maxF); + dstValue2 = _mm_min_ps(dstValue2, maxF); + dstValue3 = _mm_min_ps(dstValue3, maxF); + dstValue4 = _mm_min_ps(dstValue4, maxF); + dstValue5 = _mm_min_ps(dstValue5, maxF); + dstValue6 = _mm_min_ps(dstValue6, maxF); + dstValue7 = _mm_min_ps(dstValue7, maxF); + + dstValue0 = _mm_max_ps(dstValue0, minF); + dstValue1 = _mm_max_ps(dstValue1, minF); + dstValue2 = _mm_max_ps(dstValue2, minF); + dstValue3 = _mm_max_ps(dstValue3, minF); + dstValue4 = _mm_max_ps(dstValue4, minF); + dstValue5 = _mm_max_ps(dstValue5, minF); + dstValue6 = _mm_max_ps(dstValue6, minF); + dstValue7 = _mm_max_ps(dstValue7, minF); + _mm_storeu_ps(dstY + 4 * 0, dstValue0); _mm_storeu_ps(dstY + 4 * 1, dstValue1); _mm_storeu_ps(dstY + 4 * 2, dstValue2); @@ -116,10 +137,10 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* srcY += unit * src_w_setup; } if (need4) { - auto dstValue0 = _mm_set1_ps(0.0f); - auto dstValue1 = _mm_set1_ps(0.0f); - auto dstValue2 = _mm_set1_ps(0.0f); - auto dstValue3 = _mm_set1_ps(0.0f); + auto dstValue0 = bv; + auto dstValue1 = bv; + auto dstValue2 = bv; + auto dstValue3 = bv; for (fy = 0; fy < fh; ++fy) { const float* src_y = srcY + fy * dilateY_step; const float* weight_y = weight_z + fy * fw * 4; @@ -133,6 +154,15 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* dstValue3 = _mm_add_ps(dstValue3, _mm_mul_ps(_mm_loadu_ps(src_x + 3 * src_w_setup), weightValue)); } } + dstValue0 = _mm_min_ps(dstValue0, maxF); + dstValue1 = _mm_min_ps(dstValue1, maxF); + dstValue2 = _mm_min_ps(dstValue2, maxF); + dstValue3 = _mm_min_ps(dstValue3, maxF); + + dstValue0 = _mm_max_ps(dstValue0, minF); + dstValue1 = _mm_max_ps(dstValue1, minF); + dstValue2 = _mm_max_ps(dstValue2, minF); + dstValue3 = _mm_max_ps(dstValue3, minF); _mm_storeu_ps(dstY + 4 * 0, dstValue0); _mm_storeu_ps(dstY + 4 * 1, dstValue1); _mm_storeu_ps(dstY + 4 * 2, dstValue2); @@ -142,7 +172,7 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* } for (dx = 0; dx < widthRemain; ++dx) { float* dst_x = dstY + dx * 4; - auto dstValue = _mm_set1_ps(0.0f); + auto dstValue = bv; const float* src_z = srcY + src_w_setup * dx; const float* weight_z = weight; for (fy = 0; fy < fh; ++fy) { @@ -154,6 +184,8 @@ void _SSE_MNNConvRunForLineDepthwise(float* dst, const float* src, const float* dstValue = _mm_add_ps(dstValue, _mm_mul_ps(_mm_loadu_ps(src_x), _mm_loadu_ps(weight_x))); } } + dstValue = _mm_min_ps(dstValue, maxF); + dstValue = _mm_max_ps(dstValue, minF); _mm_storeu_ps(dst_x, dstValue); } } diff --git a/source/backend/metal/AllShader.cpp b/source/backend/metal/AllShader.cpp index 3a695797e..16c50153f 100644 --- a/source/backend/metal/AllShader.cpp +++ b/source/backend/metal/AllShader.cpp @@ -792,6 +792,44 @@ const char* shader_MetalLayerNorm_metal = " out_data[gid.x]=(M4)(norm);\n" " }\n" "}\n" +"kernel void layernorm_m1x4_rms(const device M4 *in [[buffer(0)]],\n" +" device M4 *out [[buffer(1)]],\n" +" constant layernorm_constants& cst [[buffer(2)]],\n" +" const device float4 *gamma [[buffer(3)]],\n" +" const device float4 *beta [[buffer(4)]],\n" +" uint gid [[threadgroup_position_in_grid]],\n" +" uint tiisg[[thread_index_in_simdgroup]],\n" +" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n" +" int total_idx=(gid*4+sgitg);\n" +" int in_idx=total_idx % (cst.inside/4);\n" +" int out_idx=total_idx/(cst.inside/4);\n" +" auto in_data=in+out_idx*cst.inside/4;\n" +" auto out_data=out+out_idx*cst.inside/4;\n" +" float square_sum=0.0f;\n" +" for(int i=tiisg; i3) {xy_out[3]=activate(M4(result3),cst.activation); }\n" "}\n" +"kernel void conv1x1_g1z4_m1w4(const device M4 *in [[buffer(0)]],\n" +" device M4 *out [[buffer(1)]],\n" +" constant conv1x1_constants& cst [[buffer(2)]],\n" +" const device MNN::uchar4x2 *wt [[buffer(3)]],\n" +" const device M4 *biasTerms [[buffer(4)]],\n" +" const device float4 *dequantScale [[buffer(5)]],\n" +" uint3 gid[[threadgroup_position_in_grid]],\n" +" uint tiisg[[thread_index_in_simdgroup]],\n" +" uint sgitg[[simdgroup_index_in_threadgroup]]) {\n" +" int uz=gid.x*2+sgitg;\n" +" int rx=gid.y;\n" +" auto xy_wt=wt+uz*cst.input_slice;\n" +" auto xy_in0=in+(int)gid.z*cst.input_size+rx+0;\n" +" auto xy_out=out+(int)gid.z*cst.output_size+uz*cst.output_size*cst.batch+rx;\n" +" auto biasValue=FLOAT4(biasTerms[uz]);\n" +" FLOAT4 result0=FLOAT4(0);\n" +" int block=(cst.input_slice+cst.block_size-1)/cst.block_size;\n" +" for (int bi=0; bi> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n" +" FLOAT4 res=w4*scale[i]+dequant_bias[i];\n" +" w_dequant[i]=res;\n" +" }\n" +" result0 += FLOAT4(in40*w_dequant);\n" +" \n" +"// FLOAT4x4 w_dequant;\n" +"// for (int i=0; i<4; ++i) {\n" +"// FLOAT4 w4=FLOAT4((float)(w_int4[i][0] >> 4)-8,(float)(w_int4[i][0] & 15)-8,(float)(w_int4[i][1] >> 4)-8,(float)(w_int4[i][1] & 15)-8);\n" +"// FLOAT4 res=w4*scale[i]+dequant_bias[i];\n" +"// w_dequant[i]=w4;\n" +"// }\n" +"//\n" +"// FLOAT4 temp=FLOAT4(in40*w_dequant);\n" +"// result0 += temp*scale+(in40.x+in40.y+in40.z+in40.w)*dequant_bias;\n" +" }\n" +" }\n" +" FLOAT4 res;\n" +" res.x=simd_sum(result0.x);\n" +" res.y=simd_sum(result0.y);\n" +" res.z=simd_sum(result0.z);\n" +" res.w=simd_sum(result0.w);\n" +" /* true */\n" +" if (tiisg == 0) {\n" +" xy_out[0]=activate(M4(res+biasValue),cst.activation);\n" +" }\n" +"}\n" "kernel void conv1x1_g1z8(const device M4 *in [[buffer(0)]],\n" " device M4 *out [[buffer(1)]],\n" " constant conv1x1_constants& cst [[buffer(2)]],\n" @@ -1960,6 +2052,7 @@ const char* shader_MetalDefine_metal = "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n" "// Macro\n" "// –––––––––––––––––––––––––––––––––––––––––––––––––––\n" +"#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32\n" "#define UP_DIV(x,y) ( ((x)+(y)-1)/(y) )\n" "#define ROUND_UP(x,y) ( ((x)+(y)-1)/(y)*(y) )\n" "// whether computer with float32 when store with float16\n" diff --git a/source/backend/metal/MNNMetalContext.h b/source/backend/metal/MNNMetalContext.h index 2159ccf1f..ca5a589d1 100644 --- a/source/backend/metal/MNNMetalContext.h +++ b/source/backend/metal/MNNMetalContext.h @@ -33,8 +33,8 @@ typedef enum { /** metal device */ @property (strong, nonatomic, readonly) id device; /** max memory length cound be used in threadgroup */ -@property (assign, nonatomic, readonly) BOOL isCommitEachShader; @property (assign, nonatomic, readonly) BOOL isIphone; +@property (assign, nonatomic, readonly) BOOL isSimdGroupAvailable; /** * @brief alloc temp buffer on device diff --git a/source/backend/metal/MNNMetalContext.mm b/source/backend/metal/MNNMetalContext.mm index e23fda331..b271c1243 100644 --- a/source/backend/metal/MNNMetalContext.mm +++ b/source/backend/metal/MNNMetalContext.mm @@ -79,30 +79,17 @@ static void createLibrary(id device, NSMutableDictionarydevice; _cachesFp16 = [NSMutableDictionary dictionary]; _cachesFp32 = [NSMutableDictionary dictionary]; - _isCommitEachShader = self.class.commit_frequent; _isIphone = self.class.isIphone; + _isSimdGroupAvailable = self.class.isSimdGroupAvailable; createLibrary(_device, _cachesFp16, true); createLibrary(_device, _cachesFp32, false); return nil != _device; diff --git a/source/backend/metal/MetalAttention.mm b/source/backend/metal/MetalAttention.mm index e1d1ef28f..9679fe1ab 100644 --- a/source/backend/metal/MetalAttention.mm +++ b/source/backend/metal/MetalAttention.mm @@ -39,7 +39,9 @@ kernel void main0(const device T* input0 [[buffer(0)]], const device int* mask [[buffer(4)]], #endif constant Param& param [[buffer(5)]], - uint3 gid[[thread_position_in_grid]]) { + uint3 gid[[thread_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { const int x = gid.x; // query_seq_len const int y = gid.y; // head_num const int z = gid.z; // key_seq_len @@ -102,7 +104,7 @@ kernel void main0(const device T* input0 [[buffer(0)]], } } out *= Vscale; - output[y + z * head_num] = (T)out; + output[y * key_seq_len + z] = (T)out; #endif } @@ -158,18 +160,18 @@ kernel void main0(const device T* input0 [[buffer(0)]], } output[ x * stride * group + (y * head_dim + z)] = out; #else - device const T *A_offset = input0 + y; + device const T *A_offset = input0 + y * value_seq_len; device const T *B_offset = input1 + offset_head; device T *Pastvalue_offset = past_value + offset_head; float out = 0; for(int i = 0; i < value_seq_len - 1; ++i){ - float A = (float)A_offset[i * head_num]; + float A = (float)A_offset[i]; float B = (float)Pastvalue_offset[i * stride]; out += A * B; } - out += (float)A_offset[(value_seq_len - 1)*head_num] * (float)B_offset[0]; + out += (float)A_offset[(value_seq_len - 1)] * (float)B_offset[0]; if (yr == 0) { Pastvalue_offset[(value_seq_len - 1)*stride] = B_offset[0]; } @@ -282,6 +284,7 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { void AttentionBufExecution::onEncode(const std::vector &inputs, const std::vector &outputs, id encoder) { + auto query = inputs[0]; auto key = inputs[1]; auto value = inputs[2]; @@ -407,8 +410,8 @@ virtual bool onClone(Backend* bn, const Op* op, Execution** dst) override { // For softmax parameter int inside, outside; if (mIsDecode) { - inside = mNumHead; - outside = 1; + inside = 1; + outside = mNumHead; } else { inside = 1; outside = mCache->mKv_seq_len * mNumHead; diff --git a/source/backend/metal/MetalBackend.hpp b/source/backend/metal/MetalBackend.hpp index 22eee335f..dfcc571dc 100644 --- a/source/backend/metal/MetalBackend.hpp +++ b/source/backend/metal/MetalBackend.hpp @@ -189,10 +189,7 @@ class MetalBackend : public Backend { id encoder, id shape) const; void flushEncoder() const; - id encoder_for_net() const; - void addOpEncoder(std::function opEncoder); - - bool isCommandEncoderSet(); + id encoder_for_net() const; BufferAllocator* getBufferPool() const; EagerBufferAllocator *getStaticBufferPool() const { @@ -233,11 +230,8 @@ class MetalBackend : public Backend { const MetalRuntime* mRuntime; mutable NSUInteger mEncoderCount = 0; - mutable bool mOpEncoderSet = false;//whether has set encoder mutable bool mSupportDeferEncode = true; - mutable bool mFrameEncodeCache = false; - std::vector> mOpEncoders; mutable id mComputeEncoder = nil; std::shared_ptr mBufferPool; std::shared_ptr mBufferPoolShapeImmutable; diff --git a/source/backend/metal/MetalBackend.mm b/source/backend/metal/MetalBackend.mm index 268db6fde..3d680b65f 100644 --- a/source/backend/metal/MetalBackend.mm +++ b/source/backend/metal/MetalBackend.mm @@ -229,6 +229,7 @@ MemChunk chunk() override { } return NULL; } + //MNN_PRINT("support type [%s]\n", EnumNameOpType(op->type())); auto exe = iter->second->onCreate(inputs, op, this, outputs); if (NULL == exe) { @@ -258,15 +259,8 @@ MemChunk chunk() override { void MetalBackend::onExecuteEnd() const { flushEncoder(); commit_net(); - - if(mFrameEncodeCache) { - // Prepare for next execute - for(auto opEncoder : mOpEncoders) { - opEncoder(); - } - mOpEncoderSet = true; - } } + BufferAllocator* MetalBackend::getBufferPool() const { return mCurrentAllocator; } @@ -302,18 +296,11 @@ MemChunk chunk() override { return true; } -bool MetalBackend::isCommandEncoderSet() { - return mOpEncoderSet;// !isCommitEachShader & mOpFullSupport -} - bool MetalBackend::isCmdBufferCommit() { auto ctx = (__bridge MNNMetalContext *)context(); - if(!ctx.isCommitEachShader) { - return false; - } //TODO: set magic number - const int magicNum = 2; + const int magicNum = mRuntime->hint().encorderNumForCommit; mEncoderCount++; if(mEncoderCount != 0 && mEncoderCount % magicNum == 0) { return true; @@ -321,12 +308,6 @@ MemChunk chunk() override { return false; } -void MetalBackend::addOpEncoder(std::function opEncoder) { - if(mFrameEncodeCache) { - mOpEncoders.push_back(opEncoder); - } -} - id MetalBackend::getHostBuffer(size_t size) const { size = UP_DIV(size, METAL_CONST_BUFFER_LIMIT) * METAL_CONST_BUFFER_LIMIT; // reuse @@ -534,11 +515,7 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff } })metal"; -void MetalBackend::onResizeBegin() { - mFrameEncodeCache = false; - mOpEncoderSet = false; - mOpEncoders.clear(); - +void MetalBackend::onResizeBegin() { // Abort last inference task if needed flushEncoder(); _commandBuffer_net = nil; @@ -549,7 +526,6 @@ kernel void main0(const device IType *in [[buffer(0)]], device OType *out [[buff ErrorCode MetalBackend::onResizeEnd() { auto ctx = (__bridge MNNMetalContext *)context(); - mFrameEncodeCache = (!ctx.isCommitEachShader && mSupportDeferEncode); return mCurrentAllocator->compute(); } @@ -711,9 +687,8 @@ static void _execute(id encoder, const MetalBackend::C void MetalBackend::onCopyBuffer(const Tensor *src, const Tensor *dst) const { flushEncoder(); auto ctx = (__bridge MNNMetalContext *)context(); - if(!mFrameEncodeCache) { - commit_net(); - } + commit_net(); + _resetDynamicMemory(); onCopyBuffer(src, dst, nil, nil); } @@ -789,9 +764,8 @@ static void _execute(id encoder, const MetalBackend::C int MetalBackend::onSync(Tensor::MapType mtype, bool toCpu, const Tensor* dstTensor) { flushEncoder(); auto ctx = (__bridge MNNMetalContext *)context(); - if(!mOpEncoderSet) { - commit_net(); - } + commit_net(); + if (toCpu) { wait(); } diff --git a/source/backend/metal/MetalConvolution1x1.mm b/source/backend/metal/MetalConvolution1x1.mm index 33a3eb19d..35e65118d 100644 --- a/source/backend/metal/MetalConvolution1x1.mm +++ b/source/backend/metal/MetalConvolution1x1.mm @@ -87,8 +87,16 @@ std::string name = "conv1x1_g1z4_w8"; mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w8" fp16:backend->useFp16InsteadFp32()]; if (mDequantBits == 4) { - mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()]; - name = "conv1x1_g1z4_w4"; + if(context.isSimdGroupAvailable && ob * ow * oh == 1) { + mPipeline = [context pipelineWithName:@"conv1x1_g1z4_m1w4" fp16:backend->useFp16InsteadFp32()]; + name = "conv1x1_g1z4_m1w4"; + mThreads = std::make_pair(MTLSizeMake(UP_DIV(oc, 8), 1, 1), MTLSizeMake(8, 8, 1)); + + return NO_ERROR; + } else { + mPipeline = [context pipelineWithName:@"conv1x1_g1z4_w4" fp16:backend->useFp16InsteadFp32()]; + name = "conv1x1_g1z4_w4"; + } } NSArray *arr = [NSArray arrayWithObjects:(id)((MetalRuntimeAllocator::MetalBufferAlloc *)input->deviceId())->getBuffer(), (id)(((MetalRuntimeAllocator::MetalBufferAlloc *)output->deviceId()))->getBuffer(), diff --git a/source/backend/metal/MetalExecution.mm b/source/backend/metal/MetalExecution.mm index 35de88d24..75a20cbfc 100644 --- a/source/backend/metal/MetalExecution.mm +++ b/source/backend/metal/MetalExecution.mm @@ -18,10 +18,6 @@ ErrorCode MetalExecution::onExecute(const std::vector &inputs, const std::vector &outputs) { auto backend = static_cast(this->backend()); - if(backend->isCommandEncoderSet()) { - return NO_ERROR; - } - auto func = [=](){ auto encoder = backend->encoder_for_net(); this->onEncode(inputs, outputs, encoder); @@ -31,7 +27,6 @@ } }; func(); - backend->addOpEncoder(func); return NO_ERROR; } diff --git a/source/backend/metal/MetalGridSample.mm b/source/backend/metal/MetalGridSample.mm index ed66b6748..22213c0a3 100644 --- a/source/backend/metal/MetalGridSample.mm +++ b/source/backend/metal/MetalGridSample.mm @@ -26,7 +26,7 @@ #endif struct grid_sample_params { - int batches; + int batch; int channels; int inH; int inW; @@ -179,7 +179,7 @@ kernel void main0(const device T *input [[buffer(0)]], device T *output [[buffer(2)]], constant grid_sample_params &p [[buffer(3)]], uint3 gid [[thread_position_in_grid]]) { - if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batches) + if ((int)gid.x >= p.outW || (int)gid.y >= p.outH * p.outD || (int)gid.z >= p.batch) return; int gridPos = gid.z*p.outH*p.outW*CON + gid.y*p.outW*CON + gid.x*CON; @@ -191,8 +191,8 @@ kernel void main0(const device T *input [[buffer(0)]], const int channelC4 = (p.channels + 3) / 4; for (int c = 0; c < channelC4; ++ c) { - auto outputPos = gid.z*channelC4*p.outH*p.outW + c*p.outH*p.outW + gid.y*p.outW + gid.x; - auto inputPtr = input + gid.z*channelC4*p.inH*p.inW + c*p.inH*p.inW; + auto outputPos = gid.z*p.outD*p.outH*p.outW + c*p.outD*p.outH*p.outW*p.batch + gid.y*p.outW + gid.x; + auto inputPtr = input + gid.z*p.inD*p.inH*p.inW + c*p.inH*p.inW*p.inD*p.batch; #if GRID3D output[outputPos] = interpolate(z, y, x, inputPtr, p.inD, p.inH, p.inW, p.mode, p.paddingMode); #else diff --git a/source/backend/metal/MetalLayerNorm.mm b/source/backend/metal/MetalLayerNorm.mm index 917d5fe6a..7eaf586f0 100755 --- a/source/backend/metal/MetalLayerNorm.mm +++ b/source/backend/metal/MetalLayerNorm.mm @@ -76,6 +76,7 @@ ((int *)mShapeBuffer.contents)[3] = (int)has_gamma_beta_; + bool parallel = (mInside > 32) && ((mInside & 3) == 0); if(RMSNorm){ mPipeline = [context pipelineWithName:parallel ? @"layernorm_x4_rms" : @"layernorm_x1_rms" fp16:backend->useFp16InsteadFp32()]; @@ -85,10 +86,17 @@ auto inside = parallel ? mInside/4 : mInside; mThreads = [context computeBestGroupAndLocal:mPipeline threads:MTLSizeMake((NSUInteger)inside, (NSUInteger)mOutside, 1)]; + if(context.isSimdGroupAvailable) { + if(mOutside == 1 && RMSNorm && parallel) { + mPipeline = [context pipelineWithName:@"layernorm_m1x4_rms" fp16:backend->useFp16InsteadFp32()]; + mThreads = std::make_pair(MTLSizeMake((NSUInteger)UP_DIV(inside, 4) * mOutside, 1, 1), MTLSizeMake(128, 1, 1)); + } + } return NO_ERROR; } void MetalLayerNorm::onEncode(const std::vector &inputs, const std::vector &outputs, id encoder) { + auto backend = static_cast(this->backend()); auto context = (__bridge MNNMetalContext *)backend->context(); auto input = inputs[0], output = outputs[0]; diff --git a/source/backend/metal/MetalLoop.mm b/source/backend/metal/MetalLoop.mm index 8f51e8622..85010045d 100644 --- a/source/backend/metal/MetalLoop.mm +++ b/source/backend/metal/MetalLoop.mm @@ -550,6 +550,7 @@ virtual ErrorCode onResize(const std::vector& inputs, const std::vecto } virtual void onEncode(const std::vector& inputs, const std::vector& outputs, id encoder) override { + auto cmd = mLoop->commands()->GetAs(0); auto dstTensor = mTensors[cmd->indexes()->data()[0]]; auto srcTensor = mTensors[cmd->indexes()->data()[1]]; diff --git a/source/backend/metal/MetalRaster.hpp b/source/backend/metal/MetalRaster.hpp index 0d64e0840..23e7e47f3 100644 --- a/source/backend/metal/MetalRaster.hpp +++ b/source/backend/metal/MetalRaster.hpp @@ -28,13 +28,10 @@ class MetalRaster : public MetalExecution { MTLSize global; }; private: - std::map> mTempInput; std::map mTempInputCopy; - std::shared_ptr mTempOutput; bool mNeedZero = false; Tensor* mOutputPtr = nullptr; - id mBlitPipeline; - std::vector> mShapeTemp; + std::vector> mBlitPipeline; id mZeroCopy = nil; id mZeroPipeline; }; diff --git a/source/backend/metal/MetalRaster.mm b/source/backend/metal/MetalRaster.mm index 8383b10e6..788f13087 100644 --- a/source/backend/metal/MetalRaster.mm +++ b/source/backend/metal/MetalRaster.mm @@ -34,6 +34,31 @@ static void writeSamplerInfo(SamplerInfo& info, const Tensor::InsideDescribe::Re info.stride[3] = sampler.src.offset; info.extent[3] = sampler.dst.offset; } + +static std::string getUnitName(int bytes) { + std::string unitName; + switch (bytes) { + case 1: + unitName = "uchar"; + break; + case 2: + unitName = "short"; + break; + case 4: + unitName = "int"; + break; + case 8: + unitName = "short4"; + break; + case 16: + unitName = "int4"; + break; + default: + FUNC_PRINT(bytes); + break; + } + return unitName; +} static const char* gMultiBlitMetal = R"metal( #include @@ -85,6 +110,125 @@ kernel void main0(const device T *in [[buffer(0)]], } )metal"; +static const char* gMultiRasterTemplate = R"metal( +#include +#include +using namespace metal; +struct SamplerInfo { + uint4 stride;//stride[3] + offset + uint4 size;//size[3] + totalSize + uint4 extent;//dstStride[3]+dstOffset +}; +kernel void main0(const device T *in [[buffer(0)]], + device T *out [[buffer(1)]], + const device uint4* buf [[buffer(2)]], + uint3 tgid [[thread_position_in_grid]]) { + + uint4 limit = buf[2]; + const device SamplerInfo* infoP = (const device SamplerInfo*)(buf + 3); + uint3 gid = tgid; + gid.x = tgid.x % limit.x; + uint n = tgid.x / limit.x; + if (n < limit.y) { + SamplerInfo info = infoP[n]; + + if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) { + uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w; + uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w; + #ifdef INPUT_FORMAT_NCHW + int srcOffsetReal = srcOffset; + #elif INPUT_FORMAT_NHWC + int srcOffsetReal = srcOffset; + #elif INPUT_FORMAT_C4NHW4 + uint4 src_shape = buf[0];//src nchw + int src_batch = src_shape.x; + int src_channel = src_shape.y; + int src_height = src_shape.z; + int src_width = src_shape.w; + int in_w = srcOffset % src_width; srcOffset /= src_width; + int in_h = srcOffset % src_height; srcOffset /= src_height; + int in_c = srcOffset % src_channel; + int in_b = srcOffset / src_channel; + int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4); + #endif + + #ifdef OUTPUT_FORMAT_NCHW + int dstOffsetReal = dstOffset; + #elif OUTPUT_FORMAT_NHWC + int dstOffsetReal = dstOffset; + #elif OUTPUT_FORMAT_C4NHW4 + uint4 dst_shape = buf[1];//dst nchw + int dst_batch = dst_shape.x; + int dst_channel = dst_shape.y; + int dst_height = dst_shape.z; + int dst_width = dst_shape.w; + int out_w = dstOffset % dst_width; dstOffset /= dst_width; + int out_h = dstOffset % dst_height; dstOffset /= dst_height; + int out_c = dstOffset % dst_channel; + int out_b = dstOffset / dst_channel; + int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4); + #endif + out[dstOffsetReal] = in[srcOffsetReal]; + } + } +} +)metal"; + +static const char* gSingleRasterTemplate = R"metal( +#include +#include +using namespace metal; +struct SamplerInfo { + uint4 stride;//stride[3] + offset + uint4 size;//size[3] + totalSize + uint4 extent;//dstStride[3]+dstOffset +}; +kernel void main0(const device T *in [[buffer(0)]], + device T *out [[buffer(1)]], + const device uint4* buf [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + SamplerInfo info = *((const device SamplerInfo*)(buf + 3)); + if (gid.x < info.size.x && gid.y < info.size.y && gid.z < info.size.z) { + uint dstOffset = gid.x * info.extent.x + gid.y * info.extent.y + gid.z * info.extent.z + info.extent.w; + uint srcOffset = gid.x * info.stride.x + gid.y * info.stride.y + gid.z * info.stride.z + info.stride.w; + #ifdef INPUT_FORMAT_NCHW + int srcOffsetReal = srcOffset; + #elif INPUT_FORMAT_NHWC + int srcOffsetReal = srcOffset; + #elif INPUT_FORMAT_C4NHW4 + uint4 src_shape = buf[0];//src nchw + int src_batch = src_shape.x; + int src_channel = src_shape.y; + int src_height = src_shape.z; + int src_width = src_shape.w; + int in_w = srcOffset % src_width; srcOffset /= src_width; + int in_h = srcOffset % src_height; srcOffset /= src_height; + int in_c = srcOffset % src_channel; + int in_b = srcOffset / src_channel; + int srcOffsetReal = (((in_b + (in_c / 4) * src_batch) * src_height + in_h) * src_width + in_w) * 4 + (in_c % 4); + #endif + + #ifdef OUTPUT_FORMAT_NCHW + int dstOffsetReal = dstOffset; + #elif OUTPUT_FORMAT_NHWC + int dstOffsetReal = dstOffset; + #elif OUTPUT_FORMAT_C4NHW4 + uint4 dst_shape = buf[1];//dst nchw + int dst_batch = dst_shape.x; + int dst_channel = dst_shape.y; + int dst_height = dst_shape.z; + int dst_width = dst_shape.w; + int out_w = dstOffset % dst_width; dstOffset /= dst_width; + int out_h = dstOffset % dst_height; dstOffset /= dst_height; + int out_c = dstOffset % dst_channel; + int out_b = dstOffset / dst_channel; + int dstOffsetReal = (((out_b + (out_c / 4) * dst_batch) * dst_height + out_h) * dst_width + out_w) * 4 + (out_c % 4); + #endif + out[dstOffsetReal] = in[srcOffsetReal]; + } +} +)metal"; + static const char* gFillInt4 = R"metal( #include #include @@ -105,32 +249,13 @@ kernel void main0(device int4 *out [[buffer(0)]], id MetalRaster::getBlitPipeline(int bytes, Backend* backend, bool multiRegion) { auto mtbn = static_cast(backend); std::string pipelineName; - std::string unitName; + std::string unitName = getUnitName(bytes); if (multiRegion) { pipelineName = "blit_multi"; } else { pipelineName = "blit"; } - switch (bytes) { - case 1: - unitName = "uchar"; - break; - case 2: - unitName = "short"; - break; - case 4: - unitName = "int"; - break; - case 8: - unitName = "short4"; - break; - case 16: - unitName = "int4"; - break; - default: - FUNC_PRINT(bytes); - break; - } + std::vector keys = { unitName, pipelineName @@ -159,9 +284,6 @@ kernel void main0(device int4 *out [[buffer(0)]], if (nil != mZeroCopy) { mtbn->returnConstBuffer(mZeroCopy); } - for (auto b : mShapeTemp) { - mtbn->returnConstBuffer(b); - } } struct MemsetInfo { int value[4]; @@ -197,9 +319,8 @@ kernel void main0(device int4 *out [[buffer(0)]], mZeroCopy = mtbn->getConstBuffer(sizeof(MemsetInfo)); } } - mTempInput.clear(); + mTempInputCopy.clear(); - mTempOutput = nullptr; mOutputPtr = output; #ifndef MNN_METAL_FORBID_RASTER_C4 if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) { @@ -216,7 +337,8 @@ kernel void main0(device int4 *out [[buffer(0)]], } } if (fast) { - mBlitPipeline = getBlitPipeline(bytes * 4, backend(), true); + mBlitPipeline.resize(1); + mBlitPipeline[0] = getBlitPipeline(bytes * 4, backend(), true); std::map> collectForTensor; for (int i=0; i< des->regions.size(); ++i) { auto& slice = des->regions[i]; @@ -249,7 +371,7 @@ kernel void main0(device int4 *out [[buffer(0)]], } ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0]; ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size(); - auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])]; + auto local = [context computeBestGroupAndLocal:mBlitPipeline[0] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])]; blit.global = local.first; blit.local = local.second; mTempInputCopy.insert(std::make_pair(iter.first, blit)); @@ -258,57 +380,14 @@ kernel void main0(device int4 *out [[buffer(0)]], } } #endif - for (int i=0; i< des->regions.size(); ++i) { - auto& slice = des->regions[i]; - auto origin = slice.origin; - if (TensorUtils::getDescribe(origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) { - continue; - } - if (mTempInput.find(origin)!=mTempInput.end()) { - continue; - } - std::shared_ptr newTensor(new Tensor); - TensorUtils::copyShape(origin, newTensor.get()); - TensorUtils::getDescribe(newTensor.get())->dimensionFormat = MNN_DATA_FORMAT_NCHW; - newTensor->buffer().type = origin->getType(); - TensorUtils::setLinearLayout(newTensor.get()); - mTempInput.insert(std::make_pair(origin, newTensor)); - } - if (MNN_DATA_FORMAT_NC4HW4 == outputDes->dimensionFormat) { - mTempOutput.reset(new Tensor); - TensorUtils::setupTensorInfo(output, mTempOutput.get(), MNN_DATA_FORMAT_NCHW); - } - if (nullptr != mTempOutput) { - auto res = backend()->onAcquireBuffer(mTempOutput.get(), Backend::DYNAMIC); - if (!res) { - return OUT_OF_MEMORY; - } - mOutputPtr = mTempOutput.get(); - } - for (auto& iter : mTempInput) { - auto res = backend()->onAcquireBuffer(iter.second.get(), Backend::DYNAMIC); - if (!res) { - return OUT_OF_MEMORY; - } - } - for (auto& iter : mTempInput) { - backend()->onReleaseBuffer(iter.second.get(), Backend::DYNAMIC); - } - if (nullptr != mTempOutput) { - backend()->onReleaseBuffer(mTempOutput.get(), Backend::DYNAMIC); - } - mBlitPipeline = getBlitPipeline(bytes, backend(), true); + std::map> collectForTensor; for (int i=0; i< des->regions.size(); ++i) { auto& slice = des->regions[i]; if (nullptr == slice.origin) { continue; } - auto iter = mTempInput.find(slice.origin); Tensor* t = slice.origin; - if (iter != mTempInput.end()) { - t = iter->second.get(); - } auto coliter = collectForTensor.find(t); if (coliter == collectForTensor.end()) { collectForTensor.insert(std::make_pair(t, std::vector{i})); @@ -316,15 +395,64 @@ kernel void main0(device int4 *out [[buffer(0)]], coliter->second.emplace_back(i); } } + + NSString* input_format; + NSString* output_format; + if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NCHW) { + output_format = @"OUTPUT_FORMAT_NCHW"; + } else if(outputDes->dimensionFormat == MNN_DATA_FORMAT_NHWC) { + output_format = @"OUTPUT_FORMAT_NHWC"; + } else { + output_format = @"OUTPUT_FORMAT_C4NHW4"; + } + std::string unitName = getUnitName(bytes); + mBlitPipeline.resize(collectForTensor.size()); + int index = 0; for (auto& iter : collectForTensor) { + auto origin = iter.first; + + if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NCHW) { + input_format = @"INPUT_FORMAT_NCHW"; + } else if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) { + input_format = @"INPUT_FORMAT_NHWC"; + } else { + input_format = @"INPUT_FORMAT_C4NHW4"; + } + std::vector keys = { + std::string([input_format UTF8String]), + std::string([output_format UTF8String]), + unitName, + }; + if(iter.second.size() == 1) { + keys.emplace_back("direct_raster_single"); + } else { + keys.emplace_back("direct_raster_multi"); + } + auto pipeline = mtbn->runtime()->findPipeline(keys); + + if(nullptr == pipeline) { + MTLCompileOptions *options = [[MTLCompileOptions alloc] init]; + options.preprocessorMacros = @{ + input_format : @"1", + output_format : @"1", + @"T" : @(unitName.c_str()), + }; + if(iter.second.size() == 1) { + pipeline = mtbn->makeComputePipelineWithSourceOption(gSingleRasterTemplate, "main0", options); + } else { + pipeline = mtbn->makeComputePipelineWithSourceOption(gMultiRasterTemplate, "main0", options); + } + mtbn->runtime()->insertPipeline(keys, pipeline); + } + mBlitPipeline[index] = pipeline; + BlitInfo blit; - auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 4 * sizeof(uint32_t)); + auto memory = bufferAlloc->alloc(sizeof(SamplerInfo) * iter.second.size() + 12 * sizeof(uint32_t)); blit.blit = std::make_pair(memory.first, memory.second); auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)memory.first)->getBuffer(); - auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 4 * sizeof(uint32_t) + memory.second); + auto infoP = (SamplerInfo*)((uint8_t*)[buffer contents] + 12 * sizeof(uint32_t) + memory.second); - blit.blit = std::make_pair(memory.first, memory.second); uint32_t maxSize[3] = {1, 1, 1}; for (int v=0; vregions[iter.second[v]]; @@ -333,41 +461,42 @@ kernel void main0(device int4 *out [[buffer(0)]], maxSize[1] = ALIMAX(maxSize[1], slice.size[1]); maxSize[2] = ALIMAX(maxSize[2], slice.size[2]); } - ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[0] = maxSize[0]; - ((uint32_t*)((uint8_t*)[buffer contents] + memory.second))[1] = iter.second.size(); - auto local = [context computeBestGroupAndLocal:mBlitPipeline threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])]; + + uint32_t* shape = (uint32_t*)((uint8_t*)[buffer contents] + memory.second); + int origin_area = 1; + for(int i = 2; i < origin->shape().size(); i++) { + origin_area *= origin->shape()[i]; + } + int output_area = 1; + for(int i = 2; i < output->shape().size(); i++) { + output_area *= output->shape()[i]; + } + shape[0] = ALIMAX(1, origin->shape()[0]); + shape[1] = ALIMAX(1, origin->shape()[1]); + shape[2] = ALIMAX(1, origin_area); + shape[3] = 1; + shape[4] = ALIMAX(1, output->shape()[0]); + shape[5] = ALIMAX(1, output->shape()[1]); + shape[6] = ALIMAX(1, output_area); + shape[7] = 1; + shape[8] = maxSize[0]; + shape[9] = iter.second.size(); + + auto local = [context computeBestGroupAndLocal:mBlitPipeline[index++] threads:MTLSizeMake(maxSize[0] * iter.second.size(), maxSize[1], maxSize[2])]; blit.global = local.first; blit.local = local.second; mTempInputCopy.insert(std::make_pair(iter.first, blit)); } - for (auto b : mShapeTemp) { - mtbn->returnConstBuffer(b); - } - mShapeTemp.clear(); - for (int i = 0; i < mTempInput.size(); ++i) { - id shape = mtbn->getConstBuffer(0); - mShapeTemp.emplace_back(std::move(shape)); - } - if (nullptr != mTempOutput) { - mShapeTemp.emplace_back(mtbn->getConstBuffer(0)); - } return NO_ERROR; } void MetalRaster::onEncode(const std::vector &inputs, const std::vector &outputs, id encoder) { + auto backend = static_cast(this->backend()); auto context = (__bridge MNNMetalContext *)backend->context(); - int out_offset = TensorUtils::getDescribe(outputs[0])->extra.offset; - if (nullptr != mTempOutput) { - out_offset = TensorUtils::getDescribe(mTempOutput.get())->extra.offset; - } + if (mNeedZero) { - size_t sizeInBytes; - if (mTempOutput != nullptr) { - sizeInBytes = backend->getTensorSizeInBytes(mTempOutput.get()); - } else { - sizeInBytes = backend->getTensorSizeInBytes(outputs[0]); - } + size_t sizeInBytes = backend->getTensorSizeInBytes(outputs[0]); size_t size = sizeInBytes / (4 * sizeof(int32_t)); auto ptr = (MemsetInfo*)[mZeroCopy contents]; ptr->size[0] = (uint32_t)size; @@ -376,28 +505,33 @@ kernel void main0(device int4 *out [[buffer(0)]], [encoder setBuffer: mZeroCopy offset:0 atIndex: 1]; [encoder dispatchThreadgroups:MTLSizeMake(UP_DIV(size, 256), 1, 1) threadsPerThreadgroup:MTLSizeMake(256, 1, 1)]; } + + bool singlePipeline = false; int index = 0; - for (auto& iter : mTempInput) { - backend->onCopyBuffer(iter.first, iter.second.get(), encoder, mShapeTemp[index++]); + if(mBlitPipeline.size() == 1) { + singlePipeline = true; + [encoder setComputePipelineState:mBlitPipeline[0]]; + } else { + MNN_ASSERT(mTempInputCopy.size() == mBlitPipeline.size()); } - - [encoder setComputePipelineState:mBlitPipeline]; for (auto& iter : mTempInputCopy) { + if(!singlePipeline) { + [encoder setComputePipelineState:mBlitPipeline[index++]]; + } MetalBackend::setTensor(iter.first, encoder, 0); MetalBackend::setTensor(mOutputPtr, encoder, 1); auto& blit = iter.second; auto buffer = ((MetalRuntimeAllocator::MetalBufferAlloc*)blit.blit.first)->getBuffer(); [encoder setBuffer: buffer offset:blit.blit.second atIndex: 2]; + [encoder dispatchThreadgroups:blit.global threadsPerThreadgroup:blit.local]; } - if (nullptr != mTempOutput) { - backend->onCopyBuffer(mTempOutput.get(), outputs[0], encoder, mShapeTemp[index]); - } } class MetalRasterCreator : public MetalBackend::Creator { public: virtual Execution *onCreate(const std::vector &inputs, const MNN::Op *op, Backend *backend, const std::vector& outputs) const { + return new MetalRaster(backend); } }; diff --git a/source/backend/metal/shader/MetalConvolution1x1.metal b/source/backend/metal/shader/MetalConvolution1x1.metal index 21bd0d8d0..80e4d7fb6 100644 --- a/source/backend/metal/shader/MetalConvolution1x1.metal +++ b/source/backend/metal/shader/MetalConvolution1x1.metal @@ -167,6 +167,65 @@ kernel void conv1x1_g1z4_w4(const device ftype4 *in [[buffer(0)]], //if (computeSize > 3) {xy_out[3] = activate(ftype4(result3), cst.activation); } } +kernel void conv1x1_g1z4_m1w4(const device ftype4 *in [[buffer(0)]], + device ftype4 *out [[buffer(1)]], + constant conv1x1_constants& cst [[buffer(2)]], + const device MNN::uchar4x2 *wt [[buffer(3)]], + const device ftype4 *biasTerms [[buffer(4)]], + const device float4 *dequantScale [[buffer(5)]], + uint3 gid[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + int uz = gid.x * 2 + sgitg; + + int rx = gid.y; + auto xy_wt = wt + uz * cst.input_slice; + auto xy_in0 = in + (int)gid.z * cst.input_size + rx + 0; + auto xy_out = out + (int)gid.z * cst.output_size + uz * cst.output_size * cst.batch + rx; + auto biasValue = FLOAT4(biasTerms[uz]); + FLOAT4 result0 = FLOAT4(0); + + int block = (cst.input_slice + cst.block_size - 1) / cst.block_size; + for (int bi=0; bi> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8); + FLOAT4 res = w4 * scale[i] + dequant_bias[i]; + w_dequant[i] = res; + } + + result0 += FLOAT4(in40 * w_dequant); + +// FLOAT4x4 w_dequant; +// for (int i = 0; i < 4; ++i) { +// FLOAT4 w4 = FLOAT4((float)(w_int4[i][0] >> 4) - 8, (float)(w_int4[i][0] & 15) - 8, (float)(w_int4[i][1] >> 4) - 8, (float)(w_int4[i][1] & 15) - 8); +// FLOAT4 res = w4 * scale[i] + dequant_bias[i]; +// w_dequant[i] = w4; +// } +// +// FLOAT4 temp = FLOAT4(in40 * w_dequant); +// result0 += temp * scale + (in40.x + in40.y + in40.z + in40.w) * dequant_bias; + } + } + FLOAT4 res; + res.x = simd_sum(result0.x); + res.y = simd_sum(result0.y); + res.z = simd_sum(result0.z); + res.w = simd_sum(result0.w); + /* true */ + if (tiisg == 0) { + xy_out[0] = activate(ftype4(res + biasValue), cst.activation); + } +} + kernel void conv1x1_g1z8(const device ftype4 *in [[buffer(0)]], device ftype4 *out [[buffer(1)]], constant conv1x1_constants& cst [[buffer(2)]], diff --git a/source/backend/metal/shader/MetalDefine.metal b/source/backend/metal/shader/MetalDefine.metal index bcf7aa462..bf3f85daf 100644 --- a/source/backend/metal/shader/MetalDefine.metal +++ b/source/backend/metal/shader/MetalDefine.metal @@ -5,6 +5,7 @@ using namespace metal; // Macro // ––––––––––––––––––––––––––––––––––––––––––––––––––– +#define SIMD_GROUP_WIDTH 32 // setting SIMD group size is 32 #define UP_DIV(x, y) ( ((x) + (y) - 1) / (y) ) #define ROUND_UP(x, y) ( ((x) + (y) - 1) / (y) * (y) ) diff --git a/source/backend/metal/shader/MetalLayerNorm.metal b/source/backend/metal/shader/MetalLayerNorm.metal index 626fd9d06..bad927112 100644 --- a/source/backend/metal/shader/MetalLayerNorm.metal +++ b/source/backend/metal/shader/MetalLayerNorm.metal @@ -147,3 +147,46 @@ kernel void layernorm_x4_rms(const device ftype4 *in [[buffer(0)]], out_data[gid.x] = (ftype4)(norm); } } + +kernel void layernorm_m1x4_rms(const device ftype4 *in [[buffer(0)]], + device ftype4 *out [[buffer(1)]], + constant layernorm_constants& cst [[buffer(2)]], + const device float4 *gamma [[buffer(3)]], + const device float4 *beta [[buffer(4)]], + uint gid [[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + int total_idx = (gid * 4 + sgitg); + int in_idx = total_idx % (cst.inside/4); + int out_idx = total_idx / (cst.inside/4); + + auto in_data = in + out_idx * cst.inside/4; + auto out_data = out + out_idx * cst.inside/4; + + float square_sum = 0.0f; + + for(int i = tiisg; i < cst.inside/4; i+=SIMD_GROUP_WIDTH) { + ftype4 data = in_data[i]; + float dis = data.x; + square_sum += dis * dis; + dis = data.y; + square_sum += dis * dis; + dis = data.z; + square_sum += dis * dis; + dis = data.w; + square_sum += dis * dis; + } + square_sum = simd_sum(square_sum); + + if(tiisg == 0) { + float var = 1.0 / sqrt(square_sum / cst.inside + cst.eps); + + float4 norm = var * ((float4)in_data[in_idx]); + if(cst.has_gamma_beta) { + out_data[in_idx] = (ftype4)(norm * gamma[in_idx] + beta[in_idx]); + } else { + out_data[in_idx] = (ftype4)(norm); + } + } +} diff --git a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp index d42961c1e..2b45559c4 100644 --- a/source/backend/opencl/core/runtime/OpenCLRuntime.cpp +++ b/source/backend/opencl/core/runtime/OpenCLRuntime.cpp @@ -111,7 +111,7 @@ OpenCLRuntime::OpenCLRuntime(const BackendConfig::PrecisionMode precision, const } #endif - if (deviceName.find("QUALCOMM Adreno") != std::string::npos) { + if (deviceName.find("QUALCOMM Adreno") != std::string::npos || deviceName.find("Qualcomm") != std::string::npos) { mGpuType = ADRENO; // if device is QUALCOMM's and version is 2.0 , set spacial optimized param diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp index 8dc9957cf..3e288c1f5 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.cpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.cpp @@ -7,7 +7,8 @@ // #include "backend/opencl/core/runtime/OpenCLWrapper.hpp" -#ifdef WIN32 +#ifdef _WIN32 +#include #include #else #include @@ -94,7 +95,7 @@ bool OpenCLSymbols::LoadOpenCLLibrary() { bool OpenCLSymbols::UnLoadOpenCLLibrary() { if (handle_ != nullptr) { -#if defined(WIN32) +#if defined(_WIN32) if (FreeLibrary(handle_) == 0) { #else if (dlclose(handle_) != 0) { @@ -129,7 +130,7 @@ bool OpenCLSymbols::isGlError() { bool OpenCLSymbols::LoadLibraryFromPath(const std::string &library_path) { -#if defined(WIN32) +#if defined(_WIN32) handle_ = LoadLibraryA(library_path.c_str()); if (handle_ == nullptr) { return false; diff --git a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp index 561ccde8c..ba39a8c30 100644 --- a/source/backend/opencl/core/runtime/OpenCLWrapper.hpp +++ b/source/backend/opencl/core/runtime/OpenCLWrapper.hpp @@ -9,7 +9,7 @@ #ifndef OpenCLWrapper_hpp #define OpenCLWrapper_hpp -#if defined(WIN32) +#if defined(_WIN32) #include #undef min #undef max @@ -248,7 +248,7 @@ class OpenCLSymbols { private: bool LoadLibraryFromPath(const std::string &path); -#if defined(WIN32) +#if defined(_WIN32) HMODULE handle_ = nullptr; #else void *handle_ = nullptr; diff --git a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp index 8ba800b26..185a25294 100644 --- a/source/backend/opencl/execution/buffer/ConvBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/ConvBufExecution.cpp @@ -324,8 +324,6 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); mConvGemmOutTensor.reset(Tensor::createDevice({alignN * alignM})); mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); - mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); - mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); { std::set buildOptions; @@ -399,6 +397,8 @@ ErrorCode ConvBufExecution::onResize(const std::vector &inputs, const mOpenCLBackend->endRecord(mRecording); } + mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); + mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); return NO_ERROR; } else if (mResource->mConv1x1Opt) { diff --git a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp index d31462301..3c2a02b9d 100644 --- a/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp +++ b/source/backend/opencl/execution/buffer/ConvBufLowMemoryExecution.cpp @@ -398,9 +398,6 @@ void ConvBufLowMemoryExecution::useFPWeightGemmLowMemory(Tensor * input, Tensor mOpenCLBackend->onAcquireBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC); mOpenCLBackend->onAcquireBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); mOpenCLBackend->onAcquireBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); - mOpenCLBackend->onReleaseBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC); - mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); - mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); //weight inverse quantization and rearrange { @@ -508,6 +505,9 @@ void ConvBufLowMemoryExecution::useFPWeightGemmLowMemory(Tensor * input, Tensor unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1]}; unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1]}; } + mOpenCLBackend->onReleaseBuffer(mConvGemmWeightTensor.get(), Backend::DYNAMIC); + mOpenCLBackend->onReleaseBuffer(mConvGemmInpTensor.get(), Backend::DYNAMIC); + mOpenCLBackend->onReleaseBuffer(mConvGemmOutTensor.get(), Backend::DYNAMIC); return; } diff --git a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp index 50b7fd25e..1c11e60e9 100644 --- a/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp +++ b/source/backend/opencl/execution/buffer/GridSampleBufExecution.cpp @@ -14,78 +14,125 @@ namespace MNN { namespace OpenCL { GridSampleBufExecution::GridSampleBufExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) { - mUnits.resize(1); - auto &unit = mUnits[0]; + mOpenCLBackend = static_cast(backend); + mMode = op->main_as_GridSample()->mode(); mPaddingMode = op->main_as_GridSample()->paddingMode(); if (op->main_as_GridSample()->alignCorners()) { mAlignCorners = 1; - } - else { + }else { mAlignCorners = 0; } - - mOpenCLBackend = static_cast(backend); - auto runtime = mOpenCLBackend->getOpenCLRuntime(); - auto gridSampleParam = op->main_as_GridSample(); - - std::set buildOptions; - if (op->main_as_GridSample()->mode() == 0) { - mKernelName = "bilinear_buf"; - unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions); - } - else { - mKernelName = "nearest_buf"; - unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions); - } - mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); } ErrorCode GridSampleBufExecution::onEncode(const std::vector &inputs, const std::vector &outputs) { - auto &unit = mUnits[0]; + auto runtime = mOpenCLBackend->getOpenCLRuntime(); auto inputTensor = inputs[0]; auto gridTensor = inputs[1]; auto outputTensor = outputs[0]; - auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime(); - - const int batches = inputTensor->buffer().dim[0].extent; - const int channels = inputTensor->buffer().dim[1].extent; - const int inH = inputTensor->buffer().dim[2].extent; - const int inW = inputTensor->buffer().dim[3].extent; - const int channelC4 = UP_DIV(channels, 4); - - const int outH = outputTensor->buffer().dim[2].extent; - const int outW = outputTensor->buffer().dim[3].extent; - - mGlobalWorkSize = { - static_cast(channelC4), - static_cast(outW), - static_cast(outH * batches) - }; - - MNN_ASSERT(outW > 0 && outH > 0); - - uint32_t idx = 0; - cl_int ret = CL_SUCCESS; - ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); - ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]); - ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]); - ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor)); - ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor)); - ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor)); - ret |= unit.kernel->get().setArg(idx++, static_cast(inH)); - ret |= unit.kernel->get().setArg(idx++, static_cast(inW)); - ret |= unit.kernel->get().setArg(idx++, static_cast(outH)); - ret |= unit.kernel->get().setArg(idx++, static_cast(outW)); - ret |= unit.kernel->get().setArg(idx++, static_cast(batches)); - ret |= unit.kernel->get().setArg(idx++, mPaddingMode); - ret |= unit.kernel->get().setArg(idx++, mAlignCorners); - MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution"); - - mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first; - - mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); - unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; - unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + if(outputs[0]->dimensions() > 4){ + mUnits.resize(1); + const int batches = inputTensor->buffer().dim[0].extent; + const int channels = inputTensor->buffer().dim[1].extent; + const int inD = inputTensor->buffer().dim[2].extent; + const int inH = inputTensor->buffer().dim[3].extent; + const int inW = inputTensor->buffer().dim[4].extent; + const int channelC4 = UP_DIV(channels, 4); + const int outD = outputTensor->buffer().dim[2].extent; + const int outH = outputTensor->buffer().dim[3].extent; + const int outW = outputTensor->buffer().dim[4].extent; + + auto &unit = mUnits[0]; + std::set buildOptions; + if (mMode == SampleMode_BILINEAR) { + mKernelName = "bilinear5d_buf"; + unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions); + } else { + mKernelName = "nearest5d_buf"; + unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions); + } + mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); + mGlobalWorkSize = { + static_cast(channelC4 * outD), + static_cast(outW), + static_cast(outH * batches) + }; + MNN_ASSERT(outW > 0 && outH > 0); + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inD)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outD)); + ret |= unit.kernel->get().setArg(idx++, static_cast(batches)); + ret |= unit.kernel->get().setArg(idx++, mPaddingMode); + ret |= unit.kernel->get().setArg(idx++, mAlignCorners); + MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution"); + + mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first; + + mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); + unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; + unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + }else{ + mUnits.resize(1); + auto &unit = mUnits[0]; + const int batches = inputTensor->buffer().dim[0].extent; + const int channels = inputTensor->buffer().dim[1].extent; + const int inH = inputTensor->buffer().dim[2].extent; + const int inW = inputTensor->buffer().dim[3].extent; + const int channelC4 = UP_DIV(channels, 4); + const int outH = outputTensor->buffer().dim[2].extent; + const int outW = outputTensor->buffer().dim[3].extent; + + std::set buildOptions; + if (mMode == 0) { + mKernelName = "bilinear_buf"; + unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions); + } + else { + mKernelName = "nearest_buf"; + unit.kernel = runtime->buildKernel("grid_sample_buf", mKernelName, buildOptions); + } + mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); + mGlobalWorkSize = { + static_cast(channelC4), + static_cast(outW), + static_cast(outH * batches) + }; + MNN_ASSERT(outW > 0 && outH > 0); + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(batches)); + ret |= unit.kernel->get().setArg(idx++, mPaddingMode); + ret |= unit.kernel->get().setArg(idx++, mAlignCorners); + MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleBufExecution"); + + mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first; + + mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); + unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; + unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + } return NO_ERROR; } diff --git a/source/backend/opencl/execution/cl/grid_sample.cl b/source/backend/opencl/execution/cl/grid_sample.cl index 6a41f0050..67f1a7f94 100644 --- a/source/backend/opencl/execution/cl/grid_sample.cl +++ b/source/backend/opencl/execution/cl/grid_sample.cl @@ -56,6 +56,27 @@ FLOAT4 sample(int h, int w, return RI_F(tmp, SAMPLER, (int2)(w_offset_base + w, h_offset_base + h)); } + +FLOAT4 sample3d(int d, int h, int w, + const int x_offset_base, + const int y_offset_base, + __read_only image2d_t tmp, + int depth, int height, int width, + enum BorderMode paddingMode){ + + if (d < 0 || d >= depth || h < 0 || h >= height || w < 0 || w >= width) { + if(paddingMode == BorderMode_ZEROS) + { + return 0.0f; + } + d = CLAMP(d, 0, depth - 1); + h = CLAMP(h, 0, height - 1); + w = CLAMP(w, 0, width - 1); + } + return RI_F(tmp, SAMPLER, (int2)(x_offset_base + h * width + w, y_offset_base + d)); +} + + __kernel void nearest(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, __read_only image2d_t grid, __write_only image2d_t output, @@ -176,3 +197,148 @@ __kernel void bilinear(GLOBAL_SIZE_3_DIMS __read_only image2d_t input, const int output_h_offset = mad24(output_batch_idx, output_height, output_height_idx); WI_F(output, (int2)(output_w_offset, output_h_offset), value); } + +__kernel void nearest5d(GLOBAL_SIZE_3_DIMS + __read_only image2d_t input, + __read_only image2d_t grid, + __write_only image2d_t output, + __private const int input_height, + __private const int input_width, + __private const int input_depth, + __private const int output_height, + __private const int output_width, + __private const int output_depth, + __private const int batch, + __private const enum BorderMode paddingMode, + __private const int alignCorners){ + + const int output_channel_depth_idx = get_global_id(0); + const int output_width_block_idx = get_global_id(1); + const int output_batch_height_block_idx = get_global_id(2); + + DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx); + + const int output_channel_idx = output_channel_depth_idx / output_depth; + const int output_depth_idx = output_channel_depth_idx % output_depth; + const int output_batch_idx = output_batch_height_block_idx / output_height; + const int output_height_idx = output_batch_height_block_idx % output_height; + + // get grid idx + const int grid_w_offset = (output_depth_idx / 4) * output_width * 3 + output_width_block_idx * 3; + const int grid_h_offset = mad24(output_batch_idx, output_height, output_height_idx); + + FLOAT4 grid_x = RI_F(grid, SAMPLER, (int2)(grid_w_offset, grid_h_offset)); + FLOAT4 grid_y = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 1, grid_h_offset)); + FLOAT4 grid_z = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 2, grid_h_offset)); + + const float arr[12] = {grid_x.x, grid_y.x, grid_z.x, grid_x.y, grid_y.y, grid_z.y, grid_x.z, grid_y.z, grid_z.z, grid_x.w, grid_y.w, grid_z.w}; + + // get grid x,y + const int arr_offset = output_depth_idx % 4; + const float x = arr[3 * arr_offset]; + const float y = arr[3 * arr_offset + 1]; + const float z = arr[3 * arr_offset + 2]; + + float in_grid_x = getPosition(x, input_width, alignCorners); + float in_grid_y = getPosition(y, input_height, alignCorners); + float in_grid_z = getPosition(z, input_depth, alignCorners); + + // get nearest point + int nw = floor(in_grid_x + 0.5f); + int nh = floor(in_grid_y + 0.5f); + int nd = floor(in_grid_z + 0.5f); + + const int inp_w_offset = mul24(output_channel_idx, input_width * input_height); + const int inp_h_offset = mul24(output_batch_idx, input_depth); + FLOAT4 value = sample3d(nd, nh, nw, inp_w_offset, inp_h_offset, input, input_depth, input_height, input_width, paddingMode); + + const int output_w_offset = output_channel_idx * output_width * output_height + output_height_idx * output_width + output_width_block_idx; + const int output_h_offset = mad24(output_batch_idx, output_depth, output_depth_idx); + WI_F(output, (int2)(output_w_offset, output_h_offset), value); +} + +__kernel void bilinear5d(GLOBAL_SIZE_3_DIMS + __read_only image2d_t input, + __read_only image2d_t grid, + __write_only image2d_t output, + __private const int input_height, + __private const int input_width, + __private const int input_depth, + __private const int output_height, + __private const int output_width, + __private const int output_depth, + __private const int batch, + __private const enum BorderMode paddingMode, + __private const int alignCorners){ + + const int output_channel_depth_idx = get_global_id(0); + const int output_width_block_idx = get_global_id(1); + const int output_batch_height_block_idx = get_global_id(2); + + DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx); + + const int output_channel_idx = output_channel_depth_idx / output_depth; + const int output_depth_idx = output_channel_depth_idx % output_depth; + const int output_batch_idx = output_batch_height_block_idx / output_height; + const int output_height_idx = output_batch_height_block_idx % output_height; + + // get grid idx + const int grid_w_offset = (output_depth_idx / 4) * output_width * 3 + output_width_block_idx * 3; + const int grid_h_offset = mad24(output_batch_idx, output_height, output_height_idx); + + FLOAT4 grid_x = RI_F(grid, SAMPLER, (int2)(grid_w_offset, grid_h_offset)); + FLOAT4 grid_y = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 1, grid_h_offset)); + FLOAT4 grid_z = RI_F(grid, SAMPLER, (int2)(grid_w_offset + 2, grid_h_offset)); + + const float arr[12] = {grid_x.x, grid_y.x, grid_z.x, grid_x.y, grid_y.y, grid_z.y, grid_x.z, grid_y.z, grid_z.z, grid_x.w, grid_y.w, grid_z.w}; + + // get grid x,y + const int arr_offset = output_depth_idx % 4; + const float x = arr[3 * arr_offset]; + const float y = arr[3 * arr_offset + 1]; + const float z = arr[3 * arr_offset + 2]; + + float in_grid_x = getPosition(x, input_width, alignCorners); + float in_grid_y = getPosition(y, input_height, alignCorners); + float in_grid_z = getPosition(z, input_depth, alignCorners); + + int in_d0 = floor(in_grid_z); + int in_h0 = floor(in_grid_y); + int in_w0 = floor(in_grid_x); + int in_d1 = ceil(in_grid_z); + int in_h1 = ceil(in_grid_y); + int in_w1 = ceil(in_grid_x); + + float x_weight0 = in_grid_x - in_w0; + float x_weight1 = 1 - x_weight0; + float y_weight0 = in_grid_y - in_h0; + float y_weight1 = 1 - y_weight0; + float z_weight0 = in_grid_z - in_d0; + float z_weight1 = 1 - z_weight0; + + // bilinear interpolation + const int inp_x_offset = mul24(output_channel_idx, input_width * input_height); + const int inp_y_offset = mul24(output_batch_idx, input_depth); + FLOAT4 i000 = sample3d(in_d0, in_h0, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i001 = sample3d(in_d0, in_h0, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i010 = sample3d(in_d0, in_h1, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i011 = sample3d(in_d0, in_h1, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i100 = sample3d(in_d1, in_h0, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i101 = sample3d(in_d1, in_h0, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i110 = sample3d(in_d1, in_h1, in_w0, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + FLOAT4 i111 = sample3d(in_d1, in_h1, in_w1, inp_x_offset, inp_y_offset, input, input_depth, input_height, input_width, paddingMode); + + + FLOAT4 i00 = (FLOAT4)(x_weight1) * i000 + (FLOAT4)(x_weight0) * i001; + FLOAT4 i01 = (FLOAT4)(x_weight1) * i010 + (FLOAT4)(x_weight0) * i011; + FLOAT4 i10 = (FLOAT4)(x_weight1) * i100 + (FLOAT4)(x_weight0) * i101; + FLOAT4 i11 = (FLOAT4)(x_weight1) * i110 + (FLOAT4)(x_weight0) * i111; + + FLOAT4 i0 = (FLOAT4)(y_weight1) * i00 + (FLOAT4)(y_weight0) * i01; + FLOAT4 i1 = (FLOAT4)(y_weight1) * i10 + (FLOAT4)(y_weight0) * i11; + FLOAT4 interp = (FLOAT4)(z_weight1) * i0 + (FLOAT4)(z_weight0) * i1; + const int output_w_offset = output_channel_idx * output_width * output_height + output_height_idx * output_width + output_width_block_idx; + const int output_h_offset = mad24(output_batch_idx, output_depth, output_depth_idx); + + WI_F(output, (int2)(output_w_offset, output_h_offset), interp); +} diff --git a/source/backend/opencl/execution/cl/grid_sample_buf.cl b/source/backend/opencl/execution/cl/grid_sample_buf.cl index 758cb2295..42ada041f 100644 --- a/source/backend/opencl/execution/cl/grid_sample_buf.cl +++ b/source/backend/opencl/execution/cl/grid_sample_buf.cl @@ -54,6 +54,25 @@ COMPUTE_FLOAT4 sample(int h, int w, return CONVERT_COMPUTE_FLOAT4(vload4(offset, buffer)); } +COMPUTE_FLOAT4 sample3d(int d, int h, int w, + const int offset_base, + __global const FLOAT *buffer, + int depth, int height, int width, + enum BorderMode paddingMode){ + + if (d < 0 || d >= depth || h < 0 || h >= height || w < 0 || w >= width) { + if(paddingMode == BorderMode_ZEROS) + { + return 0.0f; + } + d = CLAMP(d, 0, depth - 1); + h = CLAMP(h, 0, height - 1); + w = CLAMP(w, 0, width - 1); + } + int offset = ((offset_base + d) * height + h) * width + w; + return CONVERT_COMPUTE_FLOAT4(vload4(offset, buffer)); +} + __kernel void nearest_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input, __global const FLOAT* grid, __global FLOAT* output, @@ -165,3 +184,126 @@ __kernel void bilinear_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input, const int output_offset = ((output_batch_idx + output_channel_block_idx * batch) * output_height + output_height_idx) * output_width + output_width_block_idx; vstore4(CONVERT_FLOAT4(value), output_offset, output); } +__kernel void nearest5d_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input, + __global const FLOAT* grid, + __global FLOAT* output, + __private const int input_height, + __private const int input_width, + __private const int input_depth, + __private const int output_height, + __private const int output_width, + __private const int output_depth, + __private const int batch, + __private const enum BorderMode paddingMode, + __private const int alignCorners){ + + const int output_channel_depth_idx = get_global_id(0); + const int output_width_block_idx = get_global_id(1); + const int output_batch_height_block_idx = get_global_id(2); + + DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx); + + const int output_channel_idx = output_channel_depth_idx / output_depth; + const int output_depth_idx = output_channel_depth_idx % output_depth; + const int output_batch_idx = output_batch_height_block_idx / output_height; + const int output_height_idx = output_batch_height_block_idx % output_height; + + const int grid_offset = ((output_batch_idx * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx; + float3 grid_xyz = convert_float3(vload3(grid_offset, grid)); + + const float x = grid_xyz.x; + const float y = grid_xyz.y; + const float z = grid_xyz.z; + + float in_grid_x = getPosition(x, input_width, alignCorners); + float in_grid_y = getPosition(y, input_height, alignCorners); + float in_grid_z = getPosition(z, input_depth, alignCorners); + + // get nearest point + int nw = floor(in_grid_x + 0.5f); + int nh = floor(in_grid_y + 0.5f); + int nd = floor(in_grid_z + 0.5f); + + const int inp_offset_base = (output_batch_idx + output_channel_idx * batch) * input_depth; + COMPUTE_FLOAT4 value = sample3d(nd, nh, nw, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + + const int output_offset = (((output_batch_idx + output_channel_idx * batch) * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx; + vstore4(CONVERT_FLOAT4(value), output_offset, output); +} + +__kernel void bilinear5d_buf(GLOBAL_SIZE_3_DIMS + __global const FLOAT* input, + __global const FLOAT* grid, + __global FLOAT* output, + __private const int input_height, + __private const int input_width, + __private const int input_depth, + __private const int output_height, + __private const int output_width, + __private const int output_depth, + __private const int batch, + __private const enum BorderMode paddingMode, + __private const int alignCorners){ + + const int output_channel_depth_idx = get_global_id(0); + const int output_width_block_idx = get_global_id(1); + const int output_batch_height_block_idx = get_global_id(2); + + DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx, output_width_block_idx, output_batch_height_block_idx); + + const int output_channel_idx = output_channel_depth_idx / output_depth; + const int output_depth_idx = output_channel_depth_idx % output_depth; + const int output_batch_idx = output_batch_height_block_idx / output_height; + const int output_height_idx = output_batch_height_block_idx % output_height; + + const int grid_offset = ((output_batch_idx * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx; + float3 grid_xyz = convert_float3(vload3(grid_offset, grid)); + + + // get grid x,y + const float x = grid_xyz.x; + const float y = grid_xyz.y; + const float z = grid_xyz.z; + + float in_grid_x = getPosition(x, input_width, alignCorners); + float in_grid_y = getPosition(y, input_height, alignCorners); + float in_grid_z = getPosition(z, input_depth, alignCorners); + + int in_d0 = floor(in_grid_z); + int in_h0 = floor(in_grid_y); + int in_w0 = floor(in_grid_x); + int in_d1 = ceil(in_grid_z); + int in_h1 = ceil(in_grid_y); + int in_w1 = ceil(in_grid_x); + + float x_weight0 = in_grid_x - in_w0; + float x_weight1 = 1 - x_weight0; + float y_weight0 = in_grid_y - in_h0; + float y_weight1 = 1 - y_weight0; + float z_weight0 = in_grid_z - in_d0; + float z_weight1 = 1 - z_weight0; + + // bilinear interpolation + const int inp_offset_base = (output_batch_idx + output_channel_idx * batch) * input_depth; + COMPUTE_FLOAT4 i000 = sample3d(in_d0, in_h0, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i001 = sample3d(in_d0, in_h0, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i010 = sample3d(in_d0, in_h1, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i011 = sample3d(in_d0, in_h1, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i100 = sample3d(in_d1, in_h0, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i101 = sample3d(in_d1, in_h0, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i110 = sample3d(in_d1, in_h1, in_w0, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + COMPUTE_FLOAT4 i111 = sample3d(in_d1, in_h1, in_w1, inp_offset_base, input, input_depth, input_height, input_width, paddingMode); + + + COMPUTE_FLOAT4 i00 = (COMPUTE_FLOAT4)(x_weight1) * i000 + (COMPUTE_FLOAT4)(x_weight0) * i001; + COMPUTE_FLOAT4 i01 = (COMPUTE_FLOAT4)(x_weight1) * i010 + (COMPUTE_FLOAT4)(x_weight0) * i011; + COMPUTE_FLOAT4 i10 = (COMPUTE_FLOAT4)(x_weight1) * i100 + (COMPUTE_FLOAT4)(x_weight0) * i101; + COMPUTE_FLOAT4 i11 = (COMPUTE_FLOAT4)(x_weight1) * i110 + (COMPUTE_FLOAT4)(x_weight0) * i111; + + COMPUTE_FLOAT4 i0 = (COMPUTE_FLOAT4)(y_weight1) * i00 + (COMPUTE_FLOAT4)(y_weight0) * i01; + COMPUTE_FLOAT4 i1 = (COMPUTE_FLOAT4)(y_weight1) * i10 + (COMPUTE_FLOAT4)(y_weight0) * i11; + COMPUTE_FLOAT4 interp = (COMPUTE_FLOAT4)(z_weight1) * i0 + (COMPUTE_FLOAT4)(z_weight0) * i1; + + const int output_offset = (((output_batch_idx + output_channel_idx * batch) * output_depth + output_depth_idx) * output_height + output_height_idx) * output_width + output_width_block_idx; + vstore4(CONVERT_FLOAT4(interp), output_offset, output); +} diff --git a/source/backend/opencl/execution/cl/opencl_program.cc b/source/backend/opencl/execution/cl/opencl_program.cc index a4d2cb4f4..ed7ee0cc6 100644 --- a/source/backend/opencl/execution/cl/opencl_program.cc +++ b/source/backend/opencl/execution/cl/opencl_program.cc @@ -1535,6 +1535,23 @@ const char* grid_sample_buf = " int offset=(offset_base+h)*width+w;\n" " return CONVERT_COMPUTE_FLOAT4(vload4(offset,buffer));\n" "}\n" +"COMPUTE_FLOAT4 sample3d(int d,int h,int w,\n" +" const int offset_base,\n" +" __global const FLOAT *buffer,\n" +" int depth,int height,int width,\n" +" enum BorderMode paddingMode){\n" +" if (d<0 || d >= depth || h<0 || h >= height || w<0 || w >= width) {\n" +" if(paddingMode == BorderMode_ZEROS)\n" +" {\n" +" return 0.0f;\n" +" }\n" +" d=CLAMP(d,0,depth-1);\n" +" h=CLAMP(h,0,height-1);\n" +" w=CLAMP(w,0,width-1);\n" +" }\n" +" int offset=((offset_base+d)*height+h)*width+w;\n" +" return CONVERT_COMPUTE_FLOAT4(vload4(offset,buffer));\n" +"}\n" "__kernel void nearest_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,\n" " __global const FLOAT* grid,\n" " __global FLOAT* output,\n" @@ -1628,6 +1645,117 @@ const char* grid_sample_buf = " const int output_offset=((output_batch_idx+output_channel_block_idx*batch)*output_height+output_height_idx)*output_width+output_width_block_idx;\n" " vstore4(CONVERT_FLOAT4(value),output_offset,output);\n" "}\n" +"__kernel void nearest5d_buf(GLOBAL_SIZE_3_DIMS __global const FLOAT* input,\n" +" __global const FLOAT* grid,\n" +" __global FLOAT* output,\n" +" __private const int input_height,\n" +" __private const int input_width,\n" +" __private const int input_depth,\n" +" __private const int output_height,\n" +" __private const int output_width,\n" +" __private const int output_depth,\n" +" __private const int batch,\n" +" __private const enum BorderMode paddingMode,\n" +" __private const int alignCorners){\n" +" \n" +" const int output_channel_depth_idx=get_global_id(0);\n" +" const int output_width_block_idx=get_global_id(1);\n" +" const int output_batch_height_block_idx=get_global_id(2);\n" +" \n" +" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n" +" \n" +" const int output_channel_idx=output_channel_depth_idx/output_depth;\n" +" const int output_depth_idx=output_channel_depth_idx % output_depth;\n" +" const int output_batch_idx=output_batch_height_block_idx/output_height;\n" +" const int output_height_idx=output_batch_height_block_idx % output_height;\n" +" \n" +" const int grid_offset=((output_batch_idx*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n" +" float3 grid_xyz=convert_float3(vload3(grid_offset,grid));\n" +" const float x=grid_xyz.x;\n" +" const float y=grid_xyz.y;\n" +" const float z=grid_xyz.z;\n" +" float in_grid_x=getPosition(x,input_width,alignCorners);\n" +" float in_grid_y=getPosition(y,input_height,alignCorners);\n" +" float in_grid_z=getPosition(z,input_depth,alignCorners);\n" +" // get nearest point\n" +" int nw=floor(in_grid_x+0.5f);\n" +" int nh=floor(in_grid_y+0.5f);\n" +" int nd=floor(in_grid_z+0.5f);\n" +" const int inp_offset_base=(output_batch_idx+output_channel_idx*batch)*input_depth;\n" +" COMPUTE_FLOAT4 value=sample3d(nd,nh,nw,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" const int output_offset=(((output_batch_idx+output_channel_idx*batch)*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n" +" vstore4(CONVERT_FLOAT4(value),output_offset,output);\n" +"}\n" +"__kernel void bilinear5d_buf(GLOBAL_SIZE_3_DIMS\n" +" __global const FLOAT* input,\n" +" __global const FLOAT* grid,\n" +" __global FLOAT* output,\n" +" __private const int input_height,\n" +" __private const int input_width,\n" +" __private const int input_depth,\n" +" __private const int output_height,\n" +" __private const int output_width,\n" +" __private const int output_depth,\n" +" __private const int batch,\n" +" __private const enum BorderMode paddingMode,\n" +" __private const int alignCorners){\n" +" const int output_channel_depth_idx=get_global_id(0);\n" +" const int output_width_block_idx=get_global_id(1);\n" +" const int output_batch_height_block_idx=get_global_id(2);\n" +" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n" +" \n" +" const int output_channel_idx=output_channel_depth_idx/output_depth;\n" +" const int output_depth_idx=output_channel_depth_idx % output_depth;\n" +" const int output_batch_idx=output_batch_height_block_idx/output_height;\n" +" const int output_height_idx=output_batch_height_block_idx % output_height;\n" +" \n" +" const int grid_offset=((output_batch_idx*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n" +" float3 grid_xyz=convert_float3(vload3(grid_offset,grid));\n" +" \n" +" // get grid x,y\n" +" const float x=grid_xyz.x;\n" +" const float y=grid_xyz.y;\n" +" const float z=grid_xyz.z;\n" +" float in_grid_x=getPosition(x,input_width,alignCorners);\n" +" float in_grid_y=getPosition(y,input_height,alignCorners);\n" +" float in_grid_z=getPosition(z,input_depth,alignCorners);\n" +" int in_d0=floor(in_grid_z);\n" +" int in_h0=floor(in_grid_y);\n" +" int in_w0=floor(in_grid_x);\n" +" int in_d1=ceil(in_grid_z);\n" +" int in_h1=ceil(in_grid_y);\n" +" int in_w1=ceil(in_grid_x);\n" +" \n" +" float x_weight0=in_grid_x-in_w0;\n" +" float x_weight1=1-x_weight0;\n" +" float y_weight0=in_grid_y-in_h0;\n" +" float y_weight1=1-y_weight0;\n" +" float z_weight0=in_grid_z-in_d0;\n" +" float z_weight1=1-z_weight0;\n" +" // bilinear interpolation\n" +" const int inp_offset_base=(output_batch_idx+output_channel_idx*batch)*input_depth;\n" +" COMPUTE_FLOAT4 i000=sample3d(in_d0,in_h0,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i001=sample3d(in_d0,in_h0,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i010=sample3d(in_d0,in_h1,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i011=sample3d(in_d0,in_h1,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i100=sample3d(in_d1,in_h0,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i101=sample3d(in_d1,in_h0,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i110=sample3d(in_d1,in_h1,in_w0,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" COMPUTE_FLOAT4 i111=sample3d(in_d1,in_h1,in_w1,inp_offset_base,input,input_depth,input_height,input_width,paddingMode);\n" +" \n" +" \n" +" COMPUTE_FLOAT4 i00=(COMPUTE_FLOAT4)(x_weight1)*i000+(COMPUTE_FLOAT4)(x_weight0)*i001;\n" +" COMPUTE_FLOAT4 i01=(COMPUTE_FLOAT4)(x_weight1)*i010+(COMPUTE_FLOAT4)(x_weight0)*i011;\n" +" COMPUTE_FLOAT4 i10=(COMPUTE_FLOAT4)(x_weight1)*i100+(COMPUTE_FLOAT4)(x_weight0)*i101;\n" +" COMPUTE_FLOAT4 i11=(COMPUTE_FLOAT4)(x_weight1)*i110+(COMPUTE_FLOAT4)(x_weight0)*i111;\n" +" \n" +" COMPUTE_FLOAT4 i0=(COMPUTE_FLOAT4)(y_weight1)*i00+(COMPUTE_FLOAT4)(y_weight0)*i01;\n" +" COMPUTE_FLOAT4 i1=(COMPUTE_FLOAT4)(y_weight1)*i10+(COMPUTE_FLOAT4)(y_weight0)*i11;\n" +" COMPUTE_FLOAT4 interp=(COMPUTE_FLOAT4)(z_weight1)*i0+(COMPUTE_FLOAT4)(z_weight0)*i1;\n" +" \n" +" const int output_offset=(((output_batch_idx+output_channel_idx*batch)*output_depth+output_depth_idx)*output_height+output_height_idx)*output_width+output_width_block_idx;\n" +" vstore4(CONVERT_FLOAT4(interp),output_offset,output);\n" +"}\n" ; #endif const char* interp = @@ -2840,17 +2968,37 @@ const char* raster = " DEAL_NON_UNIFORM_DIM2(x,y);\n" " WI_DATA(output,(int2)(x,y),(OUTPUT_TYPE_I4)(0));\n" "}\n" -"__kernel void raster_buffer_direct(\n" +"__kernel void raster_buffer(\n" " GLOBAL_SIZE_3_DIMS\n" -" __read_only image2d_t input,\n" +" __global INPUT_TYPE *input,\n" +" __private const int inputOffset,\n" +" __private const int inputStride0,\n" +" __private const int inputStride1,\n" +" __private const int inputStride2,\n" +" __global OUTPUT_TYPE *output,\n" +" __private const int outputOffset,\n" +" __private const int outputStride0,\n" +" __private const int outputStride1,\n" +" __private const int outputStride2\n" +" ) {\n" +" const int x=get_global_id(0);\n" +" const int y=get_global_id(1);\n" +" const int z=get_global_id(2);\n" +" \n" +" DEAL_NON_UNIFORM_DIM3(x,y,z);\n" +" \n" +" int inputIndex=inputOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n" +" int outputIndex=outputOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n" +" output[outputIndex]=(OUTPUT_TYPE)input[inputIndex];\n" +"}\n" +"__kernel void raster_buffer_combine(\n" +" GLOBAL_SIZE_3_DIMS\n" +" __global INPUT_TYPE *input,\n" " __private const int inputOffset,\n" " __private const int combineSrcOffset,\n" " __private const int inputStride0,\n" " __private const int inputStride1,\n" " __private const int inputStride2,\n" -" __private const int src_width,\n" -" __private const int src_height,\n" -" __private const int src_channel,\n" " __global OUTPUT_TYPE *output,\n" " __private const int outputOffset,\n" " __private const int combineDstOffset,\n" @@ -2869,21 +3017,7 @@ const char* raster = " \n" " int inputIndex=inputOffset+id*combineSrcOffset+z*inputStride0+y*inputStride1+x*inputStride2;\n" " int outputIndex=outputOffset+id*combineDstOffset+z*outputStride0+y*outputStride1+x*outputStride2;\n" -"#ifdef INPUT_DATA_FORMAT_NHWC\n" -" int in_c=inputIndex % src_channel; inputIndex /= src_channel;\n" -" int in_w=inputIndex % src_width; inputIndex /= src_width;\n" -" int in_h=inputIndex % src_height;\n" -" int in_b=inputIndex/src_height;\n" -"#else\n" -" int in_w=inputIndex % src_width; inputIndex /= src_width;\n" -" int in_h=inputIndex % src_height; inputIndex /= src_height;\n" -" int in_c=inputIndex % src_channel;\n" -" int in_b=inputIndex/src_channel;\n" -"#endif\n" -" int2 coord=(int2)((in_c/4)*src_width+in_w,in_b*src_height+in_h);\n" -" INPUT_TYPE_I4 value=RI_DATA(input,SAMPLER,coord);\n" -" INPUT_TYPE_I* value_ptr=(INPUT_TYPE_I*)&value;\n" -" output[outputIndex]=(OUTPUT_TYPE)value_ptr[in_c % 4];\n" +" output[outputIndex]=(OUTPUT_TYPE)input[inputIndex];\n" "}\n" "__kernel void raster_image(\n" " GLOBAL_SIZE_3_DIMS\n" @@ -9825,6 +9959,23 @@ const char* grid_sample = " }\n" " return RI_F(tmp,SAMPLER,(int2)(w_offset_base+w,h_offset_base+h));\n" "}\n" +"FLOAT4 sample3d(int d,int h,int w,\n" +" const int x_offset_base,\n" +" const int y_offset_base,\n" +" __read_only image2d_t tmp,\n" +" int depth,int height,int width,\n" +" enum BorderMode paddingMode){\n" +" if (d<0 || d >= depth || h<0 || h >= height || w<0 || w >= width) {\n" +" if(paddingMode == BorderMode_ZEROS)\n" +" {\n" +" return 0.0f;\n" +" }\n" +" d=CLAMP(d,0,depth-1);\n" +" h=CLAMP(h,0,height-1);\n" +" w=CLAMP(w,0,width-1);\n" +" }\n" +" return RI_F(tmp,SAMPLER,(int2)(x_offset_base+h*width+w,y_offset_base+d));\n" +"}\n" "__kernel void nearest(GLOBAL_SIZE_3_DIMS __read_only image2d_t input,\n" " __read_only image2d_t grid,\n" " __write_only image2d_t output,\n" @@ -9925,6 +10076,139 @@ const char* grid_sample = " const int output_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n" " WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n" "}\n" +"__kernel void nearest5d(GLOBAL_SIZE_3_DIMS\n" +" __read_only image2d_t input,\n" +" __read_only image2d_t grid,\n" +" __write_only image2d_t output,\n" +" __private const int input_height,\n" +" __private const int input_width,\n" +" __private const int input_depth,\n" +" __private const int output_height,\n" +" __private const int output_width,\n" +" __private const int output_depth,\n" +" __private const int batch,\n" +" __private const enum BorderMode paddingMode,\n" +" __private const int alignCorners){\n" +" \n" +" const int output_channel_depth_idx=get_global_id(0);\n" +" const int output_width_block_idx=get_global_id(1);\n" +" const int output_batch_height_block_idx=get_global_id(2);\n" +" \n" +" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n" +" \n" +" const int output_channel_idx=output_channel_depth_idx/output_depth;\n" +" const int output_depth_idx=output_channel_depth_idx % output_depth;\n" +" const int output_batch_idx=output_batch_height_block_idx/output_height;\n" +" const int output_height_idx=output_batch_height_block_idx % output_height;\n" +" \n" +" // get grid idx\n" +" const int grid_w_offset=(output_depth_idx/4)*output_width*3+output_width_block_idx*3;\n" +" const int grid_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n" +" \n" +" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset,grid_h_offset));\n" +" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1,grid_h_offset));\n" +" FLOAT4 grid_z=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2,grid_h_offset));\n" +" const float arr[12]={grid_x.x,grid_y.x,grid_z.x,grid_x.y,grid_y.y,grid_z.y,grid_x.z,grid_y.z,grid_z.z,grid_x.w,grid_y.w,grid_z.w};\n" +" \n" +" // get grid x,y\n" +" const int arr_offset=output_depth_idx % 4;\n" +" const float x=arr[3*arr_offset];\n" +" const float y=arr[3*arr_offset+1];\n" +" const float z=arr[3*arr_offset+2];\n" +" float in_grid_x=getPosition(x,input_width,alignCorners);\n" +" float in_grid_y=getPosition(y,input_height,alignCorners);\n" +" float in_grid_z=getPosition(z,input_depth,alignCorners);\n" +" // get nearest point\n" +" int nw=floor(in_grid_x+0.5f);\n" +" int nh=floor(in_grid_y+0.5f);\n" +" int nd=floor(in_grid_z+0.5f);\n" +" \n" +" const int inp_w_offset=mul24(output_channel_idx,input_width*input_height);\n" +" const int inp_h_offset=mul24(output_batch_idx,input_depth);\n" +" FLOAT4 value=sample3d(nd,nh,nw,inp_w_offset,inp_h_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" \n" +" const int output_w_offset=output_channel_idx*output_width*output_height+output_height_idx*output_width+output_width_block_idx;\n" +" const int output_h_offset=mad24(output_batch_idx,output_depth,output_depth_idx);\n" +" WI_F(output,(int2)(output_w_offset,output_h_offset),value);\n" +"}\n" +"__kernel void bilinear5d(GLOBAL_SIZE_3_DIMS\n" +" __read_only image2d_t input,\n" +" __read_only image2d_t grid,\n" +" __write_only image2d_t output,\n" +" __private const int input_height,\n" +" __private const int input_width,\n" +" __private const int input_depth,\n" +" __private const int output_height,\n" +" __private const int output_width,\n" +" __private const int output_depth,\n" +" __private const int batch,\n" +" __private const enum BorderMode paddingMode,\n" +" __private const int alignCorners){\n" +" const int output_channel_depth_idx=get_global_id(0);\n" +" const int output_width_block_idx=get_global_id(1);\n" +" const int output_batch_height_block_idx=get_global_id(2);\n" +" DEAL_NON_UNIFORM_DIM3(output_channel_depth_idx,output_width_block_idx,output_batch_height_block_idx);\n" +" \n" +" const int output_channel_idx=output_channel_depth_idx/output_depth;\n" +" const int output_depth_idx=output_channel_depth_idx % output_depth;\n" +" const int output_batch_idx=output_batch_height_block_idx/output_height;\n" +" const int output_height_idx=output_batch_height_block_idx % output_height;\n" +" \n" +" // get grid idx\n" +" const int grid_w_offset=(output_depth_idx/4)*output_width*3+output_width_block_idx*3;\n" +" const int grid_h_offset=mad24(output_batch_idx,output_height,output_height_idx);\n" +" \n" +" FLOAT4 grid_x=RI_F(grid,SAMPLER,(int2)(grid_w_offset,grid_h_offset));\n" +" FLOAT4 grid_y=RI_F(grid,SAMPLER,(int2)(grid_w_offset+1,grid_h_offset));\n" +" FLOAT4 grid_z=RI_F(grid,SAMPLER,(int2)(grid_w_offset+2,grid_h_offset));\n" +" const float arr[12]={grid_x.x,grid_y.x,grid_z.x,grid_x.y,grid_y.y,grid_z.y,grid_x.z,grid_y.z,grid_z.z,grid_x.w,grid_y.w,grid_z.w};\n" +" \n" +" // get grid x,y\n" +" const int arr_offset=output_depth_idx % 4;\n" +" const float x=arr[3*arr_offset];\n" +" const float y=arr[3*arr_offset+1];\n" +" const float z=arr[3*arr_offset+2];\n" +" float in_grid_x=getPosition(x,input_width,alignCorners);\n" +" float in_grid_y=getPosition(y,input_height,alignCorners);\n" +" float in_grid_z=getPosition(z,input_depth,alignCorners);\n" +" int in_d0=floor(in_grid_z);\n" +" int in_h0=floor(in_grid_y);\n" +" int in_w0=floor(in_grid_x);\n" +" int in_d1=ceil(in_grid_z);\n" +" int in_h1=ceil(in_grid_y);\n" +" int in_w1=ceil(in_grid_x);\n" +" \n" +" float x_weight0=in_grid_x-in_w0;\n" +" float x_weight1=1-x_weight0;\n" +" float y_weight0=in_grid_y-in_h0;\n" +" float y_weight1=1-y_weight0;\n" +" float z_weight0=in_grid_z-in_d0;\n" +" float z_weight1=1-z_weight0;\n" +" // bilinear interpolation\n" +" const int inp_x_offset=mul24(output_channel_idx,input_width*input_height);\n" +" const int inp_y_offset=mul24(output_batch_idx,input_depth);\n" +" FLOAT4 i000=sample3d(in_d0,in_h0,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i001=sample3d(in_d0,in_h0,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i010=sample3d(in_d0,in_h1,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i011=sample3d(in_d0,in_h1,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i100=sample3d(in_d1,in_h0,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i101=sample3d(in_d1,in_h0,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i110=sample3d(in_d1,in_h1,in_w0,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" FLOAT4 i111=sample3d(in_d1,in_h1,in_w1,inp_x_offset,inp_y_offset,input,input_depth,input_height,input_width,paddingMode);\n" +" \n" +" \n" +" FLOAT4 i00=(FLOAT4)(x_weight1)*i000+(FLOAT4)(x_weight0)*i001;\n" +" FLOAT4 i01=(FLOAT4)(x_weight1)*i010+(FLOAT4)(x_weight0)*i011;\n" +" FLOAT4 i10=(FLOAT4)(x_weight1)*i100+(FLOAT4)(x_weight0)*i101;\n" +" FLOAT4 i11=(FLOAT4)(x_weight1)*i110+(FLOAT4)(x_weight0)*i111;\n" +" \n" +" FLOAT4 i0=(FLOAT4)(y_weight1)*i00+(FLOAT4)(y_weight0)*i01;\n" +" FLOAT4 i1=(FLOAT4)(y_weight1)*i10+(FLOAT4)(y_weight0)*i11;\n" +" FLOAT4 interp=(FLOAT4)(z_weight1)*i0+(FLOAT4)(z_weight0)*i1;\n" +" const int output_w_offset=output_channel_idx*output_width*output_height+output_height_idx*output_width+output_width_block_idx;\n" +" const int output_h_offset=mad24(output_batch_idx,output_depth,output_depth_idx);\n" +" WI_F(output,(int2)(output_w_offset,output_h_offset),interp);\n" +"}\n" ; const char* buffer_convert_quant = "#ifdef MNN_SUPPORT_FP16\n" diff --git a/source/backend/opencl/execution/cl/raster.cl b/source/backend/opencl/execution/cl/raster.cl index 6033514d5..a7fe7bd5c 100644 --- a/source/backend/opencl/execution/cl/raster.cl +++ b/source/backend/opencl/execution/cl/raster.cl @@ -44,17 +44,38 @@ __kernel void image_set_zero( WI_DATA(output, (int2)(x, y), (OUTPUT_TYPE_I4)(0)); } -__kernel void raster_buffer_direct( +__kernel void raster_buffer( GLOBAL_SIZE_3_DIMS - __read_only image2d_t input, + __global INPUT_TYPE *input, + __private const int inputOffset, + __private const int inputStride0, + __private const int inputStride1, + __private const int inputStride2, + __global OUTPUT_TYPE *output, + __private const int outputOffset, + __private const int outputStride0, + __private const int outputStride1, + __private const int outputStride2 + ) { + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + + DEAL_NON_UNIFORM_DIM3(x, y, z); + + int inputIndex = inputOffset + z * inputStride0 + y * inputStride1 + x * inputStride2; + int outputIndex = outputOffset + z * outputStride0 + y * outputStride1 + x * outputStride2; + output[outputIndex] = (OUTPUT_TYPE)input[inputIndex]; +} + +__kernel void raster_buffer_combine( + GLOBAL_SIZE_3_DIMS + __global INPUT_TYPE *input, __private const int inputOffset, __private const int combineSrcOffset, __private const int inputStride0, __private const int inputStride1, __private const int inputStride2, - __private const int src_width, - __private const int src_height, - __private const int src_channel, __global OUTPUT_TYPE *output, __private const int outputOffset, __private const int combineDstOffset, @@ -73,23 +94,10 @@ __kernel void raster_buffer_direct( int inputIndex = inputOffset + id * combineSrcOffset + z * inputStride0 + y * inputStride1 + x * inputStride2; int outputIndex = outputOffset + id * combineDstOffset + z * outputStride0 + y * outputStride1 + x * outputStride2; -#ifdef INPUT_DATA_FORMAT_NHWC - int in_c = inputIndex % src_channel; inputIndex /= src_channel; - int in_w = inputIndex % src_width; inputIndex /= src_width; - int in_h = inputIndex % src_height; - int in_b = inputIndex / src_height; -#else - int in_w = inputIndex % src_width; inputIndex /= src_width; - int in_h = inputIndex % src_height; inputIndex /= src_height; - int in_c = inputIndex % src_channel; - int in_b = inputIndex / src_channel; -#endif - int2 coord = (int2)((in_c / 4) * src_width + in_w, in_b * src_height + in_h); - INPUT_TYPE_I4 value = RI_DATA(input, SAMPLER, coord); - INPUT_TYPE_I* value_ptr = (INPUT_TYPE_I*)&value; - output[outputIndex] = (OUTPUT_TYPE)value_ptr[in_c % 4]; + output[outputIndex] = (OUTPUT_TYPE)input[inputIndex]; } + __kernel void raster_image( GLOBAL_SIZE_3_DIMS __read_only image2d_t input, diff --git a/source/backend/opencl/execution/image/GridSampleExecution.cpp b/source/backend/opencl/execution/image/GridSampleExecution.cpp index 7cc2a0ff1..39369e99e 100644 --- a/source/backend/opencl/execution/image/GridSampleExecution.cpp +++ b/source/backend/opencl/execution/image/GridSampleExecution.cpp @@ -14,77 +14,128 @@ namespace MNN { namespace OpenCL { GridSampleExecution::GridSampleExecution(const std::vector &inputs, const MNN::Op *op, Backend *backend) : CommonExecution(backend, op) { - mUnits.resize(1); - auto &unit = mUnits[0]; + mOpenCLBackend = static_cast(backend); + mMode = op->main_as_GridSample()->mode(); mPaddingMode = op->main_as_GridSample()->paddingMode(); if (op->main_as_GridSample()->alignCorners()) { mAlignCorners = 1; - } - else { + }else { mAlignCorners = 0; } - - mOpenCLBackend = static_cast(backend); - auto runtime = mOpenCLBackend->getOpenCLRuntime(); - auto gridSampleParam = op->main_as_GridSample(); - - std::set buildOptions; - if (op->main_as_GridSample()->mode() == 0) { - mKernelName = "bilinear"; - unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions); - } - else { - mKernelName = "nearest"; - unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions); - - } - - mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); } ErrorCode GridSampleExecution::onEncode(const std::vector &inputs, const std::vector &outputs) { - auto &unit = mUnits[0]; + auto runtime = mOpenCLBackend->getOpenCLRuntime(); auto inputTensor = inputs[0]; auto gridTensor = inputs[1]; auto outputTensor = outputs[0]; - - const int batches = inputTensor->buffer().dim[0].extent; - const int channels = inputTensor->buffer().dim[1].extent; - const int inH = inputTensor->buffer().dim[2].extent; - const int inW = inputTensor->buffer().dim[3].extent; - const int channelC4 = UP_DIV(channels, 4); - - const int outH = outputTensor->buffer().dim[2].extent; - const int outW = outputTensor->buffer().dim[3].extent; - - mGlobalWorkSize = { - static_cast(channelC4), - static_cast(outW), - static_cast(outH * batches) - }; - - MNN_ASSERT(outW > 0 && outH > 0); - - uint32_t idx = 0; - cl_int ret = CL_SUCCESS; - ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); - ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]); - ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]); - ret |= unit.kernel->get().setArg(idx++, openCLImage(inputTensor)); - ret |= unit.kernel->get().setArg(idx++, openCLImage(gridTensor)); - ret |= unit.kernel->get().setArg(idx++, openCLImage(outputTensor)); - ret |= unit.kernel->get().setArg(idx++, static_cast(inH)); - ret |= unit.kernel->get().setArg(idx++, static_cast(inW)); - ret |= unit.kernel->get().setArg(idx++, static_cast(outH)); - ret |= unit.kernel->get().setArg(idx++, static_cast(outW)); - ret |= unit.kernel->get().setArg(idx++, mPaddingMode); - ret |= unit.kernel->get().setArg(idx++, mAlignCorners); - MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleExecution"); - - mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first; - mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); - unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; - unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + if(outputs[0]->dimensions() > 4){ + mUnits.resize(1); + const int batches = inputTensor->buffer().dim[0].extent; + const int channels = inputTensor->buffer().dim[1].extent; + const int inD = inputTensor->buffer().dim[2].extent; + const int inH = inputTensor->buffer().dim[3].extent; + const int inW = inputTensor->buffer().dim[4].extent; + const int channelC4 = UP_DIV(channels, 4); + const int outD = outputTensor->buffer().dim[2].extent; + const int outH = outputTensor->buffer().dim[3].extent; + const int outW = outputTensor->buffer().dim[4].extent; + std::vector outputShape = tensorShapeFormat(gridTensor); + auto &unit = mUnits[0]; + std::set buildOptions; + if (mMode == 0) { + mKernelName = "bilinear5d"; + unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions); + } + else { + mKernelName = "nearest5d"; + unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions); + + } + mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); + mGlobalWorkSize = { + static_cast(channelC4 * outD), + static_cast(outW), + static_cast(outH * batches) + }; + MNN_ASSERT(outW > 0 && outH > 0); + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(inputTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(gridTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLBuffer(outputTensor)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inD)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outD)); + ret |= unit.kernel->get().setArg(idx++, static_cast(batches)); + ret |= unit.kernel->get().setArg(idx++, mPaddingMode); + ret |= unit.kernel->get().setArg(idx++, mAlignCorners); + MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleExecution"); + + mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, runtime, mKernelName, unit.kernel).first; + + mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); + unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; + unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + }else{ + mUnits.resize(1); + auto &unit = mUnits[0]; + const int batches = inputTensor->buffer().dim[0].extent; + const int channels = inputTensor->buffer().dim[1].extent; + const int inH = inputTensor->buffer().dim[2].extent; + const int inW = inputTensor->buffer().dim[3].extent; + const int channelC4 = UP_DIV(channels, 4); + + const int outH = outputTensor->buffer().dim[2].extent; + const int outW = outputTensor->buffer().dim[3].extent; + + std::set buildOptions; + if (mMode == 0) { + mKernelName = "bilinear"; + unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions); + } + else { + mKernelName = "nearest"; + unit.kernel = runtime->buildKernel("grid_sample", mKernelName, buildOptions); + + } + + mGlobalWorkSize = { + static_cast(channelC4), + static_cast(outW), + static_cast(outH * batches) + }; + + MNN_ASSERT(outW > 0 && outH > 0); + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[0]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[1]); + ret |= unit.kernel->get().setArg(idx++, mGlobalWorkSize[2]); + ret |= unit.kernel->get().setArg(idx++, openCLImage(inputTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLImage(gridTensor)); + ret |= unit.kernel->get().setArg(idx++, openCLImage(outputTensor)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(inW)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outH)); + ret |= unit.kernel->get().setArg(idx++, static_cast(outW)); + ret |= unit.kernel->get().setArg(idx++, mPaddingMode); + ret |= unit.kernel->get().setArg(idx++, mAlignCorners); + MNN_CHECK_CL_SUCCESS(ret, "setArg GridSampleExecution"); + + mLocalWorkSize = localWS3DDefault(mGlobalWorkSize, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), mKernelName, unit.kernel).first; + mOpenCLBackend->recordKernel3d(unit.kernel, mGlobalWorkSize, mLocalWorkSize); + unit.globalWorkSize = {mGlobalWorkSize[0], mGlobalWorkSize[1], mGlobalWorkSize[2]}; + unit.localWorkSize = {mLocalWorkSize[0], mLocalWorkSize[1], mLocalWorkSize[2]}; + } return NO_ERROR; } diff --git a/source/backend/opencl/execution/image/RasterExecution.cpp b/source/backend/opencl/execution/image/RasterExecution.cpp index 15cdb6235..47a1af17f 100644 --- a/source/backend/opencl/execution/image/RasterExecution.cpp +++ b/source/backend/opencl/execution/image/RasterExecution.cpp @@ -37,15 +37,131 @@ ErrorCode RasterExecution::onEncode(const std::vector &____inputs, con mNeedZero = !TensorUtils::regionIsFull(output); auto regionNum = des->regions.size(); auto runtime = ((OpenCLBackend *)backend())->getOpenCLRuntime(); - + mFast = false; + if (outputDes->dimensionFormat == MNN_DATA_FORMAT_NC4HW4) { + mFast = true; + for (int i=0; i< des->regions.size(); ++i) { + auto& slice = des->regions[i]; + if (TensorUtils::getDescribe(slice.origin)->dimensionFormat != MNN_DATA_FORMAT_NC4HW4) { + mFast = false; + break; + } + if (!OpCommonUtils::canBlitFast(slice, output)) { + mFast = false; + break; + } + } + } + + if(mFast) + { + mUnits.resize(regionNum); + int kernel_idx = 0; + + if(mNeedZero) + { + mUnits.resize(regionNum + 1); + auto outputShape = tensorShapeFormat(output); + int region[] = {outputShape[0], UP_DIV(outputShape[3], 4), outputShape[1], outputShape[2]};//nhwc + Unit &unit = mUnits[kernel_idx++]; + unit.kernel = runtime->buildKernel("raster", "image_set_zero", {}, output, output); + unit.localWorkSize = {8, 8}; + unit.globalWorkSize = {(uint32_t)UP_DIV((region[1] * region[3]), 16)*16, + (uint32_t)UP_DIV((region[0] * region[2]), 16)*16}; + + int global_dim0 = region[1] * region[3]; + int global_dim1 = region[0] * region[2]; + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, global_dim0); + ret |= unit.kernel->get().setArg(idx++, global_dim1); + ret |= unit.kernel->get().setArg(idx++, openCLImage(output)); + if(ret != CL_SUCCESS) + { + MNN_PRINT("setArg err %d\n", (int)ret); + } + mOpenCLBackend->recordKernel2d(unit.kernel, + {(uint32_t)UP_DIV((region[1] * region[3]), 16)*16, + (uint32_t)UP_DIV((region[0] * region[2]), 16)*16}, + {8, 8}); + } + + // image raster + for (auto& slice : des->regions) + { + Tensor::InsideDescribe::Region C4Region; + OpCommonUtils::turnToPackRegion(slice, C4Region, output, 4); + + Unit &unit = mUnits[kernel_idx++]; + unit.kernel = runtime->buildKernel("raster", "raster_image", {}, output, output); + + const std::vector gws = {(uint32_t)C4Region.size[2], + (uint32_t)C4Region.size[1], + (uint32_t)C4Region.size[0]}; + uint32_t mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); + + auto outputShape = tensorShapeFormat(output); + auto sliceShape = tensorShapeFormat(slice.origin); + + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, gws[0]); + ret |= unit.kernel->get().setArg(idx++, gws[1]); + ret |= unit.kernel->get().setArg(idx++, gws[2]); + ret |= unit.kernel->get().setArg(idx++, openCLImage(slice.origin)); + ret |= unit.kernel->get().setArg(idx++, C4Region.src.offset); + ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[0]); + ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[1]); + ret |= unit.kernel->get().setArg(idx++, C4Region.src.stride[2]); + ret |= unit.kernel->get().setArg(idx++, sliceShape[1]); + ret |= unit.kernel->get().setArg(idx++, sliceShape[2]); + ret |= unit.kernel->get().setArg(idx++, sliceShape[3]); + ret |= unit.kernel->get().setArg(idx++, openCLImage(output)); + ret |= unit.kernel->get().setArg(idx++, C4Region.dst.offset); + ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[0]); + ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[1]); + ret |= unit.kernel->get().setArg(idx++, C4Region.dst.stride[2]); + ret |= unit.kernel->get().setArg(idx++, outputShape[1]); + ret |= unit.kernel->get().setArg(idx++, outputShape[2]); + ret |= unit.kernel->get().setArg(idx++, outputShape[3]); + if(ret != CL_SUCCESS) + { + MNN_PRINT("setArg err %d\n", (int)ret); + } + std::string name = "rasterImage"; + const std::vector lws = localWS3DDefault(gws, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), name, unit.kernel).first; + + unit.localWorkSize = {lws[0], lws[1], lws[2]}; + + unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), + ROUND_UP(gws[1], std::max((uint32_t)1, lws[1])), + ROUND_UP(gws[2], std::max((uint32_t)1, lws[2]))}; + mOpenCLBackend->recordKernel3d(unit.kernel, gws, lws); + } + return NO_ERROR; + } bool cancombine = CanCombine(outputs); // Alloc Temp buffer auto bufferPool = ((OpenCLBackend *)backend())->getBufferPool(); - if(output->getType().code == halide_type_float && runtime->isSupportedFP16()) { - mTempOutput = bufferPool->alloc(output->usize()/2); - }else{ - mTempOutput = bufferPool->alloc(output->usize()); + auto bufferUnitSize = runtime->isSupportedFP16() ? sizeof(half_float::half) : sizeof(float); + for(int i=0; i< regionNum; ++i) + { + auto origin = des->regions[i].origin; + if(mTempInput.find(origin) != mTempInput.end()) + { + continue; + } + + auto buffer = bufferPool->alloc(origin->elementSize()*bufferUnitSize); + mTempInput.insert(std::make_pair(origin, buffer)); + } + mTempOutput = bufferPool->alloc(output->elementSize() * bufferUnitSize); + + for(auto& iter : mTempInput) + { + bufferPool->recycle(iter.second); } bufferPool->recycle(mTempOutput); @@ -53,12 +169,12 @@ ErrorCode RasterExecution::onEncode(const std::vector &____inputs, con if(cancombine){ regionNum = 1; } - mUnits.resize(regionNum + 1); + mUnits.resize(regionNum + originNum + 1); int kernel_idx = 0; if(mNeedZero) { - mUnits.resize(regionNum + 2); + mUnits.resize(regionNum + originNum + 2); auto outputShape = tensorShapeFormat(output); int region[] = {outputShape[0], outputShape[3], outputShape[1], outputShape[2]};//nhwc Unit &unit = mUnits[kernel_idx++]; @@ -88,23 +204,63 @@ ErrorCode RasterExecution::onEncode(const std::vector &____inputs, con mOpenCLBackend->recordKernel2d(unit.kernel, gws, lws); } + + //image to buffer + for(auto& iter : mTempInput) + { + Tensor* origin = iter.first; + std::vector regionShape = tensorShapeFormat(origin); + int inputWH[] = {regionShape[2], regionShape[1]}; + int region[] = {regionShape[0], UP_DIV(regionShape[3], 4), regionShape[1], regionShape[2]}; + + Unit &unit = mUnits[kernel_idx++]; + if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC)// Image to nhwc buffer + { + unit.kernel = runtime->buildKernel("buffer_to_image", "image_to_nhwc_buffer", {}, origin, origin); + } + else //Image to nchw buffer + { + unit.kernel = runtime->buildKernel("buffer_to_image", "image_to_nchw_buffer", {}, origin, origin); + } + + std::vector gws = {(uint32_t)(region[3] * region[1]), + (uint32_t)(region[2] * region[0])}; + //MNN_CHECK_CL_SUCCESS + uint32_t idx = 0; + cl_int ret = CL_SUCCESS; + ret |= unit.kernel->get().setArg(idx++, gws[0]); + ret |= unit.kernel->get().setArg(idx++, gws[1]); + ret |= unit.kernel->get().setArg(idx++, *(iter.second)); + ret |= unit.kernel->get().setArg(idx++, inputWH[1]); + ret |= unit.kernel->get().setArg(idx++, inputWH[0]); + ret |= unit.kernel->get().setArg(idx++, regionShape[3]); + ret |= unit.kernel->get().setArg(idx++, openCLImage(origin)); + if(ret != CL_SUCCESS) + { + MNN_PRINT("setArg err %d\n", (int)ret); + } + + uint32_t mMaxWorkGroupSize = static_cast(runtime->getMaxWorkGroupSize(unit.kernel)); + + std::string kernelName = "raster_image_to_buffer"; + std::vector lws = localWS2DDefault(gws, mMaxWorkGroupSize, mOpenCLBackend->getOpenCLRuntime(), kernelName, unit.kernel).first; + + unit.localWorkSize = {lws[0], lws[1]}; + unit.globalWorkSize = {ROUND_UP(gws[0], std::max((uint32_t)1, lws[0])), + ROUND_UP(gws[1], std::max((uint32_t)1, lws[1]))}; + mOpenCLBackend->recordKernel2d(unit.kernel, gws, lws); + } // buffer raster if(cancombine){ - std::set buildOptions; auto regions = des->regions; auto slice = regions[0]; - auto origin = slice.origin; - auto inputShape = tensorShapeFormat(origin); int nums = regions.size(); int src_offset = regions[1].src.offset - slice.src.offset; int dst_offset = regions[1].dst.offset - slice.dst.offset; - if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) { - buildOptions.emplace(" -DINPUT_DATA_FORMAT_NHWC"); - } Unit &unit = mUnits[kernel_idx++]; - unit.kernel = runtime->buildKernel("raster", "raster_buffer_direct", buildOptions, output, output); + unit.kernel = runtime->buildKernel("raster", "raster_buffer_combine", {}, output, output); unit.globalWorkSize = {(uint32_t)slice.size[2] * nums, (uint32_t)slice.size[1], @@ -120,15 +276,12 @@ ErrorCode RasterExecution::onEncode(const std::vector &____inputs, con ret |= unit.kernel->get().setArg(idx++, gws[0]); ret |= unit.kernel->get().setArg(idx++, gws[1]); ret |= unit.kernel->get().setArg(idx++, gws[2]); - ret |= unit.kernel->get().setArg(idx++, openCLBuffer(origin)); + ret |= unit.kernel->get().setArg(idx++, *(mTempInput[slice.origin])); ret |= unit.kernel->get().setArg(idx++, slice.src.offset); ret |= unit.kernel->get().setArg(idx++, src_offset); ret |= unit.kernel->get().setArg(idx++, slice.src.stride[0]); ret |= unit.kernel->get().setArg(idx++, slice.src.stride[1]); ret |= unit.kernel->get().setArg(idx++, slice.src.stride[2]); - ret |= unit.kernel->get().setArg(idx++, inputShape[2]); - ret |= unit.kernel->get().setArg(idx++, inputShape[1]); - ret |= unit.kernel->get().setArg(idx++, inputShape[3]); ret |= unit.kernel->get().setArg(idx++, *mTempOutput); ret |= unit.kernel->get().setArg(idx++, slice.dst.offset); ret |= unit.kernel->get().setArg(idx++, dst_offset); @@ -153,16 +306,8 @@ ErrorCode RasterExecution::onEncode(const std::vector &____inputs, con }else{ for (auto& slice : des->regions) { - std::set buildOptions; - auto origin = slice.origin; - auto inputShape = tensorShapeFormat(origin); - int src_offset = 0; - int dst_offset = 0; - if(TensorUtils::getDescribe(origin)->dimensionFormat == MNN_DATA_FORMAT_NHWC) { - buildOptions.emplace(" -DINPUT_DATA_FORMAT_NHWC"); - } Unit &unit = mUnits[kernel_idx++]; - unit.kernel = runtime->buildKernel("raster", "raster_buffer_direct", buildOptions, output, output); + unit.kernel = runtime->buildKernel("raster", "raster_buffer", {}, output, output); unit.globalWorkSize = {(uint32_t)slice.size[2], (uint32_t)slice.size[1], @@ -178,22 +323,16 @@ ErrorCode RasterExecution::onEncode(const std::vector &____inputs, con ret |= unit.kernel->get().setArg(idx++, gws[0]); ret |= unit.kernel->get().setArg(idx++, gws[1]); ret |= unit.kernel->get().setArg(idx++, gws[2]); - ret |= unit.kernel->get().setArg(idx++, openCLBuffer(origin)); + ret |= unit.kernel->get().setArg(idx++, *(mTempInput[slice.origin])); ret |= unit.kernel->get().setArg(idx++, slice.src.offset); - ret |= unit.kernel->get().setArg(idx++, src_offset); ret |= unit.kernel->get().setArg(idx++, slice.src.stride[0]); ret |= unit.kernel->get().setArg(idx++, slice.src.stride[1]); ret |= unit.kernel->get().setArg(idx++, slice.src.stride[2]); - ret |= unit.kernel->get().setArg(idx++, inputShape[2]); - ret |= unit.kernel->get().setArg(idx++, inputShape[1]); - ret |= unit.kernel->get().setArg(idx++, inputShape[3]); ret |= unit.kernel->get().setArg(idx++, *mTempOutput); ret |= unit.kernel->get().setArg(idx++, slice.dst.offset); - ret |= unit.kernel->get().setArg(idx++, dst_offset); ret |= unit.kernel->get().setArg(idx++, slice.dst.stride[0]); ret |= unit.kernel->get().setArg(idx++, slice.dst.stride[1]); ret |= unit.kernel->get().setArg(idx++, slice.dst.stride[2]); - ret |= unit.kernel->get().setArg(idx++, slice.size[2]); if(ret != CL_SUCCESS) { MNN_PRINT("setArg err %d\n", (int)ret); diff --git a/source/backend/opencl/execution/image/SoftmaxExecution.cpp b/source/backend/opencl/execution/image/SoftmaxExecution.cpp index 125c3c9d2..ad01839cb 100644 --- a/source/backend/opencl/execution/image/SoftmaxExecution.cpp +++ b/source/backend/opencl/execution/image/SoftmaxExecution.cpp @@ -87,8 +87,8 @@ ErrorCode SoftmaxExecution::onEncode(const std::vector &inputs, const std::vector mGlobalWorkSize{1, 1, 1}; if(inputBatch == outside && channel == inputChannels && inside == inputWidth * inputHeight){ mAxis = 1; - mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch}; localSize = getLocalSize(channelBlocks, MaxLocalSize); + mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)outputWidth, (uint32_t)outputHeight * outputBatch}; }else if(inputBatch * inputChannels == outside && channel == inputHeight && inside == inputWidth){ mAxis = 2; mGlobalWorkSize = {(uint32_t)(localSize), (uint32_t)channelBlocks*outputWidth, (uint32_t)outputBatch}; diff --git a/source/backend/vulkan/image/backend/VulkanBackend.cpp b/source/backend/vulkan/image/backend/VulkanBackend.cpp index 0663ceba6..4be0fddb4 100644 --- a/source/backend/vulkan/image/backend/VulkanBackend.cpp +++ b/source/backend/vulkan/image/backend/VulkanBackend.cpp @@ -321,6 +321,9 @@ void VulkanBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTenso iter = mConverters.find(key); } mCmdBuffers.push_back(iter->second.second->get()); + if (TensorUtils::getDescribe(srcTensor)->isMutable == false) { + _finish(); + } } else if (dstTensor->host() != nullptr) { // gpu->host auto size = VulkanTensor::getAlignSize(srcTensor) * sizeof(float); diff --git a/source/backend/vulkan/image/compiler/AllShader.cpp b/source/backend/vulkan/image/compiler/AllShader.cpp index e9b7860f1..843d6fe37 100644 --- a/source/backend/vulkan/image/compiler/AllShader.cpp +++ b/source/backend/vulkan/image/compiler/AllShader.cpp @@ -19172,6 +19172,2066 @@ const unsigned char glsl_imageTonc4hw4_comp[] = { }; unsigned int glsl_imageTonc4hw4_comp_len = 2220; +const unsigned char glsl_binary_blit_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0x16, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x3e, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x86, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x88, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x3e, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x30, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x03, 0x00, 0x43, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x04, 0x00, 0x86, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x86, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x22, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, + 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x21, 0x00, 0x03, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x15, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x17, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x3d, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x3d, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x44, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x14, 0x00, 0x02, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, 0x85, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, 0x86, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x87, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x86, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, + 0x87, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa0, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3c, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x03, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x15, 0x01, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x39, 0x00, 0x00, 0x00, + 0x40, 0x00, 0x00, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x4d, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x53, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, + 0x52, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x60, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x55, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x60, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, 0x60, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x7e, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd7, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, + 0xd7, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdf, 0x00, 0x00, 0x00, 0xdc, 0x00, 0x00, 0x00, 0x69, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe0, 0x00, 0x00, 0x00, + 0xda, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe5, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0x64, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00, + 0xe0, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xec, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, + 0x15, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x6e, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0x9f, 0x00, 0x00, 0x00, 0x9e, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_comp_len = 1760; + +const unsigned char glsl_binary_blit_ADD_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x81, 0x00, 0x05, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_ADD_comp_len = 2808; + +const unsigned char glsl_binary_blit_SUB_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_SUB_comp_len = 2808; + +const unsigned char glsl_binary_blit_MUL_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_MUL_comp_len = 2808; + +const unsigned char glsl_binary_blit_DIV_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x88, 0x00, 0x05, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, + 0x3e, 0x00, 0x03, 0x00, 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf9, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_DIV_comp_len = 2808; + +const unsigned char glsl_binary_blit_POW_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x1a, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_POW_comp_len = 2816; + +const unsigned char glsl_binary_blit_VMAX_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x28, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_VMAX_comp_len = 2816; + +const unsigned char glsl_binary_blit_VMIN_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xf6, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x46, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x89, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x41, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x46, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x88, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x90, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa4, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x46, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x47, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x4d, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x81, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x88, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x89, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x89, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8a, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x8e, 0x00, 0x00, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x96, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x96, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x98, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa3, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x3f, 0x00, 0x00, 0x00, 0xa4, 0x00, 0x00, 0x00, 0xa3, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x3f, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x3c, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x42, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x43, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x4b, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x4e, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x50, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x54, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x49, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x56, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x55, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x45, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x63, 0x00, 0x00, 0x00, 0x57, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x63, 0x00, 0x00, 0x00, + 0x57, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x74, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xab, 0x00, 0x00, 0x00, + 0xa8, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb0, 0x00, 0x00, 0x00, 0xad, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, + 0xab, 0x00, 0x00, 0x00, 0xb0, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb6, 0x00, 0x00, 0x00, 0xb3, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, + 0xb1, 0x00, 0x00, 0x00, 0xb6, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0xb9, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x73, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x00, 0x00, 0x6e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x7b, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x7c, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, 0xc1, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc9, 0x00, 0x00, 0x00, + 0xc6, 0x00, 0x00, 0x00, 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, 0xc4, 0x00, 0x00, 0x00, + 0xc9, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcf, 0x00, 0x00, 0x00, + 0xcc, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd0, 0x00, 0x00, 0x00, 0xca, 0x00, 0x00, 0x00, + 0xcf, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd2, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0xd2, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x73, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x48, 0x00, 0x00, 0x00, + 0x81, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x83, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x84, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xdd, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, 0x59, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, 0xdf, 0x00, 0x00, 0x00, + 0x6c, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe3, 0x00, 0x00, 0x00, 0xdd, 0x00, 0x00, 0x00, 0xe2, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, 0xe5, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe9, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, 0xe8, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xeb, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0xeb, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x90, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, + 0xbd, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x93, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x92, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x99, 0x00, 0x00, 0x00, 0x71, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x0c, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf5, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x25, 0x00, 0x00, 0x00, 0x95, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, + 0xf5, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x92, 0x00, 0x00, 0x00, 0xa2, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x00, 0x00, + 0x71, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0xa2, 0x00, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x50, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_VMIN_comp_len = 2816; + +const unsigned char glsl_binary_blit_SQUDIFF_comp[] = { + 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, + 0xfe, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x06, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x4c, 0x53, 0x4c, 0x2e, 0x73, 0x74, 0x64, 0x2e, 0x34, 0x35, 0x30, + 0x00, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x03, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x0f, 0x00, 0x06, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x10, 0x00, 0x06, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x11, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x03, 0x00, 0x03, 0x00, 0x02, 0x00, 0x00, 0x00, + 0xb8, 0x01, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x6d, 0x61, 0x69, 0x6e, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x08, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x67, 0x6c, 0x5f, 0x47, 0x6c, 0x6f, 0x62, 0x61, + 0x6c, 0x49, 0x6e, 0x76, 0x6f, 0x63, 0x61, 0x74, 0x69, 0x6f, 0x6e, 0x49, + 0x44, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x63, 0x6f, 0x6e, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x30, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x06, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x73, 0x72, 0x63, 0x76, 0x69, 0x65, 0x77, 0x31, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x64, 0x73, 0x74, 0x76, 0x69, 0x65, 0x77, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x73, 0x69, 0x7a, 0x65, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x05, 0x00, 0x4c, 0x00, 0x00, 0x00, + 0x75, 0x43, 0x6f, 0x6e, 0x73, 0x74, 0x61, 0x6e, 0x74, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x06, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x73, 0x6f, 0x75, 0x72, + 0x63, 0x65, 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x05, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x75, 0x4f, 0x75, 0x74, 0x70, 0x75, 0x74, 0x00, + 0x05, 0x00, 0x05, 0x00, 0x92, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, + 0x42, 0x75, 0x66, 0x66, 0x65, 0x72, 0x00, 0x00, 0x06, 0x00, 0x05, 0x00, + 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, + 0x00, 0x00, 0x00, 0x00, 0x05, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x75, 0x49, 0x6e, 0x70, 0x75, 0x74, 0x30, 0x00, 0x05, 0x00, 0x05, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x64, 0x65, 0x73, 0x74, 0x42, 0x75, 0x66, 0x66, + 0x65, 0x72, 0x30, 0x00, 0x06, 0x00, 0x05, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x64, 0x61, 0x74, 0x61, 0x00, 0x00, 0x00, 0x00, + 0x05, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, 0x75, 0x49, 0x6e, 0x70, + 0x75, 0x74, 0x31, 0x00, 0x47, 0x00, 0x04, 0x00, 0x45, 0x00, 0x00, 0x00, + 0x0b, 0x00, 0x00, 0x00, 0x1c, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, + 0x48, 0x00, 0x05, 0x00, 0x4a, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x23, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x30, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x4c, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x8d, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x19, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x8f, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x91, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x92, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x94, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x04, 0x00, 0x00, 0x00, 0x48, 0x00, 0x04, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x18, 0x00, 0x00, 0x00, 0x48, 0x00, 0x05, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x23, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x03, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, 0x9d, 0x00, 0x00, 0x00, + 0x22, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x47, 0x00, 0x04, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x21, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x04, 0x00, 0xa8, 0x00, 0x00, 0x00, 0x0b, 0x00, 0x00, 0x00, + 0x19, 0x00, 0x00, 0x00, 0x13, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x21, 0x00, 0x03, 0x00, 0x03, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x16, 0x00, 0x03, 0x00, + 0x0e, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, + 0x15, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x20, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x16, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, 0x2c, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x17, 0x00, 0x04, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x16, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x43, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x44, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x06, 0x00, + 0x4a, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4b, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x4a, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x4b, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x14, 0x00, 0x02, 0x00, 0x51, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x2b, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x77, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x07, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x85, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x1d, 0x00, 0x03, 0x00, 0x8c, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x03, 0x00, 0x8d, 0x00, 0x00, 0x00, 0x8c, 0x00, 0x00, 0x00, + 0x20, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x8d, 0x00, 0x00, 0x00, 0x3b, 0x00, 0x04, 0x00, 0x8e, 0x00, 0x00, 0x00, + 0x8f, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x91, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x92, 0x00, 0x00, 0x00, 0x91, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x93, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x92, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x93, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, 0x96, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1d, 0x00, 0x03, 0x00, + 0x9a, 0x00, 0x00, 0x00, 0x0e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x03, 0x00, + 0x9b, 0x00, 0x00, 0x00, 0x9a, 0x00, 0x00, 0x00, 0x20, 0x00, 0x04, 0x00, + 0x9c, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x9b, 0x00, 0x00, 0x00, + 0x3b, 0x00, 0x04, 0x00, 0x9c, 0x00, 0x00, 0x00, 0x9d, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x2b, 0x00, 0x04, 0x00, 0x16, 0x00, 0x00, 0x00, + 0xa7, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x2c, 0x00, 0x06, 0x00, + 0x43, 0x00, 0x00, 0x00, 0xa8, 0x00, 0x00, 0x00, 0xa7, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x1e, 0x00, 0x00, 0x00, 0x36, 0x00, 0x05, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x05, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x43, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, + 0x45, 0x00, 0x00, 0x00, 0x7c, 0x00, 0x04, 0x00, 0x40, 0x00, 0x00, 0x00, + 0x47, 0x00, 0x00, 0x00, 0x46, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x47, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x4f, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, + 0x2c, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x00, 0x00, 0x4f, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x05, 0x00, + 0x51, 0x00, 0x00, 0x00, 0x52, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x50, 0x00, 0x00, 0x00, 0xf7, 0x00, 0x03, 0x00, 0x54, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xfa, 0x00, 0x04, 0x00, 0x52, 0x00, 0x00, 0x00, + 0x53, 0x00, 0x00, 0x00, 0x54, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, + 0x53, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x4e, 0x00, 0x00, 0x00, + 0x58, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x4d, 0x00, 0x00, 0x00, + 0x1e, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x58, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x4e, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, + 0x4d, 0x00, 0x00, 0x00, 0x25, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x5a, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, + 0x59, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, + 0x5c, 0x00, 0x00, 0x00, 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x49, 0x00, 0x00, 0x00, 0x5c, 0x00, 0x00, 0x00, + 0x8b, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, + 0x67, 0x00, 0x00, 0x00, 0x5b, 0x00, 0x00, 0x00, 0x87, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x67, 0x00, 0x00, 0x00, + 0x5b, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, 0x77, 0x00, 0x00, 0x00, + 0x78, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, + 0x78, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xac, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xaf, 0x00, 0x00, 0x00, + 0xac, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xb4, 0x00, 0x00, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xb5, 0x00, 0x00, 0x00, + 0xaf, 0x00, 0x00, 0x00, 0xb4, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, + 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xba, 0x00, 0x00, 0x00, 0xb7, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, + 0xb5, 0x00, 0x00, 0x00, 0xba, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, 0x79, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xc1, 0x00, 0x00, 0x00, 0xbb, 0x00, 0x00, 0x00, 0xbd, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x05, 0x00, 0x77, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, + 0x4c, 0x00, 0x00, 0x00, 0x72, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, + 0x07, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x7f, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, 0xc5, 0x00, 0x00, 0x00, + 0x5d, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xca, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xcd, 0x00, 0x00, 0x00, + 0xca, 0x00, 0x00, 0x00, 0x70, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, 0xc8, 0x00, 0x00, 0x00, + 0xcd, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, + 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xd3, 0x00, 0x00, 0x00, + 0xd0, 0x00, 0x00, 0x00, 0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xd4, 0x00, 0x00, 0x00, 0xce, 0x00, 0x00, 0x00, + 0xd3, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xd6, 0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, + 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, + 0xd4, 0x00, 0x00, 0x00, 0xd6, 0x00, 0x00, 0x00, 0x41, 0x00, 0x05, 0x00, + 0x77, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, 0x4c, 0x00, 0x00, 0x00, + 0x85, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x07, 0x00, 0x00, 0x00, + 0x88, 0x00, 0x00, 0x00, 0x87, 0x00, 0x00, 0x00, 0x51, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x88, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe1, 0x00, 0x00, 0x00, 0xde, 0x00, 0x00, 0x00, 0x5d, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, + 0x88, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00, 0xe3, 0x00, 0x00, 0x00, + 0x70, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xe7, 0x00, 0x00, 0x00, 0xe1, 0x00, 0x00, 0x00, 0xe6, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0x88, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x84, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00, 0xe9, 0x00, 0x00, 0x00, + 0x6b, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, + 0xed, 0x00, 0x00, 0x00, 0xe7, 0x00, 0x00, 0x00, 0xec, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x06, 0x00, 0x00, 0x00, 0xef, 0x00, 0x00, 0x00, + 0x88, 0x00, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x80, 0x00, 0x05, 0x00, + 0x06, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0xed, 0x00, 0x00, 0x00, + 0xef, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, 0x96, 0x00, 0x00, 0x00, + 0x97, 0x00, 0x00, 0x00, 0x94, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, + 0xc1, 0x00, 0x00, 0x00, 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x97, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, + 0x98, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, 0x98, 0x00, 0x00, 0x00, + 0x41, 0x00, 0x06, 0x00, 0x96, 0x00, 0x00, 0x00, 0x9f, 0x00, 0x00, 0x00, + 0x9d, 0x00, 0x00, 0x00, 0x75, 0x00, 0x00, 0x00, 0xda, 0x00, 0x00, 0x00, + 0x3d, 0x00, 0x04, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, + 0x9f, 0x00, 0x00, 0x00, 0x50, 0x00, 0x07, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, + 0xa0, 0x00, 0x00, 0x00, 0xa0, 0x00, 0x00, 0x00, 0x83, 0x00, 0x05, 0x00, + 0x0f, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00, 0x99, 0x00, 0x00, 0x00, + 0xa1, 0x00, 0x00, 0x00, 0x85, 0x00, 0x05, 0x00, 0x0f, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x00, 0x00, + 0x51, 0x00, 0x05, 0x00, 0x0e, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x41, 0x00, 0x06, 0x00, + 0x96, 0x00, 0x00, 0x00, 0xa6, 0x00, 0x00, 0x00, 0x8f, 0x00, 0x00, 0x00, + 0x75, 0x00, 0x00, 0x00, 0xf3, 0x00, 0x00, 0x00, 0x3e, 0x00, 0x03, 0x00, + 0xa6, 0x00, 0x00, 0x00, 0xa5, 0x00, 0x00, 0x00, 0xf9, 0x00, 0x02, 0x00, + 0x54, 0x00, 0x00, 0x00, 0xf8, 0x00, 0x02, 0x00, 0x54, 0x00, 0x00, 0x00, + 0xfd, 0x00, 0x01, 0x00, 0x38, 0x00, 0x01, 0x00 +}; +unsigned int glsl_binary_blit_SQUDIFF_comp_len = 2828; + const unsigned char glsl_matmul_output_comp[] = { 0x03, 0x02, 0x23, 0x07, 0x00, 0x00, 0x01, 0x00, 0x0b, 0x00, 0x08, 0x00, 0xb1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x11, 0x00, 0x02, 0x00, diff --git a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp index 5575d39eb..915ca987b 100644 --- a/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp +++ b/source/backend/vulkan/image/compiler/VulkanShaderMap.cpp @@ -94,6 +94,15 @@ mMaps.insert(std::make_pair("glsl_col2Im_RELU_comp", std::make_pair(glsl_col2Im_ mMaps.insert(std::make_pair("glsl_col2Im_RELU6_comp", std::make_pair(glsl_col2Im_RELU6_comp,glsl_col2Im_RELU6_comp_len))); mMaps.insert(std::make_pair("glsl_nc4hw4toimage_comp", std::make_pair(glsl_nc4hw4toimage_comp,glsl_nc4hw4toimage_comp_len))); mMaps.insert(std::make_pair("glsl_imageTonc4hw4_comp", std::make_pair(glsl_imageTonc4hw4_comp,glsl_imageTonc4hw4_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_comp", std::make_pair(glsl_binary_blit_comp,glsl_binary_blit_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_ADD_comp", std::make_pair(glsl_binary_blit_ADD_comp,glsl_binary_blit_ADD_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_SUB_comp", std::make_pair(glsl_binary_blit_SUB_comp,glsl_binary_blit_SUB_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_MUL_comp", std::make_pair(glsl_binary_blit_MUL_comp,glsl_binary_blit_MUL_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_DIV_comp", std::make_pair(glsl_binary_blit_DIV_comp,glsl_binary_blit_DIV_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_POW_comp", std::make_pair(glsl_binary_blit_POW_comp,glsl_binary_blit_POW_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_VMAX_comp", std::make_pair(glsl_binary_blit_VMAX_comp,glsl_binary_blit_VMAX_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_VMIN_comp", std::make_pair(glsl_binary_blit_VMIN_comp,glsl_binary_blit_VMIN_comp_len))); +mMaps.insert(std::make_pair("glsl_binary_blit_SQUDIFF_comp", std::make_pair(glsl_binary_blit_SQUDIFF_comp,glsl_binary_blit_SQUDIFF_comp_len))); mMaps.insert(std::make_pair("glsl_matmul_output_comp", std::make_pair(glsl_matmul_output_comp,glsl_matmul_output_comp_len))); mMaps.insert(std::make_pair("glsl_matmul_output_BIAS_comp", std::make_pair(glsl_matmul_output_BIAS_comp,glsl_matmul_output_BIAS_comp_len))); mMaps.insert(std::make_pair("glsl_matmul_output_TRANSPOSE_comp", std::make_pair(glsl_matmul_output_TRANSPOSE_comp,glsl_matmul_output_TRANSPOSE_comp_len))); diff --git a/source/backend/vulkan/image/execution/VulkanLoop.cpp b/source/backend/vulkan/image/execution/VulkanLoop.cpp new file mode 100644 index 000000000..fc86cc58a --- /dev/null +++ b/source/backend/vulkan/image/execution/VulkanLoop.cpp @@ -0,0 +1,232 @@ +#include "VulkanLoop.hpp" +#include "VulkanBinary.hpp" + +namespace MNN { + +std::string getMidName(const Op* op) { + std::string mid = ""; + if (op->type() == OpType_Eltwise) { + if (op->main_as_Eltwise()->coeff() != nullptr) { + // Don't support + return ""; + } + switch (op->main_as_Eltwise()->type()) { + case EltwiseType_SUB: + mid = "SUB"; + break; + case EltwiseType_MAXIMUM: + mid = "VMAX"; + break; + case EltwiseType_PROD: + mid = "MUL"; + break; + case EltwiseType_SUM: + mid = "ADD"; + break; + default: + break; + } + } else if (op->type() == OpType_BinaryOp) { + switch (op->main_as_BinaryOp()->opType()) { + case BinaryOpOperation_ADD: + mid = "ADD"; + break; + case BinaryOpOperation_SUB: + mid = "SUB"; + break; + case BinaryOpOperation_MAXIMUM: + mid = "VMAX"; + break; + case BinaryOpOperation_MINIMUM: + mid = "VMIN"; + break; + case BinaryOpOperation_MUL: + mid = "MUL"; + break; + case BinaryOpOperation_POW: + mid = "POW"; + break; + case BinaryOpOperation_SquaredDifference: + mid = "SQUDIFF"; + break; + case BinaryOpOperation_DIV: + case BinaryOpOperation_REALDIV: + mid = "DIV"; + break; + default: + break; + } + } + return mid; +} + +static void _setTensorStack(std::vector& result, const std::vector& inputs, const std::vector& outputs, const LoopParam* loop) { + if (loop->inputIndexes() != nullptr) { + for (int i=0; iinputIndexes()->size(); ++i) { + result[loop->inputIndexes()->data()[i]] = inputs[i]; + } + } + for (int i=0; ioutputIndexes()->size(); ++i) { + result[loop->outputIndexes()->data()[i]] = outputs[i]; + } +} + +struct BinaryBroadCastInfo { + ivec4 srcview0; + ivec4 srcview1; + ivec4 dstview; + ivec4 size; +}; + +class VulkanBinaryBroadCast : public VulkanBasicExecution { +public: + VulkanBinaryBroadCast(const LoopParam* loop, Backend *bn, bool isInt) : VulkanBasicExecution(bn) { + mLoop = loop; + auto vkbackend = static_cast(bn); + + std::vector types{ + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, + }; + + std::string shaderName = "glsl_binary_blit_" + getMidName(mLoop->commands()->GetAs(0)->op()) + "_comp"; + + mLoopPipeline = vkbackend->getPipeline(shaderName, types); + mDescriptorSet.reset(mLoopPipeline->createSet()); + + mGpuLoopParam.reset(new VulkanBuffer(vkbackend->getMemoryPool(), false, sizeof(BinaryBroadCastInfo), nullptr, VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT)); + mTensors.resize(mLoop->tensorNumber()); + } + + virtual ~VulkanBinaryBroadCast() = default; + + virtual ErrorCode onEncode(const std::vector& inputs, const std::vector& outputs, + const VulkanCommandPool::Buffer* cmdBuffer) override { + _setTensorStack(mTensors, inputs, outputs, mLoop); + auto cmd = mLoop->commands()->GetAs(0); + auto size = cmd->size()->data(); + auto vkBn = static_cast(backend()); + auto srcStride0 = cmd->view()->GetAs(1)->stride()->data(); + auto srcStride1 = cmd->view()->GetAs(2)->stride()->data(); + auto dstStride = cmd->view()->GetAs(0)->stride()->data(); + int totalSize = size[0] * size[1] * size[2]; + auto param = reinterpret_cast(mGpuLoopParam->map()); + for (int i=0; i<3; ++i) { + param->size[i] = size[i]; + param->srcview0[i] = srcStride0[i]; + param->srcview1[i] = srcStride1[i]; + param->dstview[i] = dstStride[i]; + } + param->srcview0[3] = cmd->view()->GetAs(1)->offset(); + param->srcview1[3] = cmd->view()->GetAs(2)->offset(); + param->dstview[3] = cmd->view()->GetAs(0)->offset(); + param->size[3] = size[0] * size[1] * size[2]; + mGpuLoopParam->unmap(); + auto output = mTensors[cmd->indexes()->data()[0]]; + auto input0 = mTensors[cmd->indexes()->data()[1]]; + auto input1 = mTensors[cmd->indexes()->data()[2]]; + + { + int bufferSizeSource0 = sizeof(float); + for (int i=0; idimensions(); ++i) { + bufferSizeSource0 *= input0->length(i); + } + mInput0.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeSource0, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)); + mInput0.convert.reset(new VulkanImageConverter(vkBn)); + } + { + int bufferSizeSource1 = sizeof(float); + for (int i=0; idimensions(); ++i) { + bufferSizeSource1 *= input1->length(i); + } + mInput1.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeSource1, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)); + mInput1.convert.reset(new VulkanImageConverter(vkBn)); + } + { + int bufferSizeOutput = sizeof(float); + for (int i=0; idimensions(); ++i) { + bufferSizeOutput *= output->length(i); + } + mOutput.buffer.reset(new VulkanBuffer(vkBn->getDynamicMemoryPool(), false, bufferSizeOutput, nullptr, VK_BUFFER_USAGE_STORAGE_BUFFER_BIT)); + mOutput.convert.reset(new VulkanImageConverter(vkBn)); + } + mInput0.convert->encodeTensorToBuffer(input0, mInput0.buffer->buffer(), mInput0.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(inputs[0]), cmdBuffer); + mInput1.convert->encodeTensorToBuffer(input1, mInput1.buffer->buffer(), mInput1.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(inputs[1]), cmdBuffer); + + mDescriptorSet->writeBuffer(mOutput.buffer->buffer(), 0, mOutput.buffer->size()); + mDescriptorSet->writeBuffer(mInput0.buffer->buffer(), 1, mInput0.buffer->size()); + mDescriptorSet->writeBuffer(mInput1.buffer->buffer(), 2, mInput1.buffer->size()); + mDescriptorSet->writeBuffer(mGpuLoopParam->buffer(), 3, mGpuLoopParam->size()); + + cmdBuffer->barrierSource(mInput0.buffer->buffer(), 0, mInput0.buffer->size()); + cmdBuffer->barrierSource(mInput1.buffer->buffer(), 0, mInput1.buffer->size()); + + mLoopPipeline->bind(cmdBuffer->get(), mDescriptorSet->get()); + vkCmdDispatch(cmdBuffer->get(), UP_DIV(totalSize,256), 1, 1); + + cmdBuffer->barrierSource(mOutput.buffer->buffer(), 0, mOutput.buffer->size()); + mOutput.convert->encodeBufferToTensor(mOutput.buffer->buffer(), output, mOutput.buffer->size(), 0, VulkanImageConverter::getTensorLinearFormat(outputs[0]), cmdBuffer); + mInput0.buffer->release(); + mInput1.buffer->release(); + mOutput.buffer->release(); + + return NO_ERROR; + } + +private: + const LoopParam* mLoop; + const VulkanPipeline* mLoopPipeline; + std::shared_ptr mGpuLoopParam; + std::shared_ptr mDescriptorSet; + std::vector mTensors; + struct ConvertInfo { + std::shared_ptr convert; + std::shared_ptr buffer; + }; + ConvertInfo mInput0; + ConvertInfo mInput1; + ConvertInfo mOutput; +}; + +VulkanBasicExecution* VulkanLoop::create(const std::vector& inputs, const std::vector& outputs, const Op* op, Backend* bn) { + auto loop = op->main_as_LoopParam(); + if (nullptr == loop || loop->commands() == nullptr) { + return nullptr; + } + if (nullptr != loop->initCommand()) { + return nullptr; + } + + if (1 == loop->commands()->size()) { + auto cmd = loop->commands()->GetAs(0); + auto subop = cmd->op(); + if (OpType_BinaryOp == subop->type() && cmd->fuse() < 0 && 1 == loop->loopNumber()) { + std::string shaderMidName = getMidName(loop->commands()->GetAs(0)->op()); + if (shaderMidName.empty()) { + return nullptr; + } + bool isInt = inputs[1]->getType().code == halide_type_int; + if (isInt) { + return nullptr; + } + return new VulkanBinaryBroadCast(loop, bn, isInt); + } + } + return nullptr; +} + +class VulkanLoopCreator : public VulkanBackend::Creator { +public: + virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* bn) const override { + return VulkanLoop::create(inputs, outputs, op, bn); + } +}; + +static bool gResistor = []() { + VulkanBackend::addCreator(OpType_While, new VulkanLoopCreator); + return true; +}(); + +} \ No newline at end of file diff --git a/source/backend/vulkan/image/execution/VulkanLoop.hpp b/source/backend/vulkan/image/execution/VulkanLoop.hpp new file mode 100644 index 000000000..2ee524aad --- /dev/null +++ b/source/backend/vulkan/image/execution/VulkanLoop.hpp @@ -0,0 +1,24 @@ +// +// VulkanLoop.cpp +// MNN +// +// Created by MNN on 2024/10/18. +// Copyright © 2018, Alibaba Group Holding Limited +// + +#ifndef VulkanLoop_hpp +#define VulkanLoop_hpp + +#include "VulkanBasicExecution.hpp" +#include "VulkanImageConverter.hpp" + +namespace MNN { + +class VulkanLoop { +public: + static VulkanBasicExecution* create(const std::vector& inputs, const std::vector& outputs, const Op* op, Backend* bn); +}; + +} // namespace MNN + +#endif /* VulkanLoop_hpp */ diff --git a/source/backend/vulkan/image/execution/VulkanRaster.cpp b/source/backend/vulkan/image/execution/VulkanRaster.cpp index d5cc81b07..a5f93fd30 100644 --- a/source/backend/vulkan/image/execution/VulkanRaster.cpp +++ b/source/backend/vulkan/image/execution/VulkanRaster.cpp @@ -244,83 +244,8 @@ class VulkanRasterCreator : public VulkanBackend::Creator { }; -class VulkanLoop : public VulkanBasicExecution { -public: - VulkanLoop(Backend *bn, const LoopParam* loop) : VulkanBasicExecution(bn) { - mLoop = loop; - } - virtual ~VulkanLoop() = default; - - virtual ErrorCode onEncode(const std::vector &inputs, const std::vector &outputs, - const VulkanCommandPool::Buffer *cmdBuffer) override { - mExecutions.clear(); - auto cmd = mLoop->commands()->GetAs(0); - std::vector tensors(mLoop->tensorNumber()); - for (int i=0; iinputIndexes()->size(); ++i) { - tensors[mLoop->inputIndexes()->data()[i]] = inputs[i]; - } - for (int i=0; ioutputIndexes()->size(); ++i) { - tensors[mLoop->outputIndexes()->data()[i]] = outputs[i]; - } - auto C = tensors[cmd->indexes()->data()[0]]; - auto A = tensors[cmd->indexes()->data()[1]]; - auto B = tensors[cmd->indexes()->data()[2]]; - for (int i=0; iloopNumber(); ++i) { - VulkanMatMul::MatMulInfo matInfo; - matInfo.e = cmd->size()->data()[0]; - matInfo.l = cmd->size()->data()[1]; - matInfo.h = cmd->size()->data()[2]; - matInfo.offsetC = cmd->view()->GetAs(0)->offset() + i * cmd->steps()->data()[0]; - matInfo.offsetA = cmd->view()->GetAs(1)->offset() + i * cmd->steps()->data()[1]; - matInfo.offsetB = cmd->view()->GetAs(2)->offset() + i * cmd->steps()->data()[2]; - ::memcpy(matInfo.aStride, cmd->view()->GetAs(1)->stride()->data(), 3 * sizeof(int)); - ::memcpy(matInfo.bStride, cmd->view()->GetAs(2)->stride()->data(), 3 * sizeof(int)); - ::memcpy(matInfo.cStride, cmd->view()->GetAs(0)->stride()->data(), 3 * sizeof(int)); - Tensor* bias = nullptr; - if (cmd->indexes()->size() > 3) { - bias = tensors[cmd->indexes()->data()[3]]; - matInfo.offsetBias = cmd->view()->GetAs(3)->offset() + i * cmd->steps()->data()[3]; - } - auto matmulOp = cmd->op(); - std::shared_ptr exe(new VulkanMatMul(matmulOp->main_as_MatMul()->transposeA(), matmulOp->main_as_MatMul()->transposeB(), backend())); - auto matmulExe = static_cast(exe.get()); - bool res = true; - if (bias == nullptr) { - res = matmulExe->encode({{A, B}}, {C}, cmdBuffer, matInfo); - } else { - res = matmulExe->encode({{A, B, bias}}, {C}, cmdBuffer, matInfo); - } - if (!res) { - return NOT_SUPPORT; - } - mExecutions.emplace_back(exe); - } - return NO_ERROR; - } -private: - std::vector> mExecutions; - const LoopParam* mLoop; -}; - -class VulkanLoopCreator : public VulkanBackend::Creator { -public: - virtual VulkanBasicExecution* onCreate(const std::vector& inputs, const std::vector& outputs, const MNN::Op* op, Backend* bn) const override { - auto loop = op->main_as_LoopParam(); - if (1 != loop->commands()->size()) { - return nullptr; - } - auto cmd = loop->commands()->GetAs(0); - if (OpType_MatMul != cmd->op()->type()) { - return nullptr; - } - return new VulkanLoop(bn, loop); - } -}; - - static bool gResistor = []() { VulkanBackend::addCreator(OpType_Raster, new VulkanRasterCreator); -// VulkanBackend::addCreator(OpType_While, new VulkanLoopCreator); return true; }(); diff --git a/source/backend/vulkan/image/execution/glsl/binary_blit.comp b/source/backend/vulkan/image/execution/glsl/binary_blit.comp new file mode 100644 index 000000000..348318bca --- /dev/null +++ b/source/backend/vulkan/image/execution/glsl/binary_blit.comp @@ -0,0 +1,84 @@ +#version 440 core +#ifdef C4 +#define FLOAT vec4 +#else +#define FLOAT float +#endif + +#define OUTPUT_TYPE float + +#define FLOAT4 vec4 +layout(std430) buffer; +layout(set=0, binding=0) writeonly buffer sourceBuffer{ + OUTPUT_TYPE data[]; +} uOutput; + + +layout(set=0, binding=1) readonly buffer destBuffer{ + FLOAT data[]; +} uInput0; + +layout(set=0, binding=2) readonly buffer destBuffer0{ + FLOAT data[]; +} uInput1; + +layout(set=0, binding=3) uniform constBuffer{ + ivec4 srcview0; + ivec4 srcview1; + ivec4 dstview; + ivec4 size; +} uConstant; + +layout (local_size_x = 256, local_size_y = 1, local_size_z = 1) in; + +int computeVec4dot(ivec4 a, ivec4 b) { + return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; +} + +FLOAT4 binary(FLOAT4 x0, FLOAT4 x1) { + FLOAT4 value; +#ifdef ADD + value = x0 + x1; +#endif +#ifdef SUB + value = x0 - x1; +#endif +#ifdef MUL + value = x0 * x1; +#endif +#ifdef DIV + value = x0 / x1; +#endif +#ifdef POW + value = pow(x0, x1); +#endif +#ifdef VMAX + value = max(x0, x1); +#endif +#ifdef VMIN + value = min(x0, x1); +#endif +#ifdef SQUDIFF + value = (x0 - x1) * (x0 - x1); +#endif + return value; +} + +void main() +{ + ivec3 posTmp = ivec3(gl_GlobalInvocationID); + if (posTmp.x < uConstant.size.w) + { + ivec4 pos; + pos.x = posTmp.x / (uConstant.size.y * uConstant.size.z); + int subIndex = posTmp.x % (uConstant.size.y * uConstant.size.z); + pos.z = subIndex % uConstant.size.z; + pos.y = subIndex / uConstant.size.z; + pos.w = 1; + int s0 = computeVec4dot(uConstant.srcview0, pos); + int s1 = computeVec4dot(uConstant.srcview1, pos); + int d = computeVec4dot(uConstant.dstview, pos); + + uOutput.data[d] = OUTPUT_TYPE(binary(FLOAT4(uInput0.data[s0]), FLOAT4(uInput1.data[s1])).x); + } +} diff --git a/source/backend/vulkan/image/execution/glsl/macro.json b/source/backend/vulkan/image/execution/glsl/macro.json index 3c964eff1..bfc289616 100644 --- a/source/backend/vulkan/image/execution/glsl/macro.json +++ b/source/backend/vulkan/image/execution/glsl/macro.json @@ -107,5 +107,15 @@ ], "resizeNearest.comp":[ "NEAREST_ROUND" + ], + "binary_blit.comp":[ + "ADD", + "SUB", + "MUL", + "DIV", + "POW", + "VMAX", + "VMIN", + "SQUDIFF" ] } diff --git a/source/backend/vulkan/image/shaders/AllShader.h b/source/backend/vulkan/image/shaders/AllShader.h index 4297b2ced..5f52602a3 100644 --- a/source/backend/vulkan/image/shaders/AllShader.h +++ b/source/backend/vulkan/image/shaders/AllShader.h @@ -182,6 +182,24 @@ extern const unsigned char glsl_nc4hw4toimage_comp[]; extern unsigned int glsl_nc4hw4toimage_comp_len; extern const unsigned char glsl_imageTonc4hw4_comp[]; extern unsigned int glsl_imageTonc4hw4_comp_len; +extern const unsigned char glsl_binary_blit_comp[]; +extern unsigned int glsl_binary_blit_comp_len; +extern const unsigned char glsl_binary_blit_ADD_comp[]; +extern unsigned int glsl_binary_blit_ADD_comp_len; +extern const unsigned char glsl_binary_blit_SUB_comp[]; +extern unsigned int glsl_binary_blit_SUB_comp_len; +extern const unsigned char glsl_binary_blit_MUL_comp[]; +extern unsigned int glsl_binary_blit_MUL_comp_len; +extern const unsigned char glsl_binary_blit_DIV_comp[]; +extern unsigned int glsl_binary_blit_DIV_comp_len; +extern const unsigned char glsl_binary_blit_POW_comp[]; +extern unsigned int glsl_binary_blit_POW_comp_len; +extern const unsigned char glsl_binary_blit_VMAX_comp[]; +extern unsigned int glsl_binary_blit_VMAX_comp_len; +extern const unsigned char glsl_binary_blit_VMIN_comp[]; +extern unsigned int glsl_binary_blit_VMIN_comp_len; +extern const unsigned char glsl_binary_blit_SQUDIFF_comp[]; +extern unsigned int glsl_binary_blit_SQUDIFF_comp_len; extern const unsigned char glsl_matmul_output_comp[]; extern unsigned int glsl_matmul_output_comp_len; extern const unsigned char glsl_matmul_output_BIAS_comp[]; diff --git a/source/backend/vulkan/runtime/vulkan_wrapper.cpp b/source/backend/vulkan/runtime/vulkan_wrapper.cpp index 88e54f373..1da2353a3 100644 --- a/source/backend/vulkan/runtime/vulkan_wrapper.cpp +++ b/source/backend/vulkan/runtime/vulkan_wrapper.cpp @@ -21,7 +21,7 @@ int InitVulkan(void) { #include #include #include -#ifdef WIN32 +#ifdef _WIN32 #include #include #define MNN_DLSYM(lib, func_name) GetProcAddress(reinterpret_cast(lib), func_name) @@ -32,7 +32,7 @@ int InitVulkan(void) { int InitVulkanOnce(void) { const std::vector gVulkan_library_paths = { -#ifdef WIN32 +#ifdef _WIN32 "vulkan-1.dll", #endif "libvulkan.so", @@ -42,7 +42,7 @@ int InitVulkanOnce(void) { }; void* libvulkan = nullptr; for (const auto& s : gVulkan_library_paths) { -#ifdef WIN32 +#ifdef _WIN32 libvulkan = LoadLibrary(s.c_str()); #else libvulkan = dlopen(s.c_str(), RTLD_NOW | RTLD_LOCAL); @@ -52,7 +52,7 @@ int InitVulkanOnce(void) { } } if (nullptr == libvulkan) { -#ifdef WIN32 +#ifdef _WIN32 MNN_ERROR("Load vulkan library error\n"); #else auto message = dlerror(); diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp index 2e0b2548b..d83b2ba10 100644 --- a/source/core/Backend.hpp +++ b/source/core/Backend.hpp @@ -52,6 +52,9 @@ struct RuntimeHint { std::string midMemoryPath; std::string weightMemoryPath; + + // op encoder number for once commit + int encorderNumForCommit = 10; }; /** abstract backend */ class Backend : public NonCopyable { diff --git a/source/core/Session.cpp b/source/core/Session.cpp index 48148ab28..b8354d53a 100644 --- a/source/core/Session.cpp +++ b/source/core/Session.cpp @@ -97,6 +97,8 @@ void Session::ModeGroup::setHint(Interpreter::HintMode mode, int hint) { case Interpreter::KVCACHE_SIZE_LIMIT: runtimeHint.kvcacheSizeLimit = hint; break; + case Interpreter::OP_ENCODER_NUMBER_FOR_COMMIT: + runtimeHint.encorderNumForCommit = hint; default: break; } diff --git a/source/geometry/GeometryReshape.cpp b/source/geometry/GeometryReshape.cpp index a8d466437..88d98a24c 100644 --- a/source/geometry/GeometryReshape.cpp +++ b/source/geometry/GeometryReshape.cpp @@ -107,7 +107,7 @@ static void _create() { std::shared_ptr _comp(new SingleGeometryComputer); GeometryComputer::registerGeometryComputer(_comp, {OpType_Squeeze, OpType_Unsqueeze, OpType_ExpandDims, OpType_Flatten, OpType_QuantizedReshape}); std::shared_ptr copycomp(new CopyGeometryComputer); - GeometryComputer::registerGeometryComputer(comp, {OpType_Identity}); + GeometryComputer::registerGeometryComputer(copycomp, {OpType_Identity}); } REGISTER_GEOMETRY(GeometryReshape, _create); diff --git a/source/math/Vec.hpp b/source/math/Vec.hpp index 5e28154d5..d7636e074 100644 --- a/source/math/Vec.hpp +++ b/source/math/Vec.hpp @@ -234,11 +234,7 @@ struct Vec { value = std::move(lr.value); } float operator[](size_t i) { -#if defined(_MSC_VER) - return value.n128_i32[i]; -#else return value[i]; -#endif } static VecType load(const float* addr) { VecType v = { (int32x4_t)(vld1q_f32(addr)) }; @@ -400,11 +396,7 @@ struct Vec { value = std::move(lr.value); } float operator[](size_t i) { -#if defined(_MSC_VER) - return value.n128_f32[i]; -#else return value[i]; -#endif } static VecType load(const float* addr) { VecType v = { vld1q_f32(addr) }; diff --git a/test.sh b/test.sh index 52bd6c6d3..1ad2ab3ab 100755 --- a/test.sh +++ b/test.sh @@ -206,6 +206,7 @@ android_static_build() { -DMNN_OPENCL=true \ -DMNN_BUILD_MINI=true \ -DMNN_SUPPORT_BF16=true \ + -DMNN_ARM82=false \ -DMNN_OPENCL=true \ -DMNN_SUPPORT_TRANSFORMER_FUSE=ON \ -DNATIVE_LIBRARY_OUTPUT=. -DNATIVE_INCLUDE_OUTPUT=. @@ -657,7 +658,7 @@ android_test() { # 1. build Android32 mkdir build_32 pushd build_32 - ../build_32.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_OPENCL=true -DMNN_LOW_MEMORY=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON + ../build_32.sh -DMNN_BUILD_TRAIN=OFF -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DMNN_OPENCL=true -DMNN_LOW_MEMORY=ON -DMNN_SUPPORT_TRANSFORMER_FUSE=ON -DMNN_ARM82=OFF android32_build_wrong=$[$? > 0] mnn32_size=$(ls -lh libMNN.so | awk '{print $5}') expr32_size=$(ls -lh libMNN_Express.so | awk '{print $5}') diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 896089e45..f128825a6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -18,7 +18,7 @@ endif() add_executable(run_test.out ${Files}) target_link_libraries(run_test.out ${MNN_DEPS}) -if (WIN32) +if (MSVC) target_compile_options(run_test.out PRIVATE /bigobj) endif() if (MNN_SUPPORT_BF16) diff --git a/test/MNNTestSuite.h b/test/MNNTestSuite.h index a7ea46e4a..f961ec7cb 100644 --- a/test/MNNTestSuite.h +++ b/test/MNNTestSuite.h @@ -80,6 +80,12 @@ class MNNTestSuite { * @return shared instance */ static MNNTestSuite* get(); + struct Status { + int precision = 0; + int memory = 0; + int power = 0; + }; + Status pStaus; public: /** diff --git a/test/TestUtils.cpp b/test/TestUtils.cpp index 6719871c3..cfd10eba9 100644 --- a/test/TestUtils.cpp +++ b/test/TestUtils.cpp @@ -15,6 +15,7 @@ #include #include #include "core/TensorUtils.hpp" +#include "RuntimeAttr.hpp" using namespace MNN; @@ -86,4 +87,8 @@ float convertFP32ToFP16(float fp32Value) { } +MNNForwardType getCurrentType() { + auto attr = MNN::Express::ExecutorScope::Current()->getAttr(); + return attr->firstType; +} diff --git a/test/TestUtils.h b/test/TestUtils.h index 6a5dd2c20..ac14eb732 100644 --- a/test/TestUtils.h +++ b/test/TestUtils.h @@ -102,6 +102,7 @@ float convertFP32ToFP16(float fp32Value); inline float keepFP32Precision(float fp32Value) { return fp32Value; } +MNNForwardType getCurrentType(); using ConvertFP32 = float(*)(float fp32Value); diff --git a/test/expr/ExecutorResetTest.cpp b/test/expr/ExecutorResetTest.cpp index 60050c454..c6d5c4249 100644 --- a/test/expr/ExecutorResetTest.cpp +++ b/test/expr/ExecutorResetTest.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #include "MNNTestSuite.h" using namespace MNN::Express; @@ -67,6 +68,23 @@ class ExecutorResetTest : public MNNTestCase { x->setName("Prob"); return x; } + bool _runmbv1() { + auto x = _Input({1, 3, 224, 224}, NC4HW4); + auto y = _mobileNetV1Expr(x); + auto buffer = Variable::save({y}); + y = nullptr;x=nullptr; + MNN::BackendConfig bnConfig; + auto exe = Executor::newExecutor(MNN_FORWARD_CPU, bnConfig, 1); + ExecutorScope scope(exe); + std::shared_ptr m(Module::load({"Input"}, {"Prob"}, (const uint8_t*)buffer.data(), buffer.size())); + x = _Input({1, 3, 224, 224}, NC4HW4); + x->writeMap(); + m->onForward({x}); + exe->setGlobalExecutorConfig(MNN_FORWARD_CPU, bnConfig, 4); + m->onForward({x}); + + return true; + } virtual bool run(int precision) { int numberThread = 0; @@ -104,6 +122,9 @@ class ExecutorResetTest : public MNNTestCase { FUNC_PRINT(1); return false; } + if (!_runmbv1()) { + return false; + } return true; } }; diff --git a/test/main.cpp b/test/main.cpp index 862ecdc0c..a23ef8eac 100644 --- a/test/main.cpp +++ b/test/main.cpp @@ -67,6 +67,8 @@ int main(int argc, char* argv[]) { } MNN::Express::ExecutorScope scope(exe); exe->setGlobalExecutorConfig(type, config, thread); + MNNTestSuite::get()->pStaus.memory = memory; + MNNTestSuite::get()->pStaus.precision = precision; if (argc > 1) { auto name = argv[1]; if (strcmp(name, "all") == 0) { diff --git a/test/op/BroadcastToTest.cpp b/test/op/BroadcastToTest.cpp index 0be0b54bd..1920468d7 100644 --- a/test/op/BroadcastToTest.cpp +++ b/test/op/BroadcastToTest.cpp @@ -20,6 +20,14 @@ class BroadcastToTest : public MNNTestCase { virtual ~BroadcastToTest() = default; virtual bool run(int precision) { + bool resultNCHW = testDimensionformat(NCHW, precision); + bool resultNHWC = testDimensionformat(NHWC, precision); + + return (resultNCHW && resultNHWC); + } + +private: + bool testDimensionformat(Dimensionformat dimensionFormat, int precision) { { const float tensorData[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; const int shapeData[] = {2, 3, 2, 2}; @@ -28,8 +36,8 @@ class BroadcastToTest : public MNNTestCase { 1.0, 2.0, 1.0, 2.0, 3.0, 4.0, 3.0, 4.0, 5.0, 6.0, 5.0, 6.0, }; - auto tensor = _Const(tensorData, {1, 3, 1, 2}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {4}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {1, 3, 1, 2}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {4}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -51,8 +59,8 @@ class BroadcastToTest : public MNNTestCase { const int shapeData[] = {3, 3}; const float expectedData[] = {1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0}; - auto tensor = _Const(tensorData, {1, 3}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {2}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {1, 3}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {2}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -74,8 +82,8 @@ class BroadcastToTest : public MNNTestCase { const int shapeData[] = {3, 3}; const float expectedData[] = {1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0}; - auto tensor = _Const(tensorData, {3, 1}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {2}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {3, 1}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {2}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -98,8 +106,8 @@ class BroadcastToTest : public MNNTestCase { const float expectedData[] = {1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0, 1.0, 2.0}; - auto tensor = _Const(tensorData, {1, 1, 1, 2}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {4}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {1, 1, 1, 2}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {4}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -122,8 +130,8 @@ class BroadcastToTest : public MNNTestCase { const float expectedData[] = {1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0}; - auto tensor = _Const(tensorData, {1, 3, 1, 1}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {4}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {1, 3, 1, 1}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {4}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -145,8 +153,8 @@ class BroadcastToTest : public MNNTestCase { const int shapeData[] = {1, 1, 1, 1}; const float expectedData[] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}; - auto tensor = _Const(tensorData, {1, 3, 1, 2}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {4}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {1, 3, 1, 2}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {4}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -169,8 +177,8 @@ class BroadcastToTest : public MNNTestCase { const float expectedData[] = {1.0, 1.0, 2.0, 2.0, 3.0, 3.0, 1.0, 1.0, 2.0, 2.0, 3.0, 3.0}; - auto tensor = _Const(tensorData, {3, 1}, NHWC, halide_type_of()); - auto shape = _Const(shapeData, {3}, NHWC, halide_type_of()); + auto tensor = _Const(tensorData, {3, 1}, dimensionFormat, halide_type_of()); + auto shape = _Const(shapeData, {3}, dimensionFormat, halide_type_of()); auto result = _BroadcastTo(tensor, shape); const int size = result->getInfo()->size; @@ -194,15 +202,23 @@ class BinaryBroadcastTest : public MNNTestCase { virtual ~BinaryBroadcastTest() = default; virtual bool run(int precision) { - auto X = _Input({2, 5, 2}, NHWC, halide_type_of()); + bool resultNCHW = testDimensionFormat(NCHW, precision); + bool resultNHWC = testDimensionFormat(NHWC, precision); + + return (resultNCHW && resultNHWC); + } + +private: + bool testDimensionFormat(Dimensionformat dimensionFormat, int precision) { + auto X = _Input({2, 5, 2}, dimensionFormat, halide_type_of()); X->setName("X"); - auto y0 = _Input({}, NHWC, halide_type_of()); + auto y0 = _Input({}, dimensionFormat, halide_type_of()); y0->writeMap()[0] = 1.0f; - auto y1 = _Input({1, 1, 2}, NHWC, halide_type_of()); + auto y1 = _Input({1, 1, 2}, dimensionFormat, halide_type_of()); y1->writeMap()[0] = 1.0f; y1->writeMap()[1] = 2.0f; - auto y2 = _Input({2, 1, 2}, NHWC, halide_type_of()); + auto y2 = _Input({2, 1, 2}, dimensionFormat, halide_type_of()); y2->writeMap()[0] = 1.0f; y2->writeMap()[1] = 2.0f; y2->writeMap()[2] = 3.0f; @@ -232,7 +248,7 @@ class BinaryBroadcastTest : public MNNTestCase { std::shared_ptr module(Module::load(std::vector{"X"}, std::vector{"z0", "z1", "z2", "z3"}, bufferOutput, sizeOutput)); // First { - auto x0 = _Input({2, 1, 2}, NHWC, halide_type_of()); + auto x0 = _Input({2, 1, 2}, dimensionFormat, halide_type_of()); auto size = x0->getInfo()->size; auto ptr = x0->writeMap(); for (int i=0; i()); + auto x0 = _Input({2, 5, 2}, dimensionFormat, halide_type_of()); auto size = x0->getInfo()->size; auto ptr = x0->writeMap(); for (int i=0; i()); + auto x0 = _Input({2, 3, 2}, dimensionFormat, halide_type_of()); auto size = x0->getInfo()->size; auto ptr = x0->writeMap(); for (int i=0; i padMap = { {PadMode_CAFFE, CAFFE}, {PadMode_VALID, VALID}, {PadMode_SAME, SAME}}; @@ -361,29 +361,6 @@ class ConvolutionCommonTest : public MNNTestCase { auto floatData = (float)(data % 255) / 255.0f; inputData.push_back(floatData); } - - if (debug) { - std::vector printCache(inputData.size()); - for (int i = 0; i < inputData.size(); ++i) { - printCache[i] = FP32Converter[precision](inputData[i]); - } - MNN_PRINT("input:"); - formatMatrix(printCache.data(), {batch, ic, ih, iw}); - printCache.resize(weightData.size()); - for (int i = 0; i < weightData.size(); ++i) { - printCache[i] = FP32Converter[precision](weightData[i]); - } - MNN_PRINT("weight:"); - formatMatrix(printCache.data(), {oc, ic, kh, kw}); - printCache.resize(biasData.size()); - for (int i = 0; i < biasData.size(); ++i) { - printCache[i] = FP32Converter[precision](biasData[i]); - } - MNN_PRINT("bias:"); - formatMatrix(printCache.data(), {oc}); - - } - reference_conv2d(inputData, weightData, biasData, outputData, outputDataSeparateBias, batch, ic, oc, ih, iw, mode, pad_h, pad_w, kh, kw, stride, dilation, group, FP32Converter[precision]); if (outputData.size() == 0) { @@ -416,54 +393,65 @@ class ConvolutionCommonTest : public MNNTestCase { } } // Single Conv - auto output = _Conv(std::move(weightData), std::move(biasData), input, {ic, oc}, {kw, kh}, padMap[mode], - {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, false, false, sparseAlgo, sparseBlockOC, mSparse); - - // difference below 0.5% relative error is considered correct. - auto outputPtr = output->readMap(); - - if (debug) { - MNN_PRINT("\ndata NCHW shape:"); - printDims(input->getInfo()->dim); - MNN_PRINT("\nweight OIHW shape:"); - printDims({oc, ic, kh, kw}); - MNN_PRINT("\noutput NCHW shape:"); - printDims(output->getInfo()->dim); - MNN_PRINT("\nexpected output:"); - formatMatrix(outputData.data(), output->getInfo()->dim); - MNN_PRINT("\nexpected output 2:"); - formatMatrix(outputDataSeparateBias.data(), output->getInfo()->dim); - MNN_PRINT("\nreal output:"); - formatMatrix(outputPtr, output->getInfo()->dim); - } - // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate. + std::vector> activations = { + {false, false}, + }; + if (testRelu) { + activations = { + {false, false}, + {true, false}, + {false, true} + }; + } float errorScale = precision <= MNN::BackendConfig::Precision_High ? 1 : 100; // winograd error in 16-bits is relatively large - if (!checkVectorByRelativeError(outputPtr, outputData.data(), outputDataSeparateBias.data(), outputData.size(), 0.001 * errorScale)) { - MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision); - for (int i = 0; i < outputData.size(); ++i) - { - MNN_PRINT("%f\t, %f\t, %f\n", outputData[i],outputDataSeparateBias[i], outputPtr[i]); + for (auto activation : activations) { + auto newWeight = weightData; + auto newBias = biasData; + auto toutputData = outputData; + auto toutputBias = outputDataSeparateBias; + float maxV = -10000.0f; + float minV = 10000.0f; + if (activation.first) { + for (auto& t : toutputData) { + maxV = ALIMAX(maxV, t); + minV = ALIMIN(minV, t); + t = ALIMAX(0.0f, t); + } + for (auto& t : toutputBias) { + maxV = ALIMAX(maxV, t); + minV = ALIMIN(minV, t); + t = ALIMAX(0.0f, t); + } } - MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str()); - return false; - } + if (activation.second) { + for (auto& t : toutputData) { + t = ALIMAX(0.0f, t); + t = ALIMIN(6.0f, t); + } + for (auto& t : toutputBias) { + t = ALIMAX(0.0f, t); + t = ALIMIN(6.0f, t); + } + } + auto output = _Conv(std::move(newWeight), std::move(newBias), input, {ic, oc}, {kw, kh}, padMap[mode], + {stride, stride}, {dilation, dilation}, group, {pad_w, pad_h}, activation.first, activation.second, sparseAlgo, sparseBlockOC, mSparse); + // difference below 0.5% relative error is considered correct. + auto outputPtr = output->readMap(); - if (mBenchSpeed) { - int oh = output->getInfo()->dim[2], ow = output->getInfo()->dim[3]; - input.fix(VARP::INPUT); - MNN::Timer _t; - const int LOOP = 20; - for (int i = 0; i < LOOP; ++i) { - input->writeMap(); - output->readMap(); + // when using low precision, im2col or strassen convolution error rate to reference value is about 1e-4, winograd has larger error rate. + + if (!checkVectorByRelativeError(outputPtr, toutputData.data(), toutputBias.data(), toutputData.size(), 0.001 * errorScale)) { + MNN_PRINT("precision:%d, expect:\t expect2:\t real:\t\n", precision); + for (int i = 0; i < toutputData.size(); ++i) + { + MNN_PRINT("%f\t, %f\t, %f\n", toutputData[i],toutputBias[i], outputPtr[i]); + } + MNN_ERROR("%s(%s) test failed!\n", test_op_name.c_str(), device_name.c_str()); + return false; } - auto time = (float)_t.durationInUs() / 1000.0f; - MNN_PRINT("kernel=(%dx%d) input=(1x%dx%dx%d) output=(1x%dx%dx%d) stride=(%dx%d), avg time = %f\n", - kh, kw, ic, ih, iw, oc, oh, ow, stride, stride, 1.0 * time / LOOP); } - return true; } }; @@ -905,7 +893,7 @@ class DepthwiseConvolutionTest : public ConvolutionCommonTest { // depthwise <==> group == outputChannel bool succ = ConvolutionCommonTest().test( type, device_name, "DepthwiseConv2D", b, ic, oc, ish, isw, PadMode_CAFFE, - p, p, kh, kw, s, d, oc, precision); + p, p, kh, kw, s, d, oc, precision, MNN::SparseAlgo_RANDOM, 1, false, true); if (!succ) { MNN_ERROR( "Error for dw oc=%d, ic=%d, ih=%d, iw = %d, kw=%d,kh=%d,d=%d,s=%d,p=%d\n", oc, diff --git a/test/op/GridSample3DTest.cpp b/test/op/GridSample3DTest.cpp new file mode 100644 index 000000000..0b96dc494 --- /dev/null +++ b/test/op/GridSample3DTest.cpp @@ -0,0 +1,241 @@ +// +// GridSampler3DTest.cpp +// MNNTests +// +// Created by MNN on 2021/03/11. +// Copyright © 2018, Alibaba Group Holding Limited +// +#include +#include +#include + +#include +#include +#include +#include "MNNTestSuite.h" +#include "TestUtils.h" + +using namespace MNN::Express; + +static float getPosition(float x, int range, bool alignCorners, GridSamplePaddingMode paddingMode) { + if (paddingMode == GRID_SAMPLE_PADDING_REFLECTION) { + // if x is on the left side of -1.0, move it to the right side of 1.0 + if (x < -1.0f) { + x = (x + ::ceil(1 - x) * 4); + } + // reflect + if (x > 1.0f) { + float l = (x - 1.0f); + int reflectionNum = ::floor(l / 2.0); + float offset = (l - reflectionNum * 2.0f); + x = (reflectionNum % 2 == 0) ? (1 - offset) : (-1.0f + offset); + } + } + + float a = alignCorners ? 1.0f : 0.0f; + float b = alignCorners ? 0.0f : 1.0f; + return (((1 + x) * (range - a) - b) / 2.0f); +} + +static int CLAMP(int v, int min, int max) { + if ((v) < min) { + (v) = min; + } else if ((v) > max) { + (v) = max; + } + return v; +} + +static float sample(int d, int h, int w, const float *buffer, int depth, int height, int width, GridSamplePaddingMode paddingMode) { + if (h < 0 || h >= height || w < 0 || w >= width || d < 0 || d >= depth) { + if (paddingMode == GRID_SAMPLE_PADDING_ZEROS) { + return 0.0f; + } + // Clearly, CLAMP is the right way to go for GridSamplePaddingMode_BORDER + // For GridSamplePaddingMode_REFLECTION, since we have reflected the values into (-1, 1), + // the leftover reflections degrade to GridSamplePaddingMode_BORDER + h = CLAMP(h, 0, height-1); + w = CLAMP(w, 0, width-1); + d = CLAMP(d, 0, depth-1); + } + + return buffer[d * height * width + h * width + w]; +} + +static float interpolate(float d, float h, float w, const float *buffer, int depth, int height, int width, InterpolationMethod mode, + GridSamplePaddingMode paddingMode) { + if (mode == NEAREST) { + int nh = ::floor(h+0.5f); + int nw = ::floor(w+0.5f); + int nd = ::floor(d+0.5f); + return sample(nd, nh, nw, buffer, depth, height, width, paddingMode); + } + + // mode == GridSampleMode_BILINEAR + int d0 = ::floor(d); + int d1 = ::ceil(d); + int h0 = ::floor(h); + int h1 = ::ceil(h); + int w0 = ::floor(w); + int w1 = ::ceil(w); + float fx2 = w - w0; + float fx1 = 1.0f - fx2; + float fy2 = h - h0; + float fy1 = 1.0f - fy2; + float fz2 = d - d0; + float fz1 = 1.0f - fz2; + + float i000 = sample(d0, h0, w0, buffer, depth, height, width, paddingMode); + float i001 = sample(d0, h0, w1, buffer, depth, height, width, paddingMode); + float i010 = sample(d0, h1, w0, buffer, depth, height, width, paddingMode); + float i011 = sample(d0, h1, w1, buffer, depth, height, width, paddingMode); + float i100 = sample(d1, h0, w0, buffer, depth, height, width, paddingMode); + float i101 = sample(d1, h0, w1, buffer, depth, height, width, paddingMode); + float i110 = sample(d1, h1, w0, buffer, depth, height, width, paddingMode); + float i111 = sample(d1, h1, w1, buffer, depth, height, width, paddingMode); + + float i00 = ((i000) * fx1 + (i001) * fx2); + float i01 = ((i010) * fx1 + (i011) * fx2); + float i10 = ((i100) * fx1 + (i101) * fx2); + float i11 = ((i110) * fx1 + (i111) * fx2); + + float i0 = i00 * fy1 + i01 * fy2; + float i1 = i10 * fy1 + i11 * fy2; + + return ((i0 * fz1) + (i1 * fz2)); +} + +static void reference_grid_sample(const float *inputPtr, const float *gridPtr, std::vector &output, + int batch, int inDepth, int inHeight, int inWidth, int outDepth, int outHeight, int outWidth, int channel, + InterpolationMethod mode, GridSamplePaddingMode paddingMode, bool alignCorners) { + output.resize(batch * outHeight * outWidth * channel * outDepth); + + float *outputPtr = output.data(); + for (auto b = 0; b < batch; ++b) { + const float *_inputPtr = inputPtr + b * inDepth * inHeight * inWidth * channel; + const float *_gridPtr = gridPtr + b * outDepth * outHeight * outWidth * 3; + float *_outputPtr = outputPtr + b * outDepth * outHeight * outWidth * channel; + + for (auto c = 0; c < channel; ++c) { + auto __inputPtr = _inputPtr + c * inDepth * inHeight * inWidth; + auto __outputPtr = _outputPtr + c * outDepth * outHeight * outWidth; + for (int d = 0; d < outDepth; ++d) { + for (auto h = 0; h < outHeight; ++h) { + auto __gridPtr = _gridPtr + (d * outWidth * outHeight + h * outWidth) * 3; + auto ___outputPtr = __outputPtr + d * outHeight * outWidth + h * outWidth; + + for (auto w = 0; w < outWidth; ++w) { + auto x = getPosition(__gridPtr[3 * w + 0], inWidth, alignCorners, paddingMode); + auto y = getPosition(__gridPtr[3 * w + 1], inHeight, alignCorners, paddingMode); + auto z = getPosition(__gridPtr[3 * w + 2], inDepth, alignCorners, paddingMode); + + ___outputPtr[w] = interpolate(z, y, x, __inputPtr, inDepth, inHeight, inWidth, mode, paddingMode); + } + } + } + } + } +} + + +class GridSample3DTest : public MNNTestCase { +public: + virtual ~GridSample3DTest() = default; + + virtual bool run(int precision) { + auto type = getCurrentType(); + + const std::vector> configs({ + {1, 3, 5, 10, 5, 10, 3, 5}, + {1, 62, 6, 10, 12, 20, 1, 2}, + {2, 64, 12, 20, 6, 6, 5, 1}, + {1, 3, 384, 640, 384, 640, 2, 2}, + }); + + for (auto config : configs) { + const int batch = config[0]; + const int depth = config[1]; + const int inHeight = config[2]; + const int inWidth = config[3]; + const int outHeight = config[4]; + const int outWidth = config[5]; + const int inDepth = config[6]; + const int outDepth = config[7]; + + std::vector originInputData(batch * depth * inHeight * inWidth * inDepth); + std::vector originGridData(batch * outHeight * outWidth * outDepth * 3); + + auto inputPtr = originInputData.data(); + auto gridPtr = originGridData.data(); + + std::random_device rd{}; + std::mt19937 gen{rd()}; + gen.seed(1024); + std::normal_distribution<> inputDist{0.0f, 1.0}; + std::normal_distribution<> gridDist{0.0f, 3.0f / outWidth}; + + for (int i = 0; i < batch * inHeight * inWidth * inDepth * depth; i++) { + inputPtr[i] = inputDist(gen); + } + for (int b = 0; b < batch; b++) { + for (int d=0; dwriteMap(), inputPtr, originInputData.size() * sizeof(float)); + ::memcpy(grid->writeMap(), gridPtr, originGridData.size() * sizeof(float)); + input = _Convert(input, NC4HW4); + + std::vector modes({BILINEAR}); + std::vector paddingModes({GRID_SAMPLE_PADDING_ZEROS, GRID_SAMPLE_PADDING_BORDER}); + std::vector alignCornersVec = {1, 0}; + std::vector expectedOutput; + for (auto mode : modes) { + for (auto paddingMode : paddingModes) { + for (auto alignCorners : alignCornersVec) { + reference_grid_sample(inputPtr, gridPtr, expectedOutput, + batch, inDepth, inHeight, inWidth, outDepth, outHeight, outWidth, depth, + mode, paddingMode, alignCorners); + auto expectedOutPtr = expectedOutput.data(); + + grid->unMap(); + input->unMap(); + + auto output = _GridSample(input, grid, mode, paddingMode, alignCorners); + output = _Convert(output, NCHW); + auto outputPtr = output->readMap(); +// MNN_PRINT("GridSamplerTest, mode: %d, pad: %d, align: %d\n", mode, paddingMode, alignCorners); + + if (mode == NEAREST) { + if (!checkVector(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01)) { + MNN_ERROR("GridSampleTest NEAREST test %d-%d-%d-%d-%d failed pad mode: %d, align: %d!\n", config[0], config[1], config[2], config[3], config[4], paddingMode, alignCorners); + return false; + } + } else { + if (!checkVector(outputPtr, expectedOutPtr, expectedOutput.size(), 0.01)) { + MNN_ERROR("GridSampleTest BILINEAR test %d-%d-%d-%d-%d failed: pad mode: %d, align: %d!\n", config[0], config[1], config[2], config[3], config[4], paddingMode, alignCorners); + return false; + } + } + } + } + } + } + + return true; + } +}; + +MNNTestSuiteRegister(GridSample3DTest, "op/GridSample3D"); diff --git a/test/op/GridSampleTest.cpp b/test/op/GridSampleTest.cpp index 77fffdc25..d945969f4 100644 --- a/test/op/GridSampleTest.cpp +++ b/test/op/GridSampleTest.cpp @@ -1,5 +1,5 @@ // -// CropAndResizeTest.cpp +// GridSamplerTest.cpp // MNNTests // // Created by MNN on 2021/03/11. @@ -149,6 +149,7 @@ class GridSampleTest : public MNNTestCase { std::random_device rd{}; std::mt19937 gen{rd()}; + gen.seed(1024); std::normal_distribution<> inputDist{0.0f, 1.0}; std::normal_distribution<> gridDist{0.0f, 3.0f / outWidth}; @@ -172,7 +173,7 @@ class GridSampleTest : public MNNTestCase { input = _Convert(input, NC4HW4); std::vector modes({BILINEAR}); - std::vector paddingModes({GRID_SAMPLE_PADDING_ZEROS}); + std::vector paddingModes({GRID_SAMPLE_PADDING_ZEROS, GRID_SAMPLE_PADDING_BORDER}); std::vector alignCornersVec = {1, 0}; std::vector expectedOutput(batch * outHeight * outWidth * depth); for (auto mode : modes) { diff --git a/test/op/RasterTest.cpp b/test/op/RasterTest.cpp index 2dd10eb73..72b8baf7a 100644 --- a/test/op/RasterTest.cpp +++ b/test/op/RasterTest.cpp @@ -8,7 +8,6 @@ #include #include -#include "RuntimeAttr.hpp" #include "MNNTestSuite.h" #include "TestUtils.h" @@ -213,8 +212,8 @@ class ReduceBlitTest : public MNNTestCase { } virtual bool run(int precision) { // TODO: Other Backend Support Reduce Blit - auto attr = ExecutorScope::Current()->getAttr(); - if (attr->firstType != MNN_FORWARD_CPU) { + auto type = getCurrentType(); + if (type != MNN_FORWARD_CPU) { MNN_ERROR("Currently only cpu backend support reduce blit\n"); return true; } diff --git a/test/speed/HybridConvSpeedTest.cpp b/test/speed/HybridConvSpeedTest.cpp index 42968330d..fc7a56868 100644 --- a/test/speed/HybridConvSpeedTest.cpp +++ b/test/speed/HybridConvSpeedTest.cpp @@ -17,7 +17,7 @@ using namespace MNN; class HybridConvSpeedTestCommon : public MNNTestCase { protected: - static bool testKernel(std::string title, INTS inputShape, INTS kernel, INTS channel, INTS pad, INTS strides, INTS dilate, int batch = 1, int nbit = 8, int precision = 1, bool testSpeed = false) { + static bool testKernel(std::string title, INTS inputShape, INTS kernel, INTS channel, INTS pad, INTS strides, INTS dilate, int batch = 1, int nbit = 8, int precision = 1, bool testSpeed = false, int block = 0) { float fac = 1.23; int res = 10; float tail = 0.2; @@ -25,8 +25,12 @@ class HybridConvSpeedTestCommon : public MNNTestCase { int iw = inputShape[0], ih = inputShape[1]; std::vector bias(oc), biastest(oc), biasdup(oc); int area = kernel[0] * kernel[1]; + if (0 == block || ic % block != 0 || area > 1) { + block = area * ic; + } + int group = (area * ic) / block; std::vector weightFp32(oc * ic * area); - std::vector wScale(oc); + std::vector wScale(oc * group); float threshold = (float)(1 << (nbit - 1)) - 1.0f; float clampMin = -threshold; @@ -48,14 +52,23 @@ class HybridConvSpeedTestCommon : public MNNTestCase { ::memcpy(biastest.data(), bias.data(), oc * sizeof(float)); ::memcpy(biasdup.data(), bias.data(), oc * sizeof(float)); int kernel_size = ic * area; + auto newWeightFp32 = weightFp32; for (int k = 0; k < oc; ++k) { int beginIndex = k * kernel_size; - auto absMax = findAbsMax(weightFp32.data() + beginIndex, kernel_size); - wScale[k] = absMax / threshold; + for (int v=0; vgetInfo(); auto ow = yInfo->dim[3], oh = yInfo->dim[2]; #if defined (__aarch64__) && (precision == 2) @@ -67,18 +80,22 @@ class HybridConvSpeedTestCommon : public MNNTestCase { yfp32 = _Convert(yfp32, NCHW); auto yPtr = y->readMap(); auto tgPtr = yfp32->readMap(); - auto elesize = batch * oc * oh * ow; + auto elesize = yfp32->getInfo()->size; float limit = 0.1f; + bool correct = true; + float maxValue = 0.001f; + for (int i = 0; i < elesize; ++i) { + maxValue = fmaxf(maxValue, fabsf(tgPtr[i])); + } + for (int i = 0; i < elesize; ++i) { float targetValue = tgPtr[i], computeResult = yPtr[i]; float diff = targetValue - computeResult; - float ratio = fabsf(diff) / fmax(targetValue, computeResult); - if (targetValue != 0 && computeResult != 0 && ratio > limit) { - MNN_PRINT("%d result Error ratio=%f: right=%f, error=%f\n", i, ratio, targetValue, computeResult); - return false; - } else if ((targetValue == 0 || computeResult == 0) && fabsf(diff) > limit) { + float ratio = fabsf(diff) / maxValue; + if (ratio > limit) { MNN_PRINT("%d result Error ratio=%f: right=%f, error=%f\n", i, ratio, targetValue, computeResult); - return false; + correct = false; + break; } } if (testSpeed) { @@ -93,8 +110,7 @@ class HybridConvSpeedTestCommon : public MNNTestCase { MNN_PRINT("%s input=(%dx%dx%dx%d) output=(%dx%dx%dx%d) avg time = %f\n", title.c_str(), batch, ic, 1, 1, batch, oc, 1, 1, 1.0 * time / LOOP); } - - return true; + return correct; } }; @@ -102,30 +118,36 @@ class HybridConvSpeedInt8Test : public HybridConvSpeedTestCommon { public: virtual bool run(int precision) { INTS strides = {1, 1}, dilate = {1, 1}, pad = {0, 0}, inputShape = {1, 1}; // {w, h} - INTS channel0 = {2048, 512}; // {ic, co} + INTS channel0 = {4096, 4096}; // {ic, co} INTS channel1 = {1496, 256}; - int batch[2] = {23, 13}; + int batch[3] = {23, 13, 1}; + std::vector blocks = {0, 32, 128}; + std::vector kernels = {1, 1}; std::vector weightBits = {8, 4}; bool lowmemory = true; - for (auto& bits : weightBits) { - MNN_PRINT("Test for %d bits\n", bits); - for (int n = 0; n < 2; ++n) { - auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel0, pad, strides, dilate, batch[n], bits, precision, true); - if (!res) { - MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel0[0], channel0[1]); - return false; + int batchNum = sizeof(batch) / sizeof(int); + bool correct = true; + for (auto block : blocks) { + for (auto& bits : weightBits) { + MNN_PRINT("Test for %d bits, block=%d\n", bits, block); + for (int n = 0; n < batchNum; ++n) { + auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel0, pad, strides, dilate, batch[n], bits, precision, true, block); + if (!res) { + MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel0[0], channel0[1]); + correct = false; + } } - } - for (int n = 0; n < 2; ++n) { - auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel1, pad, strides, dilate, batch[n], bits, precision, true); - if (!res) { - MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel1[0], channel1[1]); - return false; + for (int n = 0; n < batchNum; ++n) { + auto res = testKernel("Low memory HybridConv test:", inputShape, kernels, channel1, pad, strides, dilate, batch[n], bits, precision, true, block); + if (!res) { + MNN_ERROR("Error: low memory hybridConv when n=%d, ic=%d, oc=%d\n", batch[n], channel1[0], channel1[1]); + correct = false; + } } } } - return true; + return correct; } }; @@ -170,20 +192,28 @@ class HybridConvInt8Test : public HybridConvSpeedTestCommon { class DenseConvInt8Test : public HybridConvSpeedTestCommon { public: virtual bool run(int precision) { - std::vector< std::vector> channels = {{4, 256}, {2048, 256}, {1, 8}, {7, 9}}; + std::vector< std::vector> channels = {{4, 256}, {512, 128}, {1, 8}, {7, 9}}; INTS strides = {1, 1}, dilate = {1, 3}, pad = {0, 3}, inputShape = {1, 2640}; // {w, h} - int batch[2] = {1, 13}; - std::vector kernels = {1, 3}; - std::vector weightBits = {8}; + std::vector batch = {1, 13}; + std::vector> kernels = {{1, 1}, {1, 3}}; + std::vector weightBits = {4, 8}; bool lowmemory = true; int n = 0; for (auto& bits : weightBits) { - for (int n = 0; n < 2; ++n) { + for (int n = 0; n < batch.size(); ++n) { for (int i = 0; i < channels.size(); ++i) { - auto res = testKernel("Low memory ConvInt8 with 1x3 kernel test:", inputShape, kernels, channels[i], pad, strides, dilate, batch[n], bits, precision); - if (!res) { - MNN_ERROR("Error: low memory ConvInt8 with 1x3 kernel when n=%d, ic=%d, oc=%d\n", batch[n], channels[i][0], channels[i][1]); - return false; + for (auto kernel : kernels) { + std::vector blocks = {0}; + if (kernel[0] == 1 && kernel[1] == 1) { + blocks = {0, 32}; + } + for (auto block : blocks) { + auto res = testKernel("Low memory ConvInt8 with kernel test:", inputShape, kernel, channels[i], pad, strides, dilate, batch[n], bits, precision, false, block); + if (!res) { + MNN_ERROR("Error: low memory ConvInt8 with %dx%d kernel when n=%d, ic=%d, oc=%d, block=%d\n", kernel[0], kernel[1], batch[n], channels[i][0], channels[i][1], block); + return false; + } + } } } } diff --git a/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp b/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp index ed68db4d6..7ed348dc8 100644 --- a/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp +++ b/tools/converter/source/optimizer/onnxextra/OnnxRandomUniform.cpp @@ -16,6 +16,11 @@ namespace Express { class OnnxRandomUniformTransform : public OnnxExtraManager::Transform { public: virtual EXPRP onExecute(EXPRP expr) const override { + static bool gInit = false; + if (!gInit) { + MNN_PRINT("The model has random OP: %s, can't check result with onnxruntime\n", expr->name().c_str()); + gInit = true; + } auto op = expr->get(); auto info = op->main_as_Extra(); std::unique_ptr randomUniform(new OpT); diff --git a/tools/cpp/GetMNNInfo.cpp b/tools/cpp/GetMNNInfo.cpp index 039ab9f79..376f265fb 100644 --- a/tools/cpp/GetMNNInfo.cpp +++ b/tools/cpp/GetMNNInfo.cpp @@ -105,6 +105,9 @@ int main(int argc, char *argv[]) { } else { MNN_PRINT("Model Version: %s \n", info->version.c_str()); } + if (!info->bizCode.empty()) { + MNN_PRINT("Model bizCode: %s\n", info->bizCode.c_str()); + } return 0; } diff --git a/tools/cv/include/cv/imgproc/structural.hpp b/tools/cv/include/cv/imgproc/structural.hpp index faf4ff7a3..470ba5a21 100644 --- a/tools/cv/include/cv/imgproc/structural.hpp +++ b/tools/cv/include/cv/imgproc/structural.hpp @@ -10,7 +10,7 @@ #define STRUCTURAL_HPP #include -#include "cv/types.hpp" +#include "../types.hpp" namespace MNN { namespace CV { diff --git a/tools/cv/include/cv/types.hpp b/tools/cv/include/cv/types.hpp index 9cc4aee7a..e83c6ba38 100644 --- a/tools/cv/include/cv/types.hpp +++ b/tools/cv/include/cv/types.hpp @@ -407,6 +407,9 @@ typedef Scalar_ Scalar; static void getVARPSize(VARP var, int* height, int* width, int* channel) { auto info = var->getInfo(); + if (!info) { + return; + } auto dims = info->dim; int num = dims.size(); if (num < 2) return; diff --git a/tools/cv/source/imgproc/filter.cpp b/tools/cv/source/imgproc/filter.cpp index 81b5a32fb..b63b830b7 100644 --- a/tools/cv/source/imgproc/filter.cpp +++ b/tools/cv/source/imgproc/filter.cpp @@ -46,9 +46,7 @@ static VARP formatOutput(VARP src, halide_type_t type) { if (channel == 1) { squeeze_dims.push_back(-1); } - if (!squeeze_dims.empty()) { - src = _Squeeze(src, squeeze_dims); - } + src = _Squeeze(src, squeeze_dims); if (type == halide_type_of()) { src = _Minimum(src, _Scalar(255)); src = _Maximum(src, _Scalar(0)); diff --git a/transformers/llm/engine/include/llm/llm.hpp b/transformers/llm/engine/include/llm/llm.hpp index 5337afc3c..951a1578e 100644 --- a/transformers/llm/engine/include/llm/llm.hpp +++ b/transformers/llm/engine/include/llm/llm.hpp @@ -29,24 +29,6 @@ class Tokenizer; class Pipeline; class LlmConfig; -// Llm start -// llm stream buffer with callback -class MNN_PUBLIC LlmStreamBuffer : public std::streambuf { -public: - using CallBack = std::function;; - LlmStreamBuffer(CallBack callback) : callback_(callback) {} - -protected: - virtual std::streamsize xsputn(const char* s, std::streamsize n) override { - if (callback_) { - callback_(s, n); - } - return n; - } - -private: - CallBack callback_ = nullptr; -}; class MNN_PUBLIC Llm { using PromptItem = std::pair; // public: diff --git a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm index 4d05379a4..4561338f8 100644 --- a/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm +++ b/transformers/llm/engine/ios/mnn-llm/mnn-llm/LLMInferenceEngineWrapper.mm @@ -4,7 +4,7 @@ // // Created by wangzhaode on 2023/12/14. // - +#include #import "LLMInferenceEngineWrapper.h" #include using namespace MNN::Transformer; @@ -44,7 +44,23 @@ - (BOOL)loadModel { } return YES; } +// Llm start +// llm stream buffer with callback +class LlmStreamBuffer : public std::streambuf { +public: + using CallBack = std::function;; + LlmStreamBuffer(CallBack callback) : callback_(callback) {} +protected: + virtual std::streamsize xsputn(const char* s, std::streamsize n) override { + if (callback_) { + callback_(s, n); + } + return n; + } +private: + CallBack callback_ = nullptr; +}; - (void)processInput:(NSString *)input withStreamHandler:(StreamOutputHandler)handler { LlmStreamBuffer::CallBack callback = [handler](const char* str, size_t len) { if (handler) { diff --git a/transformers/llm/engine/src/llm.cpp b/transformers/llm/engine/src/llm.cpp index 991c6c7ef..8b836595f 100644 --- a/transformers/llm/engine/src/llm.cpp +++ b/transformers/llm/engine/src/llm.cpp @@ -193,8 +193,13 @@ size_t Llm::apply_lora(const std::string& lora_path) { module_config.rearrange = true; module_config.base = modules_.begin()->get(); size_t lora_index = modules_.size(); - modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"}, - {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + if (attention_fused_) { + modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids"}, + {"logits"}, model_path.c_str(), runtime_manager_, &module_config)); + } else { + modules_.emplace_back(Module::load({"input_ids", "attention_mask", "position_ids", "past_key_values"}, + {"logits", "presents"}, model_path.c_str(), runtime_manager_, &module_config)); + } select_module(lora_index); return lora_index; } @@ -244,6 +249,7 @@ void Llm::trace(bool start) { for (auto& m : decode_modules_) { m->traceOrOptimize(status); } + runtime_manager_->updateCache(); mTracing = start; } diff --git a/transformers/llm/export/llmexport.py b/transformers/llm/export/llmexport.py index dacf40d04..904128bb5 100644 --- a/transformers/llm/export/llmexport.py +++ b/transformers/llm/export/llmexport.py @@ -495,6 +495,9 @@ def quant(self, weight, quant_bit, quant_block): block_size = ic else: block_size = quant_block + if ic % block_size != 0: + block_size = ic + print('Skip block quant for ic=', ic, ', quant_block:', quant_block) block_num = ic // block_size weight = weight.reshape(oc, block_num, block_size) max_val = np.max(weight, axis=-1, keepdims=True) @@ -626,7 +629,7 @@ def rebuild_op(self, op, graph): "quanParameter": { "quantScale": 1.0, "scaleIn": 0.0, "scaleOut": 0.0, "useInt32": False, "has_scaleInt": False, "shapeInt32": shape_int32, - "type": 1, "aMax": 0, "aMin": q_min, "readType": oc * (ic // self.quant_block), "weightSize": 0 + "type": 1, "aMax": 0, "aMin": q_min, "readType": -1, "weightSize": 0 }, "external": external }, @@ -1864,7 +1867,7 @@ def main(): parser.add_argument('--export', type=str, default=None, help='export model to an onnx/mnn model.') parser.add_argument('--skip_slim', action='store_true', help='Whether or not to skip onnx-slim.') parser.add_argument('--quant_bit', type=int, default=4, help='mnn quant bit, 4 or 8, default is 4.') - parser.add_argument('--quant_block', type=int, default=128, help='mnn quant block, default is 0 mean channle-wise.') + parser.add_argument('--quant_block', type=int, default=0, help='mnn quant block, default is 0 mean channle-wise.') parser.add_argument('--lm_quant_bit', type=int, default=None, help='mnn lm_head quant bit, 4 or 8, default is `quant_bit`.') parser.add_argument('--mnnconvert', type=str, default='../../../build/MNNConvert', help='local mnnconvert path, if invalid, using pymnn.')