PaddlePaddle · lizexu123 · Dec 19, 2023 · Dec 26, 2023 · Dec 28, 2023 · Dec 29, 2023
diff --git a/example/auto_compression/pytorch_yolo_series/README.md b/example/auto_compression/pytorch_yolo_series/README.md
diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt b/example/auto_compression/pytorch_yolo_series/cpp_infer/CMakeLists.txt
diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/README.md b/example/auto_compression/pytorch_yolo_series/cpp_infer/README.md
@@ -4,82 +4,65 @@
 
 - CUDA、CUDNN：确认环境中已经安装CUDA和CUDNN，并且提前获取其安装路径。
 
-- TensorRT：可通过NVIDIA官网下载[TensorRT 8.4.1.5](https://developer.nvidia.com/compute/machine-learning/tensorrt/secure/8.4.1/tars/tensorrt-8.4.1.5.linux.x86_64-gnu.cuda-11.6.cudnn8.4.tar.gz)或其他版本安装包。
+- TensorRT：可通过NVIDIA官网下载[TensorRT 8.6.1.6](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/8.6.1/tars/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-11.8.tar.gz)或其他版本安装包。
 
-- Paddle Inference C++预测库：编译develop版本请参考[编译文档](https://www.paddlepaddle.org.cn/inference/user_guides/source_compile.html)。编译完成后，会在build目录下生成`paddle_inference_install_dir`文件夹，这个就是我们需要的C++预测库文件。
+- Paddle Inference C++预测库：编译develop版本请参考[编译文档](https://www.paddlepaddle.org.cn/inference/user_guides/source_compile.html)。编译完成后，会在build目录下生成`paddle_inference_install_dir`文件夹，这个就是我们需要的C++预测库文件，或者在官网下载[c++推理库](https://www.paddlepaddle.org.cn/inference/v2.6/guides/install/download_lib.html)
 
 ## 编译可执行程序
 
 - (1)修改`compile.sh`中依赖库路径，主要是以下内容：
 ```shell
 # Paddle Inference预测库路径
-LIB_DIR=/root/auto_compress/Paddle/build/paddle_inference_install_dir/
+LIB_DIR=/work/Paddle/build/paddle_inference_install_dir
 # CUDNN路径
 CUDNN_LIB=/usr/lib/x86_64-linux-gnu/
 # CUDA路径
 CUDA_LIB=/usr/local/cuda/lib64
 # TensorRT安装包路径，为TRT资源包解压完成后的绝对路径，其中包含`lib`和`include`文件夹
-TENSORRT_ROOT=/root/auto_compress/trt/trt8.4/
+TENSORRT_ROOT=/work/TensorRT-8.6.1.6
 ```
 
 ## Paddle tensorRT测试
 
 - YOLOv5
 ```
 # FP32
-./build/trt_run --model_file yolov5s_infer/model.pdmodel --params_file yolov5s_infer/model.pdiparams --run_mode=trt_fp32
+./build/trt_run --model_file yolov5_model/inference_model/model.pdmodel --params_file yolov5_model/inference_model/model.pdiparams --run_mode=trt_fp32
 # FP16
-./build/trt_run --model_file yolov5s_infer/model.pdmodel --params_file yolov5s_infer/model.pdiparams --run_mode=trt_fp16
+./build/trt_run --model_file yolov5_model/inference_model/model.pdmodel --params_file yolov5_model/inference_model/model.pdiparams --run_mode=trt_fp16
 # INT8
-./build/trt_run --model_file yolov5s_quant/model.pdmodel --params_file yolov5s_quant/model.pdiparams --run_mode=trt_int8
+./build/trt_run --model_file yolov5s_quantaware/model.pdmodel --params_file yolov5s_quantaware/model.pdiparams --run_mode=trt_int8
 ```
 
 - YOLOv6
 ```
 # FP32
-./build/trt_run --arch=YOLOv6 --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp32
+./build/trt_run --model_file yolov6_model/inference_model/model.pdmodel --params_file yolov6_model/inference_model/model.pdiparams --run_mode=trt_fp32
 # FP16
-./build/trt_run --arch=YOLOv6 --model_file yolov6s_infer/model.pdmodel --params_file yolov6s_infer/model.pdiparams --run_mode=trt_fp16
+./build/trt_run --model_file yolov6_model/inference_model/model.pdmodel --params_file yolov6_model/inference_model/model.pdiparams --run_mode=trt_fp16
 # INT8
-./build/trt_run --arch=YOLOv6 --model_file yolov6s_quant/model.pdmodel --params_file yolov6s_quant/model.pdiparams --run_mode=trt_int8
+./build/trt_run --model_file yolov6s_quantaware/model.pdmodel --params_file yolov6s_quantaware/model.pdiparams --run_mode=trt_int8
 ```
 
 
 - YOLOv7
 ```
 # FP32
-./build/trt_run --model_file yolov7_infer/model.pdmodel --params_file yolov7_infer/model.pdiparams --run_mode=trt_fp32
+./build/trt_run --model_file yolov7-tiny/inference_model/model.pdmodel --params_file yolov7-tiny/inference_model/model.pdiparams --run_mode=trt_fp32
 # FP16
-./build/trt_run --model_file yolov7_infer/model.pdmodel --params_file yolov7_infer/model.pdiparams --run_mode=trt_fp16
+./build/trt_run --model_file yolov7-tiny/inference_model/model.pdmodel --params_file yolov7-tiny/inference_model/model.pdiparams --run_mode=trt_fp16
 # INT8
-./build/trt_run --model_file yolov7_quant/model.pdmodel --params_file yolov7_quant/model.pdiparams --run_mode=trt_int8
+./build/trt_run --model_file yolov7-quantAware/model.pdmodel --params_file yolov7-quantAware/model.pdiparams --run_mode=trt_int8
 ```
 
 ## 原生TensorRT测试
 
 ```shell
 # FP32
-trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw
+trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp32:chw --outputIOFormats=fp32:chw
 # FP16
-trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --fp16
+trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp32:chw --outputIOFormats=fp32:chw --fp16
 # INT8
-trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp16:chw --outputIOFormats=fp16:chw --int8
+trtexec --onnx=yolov5s.onnx --workspace=1024 --avgRuns=1000 --inputIOFormats=fp32:chw --outputIOFormats=fp32:chw --int8
 ```
 - 注：可把--onnx=yolov5s.onnx替换成yolov6s.onnx和yolov7.onnx模型
-
-## 性能对比
-
-| 预测库 |  模型  | 预测时延<sup><small>FP32</small><sup><br><sup>(ms) |预测时延<sup><small>FP16</small><sup><br><sup>(ms) | 预测时延<sup><small>INT8</small><sup><br><sup>(ms) |
-| :--------: | :--------: |:-------- |:--------: | :---------------------: |
-| Paddle TensorRT | yolov5s |   5.95ms  |   2.44ms   |  1.87ms  |
-| TensorRT  | yolov5s |   6.16ms  |   2.58ms   |  2.07ms  |
-|  |  |     |     |  |
-| Paddle TensorRT | YOLOv6s |   9.06ms  |   2.90ms   |  1.83ms  |
-| TensorRT | YOLOv6s |   8.59ms  |   2.83ms   |  1.87ms  |
-|  |  |     |     |  |
-| Paddle TensorRT | YOLOv7 |   26.84ms  |   7.44ms   |  4.55ms  |
-| TensorRT  | YOLOv7 |   28.25ms  |   7.23ms   |  4.67ms  |
-
-环境：
-- Tesla T4，TensorRT 8.4.1，CUDA 11.2
-- batch_size=1
diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh b/example/auto_compression/pytorch_yolo_series/cpp_infer/compile.sh
@@ -14,10 +14,10 @@ WITH_MKL=ON
 WITH_GPU=ON
 USE_TENSORRT=ON
 
-LIB_DIR=/root/auto_compress/Paddle/build/paddle_inference_install_dir/
+LIB_DIR=/work/Paddle/build/paddle_inference_install_dir
 CUDNN_LIB=/usr/lib/x86_64-linux-gnu/
 CUDA_LIB=/usr/local/cuda/lib64
-TENSORRT_ROOT=/root/auto_compress/trt/trt8.4/
+TENSORRT_ROOT=/work/TensorRT-8.6.1.6
 
 WITH_ROCM=OFF
 ROCM_LIB=/opt/rocm/lib

diff --git a/example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc b/example/auto_compression/pytorch_yolo_series/cpp_infer/trt_run.cc
@@ -1,31 +1,46 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include <chrono>
 #include <iostream>
 #include <memory>
 #include <numeric>
 
 #include <gflags/gflags.h>
 #include <glog/logging.h>
-#include <cuda_runtime.h>
 
-#include "paddle/include/paddle_inference_api.h"
-#include "paddle/include/experimental/phi/common/float16.h"
+#include "paddle_inference_api.h"
 
 using paddle_infer::Config;
-using paddle_infer::Predictor;
 using paddle_infer::CreatePredictor;
 using paddle_infer::PrecisionType;
-using phi::dtype::float16;
+using paddle_infer::Predictor;
 
 DEFINE_string(model_dir, "", "Directory of the inference model.");
 DEFINE_string(model_file, "", "Path of the inference model file.");
 DEFINE_string(params_file, "", "Path of the inference params file.");
-DEFINE_string(arch, "YOLOv5", "Architectures name, can be: YOLOv5, YOLOv6, YOLOv7.");
-DEFINE_string(run_mode, "trt_fp32", "run_mode which can be: trt_fp32, trt_fp16 and trt_int8");
+DEFINE_string(
+    run_mode,
+    "paddle_gpu",
+    "run_mode which can be: trt_fp32, trt_fp16 and trt_int8 and paddle_gpu");
 DEFINE_int32(batch_size, 1, "Batch size.");
 DEFINE_int32(gpu_id, 0, "GPU card ID num.");
 DEFINE_int32(trt_min_subgraph_size, 3, "tensorrt min_subgraph_size");
 DEFINE_int32(warmup, 50, "warmup");
 DEFINE_int32(repeats, 1000, "repeats");
+DEFINE_bool(use_dynamic_shape, false, "use trt dynaminc shape.");
+DEFINE_bool(use_calib, true, "use trt int8 calibration.");
 
 using Time = decltype(std::chrono::high_resolution_clock::now());
 Time time() { return std::chrono::high_resolution_clock::now(); };
@@ -38,89 +53,89 @@ double time_diff(Time t1, Time t2) {
 
 std::shared_ptr<Predictor> InitPredictor() {
   Config config;
-  std::string model_path;
   if (FLAGS_model_dir != "") {
     config.SetModel(FLAGS_model_dir);
-    model_path = FLAGS_model_dir.substr(0, FLAGS_model_dir.find_last_of("/"));
-  } else {
-    config.SetModel(FLAGS_model_file, FLAGS_params_file);
-    model_path = FLAGS_model_file.substr(0, FLAGS_model_file.find_last_of("/"));
   }
-  // enable tune
-  std::cout << "model_path: " << model_path << std::endl;
-  config.EnableUseGpu(256, FLAGS_gpu_id);
+  config.SetModel(FLAGS_model_file, FLAGS_params_file);
+
+  config.EnableUseGpu(500, FLAGS_gpu_id);
+
   if (FLAGS_run_mode == "trt_fp32") {
-    config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, FLAGS_trt_min_subgraph_size,
-                                PrecisionType::kFloat32, false, false);
+    config.EnableTensorRtEngine(1 << 30 * FLAGS_batch_size,
+                                FLAGS_batch_size,
+                                FLAGS_trt_min_subgraph_size,
+                                PrecisionType::kFloat32,
+                                false,
+                                false);
   } else if (FLAGS_run_mode == "trt_fp16") {
-    config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, FLAGS_trt_min_subgraph_size,
-                                PrecisionType::kHalf, false, false);
+    config.EnableTensorRtEngine(1 << 30 * FLAGS_batch_size,
+                                FLAGS_batch_size,
+                                FLAGS_trt_min_subgraph_size,
+                                PrecisionType::kHalf,
+                                false,
+                                false);
   } else if (FLAGS_run_mode == "trt_int8") {
-    config.EnableTensorRtEngine(1 << 30, FLAGS_batch_size, FLAGS_trt_min_subgraph_size,
-                                PrecisionType::kInt8, false, false);
+    config.EnableTensorRtEngine(1 << 30 * FLAGS_batch_size,
+                                FLAGS_batch_size,
+                                FLAGS_trt_min_subgraph_size,
+                                PrecisionType::kInt8,
+                                false,
+                                FLAGS_use_calib);
+  }
+  if (FLAGS_use_dynamic_shape) {
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"image", {1, 3, 640, 640}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"image", {4, 3, 640, 640}}};
+    std::map<std::string, std::vector<int>> opt_input_shape = {
+        {"image", {2, 3, 640, 640}}};
+    config.SetTRTDynamicShapeInfo(
+        min_input_shape, max_input_shape, opt_input_shape);
   }
+  // Open the memory optim.
   config.EnableMemoryOptim();
+  config.SwitchIrDebug(true);
   config.SwitchIrOptim(true);
   return CreatePredictor(config);
 }
 
-template <typename type>
-void run(Predictor *predictor, const std::vector<type> &input,
-         const std::vector<int> &input_shape, type* out_data, std::vector<int> out_shape) {
-
-    // prepare input
-    int input_num = std::accumulate(input_shape.begin(), input_shape.end(), 1,
-                                    std::multiplies<int>());
-
-    auto input_names = predictor->GetInputNames();
-    auto input_t = predictor->GetInputHandle(input_names[0]);
-    input_t->Reshape(input_shape);
-    input_t->CopyFromCpu(input.data());
-
-  for (int i = 0; i < FLAGS_warmup; ++i)
-    CHECK(predictor->Run());
+void run(Predictor *predictor,
+         const std::vector<float> &input,
+         const std::vector<int> &input_shape,
+         std::vector<float> *out_data) {
+  int input_num = std::accumulate(
+      input_shape.begin(), input_shape.end(), 1, std::multiplies<int>());
 
-  auto st = time();
-  for (int i = 0; i < FLAGS_repeats; ++i) {
-    auto input_names = predictor->GetInputNames();
-    auto input_t = predictor->GetInputHandle(input_names[0]);
+  auto input_names = predictor->GetInputNames();
+  auto output_names = predictor->GetOutputNames();
+  auto input_t = predictor->GetInputHandle(input_names[0]);
+  input_t->Reshape(input_shape);
+  input_t->CopyFromCpu(input.data());
 
-    input_t->Reshape(input_shape);
-    input_t->CopyFromCpu(input.data());
+  for (size_t i = 0; i < FLAGS_warmup; ++i) CHECK(predictor->Run());
 
+  auto st = time();
+  for (size_t i = 0; i < FLAGS_repeats; ++i) {
     CHECK(predictor->Run());
-
-    auto output_names = predictor->GetOutputNames();
     auto output_t = predictor->GetOutputHandle(output_names[0]);
     std::vector<int> output_shape = output_t->shape();
-    output_t->CopyToCpu(out_data);
-
+    int out_num = std::accumulate(
+        output_shape.begin(), output_shape.end(), 1, std::multiplies<int>());
+    out_data->resize(out_num);
+    output_t->CopyToCpu(out_data->data());
   }
-
-  LOG(INFO) << "[" << FLAGS_run_mode << " bs-" << FLAGS_batch_size << " ] run avg time is " << time_diff(st, time()) / FLAGS_repeats
+  LOG(INFO) << "run avg time is " << time_diff(st, time()) / FLAGS_repeats
             << " ms";
 }
 
-int main(int argc, char *argv[])
-{
+int main(int argc, char *argv[]) {
   google::ParseCommandLineFlags(&argc, &argv, true);
   auto predictor = InitPredictor();
-
-  std::cout << "====== Use float instead of FP16 data ======" << std::endl;
-  std::vector<float> input_data(FLAGS_batch_size * 3 * 640 * 640, float(1.0));
   std::vector<int> input_shape = {FLAGS_batch_size, 3, 640, 640};
+  std::vector<float> input_data(FLAGS_batch_size * 3 * 640 * 640);
+  for (size_t i = 0; i < input_data.size(); ++i) input_data[i] = i % 255 * 0.1;
+  std::vector<float> out_data;
+  run(predictor.get(), input_data, input_shape, &out_data);
 
-  int out_box_shape = 25200;
-  if (FLAGS_arch == "YOLOv6"){
-    out_box_shape = 8400;
-  }
-  float* out_data;
-  std::vector<int> out_shape{ FLAGS_batch_size, 1, out_box_shape, 85};
-  int out_data_size = FLAGS_batch_size * out_box_shape * 85;
-
-  // Only use Pinned mem for D2H.
-  cudaHostAlloc((void**)&out_data, sizeof(float) * out_data_size, cudaHostAllocMapped);
-
-  run<float>(predictor.get(), input_data, input_shape, out_data, out_shape);
   return 0;
 }
diff --git a/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py b/example/auto_compression/pytorch_yolo_series/paddle_inference_eval.py
@@ -79,7 +79,8 @@ def argsparser():
         "--device",
         type=str,
         default="GPU",
-        help="Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
+        help=
+        "Choose the device you want to run, it can be: CPU/GPU/XPU, default is GPU",
     )
     parser.add_argument(
         "--arch", type=str, default="YOLOv5", help="architectures name.")
@@ -180,8 +181,9 @@ def draw_box(img, boxes, scores, cls_ids, conf=0.5, class_names=None):
 
         txt_size = cv2.getTextSize(text, font, 0.4, 1)[0]
         cv2.rectangle(img, (x0, y0), (x1, y1), color, 2)
-        cv2.rectangle(img, (x0, y0 + 1), (
-            x0 + txt_size[0] + 1, y0 + int(1.5 * txt_size[1])), color, -1)
+        cv2.rectangle(img, (x0, y0 + 1), (x0 + txt_size[0] + 1,
+                                          y0 + int(1.5 * txt_size[1])), color,
+                      -1)
         cv2.putText(
             img,
             text, (x0, y0 + txt_size[1]),
@@ -288,8 +290,8 @@ def load_predictor(
             dynamic_shape_file = os.path.join(FLAGS.model_path,
                                               "dynamic_shape.txt")
             if os.path.exists(dynamic_shape_file):
-                config.enable_tuned_tensorrt_dynamic_shape(dynamic_shape_file,
-                                                           True)
+                config.enable_tuned_tensorrt_dynamic_shape(
+                    dynamic_shape_file, True)
                 print("trt set dynamic shape done!")
             else:
                 config.collect_shape_range_info(dynamic_shape_file)
@@ -315,7 +317,8 @@ def eval(predictor, val_loader, anno_file, rerun_flag=False):
     input_names = predictor.get_input_names()
     output_names = predictor.get_output_names()
     boxes_tensor = predictor.get_output_handle(output_names[0])
-    for batch_id, data in enumerate(val_loader):
+    for batch_id, data in tqdm(
+            enumerate(val_loader), total=len(val_loader), desc='Evaluating'):
         data_all = {k: np.array(v) for k, v in data.items()}
         inputs = {}
         if FLAGS.arch == "YOLOv6":
@@ -345,7 +348,6 @@ def eval(predictor, val_loader, anno_file, rerun_flag=False):
         cpu_mems += cpu_mem
         gpu_mems += gpu_mem
         if batch_id % 100 == 0:
-            print("Eval iter:", batch_id)
             sys.stdout.flush()
     print("[Benchmark]Avg cpu_mem:{} MB, avg gpu_mem: {} MB".format(
         cpu_mems / sample_nums, gpu_mems / sample_nums))
@@ -469,4 +471,4 @@ def main():
     # DataLoader need run on cpu
     paddle.set_device("cpu")
 
-    main()
+    main()