为什么测试解码提升很小？而且与官方demo 使用的api不一样 #8

hacktmz · 2021-06-08T06:16:35Z

环境 20核cpu cuda 10.2 T4 单卡

def get_image(image_url):
if not image_url:
return None
try:
image_url = parse.unquote(image_url)
response = requests.get(image_url)
if response.status_code != 200:
print("get image filed!!!!!!")
return " "
return response.content
except Exception as e:
print(e)
raise

def test_load_img(image, count):
start = cv2.getTickCount()
for num in range(0, count):
np_image = np.frombuffer(bytearray(image), np.uint8)
cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
end1 = cv2.getTickCount()
print("load img1 base line = %s" % ((end1 - start) / cv2.getTickFrequency()))
return cv_image

def test_load_img_nvjpeg(image, count):
from nvjpeg import NvJpeg
nj = NvJpeg()
start = cv2.getTickCount()
for num in range(0, count):
np_image = np.asarray(bytearray(image), dtype="uint8")
# cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
cv_image = nj.decode(np_image)
end1 = cv2.getTickCount()
print("load img nvjpeg base line = %s" % ((end1 - start) / cv2.getTickFrequency()))
return cv_image

if name == "main":
image = get_image("http://cdn.weipaitang.com/img/20200313rli2rh7p-jdgj-7vyi-91qd-584099256046-W3024H4032")
if image != "":
count = 100
test_load_img_nvjpeg(image, count)
cv_image = test_load_img(image, count)

结果为：
opencv 14秒
pynvjpeg 12.6秒（确定观察到GPU的使用率，没有任何报错）

hacktmz · 2021-06-08T06:21:21Z

然后官方的解码demo 是这样写的，里面的api为什么跟pynvjpeg完全不一样

int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
std::vector<nvjpegImage_t> &out, decode_params_t &params,
double &time) {
CHECK_CUDA(cudaStreamSynchronize(params.stream));
cudaEvent_t startEvent = NULL, stopEvent = NULL;
float loopTime = 0;

CHECK_CUDA(cudaEventCreate(&startEvent, cudaEventBlockingSync));
CHECK_CUDA(cudaEventCreate(&stopEvent, cudaEventBlockingSync));

std::vector<const unsigned char*> batched_bitstreams;
std::vector<size_t> batched_bitstreams_size;
std::vector<nvjpegImage_t> batched_output;

// bit-streams that batched decode cannot handle
std::vector<const unsigned char*> otherdecode_bitstreams;
std::vector<size_t> otherdecode_bitstreams_size;
std::vector<nvjpegImage_t> otherdecode_output;

// if(params.hw_decode_available){
// for(int i = 0; i < params.batch_size; i++){
// // extract bitstream meta data to figure out whether a bit-stream can be decoded
// nvjpegJpegStreamParseHeader(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], params.jpeg_streams[0]);
// int isSupported = -1;
// nvjpegDecodeBatchedSupported(params.nvjpeg_handle, params.jpeg_streams[0], &isSupported);

// if(isSupported == 0){
// batched_bitstreams.push_back((const unsigned char *)img_data[i].data());
// batched_bitstreams_size.push_back(img_len[i]);
// batched_output.push_back(out[i]);
// } else {
// otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data());
// otherdecode_bitstreams_size.push_back(img_len[i]);
// otherdecode_output.push_back(out[i]);
// }
// }
// } else {
for(int i = 0; i < params.batch_size; i++) {
otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data());
otherdecode_bitstreams_size.push_back(img_len[i]);
otherdecode_output.push_back(out[i]);
}
// }

CHECK_CUDA(cudaEventRecord(startEvent, params.stream));

if(batched_bitstreams.size() > 0)
 {
      CHECK_NVJPEG(
           nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
                                        batched_bitstreams.size(), 1, params.fmt));

     CHECK_NVJPEG(nvjpegDecodeBatched(
         params.nvjpeg_handle, params.nvjpeg_state, batched_bitstreams.data(),
         batched_bitstreams_size.data(), batched_output.data(), params.stream));
 }

if(otherdecode_bitstreams.size() > 0)
{
      CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
      int buffer_index = 0;
      CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
      for (int i = 0; i < params.batch_size; i++) {
          CHECK_NVJPEG(
              nvjpegJpegStreamParse(params.nvjpeg_handle, otherdecode_bitstreams[i], otherdecode_bitstreams_size[i],
              0, 0, params.jpeg_streams[buffer_index]));

          CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
              params.pinned_buffers[buffer_index]));

          CHECK_NVJPEG(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));

          CHECK_CUDA(cudaStreamSynchronize(params.stream));

          CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              params.jpeg_streams[buffer_index], params.stream));

          buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync

          CHECK_NVJPEG(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              &otherdecode_output[i], params.stream));

      }
}

CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));

CHECK_CUDA(cudaEventSynchronize(stopEvent));
CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
time = static_cast(loopTime);

return EXIT_SUCCESS;
}

zeng-qinghui · 2021-06-08T23:27:08Z

中文原文

为什么测试解码提升很小？

GPU硬解码在编解码速度上是比CPU快的，但是处理过程中多了 host to device 和 device to host 内存复制时间。所以在处理小图片的速度不理想。由于GPU核心数远远多于CPU核心数，所以使用多线程也有利于获得更好的结果。
与官方demo 使用的api不一样

pynvjpeg旨在使用nvjpeg的编解码功能，实现兼容opencv的接口。

English Translation

Why the decoding time is similar?

GPU hard decoding is faster than CPU in encoding and decoding process, but there is a process names host to device or device to host memory copying spends time. For the reason, the decoding/encoding time is similar, especially decoding/encoding small pictures . Using multi-threading, The GPU should got the more better score than CPU.
The API is different from the official demo

PyNvjpeg is designed to be compatible with OpenCV.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

为什么测试解码提升很小？而且与官方demo 使用的api不一样 #8

为什么测试解码提升很小？而且与官方demo 使用的api不一样 #8

hacktmz commented Jun 8, 2021 •

edited

Loading

hacktmz commented Jun 8, 2021

zeng-qinghui commented Jun 8, 2021

为什么测试解码提升很小？而且与官方demo 使用的api不一样 #8

为什么测试解码提升很小？而且与官方demo 使用的api不一样 #8

Comments

hacktmz commented Jun 8, 2021 • edited Loading

hacktmz commented Jun 8, 2021

zeng-qinghui commented Jun 8, 2021

hacktmz commented Jun 8, 2021 •

edited

Loading