Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

为什么测试解码提升很小?而且与官方demo 使用的api不一样 #8

Open
hacktmz opened this issue Jun 8, 2021 · 2 comments

Comments

@hacktmz
Copy link

hacktmz commented Jun 8, 2021

环境 20核cpu cuda 10.2 T4 单卡

def get_image(image_url):
if not image_url:
return None
try:
image_url = parse.unquote(image_url)
response = requests.get(image_url)
if response.status_code != 200:
print("get image filed!!!!!!")
return " "
return response.content
except Exception as e:
print(e)
raise

def test_load_img(image, count):
start = cv2.getTickCount()
for num in range(0, count):
np_image = np.frombuffer(bytearray(image), np.uint8)
cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
end1 = cv2.getTickCount()
print("load img1 base line = %s" % ((end1 - start) / cv2.getTickFrequency()))
return cv_image

def test_load_img_nvjpeg(image, count):
from nvjpeg import NvJpeg
nj = NvJpeg()
start = cv2.getTickCount()
for num in range(0, count):
np_image = np.asarray(bytearray(image), dtype="uint8")
# cv_image = cv2.imdecode(np_image, cv2.IMREAD_COLOR)
cv_image = nj.decode(np_image)
end1 = cv2.getTickCount()
print("load img nvjpeg base line = %s" % ((end1 - start) / cv2.getTickFrequency()))
return cv_image

if name == "main":
image = get_image("http://cdn.weipaitang.com/img/20200313rli2rh7p-jdgj-7vyi-91qd-584099256046-W3024H4032")
if image != "":
count = 100
test_load_img_nvjpeg(image, count)
cv_image = test_load_img(image, count)

结果为:
opencv 14秒
pynvjpeg 12.6秒 (确定观察到GPU的使用率,没有任何报错)

@hacktmz
Copy link
Author

hacktmz commented Jun 8, 2021

然后官方的解码demo 是这样写的,里面的api为什么跟pynvjpeg完全不一样

int decode_images(const FileData &img_data, const std::vector<size_t> &img_len,
std::vector<nvjpegImage_t> &out, decode_params_t &params,
double &time) {
CHECK_CUDA(cudaStreamSynchronize(params.stream));
cudaEvent_t startEvent = NULL, stopEvent = NULL;
float loopTime = 0;

CHECK_CUDA(cudaEventCreate(&startEvent, cudaEventBlockingSync));
CHECK_CUDA(cudaEventCreate(&stopEvent, cudaEventBlockingSync));

std::vector<const unsigned char*> batched_bitstreams;
std::vector<size_t> batched_bitstreams_size;
std::vector<nvjpegImage_t> batched_output;

// bit-streams that batched decode cannot handle
std::vector<const unsigned char*> otherdecode_bitstreams;
std::vector<size_t> otherdecode_bitstreams_size;
std::vector<nvjpegImage_t> otherdecode_output;

// if(params.hw_decode_available){
// for(int i = 0; i < params.batch_size; i++){
// // extract bitstream meta data to figure out whether a bit-stream can be decoded
// nvjpegJpegStreamParseHeader(params.nvjpeg_handle, (const unsigned char *)img_data[i].data(), img_len[i], params.jpeg_streams[0]);
// int isSupported = -1;
// nvjpegDecodeBatchedSupported(params.nvjpeg_handle, params.jpeg_streams[0], &isSupported);

// if(isSupported == 0){
// batched_bitstreams.push_back((const unsigned char *)img_data[i].data());
// batched_bitstreams_size.push_back(img_len[i]);
// batched_output.push_back(out[i]);
// } else {
// otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data());
// otherdecode_bitstreams_size.push_back(img_len[i]);
// otherdecode_output.push_back(out[i]);
// }
// }
// } else {
for(int i = 0; i < params.batch_size; i++) {
otherdecode_bitstreams.push_back((const unsigned char *)img_data[i].data());
otherdecode_bitstreams_size.push_back(img_len[i]);
otherdecode_output.push_back(out[i]);
}
// }

CHECK_CUDA(cudaEventRecord(startEvent, params.stream));

if(batched_bitstreams.size() > 0)
 {
      CHECK_NVJPEG(
           nvjpegDecodeBatchedInitialize(params.nvjpeg_handle, params.nvjpeg_state,
                                        batched_bitstreams.size(), 1, params.fmt));

     CHECK_NVJPEG(nvjpegDecodeBatched(
         params.nvjpeg_handle, params.nvjpeg_state, batched_bitstreams.data(),
         batched_bitstreams_size.data(), batched_output.data(), params.stream));
 }

if(otherdecode_bitstreams.size() > 0)
{
      CHECK_NVJPEG(nvjpegStateAttachDeviceBuffer(params.nvjpeg_decoupled_state, params.device_buffer));
      int buffer_index = 0;
      CHECK_NVJPEG(nvjpegDecodeParamsSetOutputFormat(params.nvjpeg_decode_params, params.fmt));
      for (int i = 0; i < params.batch_size; i++) {
          CHECK_NVJPEG(
              nvjpegJpegStreamParse(params.nvjpeg_handle, otherdecode_bitstreams[i], otherdecode_bitstreams_size[i],
              0, 0, params.jpeg_streams[buffer_index]));

          CHECK_NVJPEG(nvjpegStateAttachPinnedBuffer(params.nvjpeg_decoupled_state,
              params.pinned_buffers[buffer_index]));

          CHECK_NVJPEG(nvjpegDecodeJpegHost(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              params.nvjpeg_decode_params, params.jpeg_streams[buffer_index]));

          CHECK_CUDA(cudaStreamSynchronize(params.stream));

          CHECK_NVJPEG(nvjpegDecodeJpegTransferToDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              params.jpeg_streams[buffer_index], params.stream));

          buffer_index = 1 - buffer_index; // switch pinned buffer in pipeline mode to avoid an extra sync

          CHECK_NVJPEG(nvjpegDecodeJpegDevice(params.nvjpeg_handle, params.nvjpeg_decoder, params.nvjpeg_decoupled_state,
              &otherdecode_output[i], params.stream));

      }
}

CHECK_CUDA(cudaEventRecord(stopEvent, params.stream));

CHECK_CUDA(cudaEventSynchronize(stopEvent));
CHECK_CUDA(cudaEventElapsedTime(&loopTime, startEvent, stopEvent));
time = static_cast(loopTime);

return EXIT_SUCCESS;
}

@zeng-qinghui
Copy link
Contributor

  • 中文原文
  1. 为什么测试解码提升很小?

    GPU硬解码在编解码速度上是比CPU快的,但是处理过程中多了 host to device 和 device to host 内存复制时间。所以在处理小图片的速度不理想。由于GPU核心数远远多于CPU核心数,所以使用多线程也有利于获得更好的结果。

  2. 与官方demo 使用的api不一样

    pynvjpeg旨在使用nvjpeg的编解码功能,实现兼容opencv的接口。

  • English Translation
  1. Why the decoding time is similar?

    GPU hard decoding is faster than CPU in encoding and decoding process, but there is a process names host to device or device to host memory copying spends time. For the reason, the decoding/encoding time is similar, especially decoding/encoding small pictures . Using multi-threading, The GPU should got the more better score than CPU.

  2. The API is different from the official demo

    PyNvjpeg is designed to be compatible with OpenCV.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants