Add fastdeploy server and client component (#1169)

* add backend support for fastdeploy server * fix * add code * fix * fix * add fastdeploy server component * add fastdeploy server and client * add exception description * fix * add model repository judgement * add component tab for fastdeploy client * update more tasks in fastdeploy client * sort filenames * backup config * noqa for autogenerated file * add data validation * add __init__ for package * add calculating layout for frontend * add alive server detection and optimize client * add alive server detection and optimize client * add alive server detection and optimize client * add metrics in gradio client * update presentation * Change return value to None for frontend performance data when server not ready * add get_server_config and download_pretrain_model api * add get_server_config and download_pretrain_model api * add unit for metric table * add unit for metric table * fix a bug * add judgement pretrained model download * add judgement pretrained model download * add version info for frontend * rename download model * fix a bug * add fastdeploy model list * optimize for choose configuration files * modify according to frontend need * fix name in config to model name * optimize for server list and alive judgement * keep server name as string type * optimize process judgement logic * optimize for deleting resource files * add rename resource file * fix * fix a bug * optimize code structure * optimize code structure * remove chinese tips and remove fastdeploy-python in requirements
PaddlePaddle · Jan 10, 2023 · a418dd4 · a418dd4
1 parent b90619b
commit a418dd4
Show file tree

Hide file tree

Showing 14 changed files with 5,145 additions and 3 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -12,4 +12,8 @@ multiprocess
 packaging
 x2paddle
 rarfile
-onnx >= 1.6.0
+gradio
+tritonclient[all]
+attrdict
+psutil
+onnx >= 1.6.0
diff --git a/visualdl/component/inference/fastdeploy_client/__init__.py b/visualdl/component/inference/fastdeploy_client/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
diff --git a/visualdl/component/inference/fastdeploy_client/client_app.py b/visualdl/component/inference/fastdeploy_client/client_app.py
diff --git a/visualdl/component/inference/fastdeploy_client/http_client_manager.py b/visualdl/component/inference/fastdeploy_client/http_client_manager.py
@@ -0,0 +1,304 @@
+# Copyright (c) 2022 VisualDL Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =======================================================================
+import json
+import re
+
+import numpy as np
+import requests
+import tritonclient.http as httpclient
+from attrdict import AttrDict
+from tritonclient.utils import InferenceServerException
+
+
+def convert_http_metadata_config(metadata):
+    metadata = AttrDict(metadata)
+
+    return metadata
+
+
+def prepare_request(inputs_meta, inputs_data, outputs_meta):
+    '''
+    inputs_meta: inputs meta information from model. name: info
+    inputs_data: users input data. name: data
+    '''
+    # Set the input data
+    inputs = []
+    for input_dict in inputs_meta:
+        input_name = input_dict['name']
+        if input_name not in inputs_data:
+            raise RuntimeError(
+                'Error: input name {} required for model not existed.'.format(
+                    input_name))
+        if input_dict['datatype'] == 'FP32':
+            inputs_data[input_name] = inputs_data[input_name].astype(
+                np.float32
+            ) / 255  # image data returned by gradio is uint8, convert to fp32
+        if len(input_dict['shape']
+               ) == 3 and input_dict['shape'][0] == 3:  # NCHW
+            inputs_data[input_name] = inputs_data[input_name][0].transpose(
+                2, 0, 1)
+        elif len(input_dict['shape']
+                 ) == 4 and input_dict['shape'][1] == 3:  # NCHW
+            inputs_data[input_name] = inputs_data[input_name].transpose(
+                0, 3, 1, 2)
+        infer_input = httpclient.InferInput(
+            input_name, inputs_data[input_name].shape, input_dict['datatype'])
+        infer_input.set_data_from_numpy(inputs_data[input_name])
+        inputs.append(infer_input)
+    outputs = []
+    for output_dict in outputs_meta:
+        infer_output = httpclient.InferRequestedOutput(output_dict.name)
+        outputs.append(infer_output)
+    return inputs, outputs
+
+
+metrics_table_head = """
+<style>
+table, th {{
+  border:0.1px solid black;
+}}
+</style>
+
+<div>
+<table style="width:100%">
+  <tr>
+    <th rowspan="2">模型名称</th>
+    <th colspan="4">执行统计</th>
+    <th colspan="5">延迟统计</th>
+
+  </tr>
+  <tr>
+   <th>请求处理成功数</th>
+  <th>请求处理失败数</th>
+  <th>推理batch数</th>
+  <th>推理样本数</th>
+  <th>请求处理时间(ms)</th>
+  <th>任务队列等待时间(ms)</th>
+  <th>输入处理时间(ms)</th>
+  <th>模型推理时间(ms)</th>
+  <th>输出处理时间(ms)</th>
+  </tr>
+  {}
+</table>
+</div>
+<br>
+<br>
+<br>
+<br>
+<br>
+<div>
+<table style="width:100%">
+  <tr>
+    <th rowspan="2">GPU</th>
+    <th colspan="4">性能指标</th>
+    <th colspan="2">显存</th>
+  </tr>
+  <tr>
+   <th>利用率(%)</th>
+  <th>功率(W)</th>
+  <th>功率限制(W)</th>
+  <th>耗电量(W)</th>
+  <th>总量(GB)</th>
+  <th>已使用(GB)</th>
+  </tr>
+  {}
+</table>
+</div>
+"""
+
+
+def get_metric_data(server_addr, metric_port):  # noqa:C901
+    '''
+    Get metrics data from fastdeploy server, and transform it into html table.
+    Args:
+        server_addr(str): fastdeployserver ip address
+        metric_port(int): fastdeployserver metrics port
+    Returns:
+        htmltable(str): html table to show metrics data
+    '''
+    model_table = {}
+    gpu_table = {}
+    metric_column_name = {
+        "Model": {
+            "nv_inference_request_success", "nv_inference_request_failure",
+            "nv_inference_count", "nv_inference_exec_count",
+            "nv_inference_request_duration_us",
+            "nv_inference_queue_duration_us",
+            "nv_inference_compute_input_duration_us",
+            "nv_inference_compute_infer_duration_us",
+            "nv_inference_compute_output_duration_us"
+        },
+        "GPU": {
+            "nv_gpu_power_usage", "nv_gpu_power_limit",
+            "nv_energy_consumption", "nv_gpu_utilization",
+            "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes"
+        },
+        "CPU": {
+            "nv_cpu_utilization", "nv_cpu_memory_total_bytes",
+            "nv_cpu_memory_used_bytes"
+        }
+    }
+    try:
+        res = requests.get("http://{}:{}/metrics".format(
+            server_addr, metric_port))
+    except Exception:
+        return metrics_table_head.format('', '')
+    metric_content = res.text
+    for content in metric_content.split('\n'):
+        if content.startswith('#'):
+            continue
+        else:
+            res = re.match(r'(\w+){(.*)} (\w+)',
+                           content)  # match output by server metrics interface
+            if not res:
+                continue
+            metric_name = res.group(1)
+            model = res.group(2)
+            value = res.group(3)
+            infos = {}
+            for info in model.split(','):
+                k, v = info.split('=')
+                v = v.strip('"')
+                infos[k] = v
+            if metric_name in [
+                    "nv_inference_request_duration_us",
+                    "nv_inference_queue_duration_us",
+                    "nv_inference_compute_input_duration_us",
+                    "nv_inference_compute_infer_duration_us",
+                    "nv_inference_compute_output_duration_us"
+            ]:
+                value = str(float(value) / 1000)
+            elif metric_name in [
+                    "nv_gpu_memory_total_bytes", "nv_gpu_memory_used_bytes"
+            ]:
+                value = str(float(value) / 1024 / 1024 / 1024)
+            for key, metric_names in metric_column_name.items():
+                if metric_name in metric_names:
+                    if key == 'Model':
+                        model_name = infos['model']
+                        if model_name not in model_table:
+                            model_table[model_name] = {}
+                        model_table[model_name][metric_name] = value
+                    elif key == 'GPU':
+                        gpu_name = infos['gpu_uuid']
+                        if gpu_name not in gpu_table:
+                            gpu_table[gpu_name] = {}
+                        gpu_table[gpu_name][metric_name] = value
+                    elif key == 'CPU':
+                        pass
+    model_data_list = []
+    gpu_data_list = []
+    model_data_metric_names = [
+        "nv_inference_request_success", "nv_inference_request_failure",
+        "nv_inference_exec_count", "nv_inference_count",
+        "nv_inference_request_duration_us", "nv_inference_queue_duration_us",
+        "nv_inference_compute_input_duration_us",
+        "nv_inference_compute_infer_duration_us",
+        "nv_inference_compute_output_duration_us"
+    ]
+    gpu_data_metric_names = [
+        "nv_gpu_utilization", "nv_gpu_power_usage", "nv_gpu_power_limit",
+        "nv_energy_consumption", "nv_gpu_memory_total_bytes",
+        "nv_gpu_memory_used_bytes"
+    ]
+    for k, v in model_table.items():
+        data = []
+        data.append(k)
+        for data_metric in model_data_metric_names:
+            data.append(v[data_metric])
+        model_data_list.append(data)
+    for k, v in gpu_table.items():
+        data = []
+        data.append(k)
+        for data_metric in gpu_data_metric_names:
+            data.append(v[data_metric])
+        gpu_data_list.append(data)
+    model_data = '\n'.join([
+        "<tr>" + '\n'.join(["<td>" + item + "</td>"
+                            for item in data]) + "</tr>"
+        for data in model_data_list
+    ])
+    gpu_data = '\n'.join([
+        "<tr>" + '\n'.join(["<td>" + item + "</td>"
+                            for item in data]) + "</tr>"
+        for data in gpu_data_list
+    ])
+    return metrics_table_head.format(model_data, gpu_data)
+
+
+class HttpClientManager:
+    def __init__(self):
+        self.clients = {}  # server url: httpclient
+
+    def _create_client(self, server_url):
+        if server_url in self.clients:
+            return self.clients[server_url]
+        try:
+            fastdeploy_client = httpclient.InferenceServerClient(server_url)
+            self.clients[server_url] = fastdeploy_client
+            return fastdeploy_client
+        except Exception:
+            raise RuntimeError(
+                'Can not connect to server {}, please check your \
+                    server address'.format(server_url))
+
+    def infer(self, server_url, model_name, model_version, inputs):
+        fastdeploy_client = self._create_client(server_url)
+        input_metadata, output_metadata = self.get_model_meta(
+            server_url, model_name, model_version)
+        inputs, outputs = prepare_request(input_metadata, inputs,
+                                          output_metadata)
+        response = fastdeploy_client.infer(
+            model_name, inputs, model_version=model_version, outputs=outputs)
+
+        results = {}
+        for output in output_metadata:
+            result = response.as_numpy(output.name)  # datatype: numpy
+            if output.datatype == 'BYTES':  # datatype: bytes
+                try:
+                    value = result
+                    if len(result.shape) == 1:
+                        value = result[0]
+                    elif len(result.shape) == 2:
+                        value = result[0][0]
+                    elif len(result.shape) == 3:
+                        value = result[0][0][0]
+                    result = json.loads(value)  # datatype: json
+                except Exception:
+                    pass
+            else:
+                result = result[0]
+            results[output.name] = result
+        return results
+
+    def raw_infer(self, server_url, model_name, model_version, raw_input):
+        url = 'http://{}/v2/models/{}/versions/{}/infer'.format(
+            server_url, model_name, model_version)
+        res = requests.post(url, data=json.dumps(json.loads(raw_input)))
+        return json.dumps(res.json())
+
+    def get_model_meta(self, server_url, model_name, model_version):
+        fastdeploy_client = self._create_client(server_url)
+        try:
+            model_metadata = fastdeploy_client.get_model_metadata(
+                model_name=model_name, model_version=model_version)
+        except InferenceServerException as e:
+            raise RuntimeError("Failed to retrieve the metadata: " + str(e))
+
+        model_metadata = convert_http_metadata_config(model_metadata)
+
+        input_metadata = model_metadata.inputs
+        output_metadata = model_metadata.outputs
+        return input_metadata, output_metadata