update docker for triton (#1655)

wenet-e2e · Jan 10, 2023 · 4c59a67 · 4c59a67
1 parent 762954b
commit 4c59a67
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 7 deletions.
diff --git a/runtime/gpu/Dockerfile/Dockerfile.server b/runtime/gpu/Dockerfile/Dockerfile.server
@@ -1,10 +1,9 @@
-FROM nvcr.io/nvidia/tritonserver:22.03-py3
+FROM nvcr.io/nvidia/tritonserver:22.11-py3
 LABEL maintainer="NVIDIA"
 LABEL repository="tritonserver"
 
-RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 RUN apt-get update && apt-get -y install swig && apt-get -y install python3-dev && apt-get install -y cmake
-RUN pip3 install torch==1.12.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu113
+RUN pip3 install torch==1.12.0 torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu116
 RUN pip3 install -v kaldifeat
 RUN pip3 install pyyaml onnx
 WORKDIR /workspace

diff --git a/runtime/gpu/README.md b/runtime/gpu/README.md
@@ -12,7 +12,7 @@ model_dir=<absolute path to>/20211025_conformer_exp
 onnx_model_dir=<absolute path>
 mkdir $onnx_model_dir
 python3 wenet/bin/export_onnx_gpu.py --config=$model_dir/train.yaml --checkpoint=$model_dir/final.pt --cmvn_file=$model_dir/global_cmvn --ctc_weight=0.5 --output_onnx_dir=$onnx_model_dir --fp16
-cp $model_dir/units.txt $model_dir/train.yaml $onnx_model_dir/
+cp $model_dir/words.txt $model_dir/train.yaml $onnx_model_dir/
 ```
 
 If you want to export streaming model (u2/u2++) for streaming inference (inference by chunks) instead of offline inference (inference by audio segments/utterance), you should add `--streaming` option:

diff --git a/runtime/gpu/model_repo/scoring/1/model.py b/runtime/gpu/model_repo/scoring/1/model.py
@@ -1,6 +1,7 @@
 import triton_python_backend_utils as pb_utils
 import numpy as np
 import multiprocessing
+from torch.utils.dlpack import from_dlpack
 from swig_decoders import ctc_beam_search_decoder_batch, \
     Scorer, PathTrie, TrieVector, map_batch
 import json
@@ -254,7 +255,11 @@ def execute(self, requests):
             # Extract the output tensors from the inference response.
             best_index = pb_utils.get_output_tensor_by_name(inference_response,
                                                             'best_index')
-            best_index = best_index.as_numpy()
+            if best_index.is_cpu():
+                best_index = best_index.as_numpy()
+            else:
+                best_index = from_dlpack(best_index.to_dlpack())
+                best_index = best_index.cpu().numpy()
             hyps = []
             idx = 0
             for cands, cand_lens in zip(in_hyps_pad_sos_eos, in_hyps_lens_sos):

diff --git a/runtime/gpu/scripts/convert_start_server.sh b/runtime/gpu/scripts/convert_start_server.sh
@@ -19,8 +19,8 @@ onnx_model_dir=/ws/onnx_model
 model_repo=/ws/model_repo
 
 # Convert config.pbtxt in model_repo and move models
-python3 scripts/convert.py --config=$onnx_model_dir/train.yaml --vocab=$onnx_model_dir/units.txt \
+python3 scripts/convert.py --config=$onnx_model_dir/train.yaml --vocab=$onnx_model_dir/words.txt \
         --model_repo=$model_repo --onnx_model_dir=$onnx_model_dir
 
 # Start server
-tritonserver --model-repository=/ws/model_repo --pinned-memory-pool-byte-size=1024000000 --cuda-memory-pool-byte-size=0:1024000000
+tritonserver --model-repository=${model_repo} --pinned-memory-pool-byte-size=1024000000 --cuda-memory-pool-byte-size=0:1024000000