deploy/gke-marketplace-app/trt-engine/README.md
docker run --gpus all -it --network host \
--shm-size=1g --ulimit memlock=-1 --ulimit stack=67108864 \
-v ~:/scripts nvcr.io/nvidia/tensorrt:26.04-py3
pip install onnx six torch tf2onnx tensorflow
git clone -b main https://github.com/NVIDIA/TensorRT.git
cd TensorRT
git submodule update --init --recursive
export TRT_OSSPATH=/workspace/TensorRT
export TRT_LIBPATH=/lib/x86_64-linux-gnu
pushd /usr/local/bin && wget https://ngc.nvidia.com/downloads/ngccli_cat_linux.zip && unzip ngccli_cat_linux.zip && chmod u+x ngc-cli/ngc && rm ngccli_cat_linux.zip ngc-cli.md5 && ln -s ngc-cli/ngc ngc && echo "no-apikey\nascii\n" | ngc config set
popd
cd /workspace/TensorRT/demo/BERT
bash ./scripts/download_squad.sh
bash ./scripts/download_model.sh large 128
# bash ./scripts/download_model.sh large 384
mkdir -p engines
python3 builder.py -m models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/model.ckpt -o engines/bert_large_int8_bs1_s128.engine -b 1 -s 128 -c models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/ -v models/fine-tuned/bert_tf_ckpt_large_qa_squad2_amp_128_v19.03.1/vocab.txt --int8 --fp16 --strict --calib-num 1 -iln -imh
gsutil cp bert_large_int8_bs1_s128.engine gs://triton_sample_models/26.04/bert/1/model.plan
For each Triton upgrade, container version used to generate the model, and the model path in GCS gs://triton_sample_models/26.04/ should be updated accordingly with the correct version.