Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. - Tensorrt

python

#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

Check the TensorRT version

python

!python3 -c 'import tensorrt; print("TensorRT version: {}".format(tensorrt.__version__))'

Prepare the input image and ONNX model file

python

!python3 /workspace/TensorRT/quickstart/SemanticSegmentation/export.py

Build TensorRT engine from the ONNX model

python

!trtexec --onnx=fcn-resnet101.onnx --saveEngine=fcn-resnet101.engine --optShapes=input:1x3x1026x1282 --stronglyTyped

Import required modules

python

import numpy as np
import os
import ctypes
from cuda.bindings import runtime as cudart
import tensorrt as trt

import matplotlib.pyplot as plt
from PIL import Image

TRT_LOGGER = trt.Logger()

assert cudart.cudaSetDevice(0) == (cudart.cudaError_t.cudaSuccess,)

# Filenames of TensorRT plan file and input/output images.
engine_file = "/workspace/fcn-resnet101.engine"
input_file  = "/workspace/input.ppm"
output_file = "/workspace/output.ppm"

Utilities for input / output processing

python

# For torchvision models, input images are loaded in to a range of [0, 1] and
# normalized using mean = [0.485, 0.456, 0.406] and stddev = [0.229, 0.224, 0.225].
def preprocess(image):
    # Mean normalization
    mean = np.array([0.485, 0.456, 0.406]).astype('float32')
    stddev = np.array([0.229, 0.224, 0.225]).astype('float32')
    data = (np.asarray(image).astype('float32') / float(255.0) - mean) / stddev
    # Switch from HWC to to CHW order
    return np.moveaxis(data, 2, 0)

def postprocess(data):
    num_classes = 21
    # create a color palette, selecting a color for each class
    palette = np.array([2 ** 25 - 1, 2 ** 15 - 1, 2 ** 21 - 1])
    colors = np.array([palette*i%255 for i in range(num_classes)]).astype("uint8")
    # plot the segmentation predictions for 21 classes in different colors
    img = Image.fromarray(data.astype('uint8'), mode='P')
    img.putpalette(colors)
    return img

Load TensorRT engine

Deserialize the TensorRT engine from specified plan file.

python

def load_engine(engine_file_path):
    assert os.path.exists(engine_file_path)
    print("Reading engine from file {}".format(engine_file_path))
    with open(engine_file_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        return runtime.deserialize_cuda_engine(f.read())

Inference pipeline

Starting with a deserialized engine, TensorRT inference pipeline consists of the following steps:

Create an execution context and specify input shape (based on the image dimensions for inference).
Allocate CUDA device memory for input and output.
Allocate CUDA page-locked host memory to efficiently copy back the output.
Transfer the processed image data into input memory using asynchronous host-to-device CUDA copy.
Kickoff the TensorRT inference pipeline using the asynchronous execute API.
Transfer the segmentation output back into pagelocked host memory using device-to-host CUDA copy.
Synchronize the stream used for data transfers and inference execution to ensure all operations are completes.
Finally, write out the segmentation output to an image file for visualization.

python

def infer(engine, input_file, output_file):
    print("Reading input image from file {}".format(input_file))
    with Image.open(input_file) as img:
        input_image = preprocess(img)
        image_width = img.width
        image_height = img.height

    with engine.create_execution_context() as context:
        input_buffers = {}
        input_memories = {}

        # Allocate host and device buffers
        tensor_names = [engine.get_tensor_name(i) for i in range(engine.num_io_tensors)]
        for tensor in tensor_names:
            size = trt.volume(context.get_tensor_shape(tensor))
            dtype = trt.nptype(engine.get_tensor_dtype(tensor))

            if engine.get_tensor_mode(tensor) == trt.TensorIOMode.INPUT:
                context.set_input_shape(tensor, (1, 3, image_height, image_width))
                input_buffers[tensor] = np.ascontiguousarray(input_image)
                err, input_memories[tensor] = cudart.cudaMalloc(input_image.nbytes)
                assert err == cudart.cudaError_t.cudaSuccess
                context.set_tensor_address(tensor, input_memories[tensor])
            else:
                err, output_buffer_ptr = cudart.cudaMallocHost(size * dtype().itemsize)
                assert err == cudart.cudaError_t.cudaSuccess
                pointer_type = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))
                output_buffer = np.ctypeslib.as_array(ctypes.cast(output_buffer_ptr, pointer_type), (size,))

                err, output_memory = cudart.cudaMalloc(output_buffer.nbytes)
                assert err == cudart.cudaError_t.cudaSuccess
                context.set_tensor_address(tensor, output_memory)

        err, stream = cudart.cudaStreamCreate()
        assert err == cudart.cudaError_t.cudaSuccess

        # Transfer input data to the GPU for all input tensors
        for tensor_name, input_buffer in input_buffers.items():
            input_memory = input_memories[tensor_name]
            err, = cudart.cudaMemcpyAsync(input_memory, input_buffer.ctypes.data, input_buffer.nbytes,
                                          cudart.cudaMemcpyKind.cudaMemcpyHostToDevice, stream)
            assert err == cudart.cudaError_t.cudaSuccess

        # Run inference
        context.execute_async_v3(stream)

        # Transfer prediction output from the GPU.
        err, = cudart.cudaMemcpyAsync(output_buffer.ctypes.data, output_memory, output_buffer.nbytes,
                                      cudart.cudaMemcpyKind.cudaMemcpyDeviceToHost, stream)
        assert err == cudart.cudaError_t.cudaSuccess
        # Synchronize the stream
        err, = cudart.cudaStreamSynchronize(stream)
        assert err == cudart.cudaError_t.cudaSuccess

        output_d64 = np.array(output_buffer, dtype=np.int64)
        np.savetxt('test.out', output_d64.astype(int), fmt='%i', delimiter=' ', newline=' ')

        with postprocess(np.reshape(output_buffer, (image_height, image_width))) as img:
            print("Writing output image to file {}".format(output_file))
            img.convert('RGB').save(output_file, "PPM")

        # cleanup cuda resources for all input tensors
        for input_memory in input_memories.values():
            cudart.cudaFree(input_memory)
        cudart.cudaFree(output_memory)
        cudart.cudaFreeHost(output_buffer_ptr)
        cudart.cudaStreamDestroy(stream)

Plot input image

python

plt.imshow(Image.open(input_file))

Run inference

python

print("Running TensorRT inference for FCN-ResNet101")
with load_engine(engine_file) as engine:
    infer(engine, input_file, output_file)

Plot segmentation output

python

plt.imshow(Image.open(output_file))