Difference between revisions of "Jetson/L4T/TRT Customized Example"

From eLinux.org
< Jetson‎ | L4T
Jump to: navigation, search
Line 1: Line 1:
This page collects information to deploy customized models with TensorRT.
+
This page collects information to deploy customized models with TensorRT and some common questions for Jetson.
 +
 
 +
== TensorRT Python ==
 +
=== OpenCV with ONNX model ===
 +
Below is an example to deploy TensorRT from an ONNX model with OpenCV images.
 +
 
 +
Verified environment:
 +
* JetPack4.5.1 + Xavier
 +
 
 +
<syntaxhighlight lang="python">
 +
import cv2
 +
import time
 +
import numpy as np
 +
import tensorrt as trt
 +
import pycuda.autoinit
 +
import pycuda.driver as cuda
 +
 
 +
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 +
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
 +
runtime = trt.Runtime(TRT_LOGGER)
 +
 
 +
host_inputs  = []
 +
cuda_inputs  = []
 +
host_outputs = []
 +
cuda_outputs = []
 +
bindings = []
 +
 
 +
 
 +
def Inference(engine):
 +
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
 +
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0
 +
 
 +
    np.copyto(host_inputs[0], image.ravel())
 +
    stream = cuda.Stream()
 +
    context = engine.create_execution_context()
 +
 
 +
    start_time = time.time()
 +
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
 +
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
 +
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
 +
    stream.synchronize()
 +
    print("execute times "+str(time.time()-start_time))
 +
 
 +
    output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
 +
    print(np.argmax(output))
 +
 
 +
 
 +
def PrepareEngine():
 +
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
 +
        builder.max_workspace_size = 1 << 30
 +
        with open('/usr/src/tensorrt/data/resnet50/ResNet50.onnx', 'rb') as model:
 +
            if not parser.parse(model.read()):
 +
                print ('ERROR: Failed to parse the ONNX file.')
 +
                for error in range(parser.num_errors):
 +
                    print (parser.get_error(error))
 +
        engine = builder.build_cuda_engine(network)
 +
 
 +
        # create buffer
 +
        for binding in engine:
 +
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
 +
            host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
 +
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)
 +
 
 +
            bindings.append(int(cuda_mem))
 +
            if engine.binding_is_input(binding):
 +
                host_inputs.append(host_mem)
 +
                cuda_inputs.append(cuda_mem)
 +
            else:
 +
                host_outputs.append(host_mem)
 +
                cuda_outputs.append(cuda_mem)
 +
 
 +
        return engine
 +
 
 +
 
 +
if __name__ == "__main__":
 +
    engine = PrepareEngine()
 +
    Inference(engine)
 +
</syntaxhighlight>
 +
 
 +
 
 +
=== OpenCV with PLAN model ===
 +
Below is an example to deploy TensorRT from a TensorRT PLAN model with OpenCV images.
 +
 
 +
Verified environment:
 +
* JetPack4.5.1 + Xavier
 +
 
 +
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=trt.plan
 +
 
 +
<syntaxhighlight lang="python">
 +
import cv2
 +
import time
 +
import numpy as np
 +
import tensorrt as trt
 +
import pycuda.autoinit
 +
import pycuda.driver as cuda
 +
 
 +
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 +
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
 +
runtime = trt.Runtime(TRT_LOGGER)
 +
 
 +
host_inputs  = []
 +
cuda_inputs  = []
 +
host_outputs = []
 +
cuda_outputs = []
 +
bindings = []
 +
 
 +
 
 +
def Inference(engine):
 +
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
 +
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0
 +
 
 +
    np.copyto(host_inputs[0], image.ravel())
 +
    stream = cuda.Stream()
 +
    context = engine.create_execution_context()
 +
 
 +
    start_time = time.time()
 +
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
 +
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
 +
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
 +
    stream.synchronize()
 +
    print("execute times "+str(time.time()-start_time))
 +
 
 +
    output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
 +
    print(np.argmax(output))
 +
 
 +
 
 +
def PrepareEngine():
 +
    runtime = trt.Runtime(TRT_LOGGER)
 +
    with open('./trt.plan', 'rb') as f:
 +
        buf = f.read()
 +
        engine = runtime.deserialize_cuda_engine(buf)
 +
 
 +
    # create buffer
 +
    for binding in engine:
 +
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
 +
        host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
 +
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)
 +
 
 +
        bindings.append(int(cuda_mem))
 +
        if engine.binding_is_input(binding):
 +
            host_inputs.append(host_mem)
 +
            cuda_inputs.append(cuda_mem)
 +
        else:
 +
            host_outputs.append(host_mem)
 +
            cuda_outputs.append(cuda_mem)
 +
 
 +
    return engine
 +
 
 +
 
 +
if __name__ == "__main__":
 +
    engine = PrepareEngine()
 +
    Inference(engine)
 +
</syntaxhighlight>
 +
 
 +
 
 +
=== Multi-threading ===
 +
Below is an example to run TensorRT with threads.
 +
 
 +
Verified environment:
 +
* JetPack4.5.1 + Xavier
 +
 
 +
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/mnist/mnist.onnx --saveEngine=mnist.trt
 +
$ cd /usr/src/tensorrt/data/mnist/
 +
$ sudo pip3 install pillow
 +
$ python3 download_pgms.py
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/my_tensorrt_code.py -O my_tensorrt_code.py
 +
 
 +
<syntaxhighlight lang="python">
 +
import threading
 +
import time
 +
from my_tensorrt_code import TRTInference, trt
 +
 
 +
exitFlag = 0
 +
 
 +
class myThread(threading.Thread):
 +
  def __init__(self, func, args):
 +
      threading.Thread.__init__(self)
 +
      self.func = func
 +
      self.args = args
 +
  def run(self):
 +
      print ("Starting " + self.args[0])
 +
      self.func(*self.args)
 +
      print ("Exiting " + self.args[0])
 +
 
 +
if __name__ == '__main__':
 +
    # Create new threads
 +
    '''
 +
    format thread:
 +
        - func: function names, function that we wished to use
 +
        - arguments: arguments that will be used for the func's arguments
 +
    '''
 +
 
 +
    trt_engine_path = 'mnist.trt'
 +
 
 +
    max_batch_size = 1
 +
    trt_inference_wrapper = TRTInference(trt_engine_path,
 +
        trt_engine_datatype=trt.DataType.FLOAT,
 +
        batch_size=max_batch_size)
 +
 
 +
    # Get TensorRT SSD model output
 +
    input_img_path = '/usr/src/tensorrt/data/mnist/3.pgm'
 +
 
 +
    thread1 = myThread(trt_inference_wrapper.infer, [input_img_path])
 +
 
 +
    # Start new Threads
 +
    thread1.start()
 +
    thread1.join()
 +
    trt_inference_wrapper.destory();
 +
    print ("Exiting Main Thread")
 +
</syntaxhighlight>
 +
 
  
 
== Deepstream ==
 
== Deepstream ==
Line 63: Line 273:
  
  
== TensorRT ==
+
== VPI ==
 +
=== VPI with Jetson-utils ===
 +
Below is an example to use VPI with ''jetson-utils''
 +
 
 +
Verified environment:
 +
* JetPack4.6 + XavierNX
 +
 
 +
<syntaxhighlight lang="python">
 +
import numpy as np
 +
import jetson.utils
 +
import vpi
 +
 
 +
 
 +
display = jetson.utils.glDisplay()
 +
 
 +
camera = jetson.utils.gstCamera(1920, 1280, '0')
 +
camera.Open()
 +
 
 +
while display.IsOpen():
 +
    frame, width, height = camera.CaptureRGBA(zeroCopy=1)
 +
    input = vpi.asimage(np.uint8(jetson.utils.cudaToNumpy(frame)))
 +
    with vpi.Backend.CUDA:
 +
        output = input.convert(vpi.Format.U8)
 +
        output = output.box_filter(11, border=vpi.Border.ZERO).convert(vpi.Format.RGB8)
 +
        vpi.clear_cache()
 +
 
 +
    display.RenderOnce(jetson.utils.cudaFromNumpy(output.cpu()), width, height)
 +
    display.SetTitle("{:s} | {:d}x{:d} | {:.1f} FPS".format("Camera Viewer", width, height, display.GetFPS()))
 +
 
 +
camera.Close()
 +
</syntaxhighlight>
 +
 
 +
 
 +
=== VPI with Deepstream ===
 +
Please find the following link for the example:
 +
 
 +
https://forums.developer.nvidia.com/t/deepstream-sdk-vpi-on-jetson-tx2/166834/20

Revision as of 06:47, 23 December 2021

This page collects information to deploy customized models with TensorRT and some common questions for Jetson.

TensorRT Python

OpenCV with ONNX model

Below is an example to deploy TensorRT from an ONNX model with OpenCV images.

Verified environment:

  • JetPack4.5.1 + Xavier
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)

host_inputs  = []
cuda_inputs  = []
host_outputs = []
cuda_outputs = []
bindings = []


def Inference(engine):
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0

    np.copyto(host_inputs[0], image.ravel())
    stream = cuda.Stream()
    context = engine.create_execution_context()

    start_time = time.time()
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
    stream.synchronize()
    print("execute times "+str(time.time()-start_time))

    output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
    print(np.argmax(output))


def PrepareEngine():
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        builder.max_workspace_size = 1 << 30
        with open('/usr/src/tensorrt/data/resnet50/ResNet50.onnx', 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))
        engine = builder.build_cuda_engine(network)

        # create buffer
        for binding in engine:
            size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
            host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(cuda_mem))
            if engine.binding_is_input(binding):
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        return engine


if __name__ == "__main__":
    engine = PrepareEngine()
    Inference(engine)


OpenCV with PLAN model

Below is an example to deploy TensorRT from a TensorRT PLAN model with OpenCV images.

Verified environment:

  • JetPack4.5.1 + Xavier
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=trt.plan
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)

host_inputs  = []
cuda_inputs  = []
host_outputs = []
cuda_outputs = []
bindings = []


def Inference(engine):
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0

    np.copyto(host_inputs[0], image.ravel())
    stream = cuda.Stream()
    context = engine.create_execution_context()

    start_time = time.time()
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
    context.execute_async(bindings=bindings, stream_handle=stream.handle)
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
    stream.synchronize()
    print("execute times "+str(time.time()-start_time))

    output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
    print(np.argmax(output))


def PrepareEngine():
    runtime = trt.Runtime(TRT_LOGGER)
    with open('./trt.plan', 'rb') as f:
        buf = f.read()
        engine = runtime.deserialize_cuda_engine(buf)

    # create buffer
    for binding in engine:
        size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
        host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(cuda_mem))
        if engine.binding_is_input(binding):
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)

    return engine


if __name__ == "__main__":
    engine = PrepareEngine()
    Inference(engine)


Multi-threading

Below is an example to run TensorRT with threads.

Verified environment:

  • JetPack4.5.1 + Xavier
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/mnist/mnist.onnx --saveEngine=mnist.trt
$ cd /usr/src/tensorrt/data/mnist/
$ sudo pip3 install pillow
$ python3 download_pgms.py
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/my_tensorrt_code.py -O my_tensorrt_code.py
import threading
import time
from my_tensorrt_code import TRTInference, trt

exitFlag = 0

class myThread(threading.Thread):
   def __init__(self, func, args):
      threading.Thread.__init__(self)
      self.func = func
      self.args = args
   def run(self):
      print ("Starting " + self.args[0])
      self.func(*self.args)
      print ("Exiting " + self.args[0])

if __name__ == '__main__':
    # Create new threads
    '''
    format thread:
        - func: function names, function that we wished to use
        - arguments: arguments that will be used for the func's arguments
    '''

    trt_engine_path = 'mnist.trt'

    max_batch_size = 1
    trt_inference_wrapper = TRTInference(trt_engine_path,
        trt_engine_datatype=trt.DataType.FLOAT,
        batch_size=max_batch_size)

    # Get TensorRT SSD model output
    input_img_path = '/usr/src/tensorrt/data/mnist/3.pgm'

    thread1 = myThread(trt_inference_wrapper.infer, [input_img_path])

    # Start new Threads
    thread1.start()
    thread1.join()
    trt_inference_wrapper.destory();
    print ("Exiting Main Thread")


Deepstream

YoloV4 Tiny

Verified environment:

  • JetPack4.5.1 + Xavier

Deepstream can reach 60fps with 4 video stream on Xavier:

$ cd /opt/nvidia/deepstream/deepstream-5.1/sources/objectDetector_Yolo
$ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/yolov4_tiny.patch
$ git apply yolov4_tiny.patch
$ export CUDA_VER=10.2
$ make -C nvdsinfer_custom_impl_Yolo
$ wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg -q --show-progress
$ wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -q --show-progress
$ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/deepstream_app_config_yoloV4_tiny.txt
$ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/config_infer_primary_yoloV4_tiny.txt
$ deepstream-app -c deepstream_app_config_yoloV4_tiny.txt


Custom Parser for SSD-MobileNet Trained by Jetson-inference

Verified environment:

  • JetPack4.5.1 + Xavier
$ cd /opt/nvidia/deepstream/deepstream-5.1/sources/objectDetector_SSD/
$ sudo wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/ssd-jetson_inference/ssd-jetson_inference.patch
$ sudo git apply ssd-jetson_inference.patch
$ sudo CUDA_VER=10.2 make -C nvdsinfer_custom_impl_ssd/

Update config_infer_primary_ssd.txt:

Ex.

diff --git a/config_infer_primary_ssd.txt b/config_infer_primary_ssd.txt
index e5bf468..81c52fd 100644
--- a/config_infer_primary_ssd.txt
+++ b/config_infer_primary_ssd.txt
@@ -62,15 +62,13 @@ gpu-id=0
 net-scale-factor=0.0078431372
 offsets=127.5;127.5;127.5
 model-color-format=0
-model-engine-file=sample_ssd_relu6.uff_b1_gpu0_fp32.engine
-labelfile-path=ssd_coco_labels.txt
-uff-file=sample_ssd_relu6.uff
+model-engine-file=ssd-mobilenet.uff_b1_gpu0_fp16.engine
+uff-file=ssd.uff
 infer-dims=3;300;300
 uff-input-order=0
 uff-input-blob-name=Input
-batch-size=1
-## 0=FP32, 1=INT8, 2=FP16 mode
-network-mode=0
+labelfile-path=labels.txt
+network-mode=2
 num-detected-classes=91
 interval=0
 gie-unique-id=1
$ deepstream-app -c deepstream_app_config_ssd.txt


VPI

VPI with Jetson-utils

Below is an example to use VPI with jetson-utils

Verified environment:

  • JetPack4.6 + XavierNX
import numpy as np
import jetson.utils
import vpi


display = jetson.utils.glDisplay()

camera = jetson.utils.gstCamera(1920, 1280, '0')
camera.Open()

while display.IsOpen():
    frame, width, height = camera.CaptureRGBA(zeroCopy=1)
    input = vpi.asimage(np.uint8(jetson.utils.cudaToNumpy(frame)))
    with vpi.Backend.CUDA:
        output = input.convert(vpi.Format.U8) 
        output = output.box_filter(11, border=vpi.Border.ZERO).convert(vpi.Format.RGB8)
        vpi.clear_cache()

    display.RenderOnce(jetson.utils.cudaFromNumpy(output.cpu()), width, height)
    display.SetTitle("{:s} | {:d}x{:d} | {:.1f} FPS".format("Camera Viewer", width, height, display.GetFPS()))

camera.Close()


VPI with Deepstream

Please find the following link for the example:

https://forums.developer.nvidia.com/t/deepstream-sdk-vpi-on-jetson-tx2/166834/20