Difference between revisions of "Jetson/L4T/TRT Customized Example"

From eLinux.org
< Jetson‎ | L4T
Jump to: navigation, search
 
(16 intermediate revisions by the same user not shown)
Line 6: Line 6:
  
 
Verified environment:
 
Verified environment:
* JetPack4.5.1 + Xavier
+
* JetPack5.1 + Orin
  
 
<syntaxhighlight lang="python">
 
<syntaxhighlight lang="python">
Line 20: Line 20:
 
runtime = trt.Runtime(TRT_LOGGER)
 
runtime = trt.Runtime(TRT_LOGGER)
  
 +
batch = 1
 
host_inputs  = []
 
host_inputs  = []
 
cuda_inputs  = []
 
cuda_inputs  = []
Line 37: Line 38:
 
     start_time = time.time()
 
     start_time = time.time()
 
     cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
 
     cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
     context.execute_async(bindings=bindings, stream_handle=stream.handle)
+
     context.execute_v2(bindings)
 
     cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
 
     cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
 
     stream.synchronize()
 
     stream.synchronize()
 
     print("execute times "+str(time.time()-start_time))
 
     print("execute times "+str(time.time()-start_time))
  
     output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
+
     output = host_outputs[0]
 
     print(np.argmax(output))
 
     print(np.argmax(output))
  
Line 48: Line 49:
 
def PrepareEngine():
 
def PrepareEngine():
 
     with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
 
     with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
         builder.max_workspace_size = 1 << 30
+
         config = builder.create_builder_config()
 +
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
 
         with open('/usr/src/tensorrt/data/resnet50/ResNet50.onnx', 'rb') as model:
 
         with open('/usr/src/tensorrt/data/resnet50/ResNet50.onnx', 'rb') as model:
 
             if not parser.parse(model.read()):
 
             if not parser.parse(model.read()):
Line 54: Line 56:
 
                 for error in range(parser.num_errors):
 
                 for error in range(parser.num_errors):
 
                     print (parser.get_error(error))
 
                     print (parser.get_error(error))
         engine = builder.build_cuda_engine(network)
+
 
 +
         serialized_engine = builder.build_serialized_network(network, config)
 +
        #with open("sample.engine", "wb") as f:
 +
        #    f.write(serialized_engine)
 +
        #with open('sample.engine', 'rb') as f:
 +
        #    serialized_engine = f.read()
 +
        engine = runtime.deserialize_cuda_engine(serialized_engine)
  
 
         # create buffer
 
         # create buffer
 
         for binding in engine:
 
         for binding in engine:
             size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+
             size = trt.volume(engine.get_tensor_shape(binding)) * batch
 
             host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
 
             host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
 
             cuda_mem = cuda.mem_alloc(host_mem.nbytes)
 
             cuda_mem = cuda.mem_alloc(host_mem.nbytes)
  
 
             bindings.append(int(cuda_mem))
 
             bindings.append(int(cuda_mem))
             if engine.binding_is_input(binding):
+
             if engine.get_tensor_mode(binding)==trt.TensorIOMode.INPUT:
 
                 host_inputs.append(host_mem)
 
                 host_inputs.append(host_mem)
 
                 cuda_inputs.append(cuda_mem)
 
                 cuda_inputs.append(cuda_mem)
Line 76: Line 84:
 
     engine = PrepareEngine()
 
     engine = PrepareEngine()
 
     Inference(engine)
 
     Inference(engine)
 +
 +
    engine = []
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 83: Line 93:
  
 
Verified environment:
 
Verified environment:
* JetPack4.5.1 + Xavier
+
* JetPack5.1 + Orin
  
  $ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=trt.plan
+
  $ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=sample.engine
  
 
<syntaxhighlight lang="python">
 
<syntaxhighlight lang="python">
Line 97: Line 107:
 
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 
EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
 
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
 
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)
 
  
 +
batch = 1
 
host_inputs  = []
 
host_inputs  = []
 
cuda_inputs  = []
 
cuda_inputs  = []
Line 116: Line 126:
 
     start_time = time.time()
 
     start_time = time.time()
 
     cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
 
     cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
     context.execute_async(bindings=bindings, stream_handle=stream.handle)
+
     context.execute_v2(bindings)
 
     cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
 
     cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
 
     stream.synchronize()
 
     stream.synchronize()
 
     print("execute times "+str(time.time()-start_time))
 
     print("execute times "+str(time.time()-start_time))
  
     output = host_outputs[0].reshape(np.concatenate(([1],engine.get_binding_shape(1))))
+
     output = host_outputs[0]
 
     print(np.argmax(output))
 
     print(np.argmax(output))
  
  
 
def PrepareEngine():
 
def PrepareEngine():
 +
    with open('sample.engine', 'rb') as f:
 +
        serialized_engine = f.read()
 +
 
     runtime = trt.Runtime(TRT_LOGGER)
 
     runtime = trt.Runtime(TRT_LOGGER)
     with open('./trt.plan', 'rb') as f:
+
     engine = runtime.deserialize_cuda_engine(serialized_engine)
        buf = f.read()
 
        engine = runtime.deserialize_cuda_engine(buf)
 
  
 
     # create buffer
 
     # create buffer
 
     for binding in engine:
 
     for binding in engine:
         size = trt.volume(engine.get_binding_shape(binding)) * engine.max_batch_size
+
         size = trt.volume(engine.get_tensor_shape(binding)) * batch
 
         host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
 
         host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
 
         cuda_mem = cuda.mem_alloc(host_mem.nbytes)
 
         cuda_mem = cuda.mem_alloc(host_mem.nbytes)
  
 
         bindings.append(int(cuda_mem))
 
         bindings.append(int(cuda_mem))
         if engine.binding_is_input(binding):
+
         if engine.get_tensor_mode(binding)==trt.TensorIOMode.INPUT:
 
             host_inputs.append(host_mem)
 
             host_inputs.append(host_mem)
 
             cuda_inputs.append(cuda_mem)
 
             cuda_inputs.append(cuda_mem)
Line 151: Line 162:
 
     engine = PrepareEngine()
 
     engine = PrepareEngine()
 
     Inference(engine)
 
     Inference(engine)
 +
 +
    engine = []
 
</syntaxhighlight>
 
</syntaxhighlight>
  
Line 271: Line 284:
  
 
  $ deepstream-app -c deepstream_app_config_ssd.txt
 
  $ deepstream-app -c deepstream_app_config_ssd.txt
 +
 +
 +
=== Detectron2 ===
 +
Please find the following link for an example from the forum community:
 +
 +
https://forums.developer.nvidia.com/t/passing-gstbuffer-to-tensorrt-for-inferencing/267084/18
  
  
Line 310: Line 329:
  
 
https://forums.developer.nvidia.com/t/deepstream-sdk-vpi-on-jetson-tx2/166834/20
 
https://forums.developer.nvidia.com/t/deepstream-sdk-vpi-on-jetson-tx2/166834/20
 +
 +
=== VPI with Argus Camera - cudaBayerDemosaic ===
 +
Please find the following link for the example:
 +
 +
https://forums.developer.nvidia.com/t/how-do-i-get-image-from-cudabayerdemosaic-and-connect-to-vpi/213529/18
 +
 +
=== VPI with Argus Camera - nvarguscamerasrc ===
 +
Please find below a sample to call VPI TNR within the nvarguscamerasrc component.
 +
 +
Verified environment:
 +
* JetPack4.6.4 + VPI1.2 + Jetson Nano
 +
 +
1. Download the nvarguscamerasrc source code from https://developer.nvidia.com/embedded/linux-tegra-r3274
 +
 +
2. Apply patch
 +
<syntaxhighlight lang="diff">
 +
diff --git a/Makefile b/Makefile
 +
index 4f45209..b4ad5fb 100644
 +
--- a/Makefile
 +
+++ b/Makefile
 +
@@ -31,7 +31,7 @@ CC := g++
 +
GST_INSTALL_DIR?=/usr/lib/aarch64-linux-gnu/gstreamer-1.0/
 +
LIB_INSTALL_DIR?=/usr/lib/aarch64-linux-gnu/tegra/
 +
CFLAGS:=
 +
-LIBS:= -lnvbuf_utils -lnvdsbufferpool -lnvargus_socketclient -lpthread
 +
+LIBS:= -lnvbuf_utils -lnvdsbufferpool -lnvargus_socketclient -lpthread -lnvvpi
 +
 +
SRCS := $(wildcard *.cpp)
 +
 +
diff --git a/gstnvarguscamerasrc.cpp b/gstnvarguscamerasrc.cpp
 +
index fd6cb4c..25a5dcb 100644
 +
--- a/gstnvarguscamerasrc.cpp
 +
+++ b/gstnvarguscamerasrc.cpp
 +
@@ -169,6 +169,31 @@ bool ThreadArgus::initialize(GstNvArgusCameraSrc *src)
 +
      return false;
 +
  }
 +
 +
+  /* VPI */
 +
+  //src->backend = VPI_BACKEND_CUDA;
 +
+  src->backend = VPI_BACKEND_VIC;
 +
+
 +
+  int width = src->width, height = src->height;
 +
+  VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
 +
+
 +
+  CHECK_STATUS(vpiStreamCreate(VPI_BACKEND_VIC|src->backend, &src->stream));
 +
+
 +
+  int memFlags = src->backend | VPI_BACKEND_VIC | VPI_EXCLUSIVE_STREAM_ACCESS;
 +
+  CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &src->imgPrev));
 +
+  CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &src->imgCurr));
 +
+  CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &src->imgOut));
 +
+
 +
+  CHECK_STATUS(vpiCreateTemporalNoiseReduction(src->backend, width, height, imgFormat, VPI_TNR_DEFAULT, &src->tnr));
 +
+  CHECK_STATUS(vpiInitTemporalNoiseReductionParams(&src->params));
 +
+  src->params.preset  = VPI_TNR_PRESET_INDOOR_LOW_LIGHT;
 +
+  src->params.strength = 1.0f;
 +
+
 +
+  std::cout<<"[VpiLog] Input size: "<<width<<" x "<<height << '\n'
 +
+          <<"[VpiLog] Image format: "<<vpiImageFormatGetName(imgFormat) << '\n'
 +
+          <<"[VpiLog] Algorithm: Temporal Noise Reduction V"<<((src->backend==VPI_BACKEND_CUDA)?"3":"2")<<std::endl;
 +
+
 +
+  CHECK_STATUS(vpiEventCreate(0, &src->evStart));
 +
+  CHECK_STATUS(vpiEventCreate(0, &src->evStop));
 +
  return true;
 +
}
 +
 +
@@ -182,9 +207,18 @@ bool ThreadArgus::shutdown()
 +
    m_threadID = 0;
 +
    m_doShutdown = false;
 +
    m_threadState = THREAD_INACTIVE;
 +
-  }
 +
 +
- return true;
 +
+    // Destroy all VPI resources
 +
+    vpiStreamDestroy(src->stream);
 +
+    vpiPayloadDestroy(src->tnr);
 +
+    vpiImageDestroy(src->wrapNvBuff);
 +
+    vpiImageDestroy(src->imgPrev);
 +
+    vpiImageDestroy(src->imgCurr);
 +
+    vpiImageDestroy(src->imgOut);
 +
+    vpiEventDestroy(src->evStart);
 +
+    vpiEventDestroy(src->evStop);
 +
+  }
 +
+  return true;
 +
}
 +
 +
bool ThreadArgus::waitRunning(useconds_t timeoutUs)
 +
@@ -668,6 +702,29 @@ bool StreamConsumer::threadExecute(GstNvArgusCameraSrc *src)
 +
      src->frameInfo->frameNum = iFrame->getNumber();
 +
      src->frameInfo->frameTime = iFrame->getTime();
 +
 +
+      /********************************************************/
 +
+      // VPI TNR
 +
+      /********************************************************/
 +
+      CHECK_STATUS(vpiEventRecord(src->evStart, src->stream));
 +
+      if( src->wrapNvBuff==NULL ) {
 +
+        CHECK_STATUS(vpiImageCreateNvBufferWrapper(src->frameInfo->fd, NULL, \
 +
+              src->backend|VPI_BACKEND_VIC|VPI_EXCLUSIVE_STREAM_ACCESS, &src->wrapNvBuff));
 +
+      }
 +
+
 +
+      CHECK_STATUS(vpiSubmitConvertImageFormat(src->stream, VPI_BACKEND_VIC, src->wrapNvBuff, src->imgCurr, NULL));
 +
+      CHECK_STATUS(vpiSubmitTemporalNoiseReduction(src->stream, src->backend, src->tnr, \
 +
+              src->frameInfo->frameNum==1 ? NULL:src->imgPrev, src->imgCurr, src->imgOut, &src->params));
 +
+      CHECK_STATUS(vpiSubmitConvertImageFormat(src->stream, VPI_BACKEND_VIC, src->imgOut, src->wrapNvBuff, NULL));
 +
+
 +
+      CHECK_STATUS(vpiEventRecord(src->evStop, src->stream));
 +
+      CHECK_STATUS(vpiEventSync(src->evStop));
 +
+      std::swap(src->imgPrev, src->imgCurr);
 +
+
 +
+      float elapsedMS;
 +
+      CHECK_STATUS(vpiEventElapsedTimeMillis(src->evStart, src->evStop, &elapsedMS));
 +
+      std::cout<<"[VpiLog] Elapsed time per call: "<<elapsedMS<<" ms"<<std::endl;
 +
+      /********************************************************/
 +
+
 +
      g_mutex_lock (&src->argus_buffers_queue_lock);
 +
      g_queue_push_tail (src->argus_buffers, (src->frameInfo));
 +
      g_cond_signal (&src->argus_buffers_queue_cond);
 +
diff --git a/gstnvarguscamerasrc.hpp b/gstnvarguscamerasrc.hpp
 +
index 0630f98..0333f62 100644
 +
--- a/gstnvarguscamerasrc.hpp
 +
+++ b/gstnvarguscamerasrc.hpp
 +
@@ -32,11 +32,18 @@
 +
#include <gst/gst.h>
 +
#include <condition_variable>
 +
#include <chrono>
 +
+#include <sstream>
 +
 +
#include "nvbufsurface.h"
 +
#include "nvbuf_utils.h"
 +
#include "gstnvarguscamera_utils.h"
 +
#include "gstnvdsbufferpool.h"
 +
+#include <vpi/Event.h>
 +
+#include <vpi/Image.h>
 +
+#include <vpi/Stream.h>
 +
+#include <vpi/NvBufferInterop.h>
 +
+#include <vpi/algo/TemporalNoiseReduction.h>
 +
+#include <vpi/algo/ConvertImageFormat.h>
 +
 +
G_BEGIN_DECLS
 +
 +
@@ -69,6 +76,20 @@ G_BEGIN_DECLS
 +
#define NVARGUSCAM_DEFAULT_AE_LOCK                  FALSE
 +
#define NVARGUSCAM_DEFAULT_AWB_LOCK                  FALSE
 +
 +
+#define CHECK_STATUS(STMT)                                    \
 +
+    do                                                        \
 +
+    {                                                        \
 +
+        VPIStatus status = (STMT);                            \
 +
+        if (status != VPI_SUCCESS)                            \
 +
+        {                                                    \
 +
+            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];      \
 +
+            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
 +
+            std::ostringstream ss;                            \
 +
+            ss << vpiStatusGetName(status) << ": " << buffer; \
 +
+            throw std::runtime_error(ss.str());              \
 +
+        }                                                    \
 +
+    } while (0);
 +
+
 +
typedef struct _GstNvArgusCameraSrc      GstNvArgusCameraSrc;
 +
typedef struct _GstNvArgusCameraSrcClass GstNvArgusCameraSrcClass;
 +
 +
@@ -205,6 +226,20 @@ struct _GstNvArgusCameraSrc
 +
  Argus::UniqueObj<Argus::OutputStreamSettings> streamSettings;
 +
  Argus::UniqueObj<Argus::Request> request;
 +
  NvArgusFrameInfo *frameInfo;
 +
+
 +
+  //VPI TNR Controls
 +
+  VPIBackend backend;
 +
+  VPIStream stream = NULL;
 +
+
 +
+  VPIImage wrapNvBuff = NULL;
 +
+  VPIImage imgPrev = NULL;
 +
+  VPIImage imgCurr = NULL;
 +
+  VPIImage imgOut  = NULL;
 +
+  VPITNRParams params;
 +
+  VPIPayload tnr = NULL;
 +
+
 +
+  VPIEvent evStart = NULL;
 +
+  VPIEvent evStop  = NULL;
 +
};
 +
 +
struct _GstNvArgusCameraSrcClass
 +
</syntaxhighlight>
 +
 +
3. Build
 +
$ make
 +
$ sudo make install
 +
 +
4. Test
 +
$ gst-launch-1.0 nvarguscamerasrc ! 'video/x-raw(memory:NVMM), width=(int)1920, height=(int)1080, format=(string)NV12, framerate=(fraction)30/1' ! nvv4l2h264enc preset-level=1 control-rate=1 bitrate=40000000 ! h264parse ! matroskamux ! filesink location=out.mp4
 +
 +
=== VPI with nvivafilter ===
 +
Please find the following link for the example:
 +
 +
https://forums.developer.nvidia.com/t/using-vpi-in-gstreamer/223334/21
 +
 +
=== VPI and NPP on the Color Conversion ===
 +
When converting NV12 to RGBA, the underlying algorithm has some differences:
 +
 +
* VPI/NvBufSurfTransform: automatically apply the gamma correction
 +
<syntaxhighlight lang="cpp">
 +
    NvBufSurface* dstSurface = createNVMMSurface(..., NvBufSurfaceColorFormat::NVBUF_COLOR_FORMAT_BGRA);
 +
    NvBufSurfTransform(srcSurface, dstSurface, &transformParams);
 +
</syntaxhighlight>
 +
 +
* NPP: only do the format conversion, and gamma correction need to be applied manually
 +
<syntaxhighlight lang="cpp">
 +
    auto res = nppiNV12ToBGR_709CSC_8u_P2C3R(...);
 +
</syntaxhighlight>
 +
 +
Please find the following link for more details:
 +
 +
https://forums.developer.nvidia.com/t/vpi-nvbufsurftransform-and-npp-color-conversion-differences/280135
 +
 +
 +
== Stress Test for Orin ==
 +
We describe the testing tools that can stress the Jetson AGX Orin to the full workload.
 +
 +
The expected power consumption with these steps is listed in the below table:
 +
{| class="wikitable"
 +
|-
 +
!  !! Jetson AGX Orin 64GB !! Jetson AGX Orin 32GB
 +
|-
 +
| Maximum Power || 60W || 40W
 +
|}
 +
=== Maximize the Device Performance ===
 +
$ sudo nvpmodel -m 0
 +
$ sudo jetson_clocks
 +
 +
=== CPU Stress Test ===
 +
Using Linux stress tool:
 +
$ sudo apt-get install stress
 +
$ stress --cpu $(nproc)
 +
 +
=== GPU Stress Test ===
 +
 +
Verified environment:
 +
 +
* JetPack6DP + Orin
 +
 +
Running cuBLAS sample with the half data type:
 +
 +
''' 1. '''Find matrixMulCUBLAS sample in '''[https://github.com/NVIDIA/cuda-samples/tree/v12.2 this]''' GitHub
 +
 +
''' 2. '''Apply the following change the data type from float to half
 +
 +
<syntaxhighlight lang="bash">
 +
diff --git a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile
 +
index c781defa..5abf3303 100644
 +
--- a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile
 +
+++ b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile
 +
@@ -331,8 +331,6 @@ matrixMulCUBLAS.o:matrixMulCUBLAS.cpp
 +
 +
matrixMulCUBLAS: matrixMulCUBLAS.o
 +
$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
 +
- $(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 +
- $(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 +
 +
run: build
 +
$(EXEC) ./matrixMulCUBLAS
 +
@@ -341,6 +339,5 @@ testrun: build
 +
 +
clean:
 +
rm -f matrixMulCUBLAS matrixMulCUBLAS.o
 +
- rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/matrixMulCUBLAS
 +
 +
clobber: clean
 +
</syntaxhighlight>
 +
 +
<syntaxhighlight lang="bash">
 +
diff --git a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp
 +
index 0cd33127..fbe4ac30 100644
 +
--- a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp
 +
+++ b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp
 +
@@ -87,6 +87,52 @@ typedef struct _matrixSize {
 +
  unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
 +
} sMatrixSize;
 +
 +
+// Modified from helper_image.h to support half precision arguments
 +
+inline bool sdkCompareL2fe(const __half *reference, const __half *data,
 +
+                          const unsigned int len, const float epsilon) {
 +
+  assert(epsilon >= 0);
 +
+
 +
+  float error = 0;
 +
+  float ref = 0;
 +
+
 +
+  for (unsigned int i = 0; i < len; ++i) {
 +
+    float diff = (float)reference[i] - (float)data[i];
 +
+    error += diff * diff;
 +
+    ref += (float)reference[i] * (float)reference[i];
 +
+  }
 +
+
 +
+  float normRef = sqrtf(ref);
 +
+
 +
+  if (fabs(ref) < 1e-7) {
 +
+#ifdef _DEBUG
 +
+    std::cerr << "ERROR, reference l2-norm is 0\n";
 +
+#endif
 +
+    return false;
 +
+  }
 +
+
 +
+  float normError = sqrtf(error);
 +
+  error = normError / normRef;
 +
+  bool result = error < epsilon;
 +
+#ifdef _DEBUG
 +
+
 +
+  if (!result) {
 +
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
 +
+              << epsilon << "\n";
 +
+  }
 +
+
 +
+#endif
 +
+
 +
+  return result;
 +
+}
 +
+
 +
+inline int idx_n(int row, int col, int height, int width) {
 +
+    return row * width + col;
 +
+}
 +
+
 +
+inline int idx_t(int row, int col, int height, int width) {
 +
+    return col * height + row;
 +
+}
 +
+
 +
////////////////////////////////////////////////////////////////////////////////
 +
//! Compute reference data set matrix multiply on CPU
 +
//! C = A * B
 +
@@ -96,29 +142,30 @@ typedef struct _matrixSize {
 +
//! @param hA        height of matrix A
 +
//! @param wB        width of matrix B
 +
////////////////////////////////////////////////////////////////////////////////
 +
-void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA,
 +
-                  unsigned int wA, unsigned int wB) {
 +
+void matrixMulCPU(__half *C, const __half *A, const __half *B, unsigned int hA, unsigned int wA, unsigned int wB, const bool ta, const bool tb) {
 +
+  auto idx_a = (ta) ? idx_t : idx_n;
 +
+  auto idx_b = (tb) ? idx_t : idx_n;
 +
  for (unsigned int i = 0; i < hA; ++i)
 +
    for (unsigned int j = 0; j < wB; ++j) {
 +
-      double sum = 0;
 +
+      float sum = 0;
 +
 +
      for (unsigned int k = 0; k < wA; ++k) {
 +
-        double a = A[i * wA + k];
 +
-        double b = B[k * wB + j];
 +
-        sum += a * b;
 +
+        __half a = A[i * wA + k];
 +
+        __half b = B[k * wB + j];
 +
+        sum += float(a * b);
 +
      }
 +
 +
-      C[i * wB + j] = (float)sum;
 +
+      C[i * wB + j] = sum;
 +
    }
 +
}
 +
 +
-// Allocates a matrix with random float entries.
 +
-void randomInit(float *data, int size) {
 +
-  for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX;
 +
+// Allocates a matrix with random __half entries.
 +
+void randomInit(__half *data, int size) {
 +
+  for (int i = 0; i < size; ++i) data[i] = (__half)(rand() / (float)(RAND_MAX));
 +
}
 +
 +
-void printDiff(float *data1, float *data2, int width, int height,
 +
-              int iListLength, float fListTol) {
 +
+void printDiff(__half *data1, __half *data2, int width, int height,
 +
+ int iListLength, float fListTol) {
 +
  printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
 +
  int i, j, k;
 +
  int error_count = 0;
 +
@@ -130,12 +177,12 @@ void printDiff(float *data1, float *data2, int width, int height,
 +
 +
    for (i = 0; i < width; i++) {
 +
      k = j * width + i;
 +
-      float fDiff = fabs(data1[k] - data2[k]);
 +
+      float fDiff = fabs((float)data1[k] - (float)data2[k]);
 +
 +
      if (fDiff > fListTol) {
 +
        if (error_count < iListLength) {
 +
          printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j,
 +
-                data1[k], data2[k], fDiff);
 +
+                (float)data1[k], (float)data2[k], fDiff);
 +
        }
 +
 +
        error_count++;
 +
@@ -160,7 +207,7 @@ void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
 +
        getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
 +
  }
 +
 +
-  iSizeMultiple = min(iSizeMultiple, 10);
 +
+  iSizeMultiple = min(iSizeMultiple, 8);
 +
  iSizeMultiple = max(iSizeMultiple, 1);
 +
 +
  cudaDeviceProp deviceProp;
 +
@@ -178,13 +225,13 @@ void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
 +
 +
  int block_size = 32;
 +
 +
-  matrix_size.uiWA = 3 * block_size * iSizeMultiple;
 +
-  matrix_size.uiHA = 4 * block_size * iSizeMultiple;
 +
-  matrix_size.uiWB = 2 * block_size * iSizeMultiple;
 +
-  matrix_size.uiHB = 3 * block_size * iSizeMultiple;
 +
-  matrix_size.uiWC = 2 * block_size * iSizeMultiple;
 +
-  matrix_size.uiHC = 4 * block_size * iSizeMultiple;
 +
-
 +
+  matrix_size.uiWA = block_size * pow(2, iSizeMultiple);
 +
+  matrix_size.uiHA = block_size * pow(2, iSizeMultiple);
 +
+  matrix_size.uiWB = block_size * pow(2, iSizeMultiple);
 +
+  matrix_size.uiHB = block_size * pow(2, iSizeMultiple);
 +
+  matrix_size.uiWC = block_size * pow(2, iSizeMultiple);
 +
+  matrix_size.uiHC = block_size * pow(2, iSizeMultiple);
 +
+   //
 +
  printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiHA,
 +
          matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC,
 +
          matrix_size.uiWC);
 +
@@ -200,7 +247,7 @@ void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
 +
////////////////////////////////////////////////////////////////////////////////
 +
//! Run a simple test matrix multiply using CUBLAS
 +
////////////////////////////////////////////////////////////////////////////////
 +
-int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
+int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size, bool ta, bool tb) {
 +
  cudaDeviceProp deviceProp;
 +
 +
  checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
 +
@@ -212,11 +259,11 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
 +
  // allocate host memory for matrices A and B
 +
  unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
 +
-  unsigned int mem_size_A = sizeof(float) * size_A;
 +
-  float *h_A = (float *)malloc(mem_size_A);
 +
+  unsigned int mem_size_A = sizeof(__half) * size_A;
 +
+  __half *h_A = (__half *)malloc(mem_size_A);
 +
  unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
 +
-  unsigned int mem_size_B = sizeof(float) * size_B;
 +
-  float *h_B = (float *)malloc(mem_size_B);
 +
+  unsigned int mem_size_B = sizeof(__half) * size_B;
 +
+  __half *h_B = (__half *)malloc(mem_size_B);
 +
 +
  // set seed for rand()
 +
  srand(2006);
 +
@@ -226,13 +273,13 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
  randomInit(h_B, size_B);
 +
 +
  // allocate device memory
 +
-  float *d_A, *d_B, *d_C;
 +
+  __half *d_A, *d_B, *d_C;
 +
  unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
 +
-  unsigned int mem_size_C = sizeof(float) * size_C;
 +
+  unsigned int mem_size_C = sizeof(__half) * size_C;
 +
 +
  // allocate host memory for the result
 +
-  float *h_C = (float *)malloc(mem_size_C);
 +
-  float *h_CUBLAS = (float *)malloc(mem_size_C);
 +
+  __half *h_C      = (__half *) malloc(mem_size_C);
 +
+  __half *h_CUBLAS = (__half *) malloc(mem_size_C);
 +
 +
  checkCudaErrors(cudaMalloc((void **)&d_A, mem_size_A));
 +
  checkCudaErrors(cudaMalloc((void **)&d_B, mem_size_B));
 +
@@ -248,22 +295,30 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
  printf("Computing result using CUBLAS...");
 +
 +
  // execute the kernel
 +
-  int nIter = 30;
 +
+  int nIter = INT_MAX;
 +
 +
  // CUBLAS version 2.0
 +
  {
 +
-    const float alpha = 1.0f;
 +
-    const float beta = 0.0f;
 +
+    cublasOperation_t trans_A = (ta) ? CUBLAS_OP_T : CUBLAS_OP_N;
 +
+    cublasOperation_t trans_B = (tb) ? CUBLAS_OP_T : CUBLAS_OP_N;
 +
+    int m = matrix_size.uiWC;
 +
+    int n = matrix_size.uiHC;
 +
+    int k = matrix_size.uiWA;
 +
+    int lda = (trans_A == CUBLAS_OP_N) ? k : n;
 +
+    int ldb = (trans_B == CUBLAS_OP_N) ? m : k;
 +
+    int ldc = m;
 +
+    const __half alpha = 1.0f;
 +
+    const __half beta  = 0.0f;
 +
    cublasHandle_t handle;
 +
    cudaEvent_t start, stop;
 +
 +
    checkCudaErrors(cublasCreate(&handle));
 +
 +
    // Perform warmup operation with cublas
 +
-    checkCudaErrors(cublasSgemm(
 +
-        handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA,
 +
-        matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA,
 +
-        &beta, d_C, matrix_size.uiWB));
 +
+    checkCudaErrors(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
 +
+    cublasGemmAlgo_t algo = cublasGemmAlgo_t::CUBLAS_GEMM_ALGO4_TENSOR_OP;
 +
+    checkCudaErrors(cublasGemmEx(handle, trans_B, trans_A, m, n, k, &alpha, d_B, CUDA_R_16F,
 +
+        ldb, d_A, CUDA_R_16F, lda, &beta, d_C, CUDA_R_16F, ldc, CUBLAS_COMPUTE_16F, algo));
 +
 +
    // Allocate CUDA events that we'll use for timing
 +
    checkCudaErrors(cudaEventCreate(&start));
 +
@@ -275,10 +330,8 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
    for (int j = 0; j < nIter; j++) {
 +
      // note cublas is column primary!
 +
      // need to transpose the order
 +
-      checkCudaErrors(cublasSgemm(
 +
-          handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA,
 +
-          matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A,
 +
-          matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
 +
+      checkCudaErrors(cublasGemmEx(handle, trans_B, trans_A, m, n, k, &alpha, d_B, CUDA_R_16F,
 +
+          ldb, d_A, CUDA_R_16F, lda, &beta, d_C, CUDA_R_16F, ldc, CUBLAS_COMPUTE_16F, algo));
 +
    }
 +
 +
    printf("done.\n");
 +
@@ -312,17 +365,16 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
 +
  // compute reference solution
 +
  printf("Computing result using host CPU...");
 +
-  float *reference = (float *)malloc(mem_size_C);
 +
-  matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA,
 +
-              matrix_size.uiWB);
 +
+  __half *reference = (__half *)malloc(mem_size_C);
 +
+  matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB, ta, tb);
 +
  printf("done.\n");
 +
 +
  // check result (CUBLAS)
 +
-  bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
 +
+  bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-2f);
 +
 +
  if (resCUBLAS != true) {
 +
    printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100,
 +
-              1.0e-5f);
 +
+              1.0e-2f);
 +
  }
 +
 +
  printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n",
 +
@@ -354,12 +406,12 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 +
int main(int argc, char **argv) {
 +
  printf("[Matrix Multiply CUBLAS] - Starting...\n");
 +
 +
-  int devID = 0, sizeMult = 5;
 +
+  int devID = 0, sizeMult = 7;
 +
  sMatrixSize matrix_size;
 +
 +
  initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
 +
 +
-  int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
 +
+  int matrix_result = matrixMultiply(argc, argv, devID, matrix_size, true, false);
 +
 +
  return matrix_result;
 +
}
 +
</syntaxhighlight>
 +
 +
''' 3. '''Run the sample
 +
$ ./matrixMulCUBLAS
 +
 +
''' 4. '''Run the sample with loading control
 +
 +
We also provide a configuration to control stress test loading
 +
$ ./matrixMulCUBLAS --sizemult=[#]
 +
 +
The # starts from [1, 8] for the low to full stress test.
 +
$ ./matrixMulCUBLAS --sizemult=1
 +
$ ./matrixMulCUBLAS --sizemult=8
 +
 +
The default # value is 7.
 +
 +
 +
== Installation Steps ==
 +
=== Darknet with cuDNN-8 Support ===
 +
Below are the steps to build ''darknet'' with cuDNN-8 support.
 +
 +
Verified environment:
 +
* JetPack4.5.1 + Xavier
 +
 +
'''1. Get source'''
 +
$ git clone https://github.com/pjreddie/darknet.git
 +
$ cd darknet/
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/script/topics/0001-fix-for-cudnn_v8-limited-memory-to-default-darknet-s.patch
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/opencv-darknet.patch -O opencv-darknet.patch
 +
$ git am 0001-fix-for-cudnn_v8-limited-memory-to-default-darknet-s.patch
 +
$ git am opencv-darknet.patch
 +
 +
'''2. Update Makefile based on your device'''
 +
<syntaxhighlight lang="bash">
 +
GPU=1
 +
CUDNN=1
 +
OPENCV=1
 +
</syntaxhighlight>
 +
 +
*Xavier & XavierNX:
 +
<syntaxhighlight lang="bash">
 +
ARCH= -gencode arch=compute_72,code=sm_72 \
 +
      -gencode arch=compute_72,code=[sm_72,compute_72]
 +
</syntaxhighlight>
 +
 +
* TX2:
 +
<syntaxhighlight lang="bash">
 +
ARCH= -gencode arch=compute_62,code=sm_62 \
 +
      -gencode arch=compute_62,code=[sm_62,compute_62]
 +
</syntaxhighlight>
 +
 +
* Nano:
 +
<syntaxhighlight lang="bash">
 +
ARCH= -gencode arch=compute_53,code=sm_53 \
 +
      -gencode arch=compute_53,code=[sm_53,compute_53]
 +
</syntaxhighlight>
 +
 +
'''3. Build and Test'''
 +
$ make -j8
 +
$ wget https://pjreddie.com/media/files/yolov3-tiny.weights
 +
$ ./darknet detector demo cfg/coco.data cfg/yolov3-tiny.cfg yolov3-tiny.weights [video]
 +
 +
 +
=== TensorRT Python Bindings ===
 +
Below are the steps to build TensorRT Python 3.9 bindings.
 +
 +
Verified environment:
 +
* JetPack4.6 + Xavier
 +
 +
'''1. Building python3.9'''
 +
$ sudo apt install zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev libbz2-dev
 +
$ wget https://www.python.org/ftp/python/3.9.1/Python-3.9.1.tar.xz
 +
$ tar xvf Python-3.9.1.tar.xz Python-3.9.1/
 +
 +
$ mkdir build-python-3.9.1
 +
$ cd build-python-3.9.1/
 +
$ ../Python-3.9.1/configure --enable-optimizations
 +
$ make -j $(nproc)
 +
$ sudo -H make altinstall
 +
$ cd ../
 +
 +
'''2. Build cmake 3.13.5'''
 +
$ sudo apt-get install -y protobuf-compiler libprotobuf-dev openssl libssl-dev libcurl4-openssl-dev
 +
$ wget https://github.com/Kitware/CMake/releases/download/v3.13.5/cmake-3.13.5.tar.gz
 +
$ tar xvf cmake-3.13.5.tar.gz
 +
$ rm cmake-3.13.5.tar.gz
 +
 +
$ cd cmake-3.13.5/
 +
$ ./bootstrap --system-curl
 +
$ make -j$(nproc)
 +
 +
$ echo 'export PATH='${PWD}'/bin/:$PATH' >> ~/.bashrc
 +
$ source ~/.bashrc
 +
$ cd ../
 +
 +
'''3. Prepare header'''
 +
$ mkdir python3.9
 +
$ mkdir python3.9/include
 +
$ wget http://ftp.us.debian.org/debian/pool/main/p/python3.9/libpython3.9-dev_3.9.9-2_arm64.deb
 +
$ ar x libpython3.9-dev_3.9.9-2_arm64.deb
 +
$ tar -xvf data.tar.xz
 +
$ cp ./usr/include/aarch64-linux-gnu/python3.9/pyconfig.h python3.9/include/
 +
$ cp -r Python-3.9.1/Include/* python3.9/include/
 +
 +
'''4. Build TensorRT pybinding'''
 +
$ git clone https://github.com/pybind/pybind11.git
 +
$ git clone -b release/8.0 https://github.com/NVIDIA/TensorRT.git
 +
$ cd TensorRT
 +
$ git submodule update --init --recursive
 +
 +
$ cd python/
 +
$ TRT_OSSPATH=${PWD}/.. EXT_PATH=${PWD}/../.. TARGET=aarch64 PYTHON_MINOR_VERSION=9 ./build.sh
 +
$ python3.9 -m pip install build/dist/tensorrt-8.0.1.6-cp39-none-linux_aarch64.whl
 +
 +
 +
=== Caffe ===
 +
Below are the steps to build the Caffe library.
 +
 +
Verified environment:
 +
* JetPack4.6 + Xavier
 +
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/install_caffe_jp46.sh -O install_caffe_jp46.sh
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/0001-patch-for-jp4.6.patch -O 0001-patch-for-jp4.6.patch
 +
$ ./install_caffe_jp46.sh
 +
$ source ~/.bashrc
 +
 +
 +
=== MXNet ===
 +
Below are the steps to build the MXNet 1.8.0 library.
 +
 +
Verified environment:
 +
* JetPack4.5.1 + Xavier
 +
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/mxnet_v1.8.x.patch -O mxnet_v1.8.x.patch
 +
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/autobuild_mxnet.sh -O autobuild_mxnet.sh
 +
$ sudo chmod +x autobuild_mxnet.sh
 +
$ ./autobuild_mxnet.sh Xavier
 +
$ cd mxnet/build/
 +
$ pip3 install mxnet-1.8.0-py3-none-any.whl
 +
 +
 +
=== PyInstaller with OpenCV ===
 +
Currently, the OpenCV version between JetPack default and Pyinstaller is not consistent.
 +
 +
To solve this issue, you can either upgrade the python-opencv version or downgrade the PyInstaller version.
 +
 +
* Upgrade python-opencv
 +
$ pip3 install opencv-python
 +
 +
* Downgrade pyinstaller and pyinstaller-hooks-contrib
 +
$ sudo pip3 install pyinstaller==4.2
 +
$ sudo pip3 install pyinstaller-hooks-contrib==2021.2
 +
 +
$ pyinstaller --onefile --paths="/usr/lib/python3.6/dist-packages/cv2/python-3.6" myfile.py
 +
 +
 +
=== Cross-compile OpenCV with jetpack-linux-aarch64-crosscompile-x86 container  ===
 +
Please find the following link for the example:
 +
 +
https://forums.developer.nvidia.com/t/cannot-build-opencv-4-8-0-with-nvidia-docker/268951/13
 +
 +
 +
== Common Issues ==
 +
=== "Unsupported ONNX data type: UINT8 (2)" ===
 +
This error is from TensorRT. The root cause is that ONNX expects the input image to be INT8 but TensorRT uses Float32.
 +
 +
To solve this issue, you can modify the input data format of ONNX with our graphsurgeon API.
 +
 +
$ sudo apt-get install python3-pip libprotobuf-dev protobuf-compiler
 +
$ git clone https://github.com/NVIDIA/TensorRT.git
 +
$ cd TensorRT/tools/onnx-graphsurgeon/
 +
$ make install
 +
 +
<syntaxhighlight lang="python">
 +
import onnx_graphsurgeon as gs
 +
import onnx
 +
import numpy as np
 +
 +
graph = gs.import_onnx(onnx.load("model.onnx"))
 +
for inp in graph.inputs:
 +
    inp.dtype = np.float32
 +
 +
onnx.save(gs.export_onnx(graph), "updated_model.onnx")
 +
</syntaxhighlight>
 +
 +
 +
=== "Illegal instruction (core dumped)" ===
 +
This is a known issue in NumPy v1.19.5.
 +
 +
To solve this issue, you can either downgrade your NumPy into 1.19.4 or manually update an environment variable.
 +
 +
* Downgrade NumPy
 +
$ sudo apt-get install python3-pip
 +
$ pip3 install Cython
 +
$ pip3 install numpy==1.19.4
 +
 +
* Update environment variable
 +
$ export OPENBLAS_CORETYPE=ARMV8
 +
 +
 +
=== Long delays when submitting several cudaMemcpy ===
 +
Please try to increase the computing channel
 +
$ export CUDA_DEVICE_MAX_CONNECTIONS=32
 +
 +
A document can be found here:
 +
 +
https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars
 +
 +
 +
=== h5py error when installing TensorFlow on the r32 branch ===
 +
Compatibility Issue. Please try the below command to install:
 +
$ sudo apt-get install libhdf5-serial-dev hdf5-tools libhdf5-dev zlib1g-dev zip libjpeg8-dev liblapack-dev libblas-dev gfortran
 +
$ sudo apt-get install python3-pip
 +
$ sudo pip3 install -U pip testresources setuptools
 +
$ sudo ln -s /usr/include/locale.h /usr/include/xlocale.h
 +
$ pip3 install Cython==0.29.36
 +
$ pip3 install pkgconfig
 +
 +
$ git clone https://github.com/h5py/h5py.git
 +
$ git checkout 3.1.0
 +
$ git cherry-pick 3bf862daa4ebeb2eeaf3a0491e05f5415c1818e4
 +
$ H5PY_SETUP_REQUIRES=0 pip3 install . --no-deps --no-build-isolation
 +
 +
$ sudo pip3 install -U numpy==1.19.4 future mock keras_preprocessing keras_applications gast==0.2.1 protobuf pybind11 packaging
 +
$ sudo pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v461 tensorflow
 +
 +
More info can be found here:
 +
 +
https://forums.developer.nvidia.com/t/failed-building-wheel-of-h5py/263322/5

Latest revision as of 19:53, 25 March 2024

This page collects information to deploy customized models with TensorRT and some common questions for Jetson.

TensorRT Python

OpenCV with ONNX model

Below is an example to deploy TensorRT from an ONNX model with OpenCV images.

Verified environment:

  • JetPack5.1 + Orin
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)
runtime = trt.Runtime(TRT_LOGGER)

batch = 1
host_inputs  = []
cuda_inputs  = []
host_outputs = []
cuda_outputs = []
bindings = []


def Inference(engine):
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0

    np.copyto(host_inputs[0], image.ravel())
    stream = cuda.Stream()
    context = engine.create_execution_context()

    start_time = time.time()
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
    context.execute_v2(bindings)
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
    stream.synchronize()
    print("execute times "+str(time.time()-start_time))

    output = host_outputs[0]
    print(np.argmax(output))


def PrepareEngine():
    with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:
        config = builder.create_builder_config()
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)
        with open('/usr/src/tensorrt/data/resnet50/ResNet50.onnx', 'rb') as model:
            if not parser.parse(model.read()):
                print ('ERROR: Failed to parse the ONNX file.')
                for error in range(parser.num_errors):
                    print (parser.get_error(error))

        serialized_engine = builder.build_serialized_network(network, config)
        #with open("sample.engine", "wb") as f:
        #    f.write(serialized_engine)
        #with open('sample.engine', 'rb') as f:
        #    serialized_engine = f.read()
        engine = runtime.deserialize_cuda_engine(serialized_engine)

        # create buffer
        for binding in engine:
            size = trt.volume(engine.get_tensor_shape(binding)) * batch
            host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
            cuda_mem = cuda.mem_alloc(host_mem.nbytes)

            bindings.append(int(cuda_mem))
            if engine.get_tensor_mode(binding)==trt.TensorIOMode.INPUT:
                host_inputs.append(host_mem)
                cuda_inputs.append(cuda_mem)
            else:
                host_outputs.append(host_mem)
                cuda_outputs.append(cuda_mem)

        return engine


if __name__ == "__main__":
    engine = PrepareEngine()
    Inference(engine)

    engine = []


OpenCV with PLAN model

Below is an example to deploy TensorRT from a TensorRT PLAN model with OpenCV images.

Verified environment:

  • JetPack5.1 + Orin
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/resnet50/ResNet50.onnx --saveEngine=sample.engine
import cv2
import time
import numpy as np
import tensorrt as trt
import pycuda.autoinit
import pycuda.driver as cuda

EXPLICIT_BATCH = 1 << (int)(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
TRT_LOGGER = trt.Logger(trt.Logger.INFO)

batch = 1
host_inputs  = []
cuda_inputs  = []
host_outputs = []
cuda_outputs = []
bindings = []


def Inference(engine):
    image = cv2.imread("/usr/src/tensorrt/data/resnet50/airliner.ppm")
    image = (2.0 / 255.0) * image.transpose((2, 0, 1)) - 1.0

    np.copyto(host_inputs[0], image.ravel())
    stream = cuda.Stream()
    context = engine.create_execution_context()

    start_time = time.time()
    cuda.memcpy_htod_async(cuda_inputs[0], host_inputs[0], stream)
    context.execute_v2(bindings)
    cuda.memcpy_dtoh_async(host_outputs[0], cuda_outputs[0], stream)
    stream.synchronize()
    print("execute times "+str(time.time()-start_time))

    output = host_outputs[0]
    print(np.argmax(output))


def PrepareEngine():
    with open('sample.engine', 'rb') as f:
        serialized_engine = f.read()

    runtime = trt.Runtime(TRT_LOGGER)
    engine = runtime.deserialize_cuda_engine(serialized_engine)

    # create buffer
    for binding in engine:
        size = trt.volume(engine.get_tensor_shape(binding)) * batch
        host_mem = cuda.pagelocked_empty(shape=[size],dtype=np.float32)
        cuda_mem = cuda.mem_alloc(host_mem.nbytes)

        bindings.append(int(cuda_mem))
        if engine.get_tensor_mode(binding)==trt.TensorIOMode.INPUT:
            host_inputs.append(host_mem)
            cuda_inputs.append(cuda_mem)
        else:
            host_outputs.append(host_mem)
            cuda_outputs.append(cuda_mem)

    return engine


if __name__ == "__main__":
    engine = PrepareEngine()
    Inference(engine)

    engine = []


Multi-threading

Below is an example to run TensorRT with threads.

Verified environment:

  • JetPack4.5.1 + Xavier
$ /usr/src/tensorrt/bin/trtexec --onnx=/usr/src/tensorrt/data/mnist/mnist.onnx --saveEngine=mnist.trt
$ cd /usr/src/tensorrt/data/mnist/
$ sudo pip3 install pillow
$ python3 download_pgms.py
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/my_tensorrt_code.py -O my_tensorrt_code.py
import threading
import time
from my_tensorrt_code import TRTInference, trt

exitFlag = 0

class myThread(threading.Thread):
   def __init__(self, func, args):
      threading.Thread.__init__(self)
      self.func = func
      self.args = args
   def run(self):
      print ("Starting " + self.args[0])
      self.func(*self.args)
      print ("Exiting " + self.args[0])

if __name__ == '__main__':
    # Create new threads
    '''
    format thread:
        - func: function names, function that we wished to use
        - arguments: arguments that will be used for the func's arguments
    '''

    trt_engine_path = 'mnist.trt'

    max_batch_size = 1
    trt_inference_wrapper = TRTInference(trt_engine_path,
        trt_engine_datatype=trt.DataType.FLOAT,
        batch_size=max_batch_size)

    # Get TensorRT SSD model output
    input_img_path = '/usr/src/tensorrt/data/mnist/3.pgm'

    thread1 = myThread(trt_inference_wrapper.infer, [input_img_path])

    # Start new Threads
    thread1.start()
    thread1.join()
    trt_inference_wrapper.destory();
    print ("Exiting Main Thread")


Deepstream

YoloV4 Tiny

Verified environment:

  • JetPack4.5.1 + Xavier

Deepstream can reach 60fps with 4 video stream on Xavier:

$ cd /opt/nvidia/deepstream/deepstream-5.1/sources/objectDetector_Yolo
$ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/yolov4_tiny.patch
$ git apply yolov4_tiny.patch
$ export CUDA_VER=10.2
$ make -C nvdsinfer_custom_impl_Yolo
$ wget https://raw.githubusercontent.com/AlexeyAB/darknet/master/cfg/yolov4-tiny.cfg -q --show-progress
$ wget https://github.com/AlexeyAB/darknet/releases/download/darknet_yolo_v4_pre/yolov4-tiny.weights -q --show-progress
$ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/deepstream_app_config_yoloV4_tiny.txt
$ wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/yolov4-tiny/config_infer_primary_yoloV4_tiny.txt
$ deepstream-app -c deepstream_app_config_yoloV4_tiny.txt


Custom Parser for SSD-MobileNet Trained by Jetson-inference

Verified environment:

  • JetPack4.5.1 + Xavier
$ cd /opt/nvidia/deepstream/deepstream-5.1/sources/objectDetector_SSD/
$ sudo wget https://raw.githubusercontent.com/AastaNV/eLinux_data/main/deepstream/ssd-jetson_inference/ssd-jetson_inference.patch
$ sudo git apply ssd-jetson_inference.patch
$ sudo CUDA_VER=10.2 make -C nvdsinfer_custom_impl_ssd/

Update config_infer_primary_ssd.txt:

Ex.

diff --git a/config_infer_primary_ssd.txt b/config_infer_primary_ssd.txt
index e5bf468..81c52fd 100644
--- a/config_infer_primary_ssd.txt
+++ b/config_infer_primary_ssd.txt
@@ -62,15 +62,13 @@ gpu-id=0
 net-scale-factor=0.0078431372
 offsets=127.5;127.5;127.5
 model-color-format=0
-model-engine-file=sample_ssd_relu6.uff_b1_gpu0_fp32.engine
-labelfile-path=ssd_coco_labels.txt
-uff-file=sample_ssd_relu6.uff
+model-engine-file=ssd-mobilenet.uff_b1_gpu0_fp16.engine
+uff-file=ssd.uff
 infer-dims=3;300;300
 uff-input-order=0
 uff-input-blob-name=Input
-batch-size=1
-## 0=FP32, 1=INT8, 2=FP16 mode
-network-mode=0
+labelfile-path=labels.txt
+network-mode=2
 num-detected-classes=91
 interval=0
 gie-unique-id=1
$ deepstream-app -c deepstream_app_config_ssd.txt


Detectron2

Please find the following link for an example from the forum community:

https://forums.developer.nvidia.com/t/passing-gstbuffer-to-tensorrt-for-inferencing/267084/18


VPI

VPI with Jetson-utils

Below is an example to use VPI with jetson-utils

Verified environment:

  • JetPack4.6 + XavierNX
import numpy as np
import jetson.utils
import vpi


display = jetson.utils.glDisplay()

camera = jetson.utils.gstCamera(1920, 1280, '0')
camera.Open()

while display.IsOpen():
    frame, width, height = camera.CaptureRGBA(zeroCopy=1)
    input = vpi.asimage(np.uint8(jetson.utils.cudaToNumpy(frame)))
    with vpi.Backend.CUDA:
        output = input.convert(vpi.Format.U8) 
        output = output.box_filter(11, border=vpi.Border.ZERO).convert(vpi.Format.RGB8)
        vpi.clear_cache()

    display.RenderOnce(jetson.utils.cudaFromNumpy(output.cpu()), width, height)
    display.SetTitle("{:s} | {:d}x{:d} | {:.1f} FPS".format("Camera Viewer", width, height, display.GetFPS()))

camera.Close()


VPI with Deepstream

Please find the following link for the example:

https://forums.developer.nvidia.com/t/deepstream-sdk-vpi-on-jetson-tx2/166834/20

VPI with Argus Camera - cudaBayerDemosaic

Please find the following link for the example:

https://forums.developer.nvidia.com/t/how-do-i-get-image-from-cudabayerdemosaic-and-connect-to-vpi/213529/18

VPI with Argus Camera - nvarguscamerasrc

Please find below a sample to call VPI TNR within the nvarguscamerasrc component.

Verified environment:

  • JetPack4.6.4 + VPI1.2 + Jetson Nano

1. Download the nvarguscamerasrc source code from https://developer.nvidia.com/embedded/linux-tegra-r3274

2. Apply patch

diff --git a/Makefile b/Makefile
index 4f45209..b4ad5fb 100644
--- a/Makefile
+++ b/Makefile
@@ -31,7 +31,7 @@ CC := g++
 GST_INSTALL_DIR?=/usr/lib/aarch64-linux-gnu/gstreamer-1.0/
 LIB_INSTALL_DIR?=/usr/lib/aarch64-linux-gnu/tegra/
 CFLAGS:=
-LIBS:= -lnvbuf_utils -lnvdsbufferpool -lnvargus_socketclient -lpthread
+LIBS:= -lnvbuf_utils -lnvdsbufferpool -lnvargus_socketclient -lpthread -lnvvpi
 
 SRCS := $(wildcard *.cpp)
 
diff --git a/gstnvarguscamerasrc.cpp b/gstnvarguscamerasrc.cpp
index fd6cb4c..25a5dcb 100644
--- a/gstnvarguscamerasrc.cpp
+++ b/gstnvarguscamerasrc.cpp
@@ -169,6 +169,31 @@ bool ThreadArgus::initialize(GstNvArgusCameraSrc *src)
       return false;
   }
 
+  /* VPI */
+  //src->backend = VPI_BACKEND_CUDA;
+  src->backend = VPI_BACKEND_VIC;
+
+  int width = src->width, height = src->height;
+  VPIImageFormat imgFormat = VPI_IMAGE_FORMAT_NV12_ER;
+
+  CHECK_STATUS(vpiStreamCreate(VPI_BACKEND_VIC|src->backend, &src->stream));
+
+  int memFlags = src->backend | VPI_BACKEND_VIC | VPI_EXCLUSIVE_STREAM_ACCESS;
+  CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &src->imgPrev));
+  CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &src->imgCurr));
+  CHECK_STATUS(vpiImageCreate(width, height, imgFormat, memFlags, &src->imgOut));
+
+  CHECK_STATUS(vpiCreateTemporalNoiseReduction(src->backend, width, height, imgFormat, VPI_TNR_DEFAULT, &src->tnr));
+  CHECK_STATUS(vpiInitTemporalNoiseReductionParams(&src->params));
+  src->params.preset   = VPI_TNR_PRESET_INDOOR_LOW_LIGHT;
+  src->params.strength = 1.0f;
+
+  std::cout<<"[VpiLog] Input size: "<<width<<" x "<<height << '\n'
+           <<"[VpiLog] Image format: "<<vpiImageFormatGetName(imgFormat) << '\n'
+           <<"[VpiLog] Algorithm: Temporal Noise Reduction V"<<((src->backend==VPI_BACKEND_CUDA)?"3":"2")<<std::endl;
+
+  CHECK_STATUS(vpiEventCreate(0, &src->evStart));
+  CHECK_STATUS(vpiEventCreate(0, &src->evStop));
   return true;
 }
 
@@ -182,9 +207,18 @@ bool ThreadArgus::shutdown()
     m_threadID = 0;
     m_doShutdown = false;
     m_threadState = THREAD_INACTIVE;
-  }
 
- return true;
+    // Destroy all VPI resources
+    vpiStreamDestroy(src->stream);
+    vpiPayloadDestroy(src->tnr);
+    vpiImageDestroy(src->wrapNvBuff);
+    vpiImageDestroy(src->imgPrev);
+    vpiImageDestroy(src->imgCurr);
+    vpiImageDestroy(src->imgOut);
+    vpiEventDestroy(src->evStart);
+    vpiEventDestroy(src->evStop);
+  }
+  return true;
 }
 
 bool ThreadArgus::waitRunning(useconds_t timeoutUs)
@@ -668,6 +702,29 @@ bool StreamConsumer::threadExecute(GstNvArgusCameraSrc *src)
       src->frameInfo->frameNum = iFrame->getNumber();
       src->frameInfo->frameTime = iFrame->getTime();
 
+      /********************************************************/
+      // VPI TNR
+      /********************************************************/
+      CHECK_STATUS(vpiEventRecord(src->evStart, src->stream));
+      if( src->wrapNvBuff==NULL ) {
+        CHECK_STATUS(vpiImageCreateNvBufferWrapper(src->frameInfo->fd, NULL, \
+              src->backend|VPI_BACKEND_VIC|VPI_EXCLUSIVE_STREAM_ACCESS, &src->wrapNvBuff));
+      }
+
+      CHECK_STATUS(vpiSubmitConvertImageFormat(src->stream, VPI_BACKEND_VIC, src->wrapNvBuff, src->imgCurr, NULL));
+      CHECK_STATUS(vpiSubmitTemporalNoiseReduction(src->stream, src->backend, src->tnr, \
+              src->frameInfo->frameNum==1 ? NULL:src->imgPrev, src->imgCurr, src->imgOut, &src->params));
+      CHECK_STATUS(vpiSubmitConvertImageFormat(src->stream, VPI_BACKEND_VIC, src->imgOut, src->wrapNvBuff, NULL));
+
+      CHECK_STATUS(vpiEventRecord(src->evStop, src->stream));
+      CHECK_STATUS(vpiEventSync(src->evStop));
+      std::swap(src->imgPrev, src->imgCurr);
+
+      float elapsedMS;
+      CHECK_STATUS(vpiEventElapsedTimeMillis(src->evStart, src->evStop, &elapsedMS));
+      std::cout<<"[VpiLog] Elapsed time per call: "<<elapsedMS<<" ms"<<std::endl;
+      /********************************************************/
+
       g_mutex_lock (&src->argus_buffers_queue_lock);
       g_queue_push_tail (src->argus_buffers, (src->frameInfo));
       g_cond_signal (&src->argus_buffers_queue_cond);
diff --git a/gstnvarguscamerasrc.hpp b/gstnvarguscamerasrc.hpp
index 0630f98..0333f62 100644
--- a/gstnvarguscamerasrc.hpp
+++ b/gstnvarguscamerasrc.hpp
@@ -32,11 +32,18 @@
 #include <gst/gst.h>
 #include <condition_variable>
 #include <chrono>
+#include <sstream>
 
 #include "nvbufsurface.h"
 #include "nvbuf_utils.h"
 #include "gstnvarguscamera_utils.h"
 #include "gstnvdsbufferpool.h"
+#include <vpi/Event.h>
+#include <vpi/Image.h>
+#include <vpi/Stream.h>
+#include <vpi/NvBufferInterop.h>
+#include <vpi/algo/TemporalNoiseReduction.h>
+#include <vpi/algo/ConvertImageFormat.h>
 
 G_BEGIN_DECLS
 
@@ -69,6 +76,20 @@ G_BEGIN_DECLS
 #define NVARGUSCAM_DEFAULT_AE_LOCK                   FALSE
 #define NVARGUSCAM_DEFAULT_AWB_LOCK                  FALSE
 
+#define CHECK_STATUS(STMT)                                    \
+    do                                                        \
+    {                                                         \
+        VPIStatus status = (STMT);                            \
+        if (status != VPI_SUCCESS)                            \
+        {                                                     \
+            char buffer[VPI_MAX_STATUS_MESSAGE_LENGTH];       \
+            vpiGetLastStatusMessage(buffer, sizeof(buffer));  \
+            std::ostringstream ss;                            \
+            ss << vpiStatusGetName(status) << ": " << buffer; \
+            throw std::runtime_error(ss.str());               \
+        }                                                     \
+    } while (0);
+
 typedef struct _GstNvArgusCameraSrc      GstNvArgusCameraSrc;
 typedef struct _GstNvArgusCameraSrcClass GstNvArgusCameraSrcClass;
 
@@ -205,6 +226,20 @@ struct _GstNvArgusCameraSrc
   Argus::UniqueObj<Argus::OutputStreamSettings> streamSettings;
   Argus::UniqueObj<Argus::Request> request;
   NvArgusFrameInfo *frameInfo;
+
+  //VPI TNR Controls
+  VPIBackend backend;
+  VPIStream stream = NULL;
+
+  VPIImage wrapNvBuff = NULL;
+  VPIImage imgPrev = NULL;
+  VPIImage imgCurr = NULL;
+  VPIImage imgOut  = NULL;
+  VPITNRParams params;
+  VPIPayload tnr = NULL;
+
+  VPIEvent evStart = NULL;
+  VPIEvent evStop  = NULL;
 };
 
 struct _GstNvArgusCameraSrcClass

3. Build

$ make
$ sudo make install

4. Test

$ gst-launch-1.0 nvarguscamerasrc ! 'video/x-raw(memory:NVMM), width=(int)1920, height=(int)1080, format=(string)NV12, framerate=(fraction)30/1' ! nvv4l2h264enc preset-level=1 control-rate=1 bitrate=40000000 ! h264parse ! matroskamux ! filesink location=out.mp4

VPI with nvivafilter

Please find the following link for the example:

https://forums.developer.nvidia.com/t/using-vpi-in-gstreamer/223334/21

VPI and NPP on the Color Conversion

When converting NV12 to RGBA, the underlying algorithm has some differences:

  • VPI/NvBufSurfTransform: automatically apply the gamma correction
    NvBufSurface* dstSurface = createNVMMSurface(..., NvBufSurfaceColorFormat::NVBUF_COLOR_FORMAT_BGRA);
    NvBufSurfTransform(srcSurface, dstSurface, &transformParams);
  • NPP: only do the format conversion, and gamma correction need to be applied manually
    auto res = nppiNV12ToBGR_709CSC_8u_P2C3R(...);

Please find the following link for more details:

https://forums.developer.nvidia.com/t/vpi-nvbufsurftransform-and-npp-color-conversion-differences/280135


Stress Test for Orin

We describe the testing tools that can stress the Jetson AGX Orin to the full workload.

The expected power consumption with these steps is listed in the below table:

Jetson AGX Orin 64GB Jetson AGX Orin 32GB
Maximum Power 60W 40W

Maximize the Device Performance

$ sudo nvpmodel -m 0
$ sudo jetson_clocks

CPU Stress Test

Using Linux stress tool:

$ sudo apt-get install stress
$ stress --cpu $(nproc)

GPU Stress Test

Verified environment:

  • JetPack6DP + Orin

Running cuBLAS sample with the half data type:

1. Find matrixMulCUBLAS sample in this GitHub

2. Apply the following change the data type from float to half

diff --git a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile
index c781defa..5abf3303 100644
--- a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile
+++ b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/Makefile
@@ -331,8 +331,6 @@ matrixMulCUBLAS.o:matrixMulCUBLAS.cpp
 
 matrixMulCUBLAS: matrixMulCUBLAS.o
 	$(EXEC) $(NVCC) $(ALL_LDFLAGS) $(GENCODE_FLAGS) -o $@ $+ $(LIBRARIES)
-	$(EXEC) mkdir -p ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
-	$(EXEC) cp $@ ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)
 
 run: build
 	$(EXEC) ./matrixMulCUBLAS
@@ -341,6 +339,5 @@ testrun: build
 
 clean:
 	rm -f matrixMulCUBLAS matrixMulCUBLAS.o
-	rm -rf ../../../bin/$(TARGET_ARCH)/$(TARGET_OS)/$(BUILD_TYPE)/matrixMulCUBLAS
 
 clobber: clean
diff --git a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp
index 0cd33127..fbe4ac30 100644
--- a/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp
+++ b/Samples/4_CUDA_Libraries/matrixMulCUBLAS/matrixMulCUBLAS.cpp
@@ -87,6 +87,52 @@ typedef struct _matrixSize {
   unsigned int uiWA, uiHA, uiWB, uiHB, uiWC, uiHC;
 } sMatrixSize;
 
+// Modified from helper_image.h to support half precision arguments
+inline bool sdkCompareL2fe(const __half *reference, const __half *data,
+                           const unsigned int len, const float epsilon) {
+  assert(epsilon >= 0);
+
+  float error = 0;
+  float ref = 0;
+
+  for (unsigned int i = 0; i < len; ++i) {
+    float diff = (float)reference[i] - (float)data[i];
+    error += diff * diff;
+    ref += (float)reference[i] * (float)reference[i];
+  }
+
+  float normRef = sqrtf(ref);
+
+  if (fabs(ref) < 1e-7) {
+#ifdef _DEBUG
+    std::cerr << "ERROR, reference l2-norm is 0\n";
+#endif
+    return false;
+  }
+
+  float normError = sqrtf(error);
+  error = normError / normRef;
+  bool result = error < epsilon;
+#ifdef _DEBUG
+
+  if (!result) {
+    std::cerr << "ERROR, l2-norm error " << error << " is greater than epsilon "
+              << epsilon << "\n";
+  }
+
+#endif
+
+  return result;
+}
+
+inline int idx_n(int row, int col, int height, int width) {
+    return row * width + col;
+}
+
+inline int idx_t(int row, int col, int height, int width) {
+    return col * height + row;
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 //! Compute reference data set matrix multiply on CPU
 //! C = A * B
@@ -96,29 +142,30 @@ typedef struct _matrixSize {
 //! @param hA         height of matrix A
 //! @param wB         width of matrix B
 ////////////////////////////////////////////////////////////////////////////////
-void matrixMulCPU(float *C, const float *A, const float *B, unsigned int hA,
-                  unsigned int wA, unsigned int wB) {
+void matrixMulCPU(__half *C, const __half *A, const __half *B, unsigned int hA, unsigned int wA, unsigned int wB, const bool ta, const bool tb) {
+  auto idx_a = (ta) ? idx_t : idx_n;
+  auto idx_b = (tb) ? idx_t : idx_n;
   for (unsigned int i = 0; i < hA; ++i)
     for (unsigned int j = 0; j < wB; ++j) {
-      double sum = 0;
+      float sum = 0;
 
       for (unsigned int k = 0; k < wA; ++k) {
-        double a = A[i * wA + k];
-        double b = B[k * wB + j];
-        sum += a * b;
+        __half a = A[i * wA + k];
+        __half b = B[k * wB + j];
+        sum += float(a * b);
       }
 
-      C[i * wB + j] = (float)sum;
+      C[i * wB + j] = sum;
     }
 }
 
-// Allocates a matrix with random float entries.
-void randomInit(float *data, int size) {
-  for (int i = 0; i < size; ++i) data[i] = rand() / (float)RAND_MAX;
+// Allocates a matrix with random __half entries.
+void randomInit(__half *data, int size) {
+  for (int i = 0; i < size; ++i) data[i] = (__half)(rand() / (float)(RAND_MAX));
 }
 
-void printDiff(float *data1, float *data2, int width, int height,
-               int iListLength, float fListTol) {
+void printDiff(__half *data1, __half *data2, int width, int height,
+		int iListLength, float fListTol) {
   printf("Listing first %d Differences > %.6f...\n", iListLength, fListTol);
   int i, j, k;
   int error_count = 0;
@@ -130,12 +177,12 @@ void printDiff(float *data1, float *data2, int width, int height,
 
     for (i = 0; i < width; i++) {
       k = j * width + i;
-      float fDiff = fabs(data1[k] - data2[k]);
+      float fDiff = fabs((float)data1[k] - (float)data2[k]);
 
       if (fDiff > fListTol) {
         if (error_count < iListLength) {
           printf("    Loc(%d,%d)\tCPU=%.5f\tGPU=%.5f\tDiff=%.6f\n", i, j,
-                 data1[k], data2[k], fDiff);
+                 (float)data1[k], (float)data2[k], fDiff);
         }
 
         error_count++;
@@ -160,7 +207,7 @@ void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
         getCmdLineArgumentInt(argc, (const char **)argv, "sizemult");
   }
 
-  iSizeMultiple = min(iSizeMultiple, 10);
+  iSizeMultiple = min(iSizeMultiple, 8);
   iSizeMultiple = max(iSizeMultiple, 1);
 
   cudaDeviceProp deviceProp;
@@ -178,13 +225,13 @@ void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
 
   int block_size = 32;
 
-  matrix_size.uiWA = 3 * block_size * iSizeMultiple;
-  matrix_size.uiHA = 4 * block_size * iSizeMultiple;
-  matrix_size.uiWB = 2 * block_size * iSizeMultiple;
-  matrix_size.uiHB = 3 * block_size * iSizeMultiple;
-  matrix_size.uiWC = 2 * block_size * iSizeMultiple;
-  matrix_size.uiHC = 4 * block_size * iSizeMultiple;
-
+  matrix_size.uiWA = block_size * pow(2, iSizeMultiple);
+  matrix_size.uiHA = block_size * pow(2, iSizeMultiple);
+  matrix_size.uiWB = block_size * pow(2, iSizeMultiple);
+  matrix_size.uiHB = block_size * pow(2, iSizeMultiple);
+  matrix_size.uiWC = block_size * pow(2, iSizeMultiple);
+  matrix_size.uiHC = block_size * pow(2, iSizeMultiple);
+			  //
   printf("MatrixA(%u,%u), MatrixB(%u,%u), MatrixC(%u,%u)\n", matrix_size.uiHA,
          matrix_size.uiWA, matrix_size.uiHB, matrix_size.uiWB, matrix_size.uiHC,
          matrix_size.uiWC);
@@ -200,7 +247,7 @@ void initializeCUDA(int argc, char **argv, int &devID, int &iSizeMultiple,
 ////////////////////////////////////////////////////////////////////////////////
 //! Run a simple test matrix multiply using CUBLAS
 ////////////////////////////////////////////////////////////////////////////////
-int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
+int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size, bool ta, bool tb) {
   cudaDeviceProp deviceProp;
 
   checkCudaErrors(cudaGetDeviceProperties(&deviceProp, devID));
@@ -212,11 +259,11 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 
   // allocate host memory for matrices A and B
   unsigned int size_A = matrix_size.uiWA * matrix_size.uiHA;
-  unsigned int mem_size_A = sizeof(float) * size_A;
-  float *h_A = (float *)malloc(mem_size_A);
+  unsigned int mem_size_A = sizeof(__half) * size_A;
+  __half *h_A = (__half *)malloc(mem_size_A);
   unsigned int size_B = matrix_size.uiWB * matrix_size.uiHB;
-  unsigned int mem_size_B = sizeof(float) * size_B;
-  float *h_B = (float *)malloc(mem_size_B);
+  unsigned int mem_size_B = sizeof(__half) * size_B;
+  __half *h_B = (__half *)malloc(mem_size_B);
 
   // set seed for rand()
   srand(2006);
@@ -226,13 +273,13 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
   randomInit(h_B, size_B);
 
   // allocate device memory
-  float *d_A, *d_B, *d_C;
+  __half *d_A, *d_B, *d_C;
   unsigned int size_C = matrix_size.uiWC * matrix_size.uiHC;
-  unsigned int mem_size_C = sizeof(float) * size_C;
+  unsigned int mem_size_C = sizeof(__half) * size_C;
 
   // allocate host memory for the result
-  float *h_C = (float *)malloc(mem_size_C);
-  float *h_CUBLAS = (float *)malloc(mem_size_C);
+  __half *h_C      = (__half *) malloc(mem_size_C);
+  __half *h_CUBLAS = (__half *) malloc(mem_size_C);
 
   checkCudaErrors(cudaMalloc((void **)&d_A, mem_size_A));
   checkCudaErrors(cudaMalloc((void **)&d_B, mem_size_B));
@@ -248,22 +295,30 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
   printf("Computing result using CUBLAS...");
 
   // execute the kernel
-  int nIter = 30;
+  int nIter = INT_MAX;
 
   // CUBLAS version 2.0
   {
-    const float alpha = 1.0f;
-    const float beta = 0.0f;
+    cublasOperation_t trans_A = (ta) ? CUBLAS_OP_T : CUBLAS_OP_N;
+    cublasOperation_t trans_B = (tb) ? CUBLAS_OP_T : CUBLAS_OP_N;
+    int m = matrix_size.uiWC;
+    int n = matrix_size.uiHC;
+    int k = matrix_size.uiWA;
+    int lda = (trans_A == CUBLAS_OP_N) ? k : n;
+    int ldb = (trans_B == CUBLAS_OP_N) ? m : k;
+    int ldc = m;
+    const __half alpha = 1.0f;
+    const __half beta  = 0.0f;
     cublasHandle_t handle;
     cudaEvent_t start, stop;
 
     checkCudaErrors(cublasCreate(&handle));
 
     // Perform warmup operation with cublas
-    checkCudaErrors(cublasSgemm(
-        handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA,
-        matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A, matrix_size.uiWA,
-        &beta, d_C, matrix_size.uiWB));
+    checkCudaErrors(cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH));
+    cublasGemmAlgo_t algo = cublasGemmAlgo_t::CUBLAS_GEMM_ALGO4_TENSOR_OP;
+    checkCudaErrors(cublasGemmEx(handle, trans_B, trans_A, m, n, k, &alpha, d_B, CUDA_R_16F,
+        ldb, d_A, CUDA_R_16F, lda, &beta, d_C, CUDA_R_16F, ldc, CUBLAS_COMPUTE_16F, algo));
 
     // Allocate CUDA events that we'll use for timing
     checkCudaErrors(cudaEventCreate(&start));
@@ -275,10 +330,8 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
     for (int j = 0; j < nIter; j++) {
       // note cublas is column primary!
       // need to transpose the order
-      checkCudaErrors(cublasSgemm(
-          handle, CUBLAS_OP_N, CUBLAS_OP_N, matrix_size.uiWB, matrix_size.uiHA,
-          matrix_size.uiWA, &alpha, d_B, matrix_size.uiWB, d_A,
-          matrix_size.uiWA, &beta, d_C, matrix_size.uiWB));
+      checkCudaErrors(cublasGemmEx(handle, trans_B, trans_A, m, n, k, &alpha, d_B, CUDA_R_16F,
+          ldb, d_A, CUDA_R_16F, lda, &beta, d_C, CUDA_R_16F, ldc, CUBLAS_COMPUTE_16F, algo));
     }
 
     printf("done.\n");
@@ -312,17 +365,16 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 
   // compute reference solution
   printf("Computing result using host CPU...");
-  float *reference = (float *)malloc(mem_size_C);
-  matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA,
-               matrix_size.uiWB);
+  __half *reference = (__half *)malloc(mem_size_C);
+  matrixMulCPU(reference, h_A, h_B, matrix_size.uiHA, matrix_size.uiWA, matrix_size.uiWB, ta, tb);
   printf("done.\n");
 
   // check result (CUBLAS)
-  bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-6f);
+  bool resCUBLAS = sdkCompareL2fe(reference, h_CUBLAS, size_C, 1.0e-2f);
 
   if (resCUBLAS != true) {
     printDiff(reference, h_CUBLAS, matrix_size.uiWC, matrix_size.uiHC, 100,
-              1.0e-5f);
+              1.0e-2f);
   }
 
   printf("Comparing CUBLAS Matrix Multiply with CPU results: %s\n",
@@ -354,12 +406,12 @@ int matrixMultiply(int argc, char **argv, int devID, sMatrixSize &matrix_size) {
 int main(int argc, char **argv) {
   printf("[Matrix Multiply CUBLAS] - Starting...\n");
 
-  int devID = 0, sizeMult = 5;
+  int devID = 0, sizeMult = 7;
   sMatrixSize matrix_size;
 
   initializeCUDA(argc, argv, devID, sizeMult, matrix_size);
 
-  int matrix_result = matrixMultiply(argc, argv, devID, matrix_size);
+  int matrix_result = matrixMultiply(argc, argv, devID, matrix_size, true, false);
 
   return matrix_result;
 }

3. Run the sample

$ ./matrixMulCUBLAS

4. Run the sample with loading control

We also provide a configuration to control stress test loading

$ ./matrixMulCUBLAS --sizemult=[#]

The # starts from [1, 8] for the low to full stress test.

$ ./matrixMulCUBLAS --sizemult=1
$ ./matrixMulCUBLAS --sizemult=8

The default # value is 7.


Installation Steps

Darknet with cuDNN-8 Support

Below are the steps to build darknet with cuDNN-8 support.

Verified environment:

  • JetPack4.5.1 + Xavier

1. Get source

$ git clone https://github.com/pjreddie/darknet.git
$ cd darknet/
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/script/topics/0001-fix-for-cudnn_v8-limited-memory-to-default-darknet-s.patch
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/opencv-darknet.patch -O opencv-darknet.patch
$ git am 0001-fix-for-cudnn_v8-limited-memory-to-default-darknet-s.patch
$ git am opencv-darknet.patch

2. Update Makefile based on your device

GPU=1
CUDNN=1
OPENCV=1
  • Xavier & XavierNX:
ARCH= -gencode arch=compute_72,code=sm_72 \
      -gencode arch=compute_72,code=[sm_72,compute_72]
  • TX2:
ARCH= -gencode arch=compute_62,code=sm_62 \
      -gencode arch=compute_62,code=[sm_62,compute_62]
  • Nano:
ARCH= -gencode arch=compute_53,code=sm_53 \
      -gencode arch=compute_53,code=[sm_53,compute_53]

3. Build and Test

$ make -j8
$ wget https://pjreddie.com/media/files/yolov3-tiny.weights
$ ./darknet detector demo cfg/coco.data cfg/yolov3-tiny.cfg yolov3-tiny.weights [video]


TensorRT Python Bindings

Below are the steps to build TensorRT Python 3.9 bindings.

Verified environment:

  • JetPack4.6 + Xavier

1. Building python3.9

$ sudo apt install zlib1g-dev libncurses5-dev libgdbm-dev libnss3-dev libssl-dev libreadline-dev libffi-dev libsqlite3-dev libbz2-dev
$ wget https://www.python.org/ftp/python/3.9.1/Python-3.9.1.tar.xz
$ tar xvf Python-3.9.1.tar.xz Python-3.9.1/
$ mkdir build-python-3.9.1
$ cd build-python-3.9.1/
$ ../Python-3.9.1/configure --enable-optimizations
$ make -j $(nproc)
$ sudo -H make altinstall
$ cd ../

2. Build cmake 3.13.5

$ sudo apt-get install -y protobuf-compiler libprotobuf-dev openssl libssl-dev libcurl4-openssl-dev
$ wget https://github.com/Kitware/CMake/releases/download/v3.13.5/cmake-3.13.5.tar.gz
$ tar xvf cmake-3.13.5.tar.gz
$ rm cmake-3.13.5.tar.gz
$ cd cmake-3.13.5/
$ ./bootstrap --system-curl
$ make -j$(nproc)
$ echo 'export PATH='${PWD}'/bin/:$PATH' >> ~/.bashrc
$ source ~/.bashrc
$ cd ../

3. Prepare header

$ mkdir python3.9
$ mkdir python3.9/include
$ wget http://ftp.us.debian.org/debian/pool/main/p/python3.9/libpython3.9-dev_3.9.9-2_arm64.deb
$ ar x libpython3.9-dev_3.9.9-2_arm64.deb
$ tar -xvf data.tar.xz
$ cp ./usr/include/aarch64-linux-gnu/python3.9/pyconfig.h python3.9/include/
$ cp -r Python-3.9.1/Include/* python3.9/include/

4. Build TensorRT pybinding

$ git clone https://github.com/pybind/pybind11.git
$ git clone -b release/8.0 https://github.com/NVIDIA/TensorRT.git
$ cd TensorRT
$ git submodule update --init --recursive
$ cd python/
$ TRT_OSSPATH=${PWD}/.. EXT_PATH=${PWD}/../.. TARGET=aarch64 PYTHON_MINOR_VERSION=9 ./build.sh
$ python3.9 -m pip install build/dist/tensorrt-8.0.1.6-cp39-none-linux_aarch64.whl


Caffe

Below are the steps to build the Caffe library.

Verified environment:

  • JetPack4.6 + Xavier
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/install_caffe_jp46.sh -O install_caffe_jp46.sh
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/0001-patch-for-jp4.6.patch -O 0001-patch-for-jp4.6.patch
$ ./install_caffe_jp46.sh
$ source ~/.bashrc


MXNet

Below are the steps to build the MXNet 1.8.0 library.

Verified environment:

  • JetPack4.5.1 + Xavier
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/mxnet_v1.8.x.patch -O mxnet_v1.8.x.patch
$ wget https://raw.githubusercontent.com/AastaNV/JEP/master/elinux/autobuild_mxnet.sh -O autobuild_mxnet.sh
$ sudo chmod +x autobuild_mxnet.sh
$ ./autobuild_mxnet.sh Xavier
$ cd mxnet/build/
$ pip3 install mxnet-1.8.0-py3-none-any.whl


PyInstaller with OpenCV

Currently, the OpenCV version between JetPack default and Pyinstaller is not consistent.

To solve this issue, you can either upgrade the python-opencv version or downgrade the PyInstaller version.

  • Upgrade python-opencv
$ pip3 install opencv-python
  • Downgrade pyinstaller and pyinstaller-hooks-contrib
$ sudo pip3 install pyinstaller==4.2
$ sudo pip3 install pyinstaller-hooks-contrib==2021.2
$ pyinstaller --onefile --paths="/usr/lib/python3.6/dist-packages/cv2/python-3.6" myfile.py


Cross-compile OpenCV with jetpack-linux-aarch64-crosscompile-x86 container

Please find the following link for the example:

https://forums.developer.nvidia.com/t/cannot-build-opencv-4-8-0-with-nvidia-docker/268951/13


Common Issues

"Unsupported ONNX data type: UINT8 (2)"

This error is from TensorRT. The root cause is that ONNX expects the input image to be INT8 but TensorRT uses Float32.

To solve this issue, you can modify the input data format of ONNX with our graphsurgeon API.

$ sudo apt-get install python3-pip libprotobuf-dev protobuf-compiler
$ git clone https://github.com/NVIDIA/TensorRT.git
$ cd TensorRT/tools/onnx-graphsurgeon/
$ make install
import onnx_graphsurgeon as gs
import onnx
import numpy as np

graph = gs.import_onnx(onnx.load("model.onnx"))
for inp in graph.inputs:
    inp.dtype = np.float32

onnx.save(gs.export_onnx(graph), "updated_model.onnx")


"Illegal instruction (core dumped)"

This is a known issue in NumPy v1.19.5.

To solve this issue, you can either downgrade your NumPy into 1.19.4 or manually update an environment variable.

  • Downgrade NumPy
$ sudo apt-get install python3-pip
$ pip3 install Cython
$ pip3 install numpy==1.19.4
  • Update environment variable
$ export OPENBLAS_CORETYPE=ARMV8


Long delays when submitting several cudaMemcpy

Please try to increase the computing channel

$ export CUDA_DEVICE_MAX_CONNECTIONS=32

A document can be found here:

https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars


h5py error when installing TensorFlow on the r32 branch

Compatibility Issue. Please try the below command to install:

$ sudo apt-get install libhdf5-serial-dev hdf5-tools libhdf5-dev zlib1g-dev zip libjpeg8-dev liblapack-dev libblas-dev gfortran
$ sudo apt-get install python3-pip
$ sudo pip3 install -U pip testresources setuptools
$ sudo ln -s /usr/include/locale.h /usr/include/xlocale.h
$ pip3 install Cython==0.29.36
$ pip3 install pkgconfig
$ git clone https://github.com/h5py/h5py.git
$ git checkout 3.1.0
$ git cherry-pick 3bf862daa4ebeb2eeaf3a0491e05f5415c1818e4
$ H5PY_SETUP_REQUIRES=0 pip3 install . --no-deps --no-build-isolation
$ sudo pip3 install -U numpy==1.19.4 future mock keras_preprocessing keras_applications gast==0.2.1 protobuf pybind11 packaging
$ sudo pip3 install --extra-index-url https://developer.download.nvidia.com/compute/redist/jp/v461 tensorflow

More info can be found here:

https://forums.developer.nvidia.com/t/failed-building-wheel-of-h5py/263322/5