Tensorrt8.5 inference different with origin onnx model

Description

i have trained a model with pytorch and export it to onnx format model, this onnx model predicts correct value. but when i convert it to tensorrt engine, it gives wrong inference.

Environment

TensorRT Version: 8.5.0.12
GPU Type: NVIDIA GeForce RTX 3090
Nvidia Driver Version: 470.129.06
CUDA Version: cuda_11.8.r11.8
CUDNN Version: 8
Operating System + Version: Ubuntu 20.04.5 LTS
Python Version (if applicable): 3.8.10
TensorFlow Version (if applicable):
PyTorch Version (if applicable): 1.12.1
Baremetal or Container (if container which image + tag):

Hi @liwenjudetiankong ,
Can you please share the onnx model and reproducible script with us, so that we can try reproducing.
Thanks

the model is here: Tensorrt8.5 inference different with origin onnx model

Hi,

Could you please share with us the issue repro script as well to try from our end for better debugging.

Thank you.

from transformers import BertTokenizerFast  import os  import os import json  import time  import threading from typing import Tuple, List  import numpy as np   try:     import pycuda.driver as cuda      cuda.init()     import tensorrt as trt     TRT_LOGGER = trt.Logger(trt.Logger.VERBOSE)     TRT_AVLAIBLE=True  except ImportError:     TRT_AVLAIBLE=False      try:     import onnxruntime     ONNXRUNTIME_AVLAIBLE=True except:     ONNXRUNTIME_AVLAIBLE=False        tokenizer = BertTokenizerFast.from_pretrained("hfl/chinese-macbert-base") def preprocess_data(text="this is a sad thing", is_trt=False):     texts = [text for _ in range(5)]     context = tokenizer(texts, padding="max_length", return_tensors='pt',max_length=128, truncation=True, return_offsets_mapping=True)     input_ids = context['input_ids'].detach().cpu().numpy()     attention_mask = context['attention_mask'].detach().cpu().numpy()     token_type_ids = context['token_type_ids'].detach().cpu().numpy()          if is_trt:         return  [input_ids, attention_mask, token_type_ids]                 else:         return  {"input_ids": input_ids, "attention_mask": attention_mask, "token_type_ids": token_type_ids}   class TrtModel:     def __init__(self, model_name="detector_corrector", model_dir=".", cached_engine=True, max_batch_size=5) -> None:         self.cfx = cuda.Device(0).make_context()         self.model_dir = model_dir         self.model_name = model_name         self.max_batch_size = max_batch_size         self.catched_engine = cached_engine         self.engine = self.load_model(             os.path.join(model_dir, model_name + ".onnx"))         self.input_binding_idxs, self.output_binding_idxs = self.get_binding_idxs()         self.input_names = [self.engine.get_binding_name(             binding_idx) for binding_idx in self.input_binding_idxs]         self.output_names = [self.engine.get_binding_name(             binding_idx) for binding_idx in self.output_binding_idxs]      def __del__(self):         self.cfx.detach()      def load_model(self, model_path):         return self.load_engine(model_path)      def get_context(self):         return self.engine.create_execution_context()               def get_stream(self):         return cuda.Stream()      def predict(self, host_inputs):         self.cfx.push()         context = self.get_context()         stream = self.get_stream()         device_inputs = [cuda.mem_alloc(h_input.nbytes)                          for h_input in host_inputs]         for h_input, d_input in zip(host_inputs, device_inputs):             cuda.memcpy_htod_async(d_input, h_input, stream)          host_outputs, device_outputs = self.gen_output_buffer(             host_inputs, context)         bindings = device_inputs + device_outputs         exe_res = context.execute_async_v2(             bindings=bindings, stream_handle=stream.handle)         if not exe_res:             print(f"{self.__class__.__name__} execute_async_v2 error")         for h_output, d_output in zip(host_outputs, device_outputs):             cuda.memcpy_dtoh_async(h_output, d_output, stream)          stream.synchronize()          for b in bindings:             b.free()          self.cfx.pop()         return host_outputs      def gen_output_buffer(self, host_inputs: List[np.ndarray], context):          assert context.all_binding_shapes_specified          host_outputs = []         device_outputs = []         for binding_index in self.output_binding_idxs:             output_shape = context.get_binding_shape(binding_index)             # Allocate buffers to hold output results after copying back to host             buffer = np.empty(output_shape, dtype=np.float32)             host_outputs.append(buffer)             # Allocate output buffers on device             device_outputs.append(cuda.mem_alloc(buffer.nbytes))          return host_outputs, device_outputs      def get_binding_idxs(self):         # Separate input and output binding indices for convenience         input_binding_idxs = []         output_binding_idxs = []         for binding_index in range(0, self.engine.num_bindings):             if self.engine.binding_is_input(binding_index):                 input_binding_idxs.append(binding_index)             else:                 output_binding_idxs.append(binding_index)          return input_binding_idxs, output_binding_idxs      def load_engine(self, onnx_file_path):         runtime = trt.Runtime(TRT_LOGGER)         cached_engine_path = os.path.join(             self.model_dir, self.model_name + ".engine")         if self.catched_engine and os.path.exists(cached_engine_path):             with open(cached_engine_path, "rb") as f:                 serialized_engine = f.read()                 engine = runtime.deserialize_cuda_engine(serialized_engine)                 print(f"load engine from cache: {cached_engine_path} sucessfully")                 return engine          EXPLICIT_BATCH = 1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)          with trt.Builder(TRT_LOGGER) as builder, builder.create_network(EXPLICIT_BATCH) as network, trt.OnnxParser(network, TRT_LOGGER) as parser:             builder.max_batch_size = self.max_batch_size             with open(onnx_file_path, 'rb') as model:                 if not parser.parse(model.read()):                     for error in range(parser.num_errors):                         print(parser.get_error(error))             config = builder.create_builder_config()             config.max_workspace_size = 1 << 30             # config.set_flag(trt.BuilderFlag.FP16)             serialized_engine = builder.build_serialized_network(                 network, config=config)             engine = runtime.deserialize_cuda_engine(serialized_engine)             print("build engine from {}  sucessfully".format(onnx_file_path))             if self.catched_engine:                 with open(cached_engine_path, "wb") as f:                     f.write(serialized_engine)                     print(f"cached engine to: {cached_engine_path}")          return engine   class OnnxModel:     def __init__(self, model_name="detector_corrector", model_dir="."):         if not model_name.endswith(".onnx"):             model_name = model_name + ".onnx"         model_path = os.path.join(model_dir, model_name)         print(f"onnx model path is {model_path}")         self.ort_session = self.load_model(model_path)              def load_model(self, model_path):         providers = ['CUDAExecutionProvider']  # onnxruntime         # sess_options = onnxruntime.SessionOptions()         # sess_options.intra_op_num_threads = 10         # sess_options.inter_op_num_threads = 10         print(f"onnxruntime get device {onnxruntime.get_device()} available providers {onnxruntime.get_available_providers()}")         ort_session = onnxruntime.InferenceSession(             model_path, providers=providers)         print(f"onnxruntime session providers {ort_session.get_providers()}")          return ort_session      def predict(self, inputs):         ort_outs = self.ort_session.run(None, inputs)         return ort_outs   trt_model = TrtModel() onnx_model = OnnxModel()  trt_output = trt_model.predict(preprocess_data(is_trt=True))  onnx_output = onnx_model.predict(preprocess_data(is_trt=False))  trt_detector_logits, trt_corrector_logits = trt_output onnx_detector_logits, onnx_corrector_logits = onnx_output  import numpy as np assert np.allclose(trt_detector_logits, onnx_detector_logits) assert np.allclose(trt_corrector_logits, onnx_corrector_logits)         

if you run this code, np.allclose result always False

Hi,

Sorry for the delayed response. We could reproduce the issue.
Please allow us some time to work on this.

Thank you.

Hi,

Please refer Tensorrt8.5 inference different with origin onnx model - #10 by spolisetty

Thank you.