Hi, @RicardoLu. I also catch the problem. My trt model inference and return wrong output. It different with Raw model and ONNX model (ONNX model work well)
My environmet:
- Docker image:
nvcr.io/nvidia/deepstream:6.3-triton-multiarch
- TensorRT: 10.3.0
- CUDA: 12.6
- Python: 3.10.12
- Polygraphy: 0.49.24
- Transformers: 4.53.1
Here is my step:
1. I export ONNX model using optimum-cli
optimum-cli export onnx --model Qwen/Qwen3-Embedding-0.6B --task feature-extraction --opset 19 models/qwen3_embedding_0.6b_onnx
2. Use trtexec
to build trt engine:
trtexec --onnx=models/qwen3_embedding_0.6b_onnx/model.onnx \ --minShapes=input_ids:1x1024,attention_mask:1x1024,position_ids:1x1024 \ --optShapes=input_ids:4x1024,attention_mask:4x1024,position_ids:4x1024 \ --maxShapes=input_ids:8x1024,attention_mask:8x1024,position_ids:8x1024 \ --fp16 \ --saveEngine=qwen3_embedding_0.6b.engine
trtexec --onnx=models/qwen3_embedding_0.6b_onnx/model.onnx \ --minShapes=input_ids:1x1024,attention_mask:1x1024,position_ids:1x1024 \ --optShapes=input_ids:4x1024,attention_mask:4x1024,position_ids:4x1024 \ --maxShapes=input_ids:8x1024,attention_mask:8x1024,position_ids:8x1024 \ --best \ --saveEngine=model_repository/qwen3_embedding_0.6b/1/qwen3_embedding_0.6b.engine
3. Use python script to check trt model:
from transformers import AutoTokenizer, AutoModel from polygraphy.backend.trt import EngineFromBytes, TrtRunner import time import numpy as np import torch import sys import os def run_tensorrt_polygraphy_model(texts, engine_path): print("\n" + "=" * 50) print("RUNNING TENSORRT MODEL ") print("=" * 50) # Load tokenizer tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Embedding-0.6B") if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token try: # Load TensorRT engine using Polygraphy with open(engine_path, 'rb') as f: engine_bytes = f.read() engine = EngineFromBytes(engine_bytes) print(f"TensorRT engine loaded from {engine_path}") # Create TensorRT runner with TrtRunner(engine) as runner: # Tokenize inputs start_time = time.time() inputs = tokenizer( texts, padding='max_length', max_length=1024, truncation=True, return_tensors="np" ) tokenization_time = time.time() - start_time print(f"Tokenization time: {tokenization_time:.4f}s") # Print input info print("Input shapes:") for key, value in inputs.items(): print(f" {key}: {value.shape}") # Prepare inputs input_ids = inputs['input_ids'].astype(np.int64) attention_mask = inputs['attention_mask'].astype(np.int64) position_ids = np.arange(1024, dtype=np.int64)[np.newaxis, :].repeat(len(texts), axis=0) print(f" position_ids: {position_ids.shape}") # Prepare input dictionary for TensorRT input_dict = { 'input_ids': input_ids, 'attention_mask': attention_mask, 'position_ids': position_ids } # Run inference start_time = time.time() outputs = runner.infer(input_dict) inference_time = time.time() - start_time print(f"Inference time: {inference_time:.4f}s") # Get output (assuming first output is last_hidden_state) output_names = list(outputs.keys()) print(f"Available outputs: {output_names}") # Get the main output (should be last_hidden_state) last_hidden_state = None for output_name in output_names: if 'last_hidden_state' in output_name.lower() or len(output_names) == 1: last_hidden_state = outputs[output_name] break if last_hidden_state is None: # Take the first output if no clear match last_hidden_state = outputs[output_names[0]] print(f"Using output: {output_names[0]}") print(f"TensorRT output shape: {last_hidden_state.shape}") print(f"TensorRT output range: [{last_hidden_state.min():.6f}, {last_hidden_state.max():.6f}]") print(f"TensorRT output mean: {last_hidden_state.mean():.6f}") print(f"TensorRT output std: {last_hidden_state.std():.6f}") # Pool embeddings embeddings_list = pool_embeddings(last_hidden_state, attention_mask) embeddings = np.stack(embeddings_list) print(f"Embeddings shape after pooling: {embeddings.shape}") # Calculate similarity similarity_scores, normalized_embeddings = calculate_similarity(embeddings) # Calculate norms norms = np.linalg.norm(normalized_embeddings, axis=1) print(f"Embedding norms: {norms.tolist()}") print(f"Similarity scores:") print(f" TensorRT scores: {similarity_scores}") return { 'embeddings': normalized_embeddings, 'raw_outputs': last_hidden_state, 'inputs': {k: v for k, v in inputs.items()}, 'similarity_scores': similarity_scores, 'inference_time': inference_time, 'output_stats': { 'min': float(last_hidden_state.min()), 'max': float(last_hidden_state.max()), 'mean': float(last_hidden_state.mean()), 'std': float(last_hidden_state.std()) } } except Exception as e: print(f"TensorRT Polygraphy model failed: {e}") import traceback traceback.print_exc() return None def main(): texts = [ "What is the capital of China?", "The capital of China is Beijing.", "Gravity is a force that attracts two bodies toward each other." ] engine_path = "/qwen3_embedding_0.6b/1/qwen3_embedding_0.6b.engine" tensorrt_result = run_tensorrt_polygraphy_model(texts, engine_path) if __name__ == "__main__": main()