当前位置：首页 > news >正文

jetson orin nano super AI模型部署之路（五）tensorrt C++ api介绍

news 来源：原创 2025/8/3 1:18:53

我们基于tensorrt-cpp-api这个仓库介绍。这个仓库的代码是一个非常不错的tensorrt的cpp api实现，可基于此开发自己的项目。

我们从src/main.cpp开始按顺序说明。

一、首先是声明我们创建tensorrt model的参数。

 // Specify our GPU inference configuration optionsOptions options;// Specify what precision to use for inference// FP16 is approximately twice as fast as FP32.options.precision = Precision::FP16;// If using INT8 precision, must specify path to directory containing// calibration data.options.calibrationDataDirectoryPath = "";// Specify the batch size to optimize for.options.optBatchSize = 1;// Specify the maximum batch size we plan on running.options.maxBatchSize = 1;// Specify the directory where you want the model engine model file saved.options.engineFileDir = ".";

这里Options是在src/engine.h中声明的一个结构体


// 网络的选项配置
struct Options {// 用于 GPU 推理的精度模式// 可选值：FP32（单精度浮点）、FP16（半精度浮点）、INT8（8位量化）Precision precision = Precision::FP16;// 如果选择 INT8 精度，必须提供校准数据集的目录路径std::string calibrationDataDirectoryPath;// 用于计算 INT8 校准数据的批量大小// 应设置为 GPU 能支持的最大批量大小int32_t calibrationBatchSize = 128;// 优化的批量大小// 表示引擎在构建时会针对该批量大小进行优化int32_t optBatchSize = 1;// 最大允许的批量大小// 表示引擎支持的最大批量大小int32_t maxBatchSize = 16;// GPU 设备索引// 用于指定在哪个 GPU 上运行推理int deviceIndex = 0;// 引擎文件保存的目录// 用于存储 TensorRT 引擎文件std::string engineFileDir = ".";// 最大允许的输入宽度// 默认为 -1，表示期望固定的输入大小int32_t maxInputWidth = -1;// 最小允许的输入宽度// 默认为 -1，表示期望固定的输入大小int32_t minInputWidth = -1;// 优化的输入宽度// 默认为 -1，表示期望固定的输入大小int32_t optInputWidth = -1;
};

二、构建tensorrt推理的engine

Engine<float> engine(options);

这里Engine是一个模板类，继承字IEngine类。IEngine 是一个抽象接口类，定义了 TensorRT 推理引擎的核心功能。它包括构建和加载网络、运行推理、获取输入和输出张量维度等功能。通过模板参数 T，可以支持不同的数据类型（如 float 或 int）。具体的实现需要在派生类中完成。

IEngine类的实现在include/interfaces/IEngine.h中。

template <typename T>
class IEngine {
public:// 虚析构函数，确保派生类的资源能够正确释放virtual ~IEngine() = default;// 构建并加载 ONNX 模型到 TensorRT 引擎文件// 参数：// - onnxModelPath: ONNX 模型文件的路径// - subVals: 输入数据的减法均值，用于归一化// - divVals: 输入数据的除法均值，用于归一化// - normalize: 是否对输入数据进行归一化// 返回值：// - 如果成功构建并加载网络，返回 true；否则返回 falsevirtual bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;// 从磁盘加载 TensorRT 引擎文件到内存// 参数：// - trtModelPath: TensorRT 引擎文件的路径// - subVals: 输入数据的减法均值，用于归一化// - divVals: 输入数据的除法均值，用于归一化// - normalize: 是否对输入数据进行归一化// 返回值：// - 如果成功加载网络，返回 true；否则返回 falsevirtual bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) = 0;// 运行推理// 参数：// - inputs: 输入数据，格式为 [input][batch][cv::cuda::GpuMat]// - featureVectors: 输出数据，格式为 [batch][output][feature_vector]// 返回值：// - 如果推理成功，返回 true；否则返回 falsevirtual bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) = 0;// 获取输入张量的维度// 返回值：// - 输入张量的维度列表，类型为 nvinfer1::Dims3virtual const std::vector<nvinfer1::Dims3> &getInputDims() const = 0;// 获取输出张量的维度// 返回值：// - 输出张量的维度列表，类型为 nvinfer1::Dimsvirtual const std::vector<nvinfer1::Dims> &getOutputDims() const = 0;
};

再来看Engine类的实现。

template <typename T>
class Engine : public IEngine<T> {
public:// 构造函数，初始化引擎选项Engine(const Options &options);// 析构函数，释放 GPU 缓冲区~Engine();// 构建并加载 ONNX 模型到 TensorRT 引擎文件// 将模型缓存到磁盘以避免重复构建，并加载到内存中// 默认情况下，输入数据会被归一化到 [0.f, 1.f]// 如果需要 [-1.f, 1.f] 的归一化，可以通过 subVals 和 divVals 参数设置bool buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;// 从磁盘加载 TensorRT 引擎文件到内存// 默认情况下，输入数据会被归一化到 [0.f, 1.f]// 如果需要 [-1.f, 1.f] 的归一化，可以通过 subVals 和 divVals 参数设置bool loadNetwork(std::string trtModelPath, const std::array<float, 3> &subVals = {0.f, 0.f, 0.f},const std::array<float, 3> &divVals = {1.f, 1.f, 1.f}, bool normalize = true) override;// 运行推理// 输入格式：[input][batch][cv::cuda::GpuMat]// 输出格式：[batch][output][feature_vector]bool runInference(const std::vector<std::vector<cv::cuda::GpuMat>> &inputs, std::vector<std::vector<std::vector<T>>> &featureVectors) override;// 工具函数：调整图像大小并保持宽高比，通过在右侧或底部填充来实现// 适用于需要将检测坐标映射回原始图像的场景（例如 YOLO 模型）static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));// 获取输入张量的维度[[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };// 获取输出张量的维度[[nodiscard]] const std::vector<nvinfer1::Dims> &getOutputDims() const override { return m_outputDims; };// 工具函数：将三维嵌套的输出数组转换为二维数组// 适用于批量大小为 1 且有多个输出特征向量的情况static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);// 工具函数：将三维嵌套的输出数组转换为一维数组// 适用于批量大小为 1 且只有一个输出特征向量的情况static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);// 工具函数：将输入从 NHWC 格式转换为 NCHW 格式，并应用归一化和均值减法static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,const std::array<float, 3> &divVals, bool normalize, bool swapRB = false);private:// 构建网络bool build(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize);// 将引擎选项序列化为字符串std::string serializeEngineOptions(const Options &options, const std::string &onnxModelPath);// 获取设备名称列表void getDeviceNames(std::vector<std::string> &deviceNames);// 清除 GPU 缓冲区void clearGpuBuffers();// 输入数据的归一化、缩放和均值减法参数std::array<float, 3> m_subVals{};std::array<float, 3> m_divVals{};bool m_normalize;// 存储输入和输出 GPU 缓冲区的指针std::vector<void *> m_buffers;std::vector<uint32_t> m_outputLengths{};std::vector<nvinfer1::Dims3> m_inputDims;std::vector<nvinfer1::Dims> m_outputDims;std::vector<std::string> m_IOTensorNames;int32_t m_inputBatchSize;// TensorRT 运行时和推理上下文std::unique_ptr<nvinfer1::IRuntime> m_runtime = nullptr;std::unique_ptr<Int8EntropyCalibrator2> m_calibrator = nullptr;std::unique_ptr<nvinfer1::ICudaEngine> m_engine = nullptr;std::unique_ptr<nvinfer1::IExecutionContext> m_context = nullptr;// 引擎选项const Options m_options;// TensorRT 日志器Logger m_logger;
};// 构造函数：初始化引擎选项
template <typename T>
Engine<T>::Engine(const Options &options) : m_options(options) {}// 析构函数：清除 GPU 缓冲区
template <typename T>
Engine<T>::~Engine() { clearGpuBuffers(); }

相较于抽象接口类IEngine，Engine类添加了resizeKeepAspectRatioPadRightBottom、transformOutput、blobFromGpuMats这几个static的public函数。

    // 工具函数：调整图像大小并保持宽高比，通过在右侧或底部填充来实现// 适用于需要将检测坐标映射回原始图像的场景（例如 YOLO 模型）static cv::cuda::GpuMat resizeKeepAspectRatioPadRightBottom(const cv::cuda::GpuMat &input, size_t height, size_t width,const cv::Scalar &bgcolor = cv::Scalar(0, 0, 0));

    // 工具函数：将三维嵌套的输出数组转换为二维数组// 适用于批量大小为 1 且有多个输出特征向量的情况static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<std::vector<T>> &output);// 工具函数：将三维嵌套的输出数组转换为一维数组// 适用于批量大小为 1 且只有一个输出特征向量的情况static void transformOutput(std::vector<std::vector<std::vector<T>>> &input, std::vector<T> &output);// 工具函数：将输入从 NHWC 格式转换为 NCHW 格式，并应用归一化和均值减法static cv::cuda::GpuMat blobFromGpuMats(const std::vector<cv::cuda::GpuMat> &batchInput, const std::array<float, 3> &subVals,const std::array<float, 3> &divVals, bool normalize, bool swapRB = false);

在类中，static 修饰的成员函数是静态成员函数，它与普通成员函数有以下区别：

(1)静态成员函数不依赖于类的实例 静态成员函数属于类本身，而不是类的某个实例。调用静态成员函数时，不需要创建类的对象，可以直接通过类名调用。

(2)静态成员函数不能访问非静态成员变量 静态成员函数没有 this 指针，因此无法访问类的非静态成员变量或非静态成员函数。静态成员函数只能访问类的静态成员变量或调用其他静态成员函数。

在这里为什么使用 static？

(1)工具函数的设计 transformOutput 是一个工具函数，用于将三维嵌套的输出数组转换为二维数组。它的功能与类的实例无关，因此设计为静态函数更合理。这样可以直接通过类名调用，而不需要创建类的对象。

(2)提高代码的可读性和效率 将与类实例无关的函数声明为静态函数，可以明确表示该函数不依赖于类的状态。静态函数的调用效率通常比非静态函数更高，因为它不需要隐式传递 this 指针。

第二个修改是对于getInputDims和getOutputDims函数的处理。在抽象接口类IEngine中，其实现方式是

virtual const std::vector<nvinfer1::Dims3> &getInputDims() const = 0;

在Engine类中，其声明变成了

    // 获取输入张量的维度[[nodiscard]] const std::vector<nvinfer1::Dims3> &getInputDims() const override { return m_inputDims; };

[[nodiscard]] 是 C++17 引入的一个属性，用于提示调用者不要忽略函数的返回值。如果调用者忽略了带有 [[nodiscard]] 属性的函数的返回值，编译器会发出警告。

在这段代码中，getInputDims() 返回的是输入张量的维度信息。如果调用者忽略了这个返回值，可能会导致程序逻辑错误。使用 [[nodiscard]] 可以提醒开发者注意返回值的重要性。

例如engine.getInputDims(); // 如果没有使用返回值，编译器会发出警告。

override 是 C++11 引入的关键字，用于显式声明一个函数是从基类继承并重写的虚函数。如果函数没有正确地重写基类中的虚函数（例如函数签名不匹配），编译器会报错。

它可以防止由于函数签名错误而导致的意外行为。在这段代码中，getInputDims() 是从基类 IEngine 中继承并重写的虚函数。

第一个 const: 表示返回的引用是常量，调用者不能修改返回的 std::vector。第二个 const: 表示该成员函数不会修改类的成员变量，保证函数是只读的。

需要注意的是，在src/engine.h中，使用了模版类的实现，这有点要注意：


// 忽略之前的所有代码// 构造函数：初始化引擎选项
template <typename T>
Engine<T>::Engine(const Options &options) : m_options(options) {}// 析构函数：清除 GPU 缓冲区
template <typename T>
Engine<T>::~Engine() { clearGpuBuffers(); }// Include inline implementations
#include "engine/EngineRunInference.inl"
#include "engine/EngineUtilities.inl"
#include "engine/EngineBuildLoadNetwork.inl"

在文件的最后，才include了这几个头文件。

EngineRunInference.inl: 包含 runInference 函数的实现。

EngineUtilities.inl: 包含工具函数（如 transformOutput）的实现。

EngineBuildLoadNetwork.inl: 包含 buildLoadNetwork 和 loadNetwork 函数的实现。

将 #include 头文件放在文件末尾的原因是为了包含内联实现文件（.inl 文件），这种设计通常用于模板类或需要将实现与声明分离的情况下。

.inl 文件的作用 .inl 文件通常用于存放模板类或函数的实现。在 C++ 中，模板类或模板函数的实现必须在编译时可见，因此不能像普通类那样将实现放在 .cpp 文件中。为了保持代码的清晰和模块化，开发者会将模板类的实现从头文件中分离出来，放入 .inl 文件中。
为什么将 .inl 文件放在文件末尾？ 2.1 避免重复定义如果在头文件的开头包含 .inl 文件，可能会导致重复定义的问题，尤其是在头文件被多次包含时。将 .inl 文件放在头文件末尾，可以确保模板类的声明先被处理，然后再处理实现部分。

2.2 确保依赖的声明已完成 .inl 文件中的实现可能依赖于头文件中声明的类或函数。将 .inl 文件放在头文件末尾，可以确保所有必要的声明都已完成，避免编译错误。

2.3 提高代码的可读性将 .inl 文件放在头文件末尾，可以将类的声明和实现逻辑分开，增强代码的可读性和可维护性。开发者可以在头文件中快速查看类的接口，而不需要直接阅读实现细节。 3. 为什么不直接放在 .cpp 文件中？模板类或模板函数的实现不能放在 .cpp 文件中，因为模板的具体类型是在编译时实例化的，而 .cpp 文件是在链接阶段处理的。如果将模板实现放在 .cpp 文件中，编译器在实例化模板时将无法找到实现，导致链接错误。

三、tensorrt推理Engine的具体实现

3.1 buildLoadNetwork实现

buildLoadNetwork在include/engine/EngineBuildLoadNetwork.inl中实现。

template <typename T>
bool Engine<T>::buildLoadNetwork(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals,bool normalize) {const auto engineName = serializeEngineOptions(m_options, onnxModelPath);const auto engineDir = std::filesystem::path(m_options.engineFileDir);std::filesystem::path enginePath = engineDir / engineName;spdlog::info("Searching for engine file with name: {}", enginePath.string());if (Util::doesFileExist(enginePath)) {spdlog::info("Engine found, not regenerating...");} else {if (!Util::doesFileExist(onnxModelPath)) {auto msg = "Could not find ONNX model at path: " + onnxModelPath;spdlog::error(msg);throw std::runtime_error(msg);}spdlog::info("Engine not found, generating. This could take a while...");if (!std::filesystem::exists(engineDir)) {std::filesystem::create_directories(engineDir);spdlog::info("Created directory: {}", engineDir.string());}auto ret = build(onnxModelPath, subVals, divVals, normalize);if (!ret) {return false;}}return loadNetwork(enginePath, subVals, divVals, normalize);
}

其中，最重要的是build函数的实现。build的实现比较长，该函数的作用是从一个 ONNX 模型文件构建 TensorRT 引擎。它会解析 ONNX 模型文件，设置优化配置（如动态批量大小、动态输入宽度、精度模式等），并最终生成一个序列化的 TensorRT 引擎文件。

其完整实现如下代码，后面会对其中重要代码展开解释，一并写在代码注释上。

3.2 build函数实现（最重要的函数）

template <typename T>
bool Engine<T>::build(std::string onnxModelPath, const std::array<float, 3> &subVals, const std::array<float, 3> &divVals, bool normalize) {// Create our engine builder.//创建 TensorRT 构建器和网络// nvinfer1::createInferBuilder:// 创建一个 TensorRT 构建器（IBuilder），用于构建和优化网络。// m_logger 是一个自定义的日志器，用于记录 TensorRT 的日志信息。auto builder = std::unique_ptr<nvinfer1::IBuilder>(nvinfer1::createInferBuilder(m_logger));if (!builder) {return false;}// Define an explicit batch size and then create the network (implicit batch// size is deprecated). More info here:// https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#explicit-implicit-batch// builder->createNetworkV2:// 创建一个网络定义（INetworkDefinition），用于描述神经网络的结构。// 使用 kEXPLICIT_BATCH 标志表示显式批量大小（TensorRT 7.0 之后推荐使用）。auto explicitBatch = 1U << static_cast<uint32_t>(nvinfer1::NetworkDefinitionCreationFlag::kEXPLICIT_BATCH);auto network = std::unique_ptr<nvinfer1::INetworkDefinition>(builder->createNetworkV2(explicitBatch));if (!network) {return false;}// Create a parser for reading the onnx file.//  创建 ONNX 解析器并解析模型// nvonnxparser::createParser:// 创建一个 ONNX 解析器（IParser），用于将 ONNX 模型解析为 TensorRT 的网络定义。auto parser = std::unique_ptr<nvonnxparser::IParser>(nvonnxparser::createParser(*network, m_logger));if (!parser) {return false;}// We are going to first read the onnx file into memory, then pass that buffer// to the parser. Had our onnx model file been encrypted, this approach would// allow us to first decrypt the buffer.// 读取 ONNX 文件:// 将 ONNX 模型文件读入内存缓冲区（buffer），然后传递给解析器。// 这种方式允许对模型文件进行预处理（如解密）。std::ifstream file(onnxModelPath, std::ios::binary | std::ios::ate);std::streamsize size = file.tellg();file.seekg(0, std::ios::beg);std::vector<char> buffer(size);if (!file.read(buffer.data(), size)) {auto msg = "Error, unable to read engine file";spdlog::error(msg);throw std::runtime_error(msg);}// parser->parse:// 解析 ONNX 模型文件，将其转换为 TensorRT 的网络定义。// ONNX 是一种常见的模型格式，TensorRT 提供了专门的解析器来支持 ONNX 模型。// 通过将文件读入内存，可以灵活处理加密或压缩的模型文件。// Parse the buffer we read into memory.auto parsed = parser->parse(buffer.data(), buffer.size());if (!parsed) {return false;}// Ensure that all the inputs have the same batch sizeconst auto numInputs = network->getNbInputs();if (numInputs < 1) {auto msg = "Error, model needs at least 1 input!";spdlog::error(msg);throw std::runtime_error(msg);}// 检查输入的批量大小// TensorRT 要求所有输入的批量大小必须一致。const auto input0Batch = network->getInput(0)->getDimensions().d[0];for (int32_t i = 1; i < numInputs; ++i) {if (network->getInput(i)->getDimensions().d[0] != input0Batch) {auto msg = "Error, the model has multiple inputs, each with differing batch sizes!";spdlog::error(msg);throw std::runtime_error(msg);}}// Check to see if the model supports dynamic batch size or notbool doesSupportDynamicBatch = false;if (input0Batch == -1) {doesSupportDynamicBatch = true;spdlog::info("Model supports dynamic batch size");} else {spdlog::info("Model only supports fixed batch size of {}", input0Batch);// If the model supports a fixed batch size, ensure that the maxBatchSize// and optBatchSize were set correctly.if (m_options.optBatchSize != input0Batch || m_options.maxBatchSize != input0Batch) {auto msg = "Error, model only supports a fixed batch size of " + std::to_string(input0Batch) +". Must set Options.optBatchSize and Options.maxBatchSize to 1";spdlog::error(msg);throw std::runtime_error(msg);}}const auto input3Batch = network->getInput(0)->getDimensions().d[3];bool doesSupportDynamicWidth = false;if (input3Batch == -1) {doesSupportDynamicWidth = true;spdlog::info("Model supports dynamic width. Using Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth to set the input width.");// Check that the values of maxInputWidth, minInputWidth, and optInputWidth are validif (m_options.maxInputWidth < m_options.minInputWidth || m_options.maxInputWidth < m_options.optInputWidth ||m_options.minInputWidth > m_options.optInputWidth|| m_options.maxInputWidth < 1 || m_options.minInputWidth < 1 || m_options.optInputWidth < 1) {auto msg = "Error, invalid values for Options.maxInputWidth, Options.minInputWidth, and Options.optInputWidth";spdlog::error(msg);throw std::runtime_error(msg);}}//设置优化配置auto config = std::unique_ptr<nvinfer1::IBuilderConfig>(builder->createBuilderConfig());if (!config) {return false;}//设置优化配置，包括动态批量大小和动态输入宽度。// 使用 IOptimizationProfile 定义输入的最小、优化和最大维度。// 为什么要这么写？// TensorRT 允许为动态输入设置优化配置，以便在推理时根据实际输入调整性能。// 通过设置优化配置，可以在性能和灵活性之间取得平衡。// Register a single optimization profilenvinfer1::IOptimizationProfile *optProfile = builder->createOptimizationProfile();for (int32_t i = 0; i < numInputs; ++i) {// Must specify dimensions for all the inputs the model expects.const auto input = network->getInput(i);const auto inputName = input->getName();const auto inputDims = input->getDimensions();int32_t inputC = inputDims.d[1];int32_t inputH = inputDims.d[2];int32_t inputW = inputDims.d[3];int32_t minInputWidth = std::max(m_options.minInputWidth, inputW);// Specify the optimization profile`if (doesSupportDynamicBatch) {optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN, nvinfer1::Dims4(1, inputC, inputH, minInputWidth));} else {optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMIN,nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, minInputWidth));}if (doesSupportDynamicWidth) {optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, m_options.optInputWidth));optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, m_options.maxInputWidth));} else {optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kOPT,nvinfer1::Dims4(m_options.optBatchSize, inputC, inputH, inputW));optProfile->setDimensions(inputName, nvinfer1::OptProfileSelector::kMAX,nvinfer1::Dims4(m_options.maxBatchSize, inputC, inputH, inputW));}}config->addOptimizationProfile(optProfile);// Set the precision levelconst auto engineName = serializeEngineOptions(m_options, onnxModelPath);if (m_options.precision == Precision::FP16) {// Ensure the GPU supports FP16 inferenceif (!builder->platformHasFastFp16()) {auto msg = "Error: GPU does not support FP16 precision";spdlog::error(msg);throw std::runtime_error(msg);}config->setFlag(nvinfer1::BuilderFlag::kFP16);} else if (m_options.precision == Precision::INT8) {if (numInputs > 1) {auto msg = "Error, this implementation currently only supports INT8 ""quantization for single input models";spdlog::error(msg);throw std::runtime_error(msg);}// Ensure the GPU supports INT8 Quantizationif (!builder->platformHasFastInt8()) {auto msg = "Error: GPU does not support INT8 precision";spdlog::error(msg);throw std::runtime_error(msg);}// Ensure the user has provided path to calibration data directoryif (m_options.calibrationDataDirectoryPath.empty()) {auto msg = "Error: If INT8 precision is selected, must provide path to ""calibration data directory to Engine::build method";throw std::runtime_error(msg);}config->setFlag((nvinfer1::BuilderFlag::kINT8));const auto input = network->getInput(0);const auto inputName = input->getName();const auto inputDims = input->getDimensions();const auto calibrationFileName = engineName + ".calibration";m_calibrator = std::make_unique<Int8EntropyCalibrator2>(m_options.calibrationBatchSize, inputDims.d[3], inputDims.d[2],m_options.calibrationDataDirectoryPath, calibrationFileName, inputName,subVals, divVals, normalize);config->setInt8Calibrator(m_calibrator.get());}// CUDA stream used for profiling by the builder.cudaStream_t profileStream;Util::checkCudaErrorCode(cudaStreamCreate(&profileStream));config->setProfileStream(profileStream);// Build the engine// If this call fails, it is suggested to increase the logger verbosity to// kVERBOSE and try rebuilding the engine. Doing so will provide you with more// information on why exactly it is failing.// 构建引擎并保存到磁盘// 使用 builder->buildSerializedNetwork 构建序列化的 TensorRT 引擎。// 将引擎保存到磁盘，以便后续加载和使用。// 为什么要这么写？// 构建引擎是一个耗时操作，将引擎保存到磁盘可以避免重复构建。// 序列化的引擎文件可以直接加载到内存中，节省时间。std::unique_ptr<nvinfer1::IHostMemory> plan{builder->buildSerializedNetwork(*network, *config)};if (!plan) {return false;}// Write the engine to diskconst auto enginePath = std::filesystem::path(m_options.engineFileDir) / engineName;std::ofstream outfile(enginePath, std::ofstream::binary);outfile.write(reinterpret_cast<const char *>(plan->data()), plan->size());spdlog::info("Success, saved engine to {}", enginePath.string());Util::checkCudaErrorCode(cudaStreamDestroy(profileStream));return true;
}

函数的作用

解析 ONNX 模型文件。

设置优化配置（动态批量大小、动态输入宽度等）。

设置精度模式（FP16 或 INT8）。

构建 TensorRT 引擎并保存到磁盘。

涉及的 TensorRT API

点击jetson orin nano super AI模型部署之路（五）tensorrt C++ api介绍查看全文

一、首先是声明我们创建tensorrt model的参数。

二、构建tensorrt推理的engine

三、tensorrt推理Engine的具体实现

3.1 buildLoadNetwork实现

3.2 build函数实现（最重要的函数）

相关文章：