From 4fa9af81b49e4908802164572786f8f24647478e Mon Sep 17 00:00:00 2001
From: root <huang_zheng_xiang@163.com>
Date: Tue, 9 Jun 2026 01:37:16 +0800
Subject: [PATCH 1/3] [NPU:rknn] initial commit

---
 CMakeLists.txt                               |   8 +
 docs/inference/npu.md                        | 100 +++++
 source/backend/rknn/CMakeLists.txt           |  15 +
 source/backend/rknn/backend/RKNNBackend.cpp  | 379 +++++++++++++++++++
 source/backend/rknn/backend/RKNNBackend.hpp  |  56 +++
 source/core/Backend.cpp                      |   6 +
 tools/converter/CMakeLists.txt               |   4 +
 tools/converter/include/config.hpp           |   6 +
 tools/converter/source/MNNConverter.cpp      |   5 +-
 tools/converter/source/common/RKNNBundle.cpp | 300 +++++++++++++++
 tools/converter/source/common/RKNNBundle.hpp |  23 ++
 tools/converter/source/common/cli.cpp        |  47 ++-
 12 files changed, 940 insertions(+), 9 deletions(-)
 create mode 100644 source/backend/rknn/CMakeLists.txt
 create mode 100644 source/backend/rknn/backend/RKNNBackend.cpp
 create mode 100644 source/backend/rknn/backend/RKNNBackend.hpp
 create mode 100644 tools/converter/source/common/RKNNBundle.cpp
 create mode 100644 tools/converter/source/common/RKNNBundle.hpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e292c1379b..24313ace46 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -282,6 +282,7 @@ option(MNN_COREML "Enable CoreML" OFF)
 option(MNN_NNAPI "Enable NNAPI" OFF)
 option(MNN_QNN "Enable QNN" OFF)
 option(MNN_QNN_ONLINE_FINALIZE "Enable QNN Online Finalize" ON)
+option(MNN_RKNN "Enable RKNN runtime backend" OFF)
 
 option(MNN_GPU_TIME_PROFILE "Enable time profiling for the OpenCL backend and Vulkan backend." OFF)
 option(MNN_GPU_PROFILE_SILENT "When GPU time profiling is enabled, only accumulate total time without printing per-kernel details." OFF)
@@ -680,6 +681,13 @@ IF(MNN_QNN)
     list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_QNN>)
 ENDIF()
 
+# RKNN
+IF(MNN_RKNN)
+    add_definitions(-DMNN_RKNN_ENABLED=1)
+    add_subdirectory(${CMAKE_CURRENT_LIST_DIR}/source/backend/rknn/)
+    list(APPEND MNN_OBJECTS_TO_LINK $<TARGET_OBJECTS:MNN_RKNN>)
+ENDIF()
+
 # NEUROPILOT
 IF(MNN_NEUROPILOT)
     target_compile_options(MNNCore PRIVATE -DMNN_NEUROPILOT=1)
diff --git a/docs/inference/npu.md b/docs/inference/npu.md
index b740343914..a239d58ad8 100644
--- a/docs/inference/npu.md
+++ b/docs/inference/npu.md
@@ -5,6 +5,7 @@
 - CoreML
 - NNAPI
 - HIAI
+- RKNN
 
 ## QNN
 
@@ -178,3 +179,102 @@ cp -r ${DDK}/include ${MNN}/source/backend/hiai/3rdParty/include
 1. cmake 参数打开npu开关： -DMNN_NPU=ON
 2. backend type设置成：MNN_FORWARD_USER_0
 3. 执行可执行程序（需动态加载：libMNN_NPU.so, libhiai_ir_build.so, libhiai_ir.so, libhiai.so）
+
+## RKNN
+适用于 Rockchip RKNPU 平台。当前接入方式不是在线逐算子构图，而是同一份 ONNX 在 Host 侧同时生成：
+- 包装后的 `.mnn`
+- sidecar `.rknn`
+
+其中 `.mnn` 内部只保留 `Input + Extra(type="RKNN")` 包装图，运行时由 MNN 的 RKNN backend 调用 RKNN C API 执行 `.rknn`。
+
+### RKNN 后端整体介绍
+
+- Host 侧通过 `MNNConvert --rknn` 完成双产物生成，不走 `compilefornpu` 的 `MNN -> NPU` 逐算子编译链路。
+- Device 侧通过 RKNN C API 加载 `.rknn` 并执行，当前 backend 注册为 `MNN_FORWARD_USER_2`。
+- RKNN backend 读取 runtime 库路径、转换脚本路径、目标平台等信息时，不做硬编码，全部从环境变量读取；缺失时直接报 `MNN_ERROR`。
+
+### 编译
+
+#### Host，编译带 RKNN 转换能力的 MNNConvert
+
+需要开启：
+- `-DMNN_BUILD_CONVERTER=ON`
+- `-DMNN_RKNN_CONVERT_MODE=ON`
+
+示例：
+
+```bash
+cmake -S ${MNN_ROOT} -B ${BUILD_DIR} \
+  -DMNN_BUILD_CONVERTER=ON \
+  -DMNN_RKNN_CONVERT_MODE=ON
+
+cmake --build ${BUILD_DIR} --target MNNConvert -j8
+```
+
+#### Device/Runtime，编译带 RKNN backend 的 MNN
+
+需要开启：
+- `-DMNN_RKNN=ON`
+- `-DRKNN_API_INCLUDE_DIR=/path/to/rknn_api/include`
+
+示例：
+
+```bash
+cmake -S ${MNN_ROOT} -B ${BUILD_DIR} \
+  -DMNN_RKNN=ON \
+  -DRKNN_API_INCLUDE_DIR=/path/to/rknn_api/include
+
+cmake --build ${BUILD_DIR} --target MNN -j8
+```
+
+### Host，生成 RKNN 包装模型
+
+调用 `MNNConvert --rknn` 前，必须设置以下环境变量：
+
+- `MNN_RKNN_TARGET`
+  - 例如 `rv1126b`
+- `MNN_RKNN_PYTHON`
+  - RKNN Toolkit 所在 Python 解释器
+- `MNN_RKNN_SCRIPT`
+  - ONNX 转 `.rknn` 的脚本路径
+- `MNN_RKNN_OUTPUT_DIR`
+  - `.rknn` 和 manifest 的输出目录
+
+示例：
+
+```bash
+export MNN_RKNN_TARGET=rv1126b
+export MNN_RKNN_PYTHON=/path/to/python
+export MNN_RKNN_SCRIPT=/path/to/to_rknn.py
+export MNN_RKNN_OUTPUT_DIR=/path/to/output/sidecar
+
+${BUILD_DIR}/MNNConvert \
+  -f ONNX \
+  --modelFile model.onnx \
+  --MNNModel model.mnn \
+  --rknn
+```
+
+执行成功后会生成：
+- `model.mnn`
+  - RKNN wrapper 模型
+- `${MNN_RKNN_OUTPUT_DIR}/model_<target>.rknn`
+- `${MNN_RKNN_OUTPUT_DIR}/model.rknn.bundle.json`
+
+### Device，运行
+
+运行时必须设置：
+- `MNN_RKNN_RUNTIME_LIB`
+  - 指向目标板上的 `librknnrt.so`
+
+并在创建 Session 时选择：
+- backend type = `MNN_FORWARD_USER_2`
+
+如果 `.rknn` 路径在 wrapper `.mnn` 中是相对路径，则需要确保模型外部路径设置正确，使 MNN 能解析 sidecar 所在目录。
+
+### 当前限制
+
+- 当前 RKNN backend 只执行 `Extra(type="RKNN")` 节点，不支持逐算子 RKNN backend。
+- 当前实现走 host buffer copy 路径，尚未做 zero-copy。
+- 当前输出路径按 `float32` 处理。
+- 当前主目标是板端运行；PC 侧如果没有可用的 x86 `librknnrt.so`，则不能直接用 MNN runtime 在 Host 上模拟执行 RKNN backend。
diff --git a/source/backend/rknn/CMakeLists.txt b/source/backend/rknn/CMakeLists.txt
new file mode 100644
index 0000000000..bb95975eb1
--- /dev/null
+++ b/source/backend/rknn/CMakeLists.txt
@@ -0,0 +1,15 @@
+file(GLOB BACKEND_SRCS ${CMAKE_CURRENT_LIST_DIR}/backend/*.cpp)
+set(MNN_RKNN_SRCS ${BACKEND_SRCS})
+
+set(_RKNN_API_INCLUDE "$ENV{RKNN_API_INCLUDE_DIR}")
+if (DEFINED RKNN_API_INCLUDE_DIR AND NOT "${RKNN_API_INCLUDE_DIR}" STREQUAL "")
+    set(_RKNN_API_INCLUDE "${RKNN_API_INCLUDE_DIR}")
+endif()
+
+if ("${_RKNN_API_INCLUDE}" STREQUAL "")
+    message(FATAL_ERROR "MNN_RKNN=ON requires RKNN_API_INCLUDE_DIR (or env RKNN_API_INCLUDE_DIR) to point to the directory containing rknn_api.h")
+endif()
+
+add_library(MNN_RKNN OBJECT ${MNN_RKNN_SRCS})
+target_include_directories(MNN_RKNN PRIVATE ${CMAKE_CURRENT_LIST_DIR}/backend/)
+target_include_directories(MNN_RKNN PRIVATE ${_RKNN_API_INCLUDE})
diff --git a/source/backend/rknn/backend/RKNNBackend.cpp b/source/backend/rknn/backend/RKNNBackend.cpp
new file mode 100644
index 0000000000..a9999ff726
--- /dev/null
+++ b/source/backend/rknn/backend/RKNNBackend.cpp
@@ -0,0 +1,379 @@
+#include "RKNNBackend.hpp"
+
+#include <cstdlib>
+#include <cstring>
+#include <dlfcn.h>
+#include <memory>
+#include <mutex>
+#include <string>
+#include <vector>
+
+#include "MNN_generated.h"
+#include "core/MNNFileUtils.h"
+#include "core/Macro.h"
+#include "core/TensorUtils.hpp"
+#include "rknn_api.h"
+
+namespace MNN {
+namespace RKNN {
+namespace {
+
+static const char* kRuntimeLibEnv = "MNN_RKNN_RUNTIME_LIB";
+static const char* kExtraTypeName = "RKNN";
+static const char* kModelPathAttr = "model_path";
+
+class HostMemObj : public Backend::MemObj {
+public:
+    explicit HostMemObj(size_t size) : mPtr(std::malloc(size)) {
+    }
+    ~HostMemObj() override {
+        std::free(mPtr);
+    }
+    MemChunk chunk() override {
+        return MemChunk(mPtr, 0);
+    }
+    bool valid() const {
+        return nullptr != mPtr;
+    }
+private:
+    void* mPtr = nullptr;
+};
+
+struct RKNNApi {
+    using Init = int (*)(rknn_context*, void*, uint32_t, uint32_t, rknn_init_extend*);
+    using Destroy = int (*)(rknn_context);
+    using Query = int (*)(rknn_context, rknn_query_cmd, void*, uint32_t);
+    using InputsSet = int (*)(rknn_context, uint32_t, rknn_input[]);
+    using Run = int (*)(rknn_context, rknn_run_extend*);
+    using OutputsGet = int (*)(rknn_context, uint32_t, rknn_output[], rknn_output_extend*);
+    using OutputsRelease = int (*)(rknn_context, uint32_t, rknn_output[]);
+
+    bool loaded = false;
+    void* handle = nullptr;
+    Init init = nullptr;
+    Destroy destroy = nullptr;
+    Query query = nullptr;
+    InputsSet inputsSet = nullptr;
+    Run run = nullptr;
+    OutputsGet outputsGet = nullptr;
+    OutputsRelease outputsRelease = nullptr;
+};
+
+static const RKNNApi* loadApi() {
+    static std::once_flag once;
+    static RKNNApi api;
+    std::call_once(once, []() {
+        auto libPath = std::getenv(kRuntimeLibEnv);
+        if (nullptr == libPath || libPath[0] == '\0') {
+            MNN_ERROR("MNN_RKNN: missing environment variable %s\n", kRuntimeLibEnv);
+            return;
+        }
+        api.handle = dlopen(libPath, RTLD_NOW | RTLD_LOCAL);
+        if (nullptr == api.handle) {
+            MNN_ERROR("MNN_RKNN: dlopen failed for %s, error: %s\n", libPath, dlerror());
+            return;
+        }
+#define MNN_RKNN_LOAD_SYMBOL(typeName, field, symbol)                                               \
+        api.field = reinterpret_cast<RKNNApi::typeName>(dlsym(api.handle, symbol));                 \
+        if (nullptr == api.field) {                                                                  \
+            MNN_ERROR("MNN_RKNN: dlsym failed for %s\n", symbol);                                   \
+            return;                                                                                  \
+        }
+        MNN_RKNN_LOAD_SYMBOL(Init, init, "rknn_init");
+        MNN_RKNN_LOAD_SYMBOL(Destroy, destroy, "rknn_destroy");
+        MNN_RKNN_LOAD_SYMBOL(Query, query, "rknn_query");
+        MNN_RKNN_LOAD_SYMBOL(InputsSet, inputsSet, "rknn_inputs_set");
+        MNN_RKNN_LOAD_SYMBOL(Run, run, "rknn_run");
+        MNN_RKNN_LOAD_SYMBOL(OutputsGet, outputsGet, "rknn_outputs_get");
+        MNN_RKNN_LOAD_SYMBOL(OutputsRelease, outputsRelease, "rknn_outputs_release");
+#undef MNN_RKNN_LOAD_SYMBOL
+        api.loaded = true;
+    });
+    return api.loaded ? &api : nullptr;
+}
+
+static std::string getStringAttr(const Extra* extra, const char* key) {
+    if (nullptr == extra || nullptr == extra->attr()) {
+        return "";
+    }
+    for (int i = 0; i < extra->attr()->size(); ++i) {
+        auto attr = extra->attr()->GetAs<Attribute>(i);
+        if (nullptr == attr || nullptr == attr->key()) {
+            continue;
+        }
+        if (attr->key()->str() == key && nullptr != attr->s()) {
+            return attr->s()->str();
+        }
+    }
+    return "";
+}
+
+static std::string resolveModelPath(const Backend* backend, const std::string& path) {
+    if (path.empty()) {
+        return "";
+    }
+    if (!path.empty() && path[0] == '/') {
+        return path;
+    }
+    return MNNFilePathConcat(backend->pNPUModelDirPath, path);
+}
+
+static rknn_tensor_type mapTensorType(const Tensor* tensor) {
+    auto type = tensor->getType();
+    if (type.code == halide_type_float && type.bits == 32) {
+        return RKNN_TENSOR_FLOAT32;
+    }
+    if (type.code == halide_type_uint && type.bits == 8) {
+        return RKNN_TENSOR_UINT8;
+    }
+    if (type.code == halide_type_int && type.bits == 8) {
+        return RKNN_TENSOR_INT8;
+    }
+    if (type.code == halide_type_int && type.bits == 32) {
+        return RKNN_TENSOR_INT32;
+    }
+    return RKNN_TENSOR_FLOAT32;
+}
+
+static rknn_tensor_format mapTensorFormat(const Tensor* tensor) {
+    auto format = TensorUtils::getDescribe(tensor)->dimensionFormat;
+    if (format == MNN_DATA_FORMAT_NHWC) {
+        return RKNN_TENSOR_NHWC;
+    }
+    return RKNN_TENSOR_NCHW;
+}
+
+static Tensor::DimensionType getHostTensorDimType(const Tensor* tensor) {
+    return tensor->getDimensionType();
+}
+
+class RKNNExecution : public Execution {
+public:
+    RKNNExecution(Backend* backend, const Op* op, const RKNNApi* api) : Execution(backend), mApi(api) {
+        if (nullptr == op || op->type() != OpType_Extra || nullptr == op->main_as_Extra()) {
+            MNN_ERROR("MNN_RKNN: invalid op for RKNN execution\n");
+            mValid = false;
+            return;
+        }
+        auto extra = op->main_as_Extra();
+        if (extra->type()->str() != kExtraTypeName) {
+            MNN_ERROR("MNN_RKNN: unsupported Extra type\n");
+            mValid = false;
+            return;
+        }
+        mModelPath = resolveModelPath(backend, getStringAttr(extra, kModelPathAttr));
+        if (mModelPath.empty()) {
+            MNN_ERROR("MNN_RKNN: Extra(%s) requires attr '%s'\n", kExtraTypeName, kModelPathAttr);
+            mValid = false;
+            return;
+        }
+        if (!MNNFileExist(mModelPath.c_str())) {
+            MNN_ERROR("MNN_RKNN: model file does not exist: %s\n", mModelPath.c_str());
+            mValid = false;
+            return;
+        }
+        if (mApi->init(&mContext, (void*)mModelPath.c_str(), 0, 0, nullptr) != RKNN_SUCC) {
+            MNN_ERROR("MNN_RKNN: rknn_init failed for %s\n", mModelPath.c_str());
+            mValid = false;
+            return;
+        }
+        if (mApi->query(mContext, RKNN_QUERY_IN_OUT_NUM, &mIoNum, sizeof(mIoNum)) != RKNN_SUCC) {
+            MNN_ERROR("MNN_RKNN: query in/out num failed\n");
+            mValid = false;
+            return;
+        }
+        mInputAttrs.resize(mIoNum.n_input);
+        mOutputAttrs.resize(mIoNum.n_output);
+        for (uint32_t i = 0; i < mIoNum.n_input; ++i) {
+            std::memset(&mInputAttrs[i], 0, sizeof(rknn_tensor_attr));
+            mInputAttrs[i].index = i;
+            if (mApi->query(mContext, RKNN_QUERY_INPUT_ATTR, &mInputAttrs[i], sizeof(rknn_tensor_attr)) != RKNN_SUCC) {
+                MNN_ERROR("MNN_RKNN: query input attr failed: %u\n", i);
+                mValid = false;
+                return;
+            }
+        }
+        for (uint32_t i = 0; i < mIoNum.n_output; ++i) {
+            std::memset(&mOutputAttrs[i], 0, sizeof(rknn_tensor_attr));
+            mOutputAttrs[i].index = i;
+            if (mApi->query(mContext, RKNN_QUERY_OUTPUT_ATTR, &mOutputAttrs[i], sizeof(rknn_tensor_attr)) != RKNN_SUCC) {
+                MNN_ERROR("MNN_RKNN: query output attr failed: %u\n", i);
+                mValid = false;
+                return;
+            }
+        }
+    }
+
+    ~RKNNExecution() override {
+        if (mContext != 0 && nullptr != mApi) {
+            mApi->destroy(mContext);
+        }
+    }
+
+    ErrorCode onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override {
+        if ((uint32_t)inputs.size() != mIoNum.n_input || (uint32_t)outputs.size() != mIoNum.n_output) {
+            MNN_ERROR("MNN_RKNN: input/output count mismatch, expect %u/%u, got %zu/%zu\n",
+                      mIoNum.n_input, mIoNum.n_output, inputs.size(), outputs.size());
+            return INVALID_VALUE;
+        }
+        return NO_ERROR;
+    }
+
+    ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override {
+        std::vector<std::unique_ptr<Tensor>> hostInputs;
+        std::vector<rknn_input> rknnInputs(inputs.size());
+        for (size_t i = 0; i < inputs.size(); ++i) {
+            hostInputs.emplace_back(new Tensor(inputs[i], getHostTensorDimType(inputs[i])));
+            if (!MNNCPUCopyBuffer(inputs[i], hostInputs.back().get())) {
+                MNN_ERROR("MNN_RKNN: failed to copy input tensor %zu to host\n", i);
+                return INVALID_VALUE;
+            }
+            std::memset(&rknnInputs[i], 0, sizeof(rknn_input));
+            rknnInputs[i].index = (uint32_t)i;
+            rknnInputs[i].buf = hostInputs.back()->buffer().host;
+            rknnInputs[i].size = hostInputs.back()->size();
+            rknnInputs[i].pass_through = 0;
+            rknnInputs[i].type = mapTensorType(hostInputs.back().get());
+            rknnInputs[i].fmt = mapTensorFormat(hostInputs.back().get());
+        }
+        if (mApi->inputsSet(mContext, (uint32_t)rknnInputs.size(), rknnInputs.data()) != RKNN_SUCC) {
+            MNN_ERROR("MNN_RKNN: rknn_inputs_set failed\n");
+            return INVALID_VALUE;
+        }
+        if (mApi->run(mContext, nullptr) != RKNN_SUCC) {
+            MNN_ERROR("MNN_RKNN: rknn_run failed\n");
+            return INVALID_VALUE;
+        }
+
+        std::vector<rknn_output> rknnOutputs(outputs.size());
+        for (size_t i = 0; i < outputs.size(); ++i) {
+            std::memset(&rknnOutputs[i], 0, sizeof(rknn_output));
+            rknnOutputs[i].index = (uint32_t)i;
+            rknnOutputs[i].want_float = 1;
+            rknnOutputs[i].is_prealloc = 0;
+        }
+        if (mApi->outputsGet(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data(), nullptr) != RKNN_SUCC) {
+            MNN_ERROR("MNN_RKNN: rknn_outputs_get failed\n");
+            return INVALID_VALUE;
+        }
+
+        for (size_t i = 0; i < outputs.size(); ++i) {
+            if (outputs[i]->getType().code != halide_type_float || outputs[i]->getType().bits != 32) {
+                MNN_ERROR("MNN_RKNN: only float32 outputs are supported in the first runtime version\n");
+                mApi->outputsRelease(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data());
+                return NOT_SUPPORT;
+            }
+            Tensor hostOutput(outputs[i], getHostTensorDimType(outputs[i]));
+            auto copySize = ALIMIN((int)hostOutput.size(), (int)rknnOutputs[i].size);
+            std::memcpy(hostOutput.buffer().host, rknnOutputs[i].buf, copySize);
+            if (!MNNCPUCopyBuffer(&hostOutput, outputs[i])) {
+                MNN_ERROR("MNN_RKNN: failed to copy output tensor %zu from host\n", i);
+                mApi->outputsRelease(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data());
+                return INVALID_VALUE;
+            }
+        }
+        mApi->outputsRelease(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data());
+        return NO_ERROR;
+    }
+
+private:
+    const RKNNApi* mApi = nullptr;
+    std::string mModelPath;
+    rknn_context mContext = 0;
+    rknn_input_output_num mIoNum{};
+    std::vector<rknn_tensor_attr> mInputAttrs;
+    std::vector<rknn_tensor_attr> mOutputAttrs;
+};
+
+} // namespace
+
+RKNNBackend::RKNNBackend(const RKNNRuntime* runtime) : Backend(MNN_FORWARD_USER_2), mRuntime(runtime) {
+}
+
+Execution* RKNNBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) {
+    auto api = loadApi();
+    if (nullptr == api) {
+        return nullptr;
+    }
+    if (nullptr == op || op->type() != OpType_Extra || nullptr == op->main_as_Extra()) {
+        return nullptr;
+    }
+    auto extra = op->main_as_Extra();
+    if (extra->type()->str() != kExtraTypeName) {
+        return nullptr;
+    }
+    auto exe = new RKNNExecution(this, op, api);
+    if (!exe->valid()) {
+        delete exe;
+        return nullptr;
+    }
+    return exe;
+}
+
+void RKNNBackend::onResizeBegin() {
+}
+
+ErrorCode RKNNBackend::onResizeEnd() {
+    return NO_ERROR;
+}
+
+void RKNNBackend::onExecuteBegin() const {
+}
+
+void RKNNBackend::onExecuteEnd() const {
+}
+
+Backend::MemObj* RKNNBackend::onAcquire(const Tensor* tensor, StorageType storageType) {
+    auto mem = new HostMemObj(tensor->size());
+    if (!mem->valid()) {
+        delete mem;
+        return nullptr;
+    }
+    return mem;
+}
+
+bool RKNNBackend::onClearBuffer() {
+    return true;
+}
+
+void RKNNBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
+    MNNCPUCopyBuffer(srcTensor, dstTensor);
+}
+
+const Runtime* RKNNBackend::getRuntime() {
+    return mRuntime;
+}
+
+RKNNRuntime::RKNNRuntime(const Backend::Info& info) : mInfo(info) {
+}
+
+Backend* RKNNRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
+    return new RKNNBackend(this);
+}
+
+void RKNNRuntime::onGabageCollect(int level) {
+}
+
+Runtime::CompilerType RKNNRuntime::onGetCompilerType() const {
+    return Runtime::Compiler_Origin;
+}
+
+Runtime* RKNNRuntimeCreator::onCreate(const Backend::Info& info) const {
+    if (nullptr == loadApi()) {
+        return nullptr;
+    }
+    return new RKNNRuntime(info);
+}
+
+bool RKNNRuntimeCreator::onValid(Backend::Info& info) const {
+    info.mode = Backend::Info::DIRECT;
+    return true;
+}
+
+} // namespace RKNN
+
+void registerRKNNRuntimeCreator() {
+    MNNInsertExtraRuntimeCreator(MNN_FORWARD_USER_2, new RKNN::RKNNRuntimeCreator, false);
+}
+
+} // namespace MNN
diff --git a/source/backend/rknn/backend/RKNNBackend.hpp b/source/backend/rknn/backend/RKNNBackend.hpp
new file mode 100644
index 0000000000..fcc5f02fa0
--- /dev/null
+++ b/source/backend/rknn/backend/RKNNBackend.hpp
@@ -0,0 +1,56 @@
+#ifndef MNN_RKNNBACKEND_HPP
+#define MNN_RKNNBACKEND_HPP
+
+#include "core/Backend.hpp"
+#include "core/Execution.hpp"
+
+namespace MNN {
+namespace RKNN {
+
+class RKNNRuntime;
+
+class RKNNBackend : public Backend {
+public:
+    explicit RKNNBackend(const RKNNRuntime* runtime);
+    ~RKNNBackend() override = default;
+
+    Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
+                        const MNN::Op* op) override;
+    void onResizeBegin() override;
+    ErrorCode onResizeEnd() override;
+    void onExecuteBegin() const override;
+    void onExecuteEnd() const override;
+    MemObj* onAcquire(const Tensor* tensor, StorageType storageType) override;
+    bool onClearBuffer() override;
+    void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
+    const Runtime* getRuntime() override;
+
+private:
+    const RKNNRuntime* mRuntime;
+};
+
+class RKNNRuntime : public Runtime {
+public:
+    explicit RKNNRuntime(const Backend::Info& info);
+    ~RKNNRuntime() override = default;
+
+    Backend* onCreate(const BackendConfig* config = nullptr, Backend* origin = nullptr) const override;
+    void onGabageCollect(int level) override;
+    CompilerType onGetCompilerType() const override;
+
+private:
+    Backend::Info mInfo;
+};
+
+class RKNNRuntimeCreator : public RuntimeCreator {
+public:
+    Runtime* onCreate(const Backend::Info& info) const override;
+    bool onValid(Backend::Info& info) const override;
+};
+
+} // namespace RKNN
+
+void registerRKNNRuntimeCreator();
+} // namespace MNN
+
+#endif
diff --git a/source/core/Backend.cpp b/source/core/Backend.cpp
index f5140b35ab..4b89e321f9 100644
--- a/source/core/Backend.cpp
+++ b/source/core/Backend.cpp
@@ -48,6 +48,9 @@ extern void registerNNAPIRuntimeCreator();
 #if MNN_QNN_ENABLED
 extern void registerQNNRuntimeCreator();
 #endif
+#if MNN_RKNN_ENABLED
+extern void registerRKNNRuntimeCreator();
+#endif
 #ifdef MNN_NEUROPILOT
 extern void registerNeuroPilot();
 #endif
@@ -71,6 +74,9 @@ void registerBackend() {
 #if MNN_QNN_ENABLED
     registerQNNRuntimeCreator();
 #endif
+#if MNN_RKNN_ENABLED
+        registerRKNNRuntimeCreator();
+#endif
 #if MNN_OPENCL_ENABLED
         OpenCL::registerOpenCLRuntimeCreator();
 #endif
diff --git a/tools/converter/CMakeLists.txt b/tools/converter/CMakeLists.txt
index 1e757c9f82..6d078f2cf4 100644
--- a/tools/converter/CMakeLists.txt
+++ b/tools/converter/CMakeLists.txt
@@ -1,6 +1,7 @@
 IF(MNN_BUILD_CONVERTER)
   SET( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/../../)
   option(MNN_BUILD_TORCH "Build Converter support TorchScript." OFF)
+  option(MNN_RKNN_CONVERT_MODE "Enable RKNN sidecar generation in MNNConvert." OFF)
   IF(MNN_BUILD_PROTOBUFFER)
       SET(Protobuf_LIBRARIES libprotobuf)
       include_directories(${CMAKE_CURRENT_LIST_DIR}/../../3rd_party/protobuf/src)
@@ -21,6 +22,9 @@ IF(MNN_BUILD_CONVERTER)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/include)
   include_directories(${CMAKE_CURRENT_LIST_DIR}/source/tflite/schema)
   include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  if (MNN_RKNN_CONVERT_MODE)
+    add_definitions(-DENABLE_RKNN_CONVERT_MODE)
+  endif()
   include(${CMAKE_CURRENT_LIST_DIR}/source/compression/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/tensorflow/CMakeLists.txt)
   include(${CMAKE_CURRENT_LIST_DIR}/source/onnx/CMakeLists.txt)
diff --git a/tools/converter/include/config.hpp b/tools/converter/include/config.hpp
index 5f2c9931d6..9fe97393a4 100644
--- a/tools/converter/include/config.hpp
+++ b/tools/converter/include/config.hpp
@@ -69,6 +69,11 @@ class MNN_PUBLIC modelConfig {
     bool mnn2json = false;
     bool dumpInfo = false;
     bool saveExternalData = false;
+    bool rknnSidecar = false;
+    std::string rknnTarget = "";
+    std::string rknnPython = "";
+    std::string rknnScript = "";
+    std::string rknnOutputDir = "";
     bool inSubGraph = false;
     // using external data when convert
     int64_t externalTreshold = 1024 * 64;
@@ -79,6 +84,7 @@ class MNN_PUBLIC modelConfig {
     bool splitQuantBlock = false;
     // Enable verbose output for each optimization pass (like LLVM's -debug-pass)
     bool dumpPass = false;
+    int cliExitCode = 1;
 };
 
 #endif // CONFIG_HPP
diff --git a/tools/converter/source/MNNConverter.cpp b/tools/converter/source/MNNConverter.cpp
index c81c9f2f78..5c2b4ed7ae 100644
--- a/tools/converter/source/MNNConverter.cpp
+++ b/tools/converter/source/MNNConverter.cpp
@@ -14,9 +14,8 @@ int main(int argc, char *argv[]) {
     // parser command line arg
     auto res = MNN::Cli::initializeMNNConvertArgs(modelPath, argc, argv);
     if (!res) {
-        return 0;
+        return modelPath.cliExitCode;
     }
     // Convert
-    MNN::Cli::convertModel(modelPath);
-    return 0;
+    return MNN::Cli::convertModel(modelPath) ? 0 : 1;
 }
diff --git a/tools/converter/source/common/RKNNBundle.cpp b/tools/converter/source/common/RKNNBundle.cpp
new file mode 100644
index 0000000000..4ff59cb4d8
--- /dev/null
+++ b/tools/converter/source/common/RKNNBundle.cpp
@@ -0,0 +1,300 @@
+#include "RKNNBundle.hpp"
+
+#include <cstdlib>
+#include <fstream>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <vector>
+
+#include "CaffeOp_generated.h"
+#include "CommonUtils.hpp"
+#include "MNN/ErrorCode.hpp"
+#include "MNN_generated.h"
+#include "core/MNNFileUtils.h"
+#include "logkit.h"
+
+namespace {
+static const char* MNN_RKNN_TARGET_ENV = "MNN_RKNN_TARGET";
+static const char* MNN_RKNN_PYTHON_ENV = "MNN_RKNN_PYTHON";
+static const char* MNN_RKNN_SCRIPT_ENV = "MNN_RKNN_SCRIPT";
+static const char* MNN_RKNN_OUTPUT_DIR_ENV = "MNN_RKNN_OUTPUT_DIR";
+
+static std::string getEnvValue(const char* name) {
+    auto value = std::getenv(name);
+    if (nullptr == value) {
+        return "";
+    }
+    return value;
+}
+
+static bool loadRequiredEnv(std::string& dst, const char* name) {
+    dst = getEnvValue(name);
+    if (dst.empty()) {
+        MNN_ERROR("RKNN sidecar requires environment variable %s\n", name);
+        return false;
+    }
+    return true;
+}
+
+static std::string shellEscape(const std::string& input) {
+    std::string escaped = "'";
+    for (char c : input) {
+        if ('\'' == c) {
+            escaped += "'\\''";
+        } else {
+            escaped.push_back(c);
+        }
+    }
+    escaped.push_back('\'');
+    return escaped;
+}
+
+static std::string basenameWithoutExtension(const std::string& path) {
+    auto slash = path.find_last_of("/\\");
+    std::string name = (slash == std::string::npos) ? path : path.substr(slash + 1);
+    auto dot = name.find_last_of('.');
+    if (dot == std::string::npos) {
+        return name;
+    }
+    return name.substr(0, dot);
+}
+
+struct InputInfo {
+    std::string name;
+    std::vector<int> dims;
+    MNN::DataType dtype = MNN::DataType_DT_FLOAT;
+    MNN::MNN_DATA_FORMAT dformat = MNN::MNN_DATA_FORMAT_NC4HW4;
+};
+
+static std::vector<InputInfo> collectInputInfos(const MNN::NetT& net) {
+    std::vector<InputInfo> inputs;
+    for (const auto& op : net.oplists) {
+        if (nullptr == op || op->type != MNN::OpType_Input || op->outputIndexes.empty()) {
+            continue;
+        }
+        auto input = op->main.AsInput();
+        if (nullptr == input) {
+            continue;
+        }
+        const auto outputIndex = op->outputIndexes[0];
+        if (outputIndex < 0 || outputIndex >= net.tensorName.size()) {
+            MNN_ERROR("RKNN wrapper: invalid input tensor index %d\n", outputIndex);
+            return {};
+        }
+        InputInfo info;
+        info.name = net.tensorName[outputIndex];
+        info.dims.assign(input->dims.begin(), input->dims.end());
+        info.dtype = input->dtype;
+        info.dformat = input->dformat;
+        inputs.emplace_back(std::move(info));
+    }
+    return inputs;
+}
+
+static std::vector<std::string> collectOutputNames(const MNN::NetT& net) {
+    if (!net.outputName.empty()) {
+        return net.outputName;
+    }
+    std::set<int> inputIndexes;
+    std::set<int> outputIndexes;
+    std::vector<std::string> outputNames;
+    for (const auto& op : net.oplists) {
+        if (nullptr == op) {
+            continue;
+        }
+        for (auto inputIndex : op->inputIndexes) {
+            inputIndexes.insert(inputIndex);
+        }
+        for (auto outputIndex : op->outputIndexes) {
+            outputIndexes.insert(outputIndex);
+        }
+    }
+    for (auto outputIndex : outputIndexes) {
+        if (inputIndexes.find(outputIndex) != inputIndexes.end()) {
+            continue;
+        }
+        if (outputIndex < 0 || outputIndex >= net.tensorName.size()) {
+            continue;
+        }
+        outputNames.emplace_back(net.tensorName[outputIndex]);
+    }
+    return outputNames;
+}
+
+static std::unique_ptr<MNN::AttributeT> makeStringAttr(const std::string& key, const std::string& value) {
+    std::unique_ptr<MNN::AttributeT> attr(new MNN::AttributeT);
+    attr->key = key;
+    attr->s = value;
+    attr->type = MNN::DataType_DT_STRING;
+    return attr;
+}
+
+static int ensureTensorIndex(const std::string& name, std::map<std::string, int>* tensorMap,
+                             std::vector<std::string>* tensorNames) {
+    auto iter = tensorMap->find(name);
+    if (iter != tensorMap->end()) {
+        return iter->second;
+    }
+    const int index = static_cast<int>(tensorNames->size());
+    tensorNames->emplace_back(name);
+    tensorMap->insert(std::make_pair(name, index));
+    return index;
+}
+}
+
+namespace MNN {
+
+bool PopulateRKNNConfigFromEnv(modelConfig& modelPath) {
+    if (!loadRequiredEnv(modelPath.rknnTarget, MNN_RKNN_TARGET_ENV)) {
+        return false;
+    }
+    if (!loadRequiredEnv(modelPath.rknnPython, MNN_RKNN_PYTHON_ENV)) {
+        return false;
+    }
+    if (!loadRequiredEnv(modelPath.rknnScript, MNN_RKNN_SCRIPT_ENV)) {
+        return false;
+    }
+    if (!loadRequiredEnv(modelPath.rknnOutputDir, MNN_RKNN_OUTPUT_DIR_ENV)) {
+        return false;
+    }
+    if (!CommonKit::FileIsExist(modelPath.rknnScript)) {
+        MNN_ERROR("RKNN script does not exist: %s\n", modelPath.rknnScript.c_str());
+        return false;
+    }
+    return true;
+}
+
+bool GenerateRKNNBundle(const modelConfig& modelPath, RKNNBundlePaths* bundlePaths) {
+    if (modelPath.model != modelConfig::ONNX) {
+        MNN_ERROR("RKNN sidecar only supports ONNX source models\n");
+        return false;
+    }
+    if (modelPath.modelFile.empty() || modelPath.MNNModel.empty()) {
+        MNN_ERROR("RKNN sidecar requires both source ONNX path and output MNN path\n");
+        return false;
+    }
+    if (!MNNDirExist(modelPath.rknnOutputDir.c_str()) && !MNNCreateDir(modelPath.rknnOutputDir.c_str())) {
+        MNN_ERROR("Create RKNN output dir failed: %s\n", modelPath.rknnOutputDir.c_str());
+        return false;
+    }
+
+    const auto baseName = basenameWithoutExtension(modelPath.MNNModel);
+    const auto rknnPath = MNNFilePathConcat(modelPath.rknnOutputDir, baseName + "_" + modelPath.rknnTarget + ".rknn");
+    const auto manifestPath = MNNFilePathConcat(modelPath.rknnOutputDir, baseName + ".rknn.bundle.json");
+
+    std::ostringstream command;
+    command << shellEscape(modelPath.rknnPython) << " "
+            << shellEscape(modelPath.rknnScript)
+            << " --onnx " << shellEscape(modelPath.modelFile)
+            << " --output " << shellEscape(rknnPath)
+            << " --target " << shellEscape(modelPath.rknnTarget);
+
+    MNN_PRINT("Generate RKNN sidecar with command: %s\n", command.str().c_str());
+    auto ret = std::system(command.str().c_str());
+    if (ret != 0) {
+        MNN_ERROR("RKNN sidecar generation failed, exit code: %d\n", ret);
+        return false;
+    }
+    if (!MNNFileExist(rknnPath.c_str())) {
+        MNN_ERROR("RKNN sidecar is not generated: %s\n", rknnPath.c_str());
+        return false;
+    }
+
+    std::ofstream manifest(manifestPath.c_str(), std::ios::out | std::ios::trunc);
+    if (!manifest.good()) {
+        MNN_ERROR("Open RKNN manifest failed: %s\n", manifestPath.c_str());
+        return false;
+    }
+    manifest << "{\n";
+    manifest << "  \"onnx_model\": \"" << modelPath.modelFile << "\",\n";
+    manifest << "  \"mnn_model\": \"" << modelPath.MNNModel << "\",\n";
+    manifest << "  \"rknn_model\": \"" << rknnPath << "\",\n";
+    manifest << "  \"target\": \"" << modelPath.rknnTarget << "\"";
+    const auto weightPath = modelPath.MNNModel + ".weight";
+    if (MNNFileExist(weightPath.c_str())) {
+        manifest << ",\n  \"mnn_external_weight\": \"" << weightPath << "\"\n";
+    } else {
+        manifest << "\n";
+    }
+    manifest << "}\n";
+    manifest.close();
+
+    if (!manifest.good()) {
+        MNN_ERROR("Write RKNN manifest failed: %s\n", manifestPath.c_str());
+        return false;
+    }
+
+    MNN_PRINT("RKNN sidecar generated: %s\n", rknnPath.c_str());
+    MNN_PRINT("RKNN manifest generated: %s\n", manifestPath.c_str());
+    if (nullptr != bundlePaths) {
+        bundlePaths->rknnPath = rknnPath;
+        bundlePaths->manifestPath = manifestPath;
+    }
+    return true;
+}
+
+std::unique_ptr<NetT> BuildRKNNWrapperNet(const NetT& sourceNet, const modelConfig& modelPath,
+                                          const RKNNBundlePaths& bundlePaths) {
+    auto inputs = collectInputInfos(sourceNet);
+    if (inputs.empty()) {
+        MNN_ERROR("RKNN wrapper: failed to collect input tensors from source net\n");
+        return nullptr;
+    }
+    auto outputs = collectOutputNames(sourceNet);
+    if (outputs.empty()) {
+        MNN_ERROR("RKNN wrapper: failed to collect output tensors from source net\n");
+        return nullptr;
+    }
+
+    std::unique_ptr<NetT> wrapper(new NetT);
+    wrapper->bizCode = modelPath.bizCode;
+    wrapper->sourceType = NetSource_ONNX;
+    wrapper->usage = Usage_INFERENCE;
+    wrapper->preferForwardType = ForwardType_CPU;
+
+    std::map<std::string, int> tensorMap;
+    std::vector<int> inputIndexes;
+    std::vector<int> outputIndexes;
+
+    for (const auto& input : inputs) {
+        const int tensorIndex = ensureTensorIndex(input.name, &tensorMap, &wrapper->tensorName);
+        inputIndexes.emplace_back(tensorIndex);
+
+        std::unique_ptr<OpT> inputOp(new OpT);
+        inputOp->name = input.name;
+        inputOp->type = OpType_Input;
+        inputOp->main.type = OpParameter_Input;
+        inputOp->main.value = new InputT;
+        inputOp->main.AsInput()->dims.assign(input.dims.begin(), input.dims.end());
+        inputOp->main.AsInput()->dtype = input.dtype;
+        inputOp->main.AsInput()->dformat = input.dformat;
+        inputOp->outputIndexes = {tensorIndex};
+        inputOp->defaultDimentionFormat = input.dformat;
+        wrapper->oplists.emplace_back(std::move(inputOp));
+    }
+
+    for (const auto& output : outputs) {
+        outputIndexes.emplace_back(ensureTensorIndex(output, &tensorMap, &wrapper->tensorName));
+    }
+
+    std::unique_ptr<OpT> rknnOp(new OpT);
+    rknnOp->name = "RKNNSubgraph";
+    rknnOp->type = OpType_Extra;
+    rknnOp->main.type = OpParameter_Extra;
+    rknnOp->main.value = new ExtraT;
+    rknnOp->main.AsExtra()->type = "RKNN";
+    rknnOp->main.AsExtra()->engine = "MNN";
+    rknnOp->main.AsExtra()->attr.emplace_back(makeStringAttr("model_path", bundlePaths.rknnPath));
+    rknnOp->main.AsExtra()->attr.emplace_back(makeStringAttr("bundle_manifest", bundlePaths.manifestPath));
+    rknnOp->main.AsExtra()->attr.emplace_back(makeStringAttr("target", modelPath.rknnTarget));
+    rknnOp->inputIndexes = inputIndexes;
+    rknnOp->outputIndexes = outputIndexes;
+    wrapper->oplists.emplace_back(std::move(rknnOp));
+
+    wrapper->outputName = outputs;
+    wrapper->tensorNumber = static_cast<int>(wrapper->tensorName.size());
+    return wrapper;
+}
+}
diff --git a/tools/converter/source/common/RKNNBundle.hpp b/tools/converter/source/common/RKNNBundle.hpp
new file mode 100644
index 0000000000..faf65ff735
--- /dev/null
+++ b/tools/converter/source/common/RKNNBundle.hpp
@@ -0,0 +1,23 @@
+#ifndef RKNN_BUNDLE_HPP
+#define RKNN_BUNDLE_HPP
+
+#include <memory>
+#include <string>
+
+#include "config.hpp"
+
+namespace MNN {
+struct NetT;
+
+struct RKNNBundlePaths {
+    std::string rknnPath;
+    std::string manifestPath;
+};
+
+bool PopulateRKNNConfigFromEnv(modelConfig& modelPath);
+bool GenerateRKNNBundle(const modelConfig& modelPath, RKNNBundlePaths* bundlePaths);
+std::unique_ptr<NetT> BuildRKNNWrapperNet(const NetT& sourceNet, const modelConfig& modelPath,
+                                          const RKNNBundlePaths& bundlePaths);
+}
+
+#endif
diff --git a/tools/converter/source/common/cli.cpp b/tools/converter/source/common/cli.cpp
index d2dd9ab4d4..802b6e8c6c 100644
--- a/tools/converter/source/common/cli.cpp
+++ b/tools/converter/source/common/cli.cpp
@@ -35,6 +35,9 @@
 #include "CommonUtils.hpp"
 #include "PostConverter.hpp"
 #include "Json2Flatbuffer.hpp"
+#ifdef ENABLE_RKNN_CONVERT_MODE
+#include "RKNNBundle.hpp"
+#endif
 #include <fstream>
 #include <sstream>
 #include <cmath>
@@ -225,15 +228,20 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
         "dumpPass",
         "Enable verbose output for each optimization pass, showing what changes each pass made (like LLVM's "
         "-debug-pass)");
+#ifdef ENABLE_RKNN_CONVERT_MODE
+    options.add_options()("rknn", "generate RKNN sidecar from source ONNX and environment variables");
+#endif
 
     auto result = options.parse(argc, argv);
 
     if (result.count("help")) {
+        modelPath.cliExitCode = 0;
         std::cout << options.help({""}) << std::endl;
         return false;
     }
 
     if (result.count("version")) {
+        modelPath.cliExitCode = 0;
         std::cout << MNN_VERSION << std::endl;
         return false;
     }
@@ -269,6 +277,7 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
         return false;
     }
     if (result.count("OP")) {
+        modelPath.cliExitCode = 0;
         MNN_PRINT("Dump %s support Ops\n", frameWork.c_str());
         const auto& res = OpCount::get()->getMap().find(frameWork);
         if (res == OpCount::get()->getMap().end()) {
@@ -448,6 +457,14 @@ bool Cli::initializeMNNConvertArgs(modelConfig &modelPath, int argc, char **argv
     if (result.count("dumpPass")) {
         modelPath.dumpPass = true;
     }
+#ifdef ENABLE_RKNN_CONVERT_MODE
+    if (result.count("rknn")) {
+        modelPath.rknnSidecar = true;
+        if (!PopulateRKNNConfigFromEnv(modelPath)) {
+            return false;
+        }
+    }
+#endif
     return true;
 }
 
@@ -651,19 +668,37 @@ bool Cli::convertModel(modelConfig& modelPath) {
         expectedPass.emplace_back("SplitBlockQuantConvolution");
     }
     CommonKit::loadCompress(modelPath);
+    std::unique_ptr<MNN::NetT> finalNet;
     if (needOptimize) {
         std::cout << "Start to Optimize the MNN Net..." << std::endl;
-        std::unique_ptr<MNN::NetT> newNet = optimizeNet(netT, modelPath.forTraining, modelPath, expectedPass);
-        if (newNet->extraTensorDescribe.size()>0 && expectedPass.empty()) {
+        finalNet = optimizeNet(netT, modelPath.forTraining, modelPath, expectedPass);
+        if (finalNet->extraTensorDescribe.size()>0 && expectedPass.empty()) {
             MNN_PRINT("MNN net has tensor quant info\n");
-            computeUnaryBuffer(newNet.get());
+            computeUnaryBuffer(finalNet.get());
         }
-        _reorderInputs(inputNames, newNet.get());
-        error = writeFb(newNet, modelPath, std::move(metaOp));
+        _reorderInputs(inputNames, finalNet.get());
     } else {
         _reorderInputs(inputNames, netT.get());
-        error = writeFb(netT, modelPath, std::move(metaOp));
+        finalNet = std::move(netT);
     }
+
+#ifdef ENABLE_RKNN_CONVERT_MODE
+    if (modelPath.rknnSidecar) {
+        RKNNBundlePaths bundlePaths;
+        if (!GenerateRKNNBundle(modelPath, &bundlePaths)) {
+            return false;
+        }
+        auto wrapperNet = BuildRKNNWrapperNet(*finalNet, modelPath, bundlePaths);
+        if (nullptr == wrapperNet) {
+            return false;
+        }
+        error = writeFb(wrapperNet, modelPath, std::move(metaOp));
+    } else {
+        error = writeFb(finalNet, modelPath, std::move(metaOp));
+    }
+#else
+    error = writeFb(finalNet, modelPath, std::move(metaOp));
+#endif
     if (0 == error) {
         std::cout << "Converted Success!" << std::endl;
     } else {

From 5704365d4acc4e6f4a4c656e0c38b99a6a76c39d Mon Sep 17 00:00:00 2001
From: root <huang_zheng_xiang@163.com>
Date: Tue, 9 Jun 2026 11:19:10 +0800
Subject: [PATCH 2/3] [NPU:rknn] migrate rknn pipeline to CPU Plugin

---
 docs/inference/npu.md                        |   8 +-
 source/backend/rknn/CMakeLists.txt           |   4 +
 source/backend/rknn/backend/RKNNBackend.cpp  | 276 +++++++------------
 source/backend/rknn/backend/RKNNBackend.hpp  |  53 +---
 source/core/Backend.cpp                      |   6 -
 tools/converter/source/common/RKNNBundle.cpp | 154 ++++++++++-
 6 files changed, 244 insertions(+), 257 deletions(-)

diff --git a/docs/inference/npu.md b/docs/inference/npu.md
index a239d58ad8..dd5702cfdc 100644
--- a/docs/inference/npu.md
+++ b/docs/inference/npu.md
@@ -185,12 +185,12 @@ cp -r ${DDK}/include ${MNN}/source/backend/hiai/3rdParty/include
 - 包装后的 `.mnn`
 - sidecar `.rknn`
 
-其中 `.mnn` 内部只保留 `Input + Extra(type="RKNN")` 包装图，运行时由 MNN 的 RKNN backend 调用 RKNN C API 执行 `.rknn`。
+其中 `.mnn` 内部保留 `Input + Plugin(type="RKNN")` 包装图，运行时由 MNN 的 CPU Plugin 框架调用 RKNN C API 执行 `.rknn`。
 
 ### RKNN 后端整体介绍
 
 - Host 侧通过 `MNNConvert --rknn` 完成双产物生成，不走 `compilefornpu` 的 `MNN -> NPU` 逐算子编译链路。
-- Device 侧通过 RKNN C API 加载 `.rknn` 并执行，当前 backend 注册为 `MNN_FORWARD_USER_2`。
+- Device 侧通过 MNN 的 CPU Plugin 框架调用 RKNN C API 加载 `.rknn` 并执行；应用侧 Session backend 仍使用 `MNN_FORWARD_CPU`。
 - RKNN backend 读取 runtime 库路径、转换脚本路径、目标平台等信息时，不做硬编码，全部从环境变量读取；缺失时直接报 `MNN_ERROR`。
 
 ### 编译
@@ -268,13 +268,13 @@ ${BUILD_DIR}/MNNConvert \
   - 指向目标板上的 `librknnrt.so`
 
 并在创建 Session 时选择：
-- backend type = `MNN_FORWARD_USER_2`
+- backend type = `MNN_FORWARD_CPU`
 
 如果 `.rknn` 路径在 wrapper `.mnn` 中是相对路径，则需要确保模型外部路径设置正确，使 MNN 能解析 sidecar 所在目录。
 
 ### 当前限制
 
-- 当前 RKNN backend 只执行 `Extra(type="RKNN")` 节点，不支持逐算子 RKNN backend。
+- 当前 RKNN 路径执行 `Plugin(type="RKNN")` 节点，不支持逐算子 RKNN backend。
 - 当前实现走 host buffer copy 路径，尚未做 zero-copy。
 - 当前输出路径按 `float32` 处理。
 - 当前主目标是板端运行；PC 侧如果没有可用的 x86 `librknnrt.so`，则不能直接用 MNN runtime 在 Host 上模拟执行 RKNN backend。
diff --git a/source/backend/rknn/CMakeLists.txt b/source/backend/rknn/CMakeLists.txt
index bb95975eb1..c5c528ead5 100644
--- a/source/backend/rknn/CMakeLists.txt
+++ b/source/backend/rknn/CMakeLists.txt
@@ -10,6 +10,10 @@ if ("${_RKNN_API_INCLUDE}" STREQUAL "")
     message(FATAL_ERROR "MNN_RKNN=ON requires RKNN_API_INCLUDE_DIR (or env RKNN_API_INCLUDE_DIR) to point to the directory containing rknn_api.h")
 endif()
 
+if (NOT MNN_WITH_PLUGIN)
+    message(FATAL_ERROR "MNN_RKNN=ON requires MNN_WITH_PLUGIN=ON because RKNN is implemented as Plugin(RKNN) + CPU Plugin kernels")
+endif()
+
 add_library(MNN_RKNN OBJECT ${MNN_RKNN_SRCS})
 target_include_directories(MNN_RKNN PRIVATE ${CMAKE_CURRENT_LIST_DIR}/backend/)
 target_include_directories(MNN_RKNN PRIVATE ${_RKNN_API_INCLUDE})
diff --git a/source/backend/rknn/backend/RKNNBackend.cpp b/source/backend/rknn/backend/RKNNBackend.cpp
index a9999ff726..cbe5e37dc4 100644
--- a/source/backend/rknn/backend/RKNNBackend.cpp
+++ b/source/backend/rknn/backend/RKNNBackend.cpp
@@ -1,5 +1,3 @@
-#include "RKNNBackend.hpp"
-
 #include <cstdlib>
 #include <cstring>
 #include <dlfcn.h>
@@ -8,37 +6,25 @@
 #include <string>
 #include <vector>
 
-#include "MNN_generated.h"
+#include "MNN/plugin/PluginContext.hpp"
+#include "MNN/plugin/PluginKernel.hpp"
+#include "MNN/plugin/PluginShapeInference.hpp"
+#include "core/Backend.hpp"
 #include "core/MNNFileUtils.h"
 #include "core/Macro.h"
 #include "core/TensorUtils.hpp"
 #include "rknn_api.h"
+#include "shape/SizeComputer.hpp"
 
+#ifdef MNN_WITH_PLUGIN
 namespace MNN {
 namespace RKNN {
 namespace {
 
 static const char* kRuntimeLibEnv = "MNN_RKNN_RUNTIME_LIB";
-static const char* kExtraTypeName = "RKNN";
+static const char* kPluginTypeName = "RKNN";
 static const char* kModelPathAttr = "model_path";
 
-class HostMemObj : public Backend::MemObj {
-public:
-    explicit HostMemObj(size_t size) : mPtr(std::malloc(size)) {
-    }
-    ~HostMemObj() override {
-        std::free(mPtr);
-    }
-    MemChunk chunk() override {
-        return MemChunk(mPtr, 0);
-    }
-    bool valid() const {
-        return nullptr != mPtr;
-    }
-private:
-    void* mPtr = nullptr;
-};
-
 struct RKNNApi {
     using Init = int (*)(rknn_context*, void*, uint32_t, uint32_t, rknn_init_extend*);
     using Destroy = int (*)(rknn_context);
@@ -64,7 +50,7 @@ static const RKNNApi* loadApi() {
     static RKNNApi api;
     std::call_once(once, []() {
         auto libPath = std::getenv(kRuntimeLibEnv);
-        if (nullptr == libPath || libPath[0] == '\0') {
+        if (nullptr == libPath || libPath[0] == 0) {
             MNN_ERROR("MNN_RKNN: missing environment variable %s\n", kRuntimeLibEnv);
             return;
         }
@@ -76,7 +62,7 @@ static const RKNNApi* loadApi() {
 #define MNN_RKNN_LOAD_SYMBOL(typeName, field, symbol)                                               \
         api.field = reinterpret_cast<RKNNApi::typeName>(dlsym(api.handle, symbol));                 \
         if (nullptr == api.field) {                                                                  \
-            MNN_ERROR("MNN_RKNN: dlsym failed for %s\n", symbol);                                   \
+            MNN_ERROR("MNN_RKNN: dlsym failed for %s\n", symbol);                                 \
             return;                                                                                  \
         }
         MNN_RKNN_LOAD_SYMBOL(Init, init, "rknn_init");
@@ -92,30 +78,22 @@ static const RKNNApi* loadApi() {
     return api.loaded ? &api : nullptr;
 }
 
-static std::string getStringAttr(const Extra* extra, const char* key) {
-    if (nullptr == extra || nullptr == extra->attr()) {
+static std::string getStringAttr(const plugin::PluginContext* ctx, const char* key) {
+    auto attr = ctx->getAttr(key);
+    if (nullptr == attr || nullptr == attr->s()) {
         return "";
     }
-    for (int i = 0; i < extra->attr()->size(); ++i) {
-        auto attr = extra->attr()->GetAs<Attribute>(i);
-        if (nullptr == attr || nullptr == attr->key()) {
-            continue;
-        }
-        if (attr->key()->str() == key && nullptr != attr->s()) {
-            return attr->s()->str();
-        }
-    }
-    return "";
+    return attr->s()->str();
 }
 
-static std::string resolveModelPath(const Backend* backend, const std::string& path) {
+static std::string resolveModelPath(const std::string& dirPath, const std::string& path) {
     if (path.empty()) {
         return "";
     }
-    if (!path.empty() && path[0] == '/') {
+    if (path[0] == '/') {
         return path;
     }
-    return MNNFilePathConcat(backend->pNPUModelDirPath, path);
+    return MNNFilePathConcat(dirPath, path);
 }
 
 static rknn_tensor_type mapTensorType(const Tensor* tensor) {
@@ -147,40 +125,62 @@ static Tensor::DimensionType getHostTensorDimType(const Tensor* tensor) {
     return tensor->getDimensionType();
 }
 
-class RKNNExecution : public Execution {
+class RKNNPluginShape : public plugin::InferShapeKernel {
 public:
-    RKNNExecution(Backend* backend, const Op* op, const RKNNApi* api) : Execution(backend), mApi(api) {
-        if (nullptr == op || op->type() != OpType_Extra || nullptr == op->main_as_Extra()) {
-            MNN_ERROR("MNN_RKNN: invalid op for RKNN execution\n");
-            mValid = false;
-            return;
+    bool compute(plugin::InferShapeContext* ctx) override {
+        for (int i = 0; i < ctx->outputs().size(); ++i) {
+            auto key = std::string("o_") + std::to_string(i);
+            auto attr = ctx->getAttr(key);
+            if (nullptr == attr || nullptr == attr->tensor()) {
+                MNN_ERROR("MNN_RKNN: missing output shape attr %s\n", key.c_str());
+                return false;
+            }
+            auto blob = attr->tensor();
+            auto dst = ctx->output(i);
+            dst->setType(blob->dataType());
+            if (nullptr != blob->dims()) {
+                dst->buffer().dimensions = blob->dims()->size();
+                for (int j = 0; j < blob->dims()->size(); ++j) {
+                    dst->setLength(j, blob->dims()->data()[j]);
+                }
+            } else {
+                dst->buffer().dimensions = 0;
+            }
+            TensorUtils::getDescribe(dst)->dimensionFormat = blob->dataFormat();
         }
-        auto extra = op->main_as_Extra();
-        if (extra->type()->str() != kExtraTypeName) {
-            MNN_ERROR("MNN_RKNN: unsupported Extra type\n");
-            mValid = false;
-            return;
+        return true;
+    }
+};
+
+class RKNNPluginExecute : public plugin::CPUComputeKernel {
+public:
+    ~RKNNPluginExecute() override {
+        if (mContext != 0 && nullptr != mApi) {
+            mApi->destroy(mContext);
+        }
+    }
+
+    bool init(plugin::CPUKernelContext* ctx) override {
+        mApi = loadApi();
+        if (nullptr == mApi) {
+            return false;
         }
-        mModelPath = resolveModelPath(backend, getStringAttr(extra, kModelPathAttr));
+        mModelPath = resolveModelPath(ctx->dir_path(), getStringAttr(ctx, kModelPathAttr));
         if (mModelPath.empty()) {
-            MNN_ERROR("MNN_RKNN: Extra(%s) requires attr '%s'\n", kExtraTypeName, kModelPathAttr);
-            mValid = false;
-            return;
+            MNN_ERROR("MNN_RKNN: Plugin(%s) requires attr %s\n", kPluginTypeName, kModelPathAttr);
+            return false;
         }
         if (!MNNFileExist(mModelPath.c_str())) {
             MNN_ERROR("MNN_RKNN: model file does not exist: %s\n", mModelPath.c_str());
-            mValid = false;
-            return;
+            return false;
         }
         if (mApi->init(&mContext, (void*)mModelPath.c_str(), 0, 0, nullptr) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: rknn_init failed for %s\n", mModelPath.c_str());
-            mValid = false;
-            return;
+            return false;
         }
         if (mApi->query(mContext, RKNN_QUERY_IN_OUT_NUM, &mIoNum, sizeof(mIoNum)) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: query in/out num failed\n");
-            mValid = false;
-            return;
+            return false;
         }
         mInputAttrs.resize(mIoNum.n_input);
         mOutputAttrs.resize(mIoNum.n_output);
@@ -189,8 +189,7 @@ class RKNNExecution : public Execution {
             mInputAttrs[i].index = i;
             if (mApi->query(mContext, RKNN_QUERY_INPUT_ATTR, &mInputAttrs[i], sizeof(rknn_tensor_attr)) != RKNN_SUCC) {
                 MNN_ERROR("MNN_RKNN: query input attr failed: %u\n", i);
-                mValid = false;
-                return;
+                return false;
             }
         }
         for (uint32_t i = 0; i < mIoNum.n_output; ++i) {
@@ -198,35 +197,30 @@ class RKNNExecution : public Execution {
             mOutputAttrs[i].index = i;
             if (mApi->query(mContext, RKNN_QUERY_OUTPUT_ATTR, &mOutputAttrs[i], sizeof(rknn_tensor_attr)) != RKNN_SUCC) {
                 MNN_ERROR("MNN_RKNN: query output attr failed: %u\n", i);
-                mValid = false;
-                return;
+                return false;
             }
         }
+        return true;
     }
 
-    ~RKNNExecution() override {
-        if (mContext != 0 && nullptr != mApi) {
-            mApi->destroy(mContext);
-        }
-    }
-
-    ErrorCode onResize(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override {
-        if ((uint32_t)inputs.size() != mIoNum.n_input || (uint32_t)outputs.size() != mIoNum.n_output) {
+    bool resize(plugin::CPUKernelContext* ctx) override {
+        if ((uint32_t)ctx->inputs().size() != mIoNum.n_input || (uint32_t)ctx->outputs().size() != mIoNum.n_output) {
             MNN_ERROR("MNN_RKNN: input/output count mismatch, expect %u/%u, got %zu/%zu\n",
-                      mIoNum.n_input, mIoNum.n_output, inputs.size(), outputs.size());
-            return INVALID_VALUE;
+                      mIoNum.n_input, mIoNum.n_output, ctx->inputs().size(), ctx->outputs().size());
+            return false;
         }
-        return NO_ERROR;
+        return true;
     }
 
-    ErrorCode onExecute(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs) override {
+    bool compute(plugin::CPUKernelContext* ctx) override {
         std::vector<std::unique_ptr<Tensor>> hostInputs;
-        std::vector<rknn_input> rknnInputs(inputs.size());
-        for (size_t i = 0; i < inputs.size(); ++i) {
-            hostInputs.emplace_back(new Tensor(inputs[i], getHostTensorDimType(inputs[i])));
-            if (!MNNCPUCopyBuffer(inputs[i], hostInputs.back().get())) {
+        std::vector<rknn_input> rknnInputs(ctx->inputs().size());
+        for (size_t i = 0; i < ctx->inputs().size(); ++i) {
+            auto src = ctx->input((int)i);
+            hostInputs.emplace_back(new Tensor(src, getHostTensorDimType(src)));
+            if (!MNNCPUCopyBuffer(src, hostInputs.back().get())) {
                 MNN_ERROR("MNN_RKNN: failed to copy input tensor %zu to host\n", i);
-                return INVALID_VALUE;
+                return false;
             }
             std::memset(&rknnInputs[i], 0, sizeof(rknn_input));
             rknnInputs[i].index = (uint32_t)i;
@@ -238,15 +232,15 @@ class RKNNExecution : public Execution {
         }
         if (mApi->inputsSet(mContext, (uint32_t)rknnInputs.size(), rknnInputs.data()) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: rknn_inputs_set failed\n");
-            return INVALID_VALUE;
+            return false;
         }
         if (mApi->run(mContext, nullptr) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: rknn_run failed\n");
-            return INVALID_VALUE;
+            return false;
         }
 
-        std::vector<rknn_output> rknnOutputs(outputs.size());
-        for (size_t i = 0; i < outputs.size(); ++i) {
+        std::vector<rknn_output> rknnOutputs(ctx->outputs().size());
+        for (size_t i = 0; i < ctx->outputs().size(); ++i) {
             std::memset(&rknnOutputs[i], 0, sizeof(rknn_output));
             rknnOutputs[i].index = (uint32_t)i;
             rknnOutputs[i].want_float = 1;
@@ -254,26 +248,27 @@ class RKNNExecution : public Execution {
         }
         if (mApi->outputsGet(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data(), nullptr) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: rknn_outputs_get failed\n");
-            return INVALID_VALUE;
+            return false;
         }
 
-        for (size_t i = 0; i < outputs.size(); ++i) {
-            if (outputs[i]->getType().code != halide_type_float || outputs[i]->getType().bits != 32) {
-                MNN_ERROR("MNN_RKNN: only float32 outputs are supported in the first runtime version\n");
+        for (size_t i = 0; i < ctx->outputs().size(); ++i) {
+            auto dst = ctx->output((int)i);
+            if (dst->getType().code != halide_type_float || dst->getType().bits != 32) {
+                MNN_ERROR("MNN_RKNN: only float32 outputs are supported in the first plugin version\n");
                 mApi->outputsRelease(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data());
-                return NOT_SUPPORT;
+                return false;
             }
-            Tensor hostOutput(outputs[i], getHostTensorDimType(outputs[i]));
+            Tensor hostOutput(dst, getHostTensorDimType(dst));
             auto copySize = ALIMIN((int)hostOutput.size(), (int)rknnOutputs[i].size);
             std::memcpy(hostOutput.buffer().host, rknnOutputs[i].buf, copySize);
-            if (!MNNCPUCopyBuffer(&hostOutput, outputs[i])) {
+            if (!MNNCPUCopyBuffer(&hostOutput, dst)) {
                 MNN_ERROR("MNN_RKNN: failed to copy output tensor %zu from host\n", i);
                 mApi->outputsRelease(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data());
-                return INVALID_VALUE;
+                return false;
             }
         }
         mApi->outputsRelease(mContext, (uint32_t)rknnOutputs.size(), rknnOutputs.data());
-        return NO_ERROR;
+        return true;
     }
 
 private:
@@ -285,95 +280,12 @@ class RKNNExecution : public Execution {
     std::vector<rknn_tensor_attr> mOutputAttrs;
 };
 
-} // namespace
-
-RKNNBackend::RKNNBackend(const RKNNRuntime* runtime) : Backend(MNN_FORWARD_USER_2), mRuntime(runtime) {
-}
-
-Execution* RKNNBackend::onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs, const MNN::Op* op) {
-    auto api = loadApi();
-    if (nullptr == api) {
-        return nullptr;
-    }
-    if (nullptr == op || op->type() != OpType_Extra || nullptr == op->main_as_Extra()) {
-        return nullptr;
-    }
-    auto extra = op->main_as_Extra();
-    if (extra->type()->str() != kExtraTypeName) {
-        return nullptr;
-    }
-    auto exe = new RKNNExecution(this, op, api);
-    if (!exe->valid()) {
-        delete exe;
-        return nullptr;
-    }
-    return exe;
-}
-
-void RKNNBackend::onResizeBegin() {
-}
-
-ErrorCode RKNNBackend::onResizeEnd() {
-    return NO_ERROR;
-}
-
-void RKNNBackend::onExecuteBegin() const {
-}
-
-void RKNNBackend::onExecuteEnd() const {
-}
-
-Backend::MemObj* RKNNBackend::onAcquire(const Tensor* tensor, StorageType storageType) {
-    auto mem = new HostMemObj(tensor->size());
-    if (!mem->valid()) {
-        delete mem;
-        return nullptr;
-    }
-    return mem;
-}
-
-bool RKNNBackend::onClearBuffer() {
-    return true;
-}
-
-void RKNNBackend::onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const {
-    MNNCPUCopyBuffer(srcTensor, dstTensor);
-}
-
-const Runtime* RKNNBackend::getRuntime() {
-    return mRuntime;
-}
-
-RKNNRuntime::RKNNRuntime(const Backend::Info& info) : mInfo(info) {
-}
-
-Backend* RKNNRuntime::onCreate(const BackendConfig* config, Backend* origin) const {
-    return new RKNNBackend(this);
-}
-
-void RKNNRuntime::onGabageCollect(int level) {
-}
-
-Runtime::CompilerType RKNNRuntime::onGetCompilerType() const {
-    return Runtime::Compiler_Origin;
-}
-
-Runtime* RKNNRuntimeCreator::onCreate(const Backend::Info& info) const {
-    if (nullptr == loadApi()) {
-        return nullptr;
-    }
-    return new RKNNRuntime(info);
-}
-
-bool RKNNRuntimeCreator::onValid(Backend::Info& info) const {
-    info.mode = Backend::Info::DIRECT;
-    return true;
-}
+static auto _rknn_plugin_shape_registrar __attribute__((unused)) =
+    MNN::plugin::InferShapeKernelRegistrar<RKNNPluginShape>("RKNN");
+static auto _rknn_plugin_compute_registrar __attribute__((unused)) =
+    MNN::plugin::ComputeKernelRegistrar<RKNNPluginExecute>("RKNN");
 
+} // namespace
 } // namespace RKNN
-
-void registerRKNNRuntimeCreator() {
-    MNNInsertExtraRuntimeCreator(MNN_FORWARD_USER_2, new RKNN::RKNNRuntimeCreator, false);
-}
-
 } // namespace MNN
+#endif
diff --git a/source/backend/rknn/backend/RKNNBackend.hpp b/source/backend/rknn/backend/RKNNBackend.hpp
index fcc5f02fa0..4cec68afb1 100644
--- a/source/backend/rknn/backend/RKNNBackend.hpp
+++ b/source/backend/rknn/backend/RKNNBackend.hpp
@@ -1,56 +1,7 @@
 #ifndef MNN_RKNNBACKEND_HPP
 #define MNN_RKNNBACKEND_HPP
 
-#include "core/Backend.hpp"
-#include "core/Execution.hpp"
-
-namespace MNN {
-namespace RKNN {
-
-class RKNNRuntime;
-
-class RKNNBackend : public Backend {
-public:
-    explicit RKNNBackend(const RKNNRuntime* runtime);
-    ~RKNNBackend() override = default;
-
-    Execution* onCreate(const std::vector<Tensor*>& inputs, const std::vector<Tensor*>& outputs,
-                        const MNN::Op* op) override;
-    void onResizeBegin() override;
-    ErrorCode onResizeEnd() override;
-    void onExecuteBegin() const override;
-    void onExecuteEnd() const override;
-    MemObj* onAcquire(const Tensor* tensor, StorageType storageType) override;
-    bool onClearBuffer() override;
-    void onCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor) const override;
-    const Runtime* getRuntime() override;
-
-private:
-    const RKNNRuntime* mRuntime;
-};
-
-class RKNNRuntime : public Runtime {
-public:
-    explicit RKNNRuntime(const Backend::Info& info);
-    ~RKNNRuntime() override = default;
-
-    Backend* onCreate(const BackendConfig* config = nullptr, Backend* origin = nullptr) const override;
-    void onGabageCollect(int level) override;
-    CompilerType onGetCompilerType() const override;
-
-private:
-    Backend::Info mInfo;
-};
-
-class RKNNRuntimeCreator : public RuntimeCreator {
-public:
-    Runtime* onCreate(const Backend::Info& info) const override;
-    bool onValid(Backend::Info& info) const override;
-};
-
-} // namespace RKNN
-
-void registerRKNNRuntimeCreator();
-} // namespace MNN
+// RKNN is implemented as Plugin("RKNN") + CPU Plugin kernels.
+// This header is kept only as a placeholder for the source/backend/rknn tree.
 
 #endif
diff --git a/source/core/Backend.cpp b/source/core/Backend.cpp
index 4b89e321f9..f5140b35ab 100644
--- a/source/core/Backend.cpp
+++ b/source/core/Backend.cpp
@@ -48,9 +48,6 @@ extern void registerNNAPIRuntimeCreator();
 #if MNN_QNN_ENABLED
 extern void registerQNNRuntimeCreator();
 #endif
-#if MNN_RKNN_ENABLED
-extern void registerRKNNRuntimeCreator();
-#endif
 #ifdef MNN_NEUROPILOT
 extern void registerNeuroPilot();
 #endif
@@ -74,9 +71,6 @@ void registerBackend() {
 #if MNN_QNN_ENABLED
     registerQNNRuntimeCreator();
 #endif
-#if MNN_RKNN_ENABLED
-        registerRKNNRuntimeCreator();
-#endif
 #if MNN_OPENCL_ENABLED
         OpenCL::registerOpenCLRuntimeCreator();
 #endif
diff --git a/tools/converter/source/common/RKNNBundle.cpp b/tools/converter/source/common/RKNNBundle.cpp
index 4ff59cb4d8..ca7b5192aa 100644
--- a/tools/converter/source/common/RKNNBundle.cpp
+++ b/tools/converter/source/common/RKNNBundle.cpp
@@ -12,6 +12,7 @@
 #include "CommonUtils.hpp"
 #include "MNN/ErrorCode.hpp"
 #include "MNN_generated.h"
+#include "../optimizer/Program.hpp"
 #include "core/MNNFileUtils.h"
 #include "logkit.h"
 
@@ -41,7 +42,7 @@ static bool loadRequiredEnv(std::string& dst, const char* name) {
 static std::string shellEscape(const std::string& input) {
     std::string escaped = "'";
     for (char c : input) {
-        if ('\'' == c) {
+        if (c == '\'') {
             escaped += "'\\''";
         } else {
             escaped.push_back(c);
@@ -60,7 +61,6 @@ static std::string basenameWithoutExtension(const std::string& path) {
     }
     return name.substr(0, dot);
 }
-
 struct InputInfo {
     std::string name;
     std::vector<int> dims;
@@ -68,6 +68,13 @@ struct InputInfo {
     MNN::MNN_DATA_FORMAT dformat = MNN::MNN_DATA_FORMAT_NC4HW4;
 };
 
+struct OutputInfo {
+    std::string name;
+    std::vector<int> dims;
+    MNN::DataType dtype = MNN::DataType_DT_FLOAT;
+    MNN::MNN_DATA_FORMAT dformat = MNN::MNN_DATA_FORMAT_NC4HW4;
+};
+
 static std::vector<InputInfo> collectInputInfos(const MNN::NetT& net) {
     std::vector<InputInfo> inputs;
     for (const auto& op : net.oplists) {
@@ -123,6 +130,99 @@ static std::vector<std::string> collectOutputNames(const MNN::NetT& net) {
     return outputNames;
 }
 
+static MNN::DataType mapExprDataType(const halide_type_t& type) {
+    if (type.code == halide_type_float) {
+        if (type.bits == 16) {
+            return MNN::DataType_DT_HALF;
+        }
+        if (type.bits == 64) {
+            return MNN::DataType_DT_DOUBLE;
+        }
+        return MNN::DataType_DT_FLOAT;
+    }
+    if (type.code == halide_type_uint) {
+        if (type.bits == 8) {
+            return MNN::DataType_DT_UINT8;
+        }
+        if (type.bits == 16) {
+            return MNN::DataType_DT_UINT16;
+        }
+        if (type.bits == 32) {
+            return MNN::DataType_DT_INT32;
+        }
+        return MNN::DataType_DT_INT32;
+    }
+    if (type.code == halide_type_int) {
+        if (type.bits == 8) {
+            return MNN::DataType_DT_INT8;
+        }
+        if (type.bits == 16) {
+            return MNN::DataType_DT_INT16;
+        }
+        if (type.bits == 64) {
+            return MNN::DataType_DT_INT64;
+        }
+        return MNN::DataType_DT_INT32;
+    }
+    if (type.code == halide_type_handle) {
+        return MNN::DataType_DT_STRING;
+    }
+    return MNN::DataType_DT_FLOAT;
+}
+
+static MNN::MNN_DATA_FORMAT mapExprFormat(MNN::Express::Dimensionformat format) {
+    switch (format) {
+        case MNN::Express::NHWC:
+            return MNN::MNN_DATA_FORMAT_NHWC;
+        case MNN::Express::NC4HW4:
+            return MNN::MNN_DATA_FORMAT_NC4HW4;
+        case MNN::Express::NCHW:
+        default:
+            return MNN::MNN_DATA_FORMAT_NCHW;
+    }
+}
+
+static std::vector<OutputInfo> collectOutputInfos(const MNN::NetT& net) {
+    auto outputNames = collectOutputNames(net);
+    if (outputNames.empty()) {
+        return {};
+    }
+    auto program = MNN::Express::Program::create(&net, true, true);
+    if (nullptr == program) {
+        MNN_ERROR("RKNN wrapper: failed to build Program for output shape inference\n");
+        return {};
+    }
+
+    std::map<std::string, const MNN::Express::Variable::Info*> infoMap;
+    for (const auto& output : program->outputs()) {
+        if (output == nullptr) {
+            continue;
+        }
+        auto info = output->getInfo();
+        if (nullptr == info) {
+            continue;
+        }
+        infoMap.insert(std::make_pair(output->name(), info));
+    }
+
+    std::vector<OutputInfo> outputs;
+    outputs.reserve(outputNames.size());
+    for (const auto& name : outputNames) {
+        auto infoIter = infoMap.find(name);
+        if (infoIter == infoMap.end() || nullptr == infoIter->second) {
+            MNN_ERROR("RKNN wrapper: failed to infer output info for tensor %s\n", name.c_str());
+            return {};
+        }
+        OutputInfo info;
+        info.name = name;
+        info.dims.assign(infoIter->second->dim.begin(), infoIter->second->dim.end());
+        info.dtype = mapExprDataType(infoIter->second->type);
+        info.dformat = mapExprFormat(infoIter->second->order);
+        outputs.emplace_back(std::move(info));
+    }
+    return outputs;
+}
+
 static std::unique_ptr<MNN::AttributeT> makeStringAttr(const std::string& key, const std::string& value) {
     std::unique_ptr<MNN::AttributeT> attr(new MNN::AttributeT);
     attr->key = key;
@@ -131,6 +231,24 @@ static std::unique_ptr<MNN::AttributeT> makeStringAttr(const std::string& key, c
     return attr;
 }
 
+static std::unique_ptr<MNN::AttributeT> makeStringListAttr(const std::string& key, const std::vector<std::string>& values) {
+    std::unique_ptr<MNN::AttributeT> attr(new MNN::AttributeT);
+    attr->key = key;
+    attr->list.reset(new MNN::ListValueT);
+    attr->list->s = values;
+    return attr;
+}
+
+static std::unique_ptr<MNN::AttributeT> makeBlobAttr(const std::string& key, const OutputInfo& info) {
+    std::unique_ptr<MNN::AttributeT> attr(new MNN::AttributeT);
+    attr->key = key;
+    attr->tensor.reset(new MNN::BlobT);
+    attr->tensor->dataType = info.dtype;
+    attr->tensor->dims = info.dims;
+    attr->tensor->dataFormat = info.dformat;
+    return attr;
+}
+
 static int ensureTensorIndex(const std::string& name, std::map<std::string, int>* tensorMap,
                              std::vector<std::string>* tensorNames) {
     auto iter = tensorMap->find(name);
@@ -242,7 +360,7 @@ std::unique_ptr<NetT> BuildRKNNWrapperNet(const NetT& sourceNet, const modelConf
         MNN_ERROR("RKNN wrapper: failed to collect input tensors from source net\n");
         return nullptr;
     }
-    auto outputs = collectOutputNames(sourceNet);
+    auto outputs = collectOutputInfos(sourceNet);
     if (outputs.empty()) {
         MNN_ERROR("RKNN wrapper: failed to collect output tensors from source net\n");
         return nullptr;
@@ -257,10 +375,13 @@ std::unique_ptr<NetT> BuildRKNNWrapperNet(const NetT& sourceNet, const modelConf
     std::map<std::string, int> tensorMap;
     std::vector<int> inputIndexes;
     std::vector<int> outputIndexes;
+    std::vector<std::string> inputNames;
+    std::vector<std::string> outputNames;
 
     for (const auto& input : inputs) {
         const int tensorIndex = ensureTensorIndex(input.name, &tensorMap, &wrapper->tensorName);
         inputIndexes.emplace_back(tensorIndex);
+        inputNames.emplace_back(input.name);
 
         std::unique_ptr<OpT> inputOp(new OpT);
         inputOp->name = input.name;
@@ -276,25 +397,30 @@ std::unique_ptr<NetT> BuildRKNNWrapperNet(const NetT& sourceNet, const modelConf
     }
 
     for (const auto& output : outputs) {
-        outputIndexes.emplace_back(ensureTensorIndex(output, &tensorMap, &wrapper->tensorName));
+        outputIndexes.emplace_back(ensureTensorIndex(output.name, &tensorMap, &wrapper->tensorName));
+        outputNames.emplace_back(output.name);
     }
 
     std::unique_ptr<OpT> rknnOp(new OpT);
     rknnOp->name = "RKNNSubgraph";
-    rknnOp->type = OpType_Extra;
-    rknnOp->main.type = OpParameter_Extra;
-    rknnOp->main.value = new ExtraT;
-    rknnOp->main.AsExtra()->type = "RKNN";
-    rknnOp->main.AsExtra()->engine = "MNN";
-    rknnOp->main.AsExtra()->attr.emplace_back(makeStringAttr("model_path", bundlePaths.rknnPath));
-    rknnOp->main.AsExtra()->attr.emplace_back(makeStringAttr("bundle_manifest", bundlePaths.manifestPath));
-    rknnOp->main.AsExtra()->attr.emplace_back(makeStringAttr("target", modelPath.rknnTarget));
+    rknnOp->type = OpType_Plugin;
+    rknnOp->main.type = OpParameter_Plugin;
+    rknnOp->main.value = new PluginT;
+    rknnOp->main.AsPlugin()->type = "RKNN";
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("model_path", bundlePaths.rknnPath));
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("bundle_manifest", bundlePaths.manifestPath));
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("target", modelPath.rknnTarget));
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringListAttr("inputs", inputNames));
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringListAttr("outputs", outputNames));
+    for (int i = 0; i < outputs.size(); ++i) {
+        rknnOp->main.AsPlugin()->attr.emplace_back(makeBlobAttr("o_" + std::to_string(i), outputs[i]));
+    }
     rknnOp->inputIndexes = inputIndexes;
     rknnOp->outputIndexes = outputIndexes;
     wrapper->oplists.emplace_back(std::move(rknnOp));
 
-    wrapper->outputName = outputs;
+    wrapper->outputName = outputNames;
     wrapper->tensorNumber = static_cast<int>(wrapper->tensorName.size());
     return wrapper;
 }
-}
+} // namespace MNN

From a4132d7161b6fb8341dafa58b82cb994e02f4c89 Mon Sep 17 00:00:00 2001
From: root <huang_zheng_xiang@163.com>
Date: Wed, 10 Jun 2026 12:27:38 +0800
Subject: [PATCH 3/3] [NPU:rknn] fix a layout bug and update profiling
 interface.

---
 docs/inference/npu.md                        |  23 ++
 express/Executor.cpp                         |  11 +-
 include/MNN/Interpreter.hpp                  |   8 +-
 source/backend/rknn/README.md                | 298 +++++++++++++++++++
 source/backend/rknn/backend/RKNNBackend.cpp  | 109 ++++++-
 source/core/Backend.hpp                      |  30 +-
 source/core/Pipeline.hpp                     |   3 +
 source/core/Session.cpp                      |  17 ++
 tools/converter/source/common/RKNNBundle.cpp |  20 +-
 9 files changed, 506 insertions(+), 13 deletions(-)
 create mode 100644 source/backend/rknn/README.md

diff --git a/docs/inference/npu.md b/docs/inference/npu.md
index dd5702cfdc..1c8f6ef366 100644
--- a/docs/inference/npu.md
+++ b/docs/inference/npu.md
@@ -193,6 +193,10 @@ cp -r ${DDK}/include ${MNN}/source/backend/hiai/3rdParty/include
 - Device 侧通过 MNN 的 CPU Plugin 框架调用 RKNN C API 加载 `.rknn` 并执行；应用侧 Session backend 仍使用 `MNN_FORWARD_CPU`。
 - RKNN backend 读取 runtime 库路径、转换脚本路径、目标平台等信息时，不做硬编码，全部从环境变量读取；缺失时直接报 `MNN_ERROR`。
 
+更完整的 RKNN 说明、包内容、示例代码与板端运行方式，请参考：
+- `source/backend/rknn/README.md`
+  - 包含 Host 转换、aarch64 交叉编译、Plugin 运行机制、独立示例代码、板端包内容与运行方式。
+
 ### 编译
 
 #### Host，编译带 RKNN 转换能力的 MNNConvert
@@ -270,8 +274,27 @@ ${BUILD_DIR}/MNNConvert \
 并在创建 Session 时选择：
 - backend type = `MNN_FORWARD_CPU`
 
+**注意：在 RK 板上执行任何真正调用 NPU 的命令时，必须使用 `sudo`。**
+
 如果 `.rknn` 路径在 wrapper `.mnn` 中是相对路径，则需要确保模型外部路径设置正确，使 MNN 能解析 sidecar 所在目录。
 
+### Device，Profiling
+
+RKNN internal profiling 通过 MNN 的公开 hint / info 接口暴露：
+
+- 开启 profiling：
+  - `Interpreter::setSessionHint(Interpreter::RKNN_PROFILE, 1)`
+  - 或 `Executor::RuntimeManager::setHint(Interpreter::RKNN_PROFILE, 1)`
+- 读取 profiling 文本：
+  - `Interpreter::getSessionInfo(session, Interpreter::BACKEND_PROFILE, &ptr)`
+  - 或 `Executor::RuntimeManager::getInfo(Interpreter::BACKEND_PROFILE, &ptr)`
+
+其中：
+- `RKNN_PROFILE` 会在 RKNN plugin 内部打开 `RKNN_FLAG_COLLECT_PERF_MASK`
+- `BACKEND_PROFILE` 返回的是 `const char*`，内容包含 RKNN 导出的 `npu_run` 和 `perf_detail` 文本
+- 因为它是普通文本，所以应用层可以直接打印，也可以原样写入文件做持久化
+- 如果当前 backend 不支持 profiling，或者尚未产生 profile，返回值可能为空
+
 ### 当前限制
 
 - 当前 RKNN 路径执行 `Plugin(type="RKNN")` 节点，不支持逐算子 RKNN backend。
diff --git a/express/Executor.cpp b/express/Executor.cpp
index bcfc01becf..772027c688 100644
--- a/express/Executor.cpp
+++ b/express/Executor.cpp
@@ -283,11 +283,20 @@ bool Executor::RuntimeManager::getInfo(Interpreter::SessionInfoCode code, void*
             auto dst = (int*)ptr;
             if (!mInside->mRuntime.first.empty()) {
                 *dst = mInside->mRuntime.first.begin()->first;
+                return true;
             }
         } break;
         case Interpreter::RESIZE_STATUS: {
             auto dst = (int*)ptr;
             *dst = mInside->mResizeStatus;
+            return true;
+        } break;
+        case Interpreter::BACKEND_PROFILE: {
+            for (auto& r : mInside->mRuntime.first) {
+                if (r.second != nullptr && r.second->onGetRuntimeInfo((int)code, ptr)) {
+                    return true;
+                }
+            }
         } break;
         default: {
             // Do nothing
@@ -694,4 +703,4 @@ void Executor::setLazyComputeMode(uint32_t mode) {
 }
 
 } // namespace Express
-} // namespace MNN
\ No newline at end of file
+} // namespace MNN
diff --git a/include/MNN/Interpreter.hpp b/include/MNN/Interpreter.hpp
index dfc3f11d78..53620644f8 100644
--- a/include/MNN/Interpreter.hpp
+++ b/include/MNN/Interpreter.hpp
@@ -264,7 +264,10 @@ class MNN_PUBLIC Interpreter {
         CPU_SME2_NEON_DIVISION_RATIO = 17,
 
         // Set SME cores, default is 2, if supports sme
-        CPU_SME_CORES = 18
+        CPU_SME_CORES = 18,
+
+        // Enable backend-side profiling export for runtimes that support it.
+        RKNN_PROFILE = 19
     };
 
     enum ExternalPathType {
@@ -463,6 +466,9 @@ class MNN_PUBLIC Interpreter {
         /** Mode / NumberThread, int* */
         THREAD_NUMBER = 4,
 
+        /** Backend-specific profile text, const char** */
+        BACKEND_PROFILE = 5,
+
         ALL
     };
 
diff --git a/source/backend/rknn/README.md b/source/backend/rknn/README.md
new file mode 100644
index 0000000000..7e3f70845b
--- /dev/null
+++ b/source/backend/rknn/README.md
@@ -0,0 +1,298 @@
+# RKNN Backend
+
+This directory contains the RKNN integration for MNN.
+
+This file intentionally keeps the instructions generic.
+For one machine-specific, real-path compilation and deployment example, see the external project README used in this integration workflow.
+
+Current design:
+- Converter side generates two artifacts from the same ONNX model:
+  - a wrapper `.mnn` model containing `Plugin(type="RKNN")`
+  - a sidecar `.rknn` model plus bundle manifest
+- Runtime side executes `Plugin("RKNN")` through the MNN CPU Plugin framework.
+- There is no `MNN_FORWARD_USER_2` RKNN runtime path anymore.
+- Application-side session backend remains `MNN_FORWARD_CPU`.
+
+## 1. Host build for `MNNConvert --rknn`
+
+Build a host `MNNConvert` with plugin support and RKNN converter support enabled:
+
+```bash
+cmake -S /path/to/MNN-Agent -B /path/to/MNN-Agent/build-linux \
+  -DMNN_BUILD_CONVERTER=ON \
+  -DMNN_WITH_PLUGIN=ON \
+  -DMNN_RKNN=ON \
+  -DMNN_RKNN_CONVERT_MODE=ON \
+  -DRKNN_API_INCLUDE_DIR=/path/to/rknn-toolkit2/rknpu2/runtime/Linux/librknn_api/include
+
+cmake --build /path/to/MNN-Agent/build-linux --target MNN MNNConvert -j8
+```
+
+## 2. Generate wrapper `.mnn` + sidecar `.rknn`
+
+Before running `MNNConvert --rknn`, export these environment variables:
+
+```bash
+export MNN_RKNN_TARGET=rv1126b
+export MNN_RKNN_PYTHON=/path/to/python
+export MNN_RKNN_SCRIPT=/path/to/to_rknn.py
+export MNN_RKNN_OUTPUT_DIR=/path/to/output/sidecar
+```
+
+Example:
+
+```bash
+/path/to/MNN-Agent/build-linux/MNNConvert \
+  -f ONNX \
+  --modelFile /path/to/model.onnx \
+  --MNNModel /path/to/model.mnn \
+  --rknn
+```
+
+Expected outputs:
+- `/path/to/model.mnn`
+- `${MNN_RKNN_OUTPUT_DIR}/model_<target>.rknn`
+- `${MNN_RKNN_OUTPUT_DIR}/model.rknn.bundle.json`
+
+The generated wrapper `.mnn` contains:
+- `Input` ops for original inputs
+- one `Plugin(type="RKNN")` op
+- plugin attrs including:
+  - `model_path`
+  - `bundle_manifest`
+  - `target`
+  - `inputs`
+  - `outputs`
+  - `o_0`, `o_1`, ... for output shape metadata
+
+Important:
+- `model_path` and `bundle_manifest` are emitted as relative file names.
+- The validated deployment layout is: wrapper `.mnn`, sidecar `.rknn`, and bundle `.json` in the same target directory.
+
+## 3. Cross compile runtime for Linux aarch64 / ARMv8
+
+Example cross build using the system `aarch64-linux-gnu` toolchain.
+This builds the target-side runtime libraries; `MNNConvert` itself is usually only needed on the host.
+
+```bash
+cmake -S /path/to/MNN-Agent -B /path/to/MNN-Agent/build-linux-aarch64-gnu \
+  -DCMAKE_SYSTEM_NAME=Linux \
+  -DCMAKE_SYSTEM_PROCESSOR=aarch64 \
+  -DCMAKE_C_COMPILER=/usr/bin/aarch64-linux-gnu-gcc \
+  -DCMAKE_CXX_COMPILER=/usr/bin/aarch64-linux-gnu-g++ \
+  -DCMAKE_C_FLAGS='-march=armv8-a' \
+  -DCMAKE_CXX_FLAGS='-march=armv8-a' \
+  -DMNN_WITH_PLUGIN=ON \
+  -DMNN_RKNN=ON \
+  -DMNN_BUILD_CONVERTER=OFF \
+  -DMNN_BUILD_DEMO=OFF \
+  -DMNN_BUILD_TOOLS=ON \
+  -DRKNN_API_INCLUDE_DIR=/path/to/rknn-toolkit2/rknpu2/runtime/Linux/librknn_api/include
+
+cmake --build /path/to/MNN-Agent/build-linux-aarch64-gnu --target MNN MNN_Express -j8
+```
+
+Notes:
+- `MNN_WITH_PLUGIN=ON` is required because RKNN is implemented as a Plugin op.
+- `MNN_RKNN=ON` pulls in the RKNN Plugin kernels.
+- `RKNN_API_INCLUDE_DIR` must point to the directory containing `rknn_api.h`.
+- The RKNN runtime library is loaded at runtime via `dlopen`, not linked as a hard dependency.
+
+## 4. Target runtime usage
+
+On the target board, export the RKNN runtime library path:
+
+```bash
+export MNN_RKNN_RUNTIME_LIB=/path/to/librknnrt.so
+```
+
+The wrapper `.mnn` should be deployed together with its sidecar `.rknn` and bundle manifest in the same directory on target.
+
+Important:
+- On RK boards, commands that actually execute NPU code should be run with `sudo`.
+
+Runtime behavior:
+- MNN loads the wrapper `.mnn`
+- `Plugin(type="RKNN")` is created by the CPU Plugin framework
+- the plugin loads the `.rknn` sidecar using RKNN C API
+- application-side MNN backend is still `MNN_FORWARD_CPU`
+- if the RKNN model expects `NHWC` but the incoming MNN tensor is `NCHW`, the plugin converts layout automatically
+- if the incoming tensor is already `NHWC`, no extra layout conversion is done
+- backend-side RKNN profiling can be enabled through the public hint path:
+  - `Interpreter::setSessionHint(Interpreter::RKNN_PROFILE, 1)` or `RuntimeManager::setHint(Interpreter::RKNN_PROFILE, 1)`
+  - retrieve the exported profile text through `getSessionInfo(..., Interpreter::BACKEND_PROFILE, &ptr)` or `RuntimeManager::getInfo(Interpreter::BACKEND_PROFILE, &ptr)`
+  - because the profile is exposed as plain text, applications can print it or write it directly to a file
+
+## 5. Current limitations
+
+- This is a sidecar-subgraph path, not a per-op RKNN backend.
+- Current implementation uses host buffer copies; zero-copy is not implemented.
+- Current output copy path assumes float32 outputs from RKNN runtime.
+- Input layout auto-conversion currently handles the common `NCHW -> NHWC` case for 4D tensors only, and only when the RKNN model explicitly expects `NHWC`.
+- Host-side PC simulation through MNN runtime requires an x86 RKNN runtime library; usually this path is meant for target boards.
+
+## 6. Code examples
+
+### 6.1 Minimal C++ example with `Interpreter`
+
+This example loads the wrapper `.mnn` generated by `MNNConvert --rknn` and runs it through the normal CPU backend. Internally, the `Plugin("RKNN")` node will call the RKNN C API.
+
+```cpp
+#include <MNN/Interpreter.hpp>
+#include <MNN/Tensor.hpp>
+#include <MNN/ImageProcess.hpp>
+
+#include <cstdio>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+int main() {
+    const char* model_path = "/data/local/tmp/rejshand_epoch200_b1_nogridsample.mnn";
+
+    std::shared_ptr<MNN::Interpreter> net(MNN::Interpreter::createFromFile(model_path));
+    if (!net) {
+        std::fprintf(stderr, "createFromFile failed\n");
+        return 1;
+    }
+
+    MNN::ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    config.numThread = 1;
+
+    MNN::BackendConfig backendConfig;
+    config.backendConfig = &backendConfig;
+
+    auto session = net->createSession(config);
+    if (!session) {
+        std::fprintf(stderr, "createSession failed\n");
+        return 1;
+    }
+
+    auto input = net->getSessionInput(session, "image");
+    if (!input) {
+        std::fprintf(stderr, "getSessionInput failed\n");
+        return 1;
+    }
+
+    net->resizeTensor(input, {1, 3, 224, 224});
+    net->resizeSession(session);
+
+    MNN::Tensor hostInput(input, MNN::Tensor::CAFFE);
+    std::memset(hostInput.host<float>(), 0, hostInput.size());
+    input->copyFromHostTensor(&hostInput);
+
+    if (net->runSession(session) != 0) {
+        std::fprintf(stderr, "runSession failed\n");
+        return 1;
+    }
+
+    auto uv = net->getSessionOutput(session, "uv");
+    auto vertices = net->getSessionOutput(session, "vertices");
+    if (!uv || !vertices) {
+        std::fprintf(stderr, "getSessionOutput failed\n");
+        return 1;
+    }
+
+    MNN::Tensor uvHost(uv, MNN::Tensor::CAFFE);
+    MNN::Tensor verticesHost(vertices, MNN::Tensor::CAFFE);
+    uv->copyToHostTensor(&uvHost);
+    vertices->copyToHostTensor(&verticesHost);
+
+    auto uvPtr = uvHost.host<float>();
+    auto vPtr = verticesHost.host<float>();
+    std::printf("uv[0] = %f, %f\n", uvPtr[0], uvPtr[1]);
+    std::printf("vertices[0] = %f, %f, %f\n", vPtr[0], vPtr[1], vPtr[2]);
+    return 0;
+}
+```
+
+Typical build command on target:
+
+```bash
+aarch64-linux-gnu-g++ -O2 -std=c++11 demo_rknn_mnn.cpp \
+  -I/path/to/MNN-Agent/include \
+  -L/path/to/mnn/libs -lMNN -o demo_rknn_mnn
+```
+
+At runtime on board:
+
+```bash
+export LD_LIBRARY_PATH=/path/to/mnn/libs:$LD_LIBRARY_PATH
+export MNN_RKNN_RUNTIME_LIB=/path/to/librknnrt.so
+./demo_rknn_mnn
+```
+
+### 6.2 Minimal `Module` example
+
+If you prefer the Express / Module API, load the same wrapper `.mnn` with `MNN_FORWARD_CPU`.
+
+```cpp
+#include <MNN/expr/Module.hpp>
+#include <MNN/expr/Expr.hpp>
+#include <MNN/expr/Executor.hpp>
+
+#include <cstdio>
+#include <memory>
+#include <vector>
+
+using namespace MNN::Express;
+
+int main() {
+    MNN::ScheduleConfig config;
+    config.type = MNN_FORWARD_CPU;
+    config.numThread = 1;
+
+    std::shared_ptr<MNN::Executor::RuntimeManager> rtmgr(MNN::Executor::RuntimeManager::createRuntimeManager(config));
+    if (!rtmgr) {
+        std::fprintf(stderr, "createRuntimeManager failed\n");
+        return 1;
+    }
+
+    std::vector<std::string> inputs = {"image"};
+    std::vector<std::string> outputs = {"uv", "vertices"};
+    auto module = Module::load(inputs, outputs, "/data/local/tmp/rejshand_epoch200_b1_nogridsample.mnn", rtmgr);
+    if (!module) {
+        std::fprintf(stderr, "Module::load failed\n");
+        return 1;
+    }
+
+    auto image = _Input({1, 3, 224, 224}, NCHW, halide_type_of<float>());
+    auto imagePtr = image->writeMap<float>();
+    for (int i = 0; i < 1 * 3 * 224 * 224; ++i) {
+        imagePtr[i] = 0.0f;
+    }
+
+    auto outputsVar = module->onForward({image});
+    if (outputsVar.size() != 2) {
+        std::fprintf(stderr, "unexpected output size: %zu\n", outputsVar.size());
+        return 1;
+    }
+
+    auto uvInfo = outputsVar[0]->getInfo();
+    auto verticesInfo = outputsVar[1]->getInfo();
+    if (!uvInfo || !verticesInfo) {
+        std::fprintf(stderr, "output info is null\n");
+        return 1;
+    }
+
+    auto uv = outputsVar[0]->readMap<float>();
+    auto vertices = outputsVar[1]->readMap<float>();
+    std::printf("uv[0] = %f, %f\n", uv[0], uv[1]);
+    std::printf("vertices[0] = %f, %f, %f\n", vertices[0], vertices[1], vertices[2]);
+    return 0;
+}
+```
+
+Runtime requirements are the same:
+
+```bash
+export LD_LIBRARY_PATH=/path/to/mnn/libs:$LD_LIBRARY_PATH
+export MNN_RKNN_RUNTIME_LIB=/path/to/librknnrt.so
+./demo_rknn_module
+```
+
+## 7. Notes
+
+- Keep this README generic. Put machine-specific paths, standalone example source files, and one-off deployment commands in the external example project README instead.
+- The standalone example program is intentionally kept outside the MNN source tree.
diff --git a/source/backend/rknn/backend/RKNNBackend.cpp b/source/backend/rknn/backend/RKNNBackend.cpp
index cbe5e37dc4..9d3b3661c2 100644
--- a/source/backend/rknn/backend/RKNNBackend.cpp
+++ b/source/backend/rknn/backend/RKNNBackend.cpp
@@ -3,6 +3,7 @@
 #include <dlfcn.h>
 #include <memory>
 #include <mutex>
+#include <sstream>
 #include <string>
 #include <vector>
 
@@ -93,6 +94,9 @@ static std::string resolveModelPath(const std::string& dirPath, const std::strin
     if (path[0] == '/') {
         return path;
     }
+    if (dirPath.empty() || dirPath == ".") {
+        return path;
+    }
     return MNNFilePathConcat(dirPath, path);
 }
 
@@ -124,6 +128,80 @@ static rknn_tensor_format mapTensorFormat(const Tensor* tensor) {
 static Tensor::DimensionType getHostTensorDimType(const Tensor* tensor) {
     return tensor->getDimensionType();
 }
+static bool convertLayoutIfNeeded(const Tensor* tensor, rknn_tensor_format expectFormat,
+                                  std::vector<uint8_t>* converted, void** buf, uint32_t* size,
+                                  rknn_tensor_format* actualFormat) {
+    auto currentFormat = mapTensorFormat(tensor);
+    *actualFormat = currentFormat;
+    *buf = tensor->buffer().host;
+    *size = (uint32_t)tensor->size();
+
+    if (expectFormat == currentFormat) {
+        return true;
+    }
+    if (expectFormat != RKNN_TENSOR_NHWC || currentFormat != RKNN_TENSOR_NCHW) {
+        return true;
+    }
+    if (tensor->dimensions() != 4) {
+        MNN_ERROR("MNN_RKNN: unsupported layout conversion for %dD tensor\n", tensor->dimensions());
+        return false;
+    }
+
+    const int batch = tensor->batch();
+    const int channel = tensor->channel();
+    const int height = tensor->height();
+    const int width = tensor->width();
+    const int elementBytes = tensor->getType().bytes();
+    if (batch <= 0 || channel <= 0 || height <= 0 || width <= 0 || elementBytes <= 0) {
+        MNN_ERROR("MNN_RKNN: invalid tensor shape for layout conversion\n");
+        return false;
+    }
+
+    converted->resize((size_t)tensor->size());
+    auto src = reinterpret_cast<const uint8_t*>(tensor->buffer().host);
+    auto dst = converted->data();
+    for (int n = 0; n < batch; ++n) {
+        for (int h = 0; h < height; ++h) {
+            for (int w = 0; w < width; ++w) {
+                for (int c = 0; c < channel; ++c) {
+                    const size_t srcIndex = ((((size_t)n * (size_t)channel + (size_t)c) * (size_t)height + (size_t)h) * (size_t)width + (size_t)w) * (size_t)elementBytes;
+                    const size_t dstIndex = ((((size_t)n * (size_t)height + (size_t)h) * (size_t)width + (size_t)w) * (size_t)channel + (size_t)c) * (size_t)elementBytes;
+                    ::memcpy(dst + dstIndex, src + srcIndex, (size_t)elementBytes);
+                }
+            }
+        }
+    }
+    *buf = converted->data();
+    *size = (uint32_t)converted->size();
+    *actualFormat = expectFormat;
+    return true;
+}
+
+static std::string buildProfileString(const RKNNApi* api, rknn_context context) {
+    std::ostringstream oss;
+    rknn_perf_run perfRun;
+    std::memset(&perfRun, 0, sizeof(perfRun));
+    auto ret = api->query(context, RKNN_QUERY_PERF_RUN, &perfRun, sizeof(perfRun));
+    if (ret == RKNN_SUCC) {
+        oss << "npu_run   : " << (double)perfRun.run_duration / 1000.0 << " ms\n";
+    } else {
+        oss << "npu_run   : unavailable\n";
+    }
+
+    rknn_perf_detail perfDetail;
+    std::memset(&perfDetail, 0, sizeof(perfDetail));
+    ret = api->query(context, RKNN_QUERY_PERF_DETAIL, &perfDetail, sizeof(perfDetail));
+    if (ret == RKNN_SUCC && perfDetail.perf_data != nullptr && perfDetail.data_len > 0) {
+        oss << "perf_detail:\n";
+        oss.write(perfDetail.perf_data, perfDetail.data_len);
+        if (perfDetail.perf_data[perfDetail.data_len - 1] != '\n') {
+            oss << '\n';
+        }
+    } else {
+        oss << "perf_detail: unavailable\n";
+    }
+    return oss.str();
+}
 
 class RKNNPluginShape : public plugin::InferShapeKernel {
 public:
@@ -165,6 +243,8 @@ class RKNNPluginExecute : public plugin::CPUComputeKernel {
         if (nullptr == mApi) {
             return false;
         }
+        auto runtime = ctx->backend() == nullptr ? nullptr : ctx->backend()->getRuntime();
+        mEnableProfile = runtime != nullptr && runtime->hint().enableBackendProfile;
         mModelPath = resolveModelPath(ctx->dir_path(), getStringAttr(ctx, kModelPathAttr));
         if (mModelPath.empty()) {
             MNN_ERROR("MNN_RKNN: Plugin(%s) requires attr %s\n", kPluginTypeName, kModelPathAttr);
@@ -174,7 +254,11 @@ class RKNNPluginExecute : public plugin::CPUComputeKernel {
             MNN_ERROR("MNN_RKNN: model file does not exist: %s\n", mModelPath.c_str());
             return false;
         }
-        if (mApi->init(&mContext, (void*)mModelPath.c_str(), 0, 0, nullptr) != RKNN_SUCC) {
+        uint32_t initFlags = 0;
+        if (mEnableProfile) {
+            initFlags |= RKNN_FLAG_COLLECT_PERF_MASK;
+        }
+        if (mApi->init(&mContext, (void*)mModelPath.c_str(), 0, initFlags, nullptr) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: rknn_init failed for %s\n", mModelPath.c_str());
             return false;
         }
@@ -213,7 +297,9 @@ class RKNNPluginExecute : public plugin::CPUComputeKernel {
     }
 
     bool compute(plugin::CPUKernelContext* ctx) override {
+        auto runtime = ctx->backend() == nullptr ? nullptr : ctx->backend()->getRuntime();
         std::vector<std::unique_ptr<Tensor>> hostInputs;
+        std::vector<std::vector<uint8_t>> convertedInputs(ctx->inputs().size());
         std::vector<rknn_input> rknnInputs(ctx->inputs().size());
         for (size_t i = 0; i < ctx->inputs().size(); ++i) {
             auto src = ctx->input((int)i);
@@ -222,13 +308,20 @@ class RKNNPluginExecute : public plugin::CPUComputeKernel {
                 MNN_ERROR("MNN_RKNN: failed to copy input tensor %zu to host\n", i);
                 return false;
             }
+            void* inputBuf = hostInputs.back()->buffer().host;
+            uint32_t inputSize = (uint32_t)hostInputs.back()->size();
+            auto inputFormat = mapTensorFormat(hostInputs.back().get());
+            if (!convertLayoutIfNeeded(hostInputs.back().get(), mInputAttrs[i].fmt, &convertedInputs[i], &inputBuf, &inputSize, &inputFormat)) {
+                MNN_ERROR("MNN_RKNN: failed to convert input tensor %zu layout\n", i);
+                return false;
+            }
             std::memset(&rknnInputs[i], 0, sizeof(rknn_input));
             rknnInputs[i].index = (uint32_t)i;
-            rknnInputs[i].buf = hostInputs.back()->buffer().host;
-            rknnInputs[i].size = hostInputs.back()->size();
+            rknnInputs[i].buf = inputBuf;
+            rknnInputs[i].size = inputSize;
             rknnInputs[i].pass_through = 0;
             rknnInputs[i].type = mapTensorType(hostInputs.back().get());
-            rknnInputs[i].fmt = mapTensorFormat(hostInputs.back().get());
+            rknnInputs[i].fmt = inputFormat;
         }
         if (mApi->inputsSet(mContext, (uint32_t)rknnInputs.size(), rknnInputs.data()) != RKNN_SUCC) {
             MNN_ERROR("MNN_RKNN: rknn_inputs_set failed\n");
@@ -250,6 +343,13 @@ class RKNNPluginExecute : public plugin::CPUComputeKernel {
             MNN_ERROR("MNN_RKNN: rknn_outputs_get failed\n");
             return false;
         }
+        if (nullptr != runtime) {
+            if (mEnableProfile) {
+                runtime->setLastBackendProfile(buildProfileString(mApi, mContext));
+            } else {
+                runtime->setLastBackendProfile("");
+            }
+        }
 
         for (size_t i = 0; i < ctx->outputs().size(); ++i) {
             auto dst = ctx->output((int)i);
@@ -278,6 +378,7 @@ class RKNNPluginExecute : public plugin::CPUComputeKernel {
     rknn_input_output_num mIoNum{};
     std::vector<rknn_tensor_attr> mInputAttrs;
     std::vector<rknn_tensor_attr> mOutputAttrs;
+    bool mEnableProfile = false;
 };
 
 static auto _rknn_plugin_shape_registrar __attribute__((unused)) =
diff --git a/source/core/Backend.hpp b/source/core/Backend.hpp
index eeebb6f645..40d93f221c 100644
--- a/source/core/Backend.hpp
+++ b/source/core/Backend.hpp
@@ -76,6 +76,9 @@ struct RuntimeHint {
     // Use CPU Ids
     std::vector<int> cpuIds;
 
+    // Enable backend-side profiling export for runtimes that support it.
+    bool enableBackendProfile = false;
+
     // Division ration between SME and NEON when runtime threads>=4
     // Default: 41, which means that in LLM inference,
     // during the Prefill stage the workload
@@ -228,6 +231,7 @@ class Backend : public NonCopyable {
     virtual bool onGetTensorInfo(const Tensor* tensor, void* dstInfo) {
         return false;
     }
+    virtual bool onGetSessionInfo(int code, void* ptr) const;
 
     /**
      * @brief clear all dynamic buffers.
@@ -285,6 +289,7 @@ class Backend : public NonCopyable {
 /** Each backend belong to a runtime*/
 class Runtime : public NonCopyable {
 public:
+    static constexpr int kSessionInfoBackendProfile = 5;
     /**
      Origin Op -> (Compiler) -> New Op -> Backend
      Default use Compiler_Geometry, Origin Op -> Compiler_Geometry -> Little Op
@@ -306,6 +311,20 @@ class Runtime : public NonCopyable {
     const RuntimeHint& hint() const {
         return mHint;
     }
+    void setLastBackendProfile(std::string profile) const {
+        mLastBackendProfile = std::move(profile);
+    }
+    bool onGetRuntimeInfo(int code, void* ptr) const {
+        if (code == kSessionInfoBackendProfile) {
+            auto dst = reinterpret_cast<const char**>(ptr);
+            if (nullptr == dst) {
+                return false;
+            }
+            *dst = mLastBackendProfile.empty() ? nullptr : mLastBackendProfile.c_str();
+            return true;
+        }
+        return false;
+    }
 
     virtual CompilerType onGetCompilerType() const {
         return Compiler_Loop;
@@ -406,6 +425,7 @@ class Runtime : public NonCopyable {
 private:
     std::future<int> mFuture;
     RuntimeHint mHint;
+    mutable std::string mLastBackendProfile;
 };
 
 /** abstract Runtime register */
@@ -440,6 +460,14 @@ class RuntimeCreator {
     RuntimeCreator() = default;
 };
 
+inline bool Backend::onGetSessionInfo(int code, void* ptr) const {
+    auto rt = const_cast<Backend*>(this)->getRuntime();
+    if (nullptr == rt) {
+        return false;
+    }
+    return rt->onGetRuntimeInfo(code, ptr);
+}
+
 /**
  * @brief get registered backend creator for given forward type.
  * @param type  given forward type.
@@ -459,4 +487,4 @@ MNN_PUBLIC bool MNNInsertExtraRuntimeCreator(MNNForwardType type, const RuntimeC
 MNN_PUBLIC bool MNNCPUCopyBuffer(const Tensor* srcTensor, const Tensor* dstTensor);
 } // namespace MNN
 
-#endif /* Backend_hpp */
\ No newline at end of file
+#endif /* Backend_hpp */
diff --git a/source/core/Pipeline.hpp b/source/core/Pipeline.hpp
index 87cd1686b6..d2a96b1fb6 100644
--- a/source/core/Pipeline.hpp
+++ b/source/core/Pipeline.hpp
@@ -62,6 +62,9 @@ class Pipeline : public NonCopyable {
     MNNForwardType getMainForwardType() const  {
         return mInfo.first.cache.first->type();
     }
+    Backend* getMainBackend() const {
+        return mInfo.first.cache.second.get();
+    }
     typedef std::map<std::pair<Tensor::InsideDescribe::NativeInsideDescribe*, Backend*>, std::pair<std::weak_ptr<Tensor::InsideDescribe::NativeInsideDescribe>, std::shared_ptr<Tensor>>> WrapTensorCache;
 private:
     ErrorCode _allocForTensor(int index, bool allocInput);
diff --git a/source/core/Session.cpp b/source/core/Session.cpp
index bffbf08d54..18646475da 100644
--- a/source/core/Session.cpp
+++ b/source/core/Session.cpp
@@ -121,6 +121,9 @@ void Session::ModeGroup::setHint(Interpreter::HintMode hint, int value) {
         case Interpreter::CPU_SME_CORES:
             runtimeHint.smeCores = value;
             break;
+        case Interpreter::HintMode::RKNN_PROFILE:
+            runtimeHint.enableBackendProfile = value > 0;
+            break;
         default:
             break;
     }
@@ -396,6 +399,20 @@ bool Session::getInfo(Interpreter::SessionInfoCode code, void* ptr) const {
             *dst = mPipelines[0]->getPipelineInfo().first.info.numThread;
             return true;
         }
+        case Interpreter::BACKEND_PROFILE: {
+            for (auto& pipeline : mPipelines) {
+                auto backend = pipeline->getMainBackend();
+                if (nullptr != backend && backend->onGetSessionInfo((int)code, ptr)) {
+                    return true;
+                }
+            }
+            for (auto& r : mRuntime.first) {
+                if (r.second != nullptr && r.second->onGetRuntimeInfo((int)code, ptr)) {
+                    return true;
+                }
+            }
+            return false;
+        }
         // TODO: Support other debug info
         default:
             break;
diff --git a/tools/converter/source/common/RKNNBundle.cpp b/tools/converter/source/common/RKNNBundle.cpp
index ca7b5192aa..140cffa7c9 100644
--- a/tools/converter/source/common/RKNNBundle.cpp
+++ b/tools/converter/source/common/RKNNBundle.cpp
@@ -325,14 +325,22 @@ bool GenerateRKNNBundle(const modelConfig& modelPath, RKNNBundlePaths* bundlePat
         MNN_ERROR("Open RKNN manifest failed: %s\n", manifestPath.c_str());
         return false;
     }
+    auto onnxSlash = modelPath.modelFile.find_last_of("/\\");
+    auto mnnSlash = modelPath.MNNModel.find_last_of("/\\");
+    auto rknnSlash = rknnPath.find_last_of("/\\");
+    const auto onnxModelName = (onnxSlash == std::string::npos) ? modelPath.modelFile : modelPath.modelFile.substr(onnxSlash + 1);
+    const auto mnnModelName = (mnnSlash == std::string::npos) ? modelPath.MNNModel : modelPath.MNNModel.substr(mnnSlash + 1);
+    const auto rknnModelName = (rknnSlash == std::string::npos) ? rknnPath : rknnPath.substr(rknnSlash + 1);
     manifest << "{\n";
-    manifest << "  \"onnx_model\": \"" << modelPath.modelFile << "\",\n";
-    manifest << "  \"mnn_model\": \"" << modelPath.MNNModel << "\",\n";
-    manifest << "  \"rknn_model\": \"" << rknnPath << "\",\n";
+    manifest << "  \"onnx_model\": \"" << onnxModelName << "\",\n";
+    manifest << "  \"mnn_model\": \"" << mnnModelName << "\",\n";
+    manifest << "  \"rknn_model\": \"" << rknnModelName << "\",\n";
     manifest << "  \"target\": \"" << modelPath.rknnTarget << "\"";
     const auto weightPath = modelPath.MNNModel + ".weight";
     if (MNNFileExist(weightPath.c_str())) {
-        manifest << ",\n  \"mnn_external_weight\": \"" << weightPath << "\"\n";
+        auto weightSlash = weightPath.find_last_of("/\\");
+        const auto weightName = (weightSlash == std::string::npos) ? weightPath : weightPath.substr(weightSlash + 1);
+        manifest << ",\n  \"mnn_external_weight\": \"" << weightName << "\"\n";
     } else {
         manifest << "\n";
     }
@@ -407,8 +415,8 @@ std::unique_ptr<NetT> BuildRKNNWrapperNet(const NetT& sourceNet, const modelConf
     rknnOp->main.type = OpParameter_Plugin;
     rknnOp->main.value = new PluginT;
     rknnOp->main.AsPlugin()->type = "RKNN";
-    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("model_path", bundlePaths.rknnPath));
-    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("bundle_manifest", bundlePaths.manifestPath));
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("model_path", basenameWithoutExtension(bundlePaths.rknnPath) + ".rknn"));
+    rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("bundle_manifest", basenameWithoutExtension(bundlePaths.manifestPath) + ".json"));
     rknnOp->main.AsPlugin()->attr.emplace_back(makeStringAttr("target", modelPath.rknnTarget));
     rknnOp->main.AsPlugin()->attr.emplace_back(makeStringListAttr("inputs", inputNames));
     rknnOp->main.AsPlugin()->attr.emplace_back(makeStringListAttr("outputs", outputNames));