alibaba · wangzhaode · Jun 18, 2026 · Jun 17, 2026
diff --git a/source/backend/cpu/compute/CommonOptFunction.cpp b/source/backend/cpu/compute/CommonOptFunction.cpp
@@ -48,6 +48,19 @@ extern void MNNSumByAxisLForMatmul_A_RVV(float* dest, int8_t* source, const floa
                                          SumByAxisParams sumParams);
 extern void MNNSumWeightInt8_RVV(float* kernelsum, int8_t* source, size_t outside, size_t reduceAxis, size_t hP,
                                  size_t lP);
+extern void generalIm2col_RVV(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el,
+                              int LP, int pack);
+extern void MNNDynamicUpdateConvBiasScale_RVV(float* newbias, float* oldbias, float* weightKernelSum, float* inputBias,
+                                              size_t ocQuad);
+extern void MNNPackedMatMulFP32_RVV(float* C, const float* A, const float* B, const size_t* parameter,
+                                    const float* postParameters, const float* bias, const float* k, const float* b);
+extern void MNNPackedMatMulRemainFP32_RVV(float* C, const float* A, const float* B, size_t eSize,
+                                          const size_t* parameter, const float* postParameters, const float* bias,
+                                          const float* k, const float* b);
+extern void MNNPackForMatMul_B_RVV(float* destC, const float* sourceC, size_t h, size_t kernelsize, size_t ic,
+                                   bool transpose);
+extern void MNNQuantScaleFP32_RVV(float* absmax, float* quant_scale, float* dequant_scale, size_t thread, size_t batch);
+extern void MNNGetMatMulPackMode_RVV(int* eP, int* lP, int* hP);
 #endif
 
 #ifndef MNN_USE_SSE
@@ -4804,11 +4817,18 @@ void MNNCoreFunctionInit() {
         gCoreFunction->MNNSumByAxisLForMatmul_A = MNNSumByAxisLForMatmul_A_RVV;
         gCoreFunction->MNNReorderWeightInt4 = MNNReorderWeightInt4_RVV;
         gCoreFunction->MNNSumWeightInt8 = MNNSumWeightInt8_RVV;
+        gCoreFunction->MNNPackedMatMul = MNNPackedMatMulFP32_RVV;
+        gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemainFP32_RVV;
+        gCoreFunction->MNNPackForMatMul_B = MNNPackForMatMul_B_RVV;
+        gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode_RVV;
 #ifdef MNN_LOW_MEMORY
         gCoreFunction->MNNAbsMax = MNNAbsMaxFP32_RVV;
         gCoreFunction->MNNDynamicQuant = MNNDynamicQuantFP32_RVV;
         gCoreFunction->MNNAsyQuantFunc = MNNAsyQuantFunc_RVV;
         gCoreFunction->MNNAsyQuantInfo = MNNAsyQuantInfo_FP32_RVV;
+        gCoreFunction->MNNGeneralIm2Col = generalIm2col_RVV;
+        gCoreFunction->MNNDynamicUpdateConvBiasScale = MNNDynamicUpdateConvBiasScale_RVV;
+        gCoreFunction->MNNQuantScale = MNNQuantScaleFP32_RVV;
 #endif
     }
 #endif

diff --git a/source/backend/cpu/riscv/rvv/MNNDynamicUpdateConvBiasScale.cpp b/source/backend/cpu/riscv/rvv/MNNDynamicUpdateConvBiasScale.cpp
@@ -0,0 +1,21 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <riscv_vector.h>
+
+void MNNDynamicUpdateConvBiasScale_RVV(float* newbias, float* oldbias, float* weightKernelSum, float* inputBias,
+                                       size_t ocQuad) {
+    int ocUp4 = 4 * ocQuad;
+    float ib = inputBias[0];
+
+    size_t vl;
+    for (size_t i = 0; i < ocUp4; i += vl) {
+        vl = __riscv_vsetvl_e32m4(ocUp4 - i);
+
+        vfloat32m4_t v_old = __riscv_vle32_v_f32m4(oldbias + i, vl);
+        vfloat32m4_t v_wks = __riscv_vle32_v_f32m4(weightKernelSum + i, vl);
+
+        vfloat32m4_t v_new = __riscv_vfmacc_vf_f32m4(v_old, ib, v_wks, vl);
+
+        __riscv_vse32_v_f32m4(newbias + i, v_new, vl);
+    }
+}
diff --git a/source/backend/cpu/riscv/rvv/MNNGetMatMulPackMode.cpp b/source/backend/cpu/riscv/rvv/MNNGetMatMulPackMode.cpp
@@ -0,0 +1,6 @@
+#include <stdint.h>
+void MNNGetMatMulPackMode_RVV(int* eP, int* lP, int* hP) {
+    *eP = 16;
+    *lP = 1;
+    *hP = 4;
+}
diff --git a/source/backend/cpu/riscv/rvv/MNNPackForMatMul_B.cpp b/source/backend/cpu/riscv/rvv/MNNPackForMatMul_B.cpp
@@ -0,0 +1,68 @@
+#include <riscv_vector.h>
+#include <cstring>
+#include <stdint.h>
+#include <stddef.h>
+
+#define RVV_MATMUL_LP 1
+#define RVV_MATMUL_HP 4
+
+void MNNPackForMatMul_B_RVV(float* destC, const float* sourceC, size_t h, size_t kernelsize, size_t ic,
+                            bool transpose) {
+    auto dest = (int32_t*)destC;
+    auto source = (int32_t*)sourceC;
+
+    int LP = RVV_MATMUL_LP;
+    int HP = RVV_MATMUL_HP;
+    auto l = kernelsize * ic;
+
+    size_t dest_size = ROUND_UP(h, HP) * ROUND_UP(ic, LP) * kernelsize * 4;
+    memset(dest, 0, dest_size);
+
+    auto stride0 = kernelsize * ROUND_UP(ic, LP) * HP;
+    auto stride1 = ROUND_UP(ic, LP) * HP;
+    auto stride2 = HP * LP;
+
+    auto srcStride0 = l;
+    auto srcStride1 = 1;
+    if (!transpose) {
+        srcStride0 = 1;
+        srcStride1 = h;
+    }
+
+    size_t h_blocks = ROUND_UP(h, HP) / HP;
+
+    for (size_t yHu = 0; yHu < h_blocks; ++yHu) {
+        size_t y_start = yHu * HP;
+        size_t y_end = (y_start + HP < h) ? (y_start + HP) : h;
+        size_t y_len = y_end - y_start;
+
+        if (y_len == 0)
+            break;
+
+        for (size_t k = 0; k < kernelsize; ++k) {
+            for (size_t x = 0; x < ic; ++x) {
+                auto xLu = x / LP;
+                auto xLp = x % LP;
+
+                int32_t* dst_ptr = dest + yHu * stride0 + k * stride1 + xLu * stride2 + xLp;
+
+                size_t l_idx = x + k * ic;
+                const int32_t* src_ptr = source + y_start * srcStride0 + l_idx * srcStride1;
+
+                size_t vl;
+                for (size_t yHp = 0; yHp < y_len; yHp += vl) {
+                    vl = __riscv_vsetvl_e32m1(y_len - yHp);
+
+                    vint32m1_t v_src;
+                    if (!transpose) {
+                        v_src = __riscv_vle32_v_i32m1(src_ptr + yHp, vl);
+                    } else {
+                        v_src = __riscv_vlse32_v_i32m1(src_ptr + yHp * l, l * sizeof(int32_t), vl);
+                    }
+
+                    __riscv_vse32_v_i32m1(dst_ptr + yHp, v_src, vl);
+                }
+            }
+        }
+    }
+}
diff --git a/source/backend/cpu/riscv/rvv/MNNPackedMatMulFP32.cpp b/source/backend/cpu/riscv/rvv/MNNPackedMatMulFP32.cpp
@@ -0,0 +1,8 @@
+#include <stddef.h>
+
+void MNNPackedMatMulRemainFP32_RVV(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
+                                   const float* postParameters, const float* bias, const float* k, const float* b);
+void MNNPackedMatMulFP32_RVV(float* C, const float* A, const float* B, const size_t* parameter,
+                             const float* postParameters, const float* bias, const float* k, const float* b) {
+    MNNPackedMatMulRemainFP32_RVV(C, A, B, 16, parameter, postParameters, bias, k, b);
+}
diff --git a/source/backend/cpu/riscv/rvv/MNNPackedMatMulRemainFP32.cpp b/source/backend/cpu/riscv/rvv/MNNPackedMatMulRemainFP32.cpp
@@ -0,0 +1,69 @@
+#include <riscv_vector.h>
+#include <algorithm>
+#include <limits>
+#include <stddef.h>
+void MNNPackedMatMulRemainFP32_RVV(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
+                                   const float* postParameters, const float* bias, const float* k, const float* b) {
+    if (eSize == 0)
+        return;
+
+    size_t aStride = parameter[0] / sizeof(float);
+    size_t l = parameter[1];
+    size_t h = parameter[2];
+    size_t cStride = parameter[3] / sizeof(float);
+    size_t bExtraStride = parameter[5] / sizeof(float);
+    size_t bStride = bExtraStride + l * 4;
+
+    size_t hC4 = UP_DIV(h, 4);
+
+    float minValue = -std::numeric_limits<float>::max();
+    float maxValue = std::numeric_limits<float>::max();
+    if (postParameters != nullptr) {
+        minValue = postParameters[2];
+        maxValue = postParameters[3];
+    }
+
+    size_t vl = __riscv_vsetvl_e32m4(eSize);
+    MNN_ASSERT(vl >= eSize);
+
+    for (size_t y = 0; y < hC4; ++y) {
+        float* c_base = C + y * cStride;
+        const float* b_base = B + y * bStride;
+        const float* bias_y = bias ? bias + 4 * y : nullptr;
+
+        vfloat32m4_t acc0, acc1, acc2, acc3;
+        if (bias_y) {
+            acc0 = __riscv_vfmv_v_f_f32m4(bias_y[0], vl);
+            acc1 = __riscv_vfmv_v_f_f32m4(bias_y[1], vl);
+            acc2 = __riscv_vfmv_v_f_f32m4(bias_y[2], vl);
+            acc3 = __riscv_vfmv_v_f_f32m4(bias_y[3], vl);
+        } else {
+            acc0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            acc1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            acc2 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+            acc3 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+        }
+
+        for (size_t z = 0; z < l; ++z) {
+            vfloat32m4_t a_vec = __riscv_vle32_v_f32m4(A + z * aStride, vl);
+            const float* w_ptr = b_base + z * 4;
+
+            acc0 = __riscv_vfmacc_vf_f32m4(acc0, w_ptr[0], a_vec, vl);
+            acc1 = __riscv_vfmacc_vf_f32m4(acc1, w_ptr[1], a_vec, vl);
+            acc2 = __riscv_vfmacc_vf_f32m4(acc2, w_ptr[2], a_vec, vl);
+            acc3 = __riscv_vfmacc_vf_f32m4(acc3, w_ptr[3], a_vec, vl);
+        }
+
+        acc0 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc0, minValue, vl), maxValue, vl);
+        acc1 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc1, minValue, vl), maxValue, vl);
+        acc2 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc2, minValue, vl), maxValue, vl);
+        acc3 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc3, minValue, vl), maxValue, vl);
+
+        ptrdiff_t stride = 4 * sizeof(float);
+
+        __riscv_vsse32_v_f32m4(c_base + 0, stride, acc0, vl);
+        __riscv_vsse32_v_f32m4(c_base + 1, stride, acc1, vl);
+        __riscv_vsse32_v_f32m4(c_base + 2, stride, acc2, vl);
+        __riscv_vsse32_v_f32m4(c_base + 3, stride, acc3, vl);
+    }
+}
diff --git a/source/backend/cpu/riscv/rvv/MNNQuantScaleFP32.cpp b/source/backend/cpu/riscv/rvv/MNNQuantScaleFP32.cpp
@@ -0,0 +1,29 @@
+#include <stdint.h>
+#include <stddef.h>
+#include <riscv_vector.h>
+
+void MNNQuantScaleFP32_RVV(float* absmax, float* quant_scale, float* dequant_scale, size_t thread, size_t batch) {
+    size_t vl;
+    for (size_t i = 0; i < batch; i += vl) {
+        vl = __riscv_vsetvl_e32m4(batch - i);
+
+        vfloat32m4_t v_max = __riscv_vfmv_v_f_f32m4(0.0f, vl);
+
+        for (size_t t = 0; t < thread; ++t) {
+            vfloat32m4_t v_val = __riscv_vle32_v_f32m4(absmax + t * batch + i, vl);
+            v_max = __riscv_vfmax_vv_f32m4(v_max, v_val, vl);
+        }
+
+        vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(v_max, 1e-7f, vl);
+
+        vfloat32m4_t v_127 = __riscv_vfmv_v_f_f32m4(127.0f, vl);
+        vfloat32m4_t v_qscale = __riscv_vfdiv_vv_f32m4(v_127, v_max, vl);
+        vfloat32m4_t v_dqscale = __riscv_vfdiv_vf_f32m4(v_max, 127.0f, vl);
+
+        v_qscale = __riscv_vfmerge_vfm_f32m4(v_qscale, 1.0f, mask, vl);
+        v_dqscale = __riscv_vfmerge_vfm_f32m4(v_dqscale, 1.0f, mask, vl);
+
+        __riscv_vse32_v_f32m4(quant_scale + i, v_qscale, vl);
+        __riscv_vse32_v_f32m4(dequant_scale + i, v_dqscale, vl);
+    }
+}
diff --git a/source/backend/cpu/riscv/rvv/generalIm2col.cpp b/source/backend/cpu/riscv/rvv/generalIm2col.cpp
@@ -0,0 +1,62 @@
+#include <riscv_vector.h>
+#include <stdint.h>
+#include <stddef.h>
+#include <algorithm>
+
+void generalIm2col_RVV(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el, int LP,
+                       int pack) {
+    int number = info[0];
+    int eReal = info[1];
+    int eDest = info[2];
+    int offset = info[3];
+
+    int eReal_pack = eReal * pack;
+    int eDest_LP = eDest * LP;
+
+    for (int n = 0; n < number; ++n) {
+        int e = el[4 * n + 0];
+        int l = el[4 * n + 1];
+        int eOffset = el[4 * n + 2];
+        int lOffset = el[4 * n + 3];
+
+        auto dest = destOrigin + eOffset * LP;
+        auto source = sourceGroup[n];
+
+        for (int y = 0; y < e; ++y) {
+            auto yR = y % eDest;
+            float* dst_y = dest + yR * LP;
+            const float* src_y = source + y * pack * offset;
+
+            int x_total = lOffset;
+            int xC = 0;
+
+            for (int x = 0; x < l; x += pack) {
+                int current_pack = std::min(pack, l - x);
+
+                int xOut = x_total / LP;
+                int xIn = x_total % LP;
+
+                const float* s_ptr = src_y + xC * eReal_pack;
+                float* d_ptr = dst_y + xOut * eDest_LP + xIn;
+
+                // Note: This RVV path is currently a placeholder.
+                // Vectorization along this dimension is limited when LP=1 and pack=4,
+                // causing it to fall through to the scalar loop below.
+                // For now, we use a scalar loop to handle this case.
+                // In the future, we can explore more efficient vectorization strategies.
+                if (xIn + current_pack <= LP) {
+                    size_t vl = __riscv_vsetvl_e32m1(current_pack);
+                    vfloat32m1_t v_val = __riscv_vle32_v_f32m1(s_ptr, vl);
+                    __riscv_vse32_v_f32m1(d_ptr, v_val, vl);
+                } else {
+                    for (int i = 0; i < current_pack; ++i) {
+                        int temp_x = x_total + i;
+                        dst_y[(temp_x / LP) * eDest_LP + (temp_x % LP)] = s_ptr[i];
+                    }
+                }
+                xC++;
+                x_total += current_pack;
+            }
+        }
+    }
+}