Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions source/backend/cpu/compute/CommonOptFunction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,19 @@ extern void MNNSumByAxisLForMatmul_A_RVV(float* dest, int8_t* source, const floa
SumByAxisParams sumParams);
extern void MNNSumWeightInt8_RVV(float* kernelsum, int8_t* source, size_t outside, size_t reduceAxis, size_t hP,
size_t lP);
extern void generalIm2col_RVV(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el,
int LP, int pack);
extern void MNNDynamicUpdateConvBiasScale_RVV(float* newbias, float* oldbias, float* weightKernelSum, float* inputBias,
size_t ocQuad);
extern void MNNPackedMatMulFP32_RVV(float* C, const float* A, const float* B, const size_t* parameter,
const float* postParameters, const float* bias, const float* k, const float* b);
extern void MNNPackedMatMulRemainFP32_RVV(float* C, const float* A, const float* B, size_t eSize,
const size_t* parameter, const float* postParameters, const float* bias,
const float* k, const float* b);
extern void MNNPackForMatMul_B_RVV(float* destC, const float* sourceC, size_t h, size_t kernelsize, size_t ic,
bool transpose);
extern void MNNQuantScaleFP32_RVV(float* absmax, float* quant_scale, float* dequant_scale, size_t thread, size_t batch);
extern void MNNGetMatMulPackMode_RVV(int* eP, int* lP, int* hP);
#endif

#ifndef MNN_USE_SSE
Expand Down Expand Up @@ -4804,11 +4817,18 @@ void MNNCoreFunctionInit() {
gCoreFunction->MNNSumByAxisLForMatmul_A = MNNSumByAxisLForMatmul_A_RVV;
gCoreFunction->MNNReorderWeightInt4 = MNNReorderWeightInt4_RVV;
gCoreFunction->MNNSumWeightInt8 = MNNSumWeightInt8_RVV;
gCoreFunction->MNNPackedMatMul = MNNPackedMatMulFP32_RVV;
gCoreFunction->MNNPackedMatMulRemain = MNNPackedMatMulRemainFP32_RVV;
gCoreFunction->MNNPackForMatMul_B = MNNPackForMatMul_B_RVV;
gCoreFunction->MNNGetMatMulPackMode = MNNGetMatMulPackMode_RVV;
#ifdef MNN_LOW_MEMORY
gCoreFunction->MNNAbsMax = MNNAbsMaxFP32_RVV;
gCoreFunction->MNNDynamicQuant = MNNDynamicQuantFP32_RVV;
gCoreFunction->MNNAsyQuantFunc = MNNAsyQuantFunc_RVV;
gCoreFunction->MNNAsyQuantInfo = MNNAsyQuantInfo_FP32_RVV;
gCoreFunction->MNNGeneralIm2Col = generalIm2col_RVV;
gCoreFunction->MNNDynamicUpdateConvBiasScale = MNNDynamicUpdateConvBiasScale_RVV;
gCoreFunction->MNNQuantScale = MNNQuantScaleFP32_RVV;
#endif
}
#endif
Expand Down
21 changes: 21 additions & 0 deletions source/backend/cpu/riscv/rvv/MNNDynamicUpdateConvBiasScale.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#include <stdint.h>
#include <stddef.h>
#include <riscv_vector.h>

void MNNDynamicUpdateConvBiasScale_RVV(float* newbias, float* oldbias, float* weightKernelSum, float* inputBias,
size_t ocQuad) {
int ocUp4 = 4 * ocQuad;
float ib = inputBias[0];

size_t vl;
for (size_t i = 0; i < ocUp4; i += vl) {
vl = __riscv_vsetvl_e32m4(ocUp4 - i);

vfloat32m4_t v_old = __riscv_vle32_v_f32m4(oldbias + i, vl);
vfloat32m4_t v_wks = __riscv_vle32_v_f32m4(weightKernelSum + i, vl);

vfloat32m4_t v_new = __riscv_vfmacc_vf_f32m4(v_old, ib, v_wks, vl);

__riscv_vse32_v_f32m4(newbias + i, v_new, vl);
}
}
6 changes: 6 additions & 0 deletions source/backend/cpu/riscv/rvv/MNNGetMatMulPackMode.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include <stdint.h>
void MNNGetMatMulPackMode_RVV(int* eP, int* lP, int* hP) {
*eP = 16;
*lP = 1;
*hP = 4;
}
68 changes: 68 additions & 0 deletions source/backend/cpu/riscv/rvv/MNNPackForMatMul_B.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#include <riscv_vector.h>
#include <cstring>
#include <stdint.h>
#include <stddef.h>

#define RVV_MATMUL_LP 1
#define RVV_MATMUL_HP 4

void MNNPackForMatMul_B_RVV(float* destC, const float* sourceC, size_t h, size_t kernelsize, size_t ic,
bool transpose) {
auto dest = (int32_t*)destC;
auto source = (int32_t*)sourceC;

int LP = RVV_MATMUL_LP;
int HP = RVV_MATMUL_HP;
auto l = kernelsize * ic;

size_t dest_size = ROUND_UP(h, HP) * ROUND_UP(ic, LP) * kernelsize * 4;
memset(dest, 0, dest_size);

auto stride0 = kernelsize * ROUND_UP(ic, LP) * HP;
auto stride1 = ROUND_UP(ic, LP) * HP;
auto stride2 = HP * LP;

auto srcStride0 = l;
auto srcStride1 = 1;
if (!transpose) {
srcStride0 = 1;
srcStride1 = h;
}

size_t h_blocks = ROUND_UP(h, HP) / HP;

for (size_t yHu = 0; yHu < h_blocks; ++yHu) {
size_t y_start = yHu * HP;
size_t y_end = (y_start + HP < h) ? (y_start + HP) : h;
size_t y_len = y_end - y_start;

if (y_len == 0)
break;

for (size_t k = 0; k < kernelsize; ++k) {
for (size_t x = 0; x < ic; ++x) {
auto xLu = x / LP;
auto xLp = x % LP;

int32_t* dst_ptr = dest + yHu * stride0 + k * stride1 + xLu * stride2 + xLp;

size_t l_idx = x + k * ic;
const int32_t* src_ptr = source + y_start * srcStride0 + l_idx * srcStride1;

size_t vl;
for (size_t yHp = 0; yHp < y_len; yHp += vl) {
vl = __riscv_vsetvl_e32m1(y_len - yHp);

vint32m1_t v_src;
if (!transpose) {
v_src = __riscv_vle32_v_i32m1(src_ptr + yHp, vl);
} else {
v_src = __riscv_vlse32_v_i32m1(src_ptr + yHp * l, l * sizeof(int32_t), vl);
}

__riscv_vse32_v_i32m1(dst_ptr + yHp, v_src, vl);
}
}
}
}
}
8 changes: 8 additions & 0 deletions source/backend/cpu/riscv/rvv/MNNPackedMatMulFP32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#include <stddef.h>

void MNNPackedMatMulRemainFP32_RVV(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
const float* postParameters, const float* bias, const float* k, const float* b);
void MNNPackedMatMulFP32_RVV(float* C, const float* A, const float* B, const size_t* parameter,
const float* postParameters, const float* bias, const float* k, const float* b) {
MNNPackedMatMulRemainFP32_RVV(C, A, B, 16, parameter, postParameters, bias, k, b);
}
69 changes: 69 additions & 0 deletions source/backend/cpu/riscv/rvv/MNNPackedMatMulRemainFP32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include <riscv_vector.h>
#include <algorithm>
#include <limits>
#include <stddef.h>
void MNNPackedMatMulRemainFP32_RVV(float* C, const float* A, const float* B, size_t eSize, const size_t* parameter,
const float* postParameters, const float* bias, const float* k, const float* b) {
if (eSize == 0)
return;

size_t aStride = parameter[0] / sizeof(float);
size_t l = parameter[1];
size_t h = parameter[2];
size_t cStride = parameter[3] / sizeof(float);
size_t bExtraStride = parameter[5] / sizeof(float);
size_t bStride = bExtraStride + l * 4;

size_t hC4 = UP_DIV(h, 4);

float minValue = -std::numeric_limits<float>::max();
float maxValue = std::numeric_limits<float>::max();
if (postParameters != nullptr) {
minValue = postParameters[2];
maxValue = postParameters[3];
}

size_t vl = __riscv_vsetvl_e32m4(eSize);
MNN_ASSERT(vl >= eSize);

for (size_t y = 0; y < hC4; ++y) {
float* c_base = C + y * cStride;
const float* b_base = B + y * bStride;
const float* bias_y = bias ? bias + 4 * y : nullptr;

vfloat32m4_t acc0, acc1, acc2, acc3;
if (bias_y) {
acc0 = __riscv_vfmv_v_f_f32m4(bias_y[0], vl);
acc1 = __riscv_vfmv_v_f_f32m4(bias_y[1], vl);
acc2 = __riscv_vfmv_v_f_f32m4(bias_y[2], vl);
acc3 = __riscv_vfmv_v_f_f32m4(bias_y[3], vl);
} else {
acc0 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
acc1 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
acc2 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
acc3 = __riscv_vfmv_v_f_f32m4(0.0f, vl);
}

for (size_t z = 0; z < l; ++z) {
vfloat32m4_t a_vec = __riscv_vle32_v_f32m4(A + z * aStride, vl);
const float* w_ptr = b_base + z * 4;

acc0 = __riscv_vfmacc_vf_f32m4(acc0, w_ptr[0], a_vec, vl);
acc1 = __riscv_vfmacc_vf_f32m4(acc1, w_ptr[1], a_vec, vl);
acc2 = __riscv_vfmacc_vf_f32m4(acc2, w_ptr[2], a_vec, vl);
acc3 = __riscv_vfmacc_vf_f32m4(acc3, w_ptr[3], a_vec, vl);
}

acc0 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc0, minValue, vl), maxValue, vl);
acc1 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc1, minValue, vl), maxValue, vl);
acc2 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc2, minValue, vl), maxValue, vl);
acc3 = __riscv_vfmin_vf_f32m4(__riscv_vfmax_vf_f32m4(acc3, minValue, vl), maxValue, vl);

ptrdiff_t stride = 4 * sizeof(float);

__riscv_vsse32_v_f32m4(c_base + 0, stride, acc0, vl);
__riscv_vsse32_v_f32m4(c_base + 1, stride, acc1, vl);
__riscv_vsse32_v_f32m4(c_base + 2, stride, acc2, vl);
__riscv_vsse32_v_f32m4(c_base + 3, stride, acc3, vl);
}
}
29 changes: 29 additions & 0 deletions source/backend/cpu/riscv/rvv/MNNQuantScaleFP32.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#include <stdint.h>
#include <stddef.h>
#include <riscv_vector.h>

void MNNQuantScaleFP32_RVV(float* absmax, float* quant_scale, float* dequant_scale, size_t thread, size_t batch) {
size_t vl;
for (size_t i = 0; i < batch; i += vl) {
vl = __riscv_vsetvl_e32m4(batch - i);

vfloat32m4_t v_max = __riscv_vfmv_v_f_f32m4(0.0f, vl);

for (size_t t = 0; t < thread; ++t) {
vfloat32m4_t v_val = __riscv_vle32_v_f32m4(absmax + t * batch + i, vl);
v_max = __riscv_vfmax_vv_f32m4(v_max, v_val, vl);
}

vbool8_t mask = __riscv_vmflt_vf_f32m4_b8(v_max, 1e-7f, vl);

vfloat32m4_t v_127 = __riscv_vfmv_v_f_f32m4(127.0f, vl);
vfloat32m4_t v_qscale = __riscv_vfdiv_vv_f32m4(v_127, v_max, vl);
vfloat32m4_t v_dqscale = __riscv_vfdiv_vf_f32m4(v_max, 127.0f, vl);

v_qscale = __riscv_vfmerge_vfm_f32m4(v_qscale, 1.0f, mask, vl);
v_dqscale = __riscv_vfmerge_vfm_f32m4(v_dqscale, 1.0f, mask, vl);

__riscv_vse32_v_f32m4(quant_scale + i, v_qscale, vl);
__riscv_vse32_v_f32m4(dequant_scale + i, v_dqscale, vl);
}
}
62 changes: 62 additions & 0 deletions source/backend/cpu/riscv/rvv/generalIm2col.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
#include <riscv_vector.h>
#include <stdint.h>
#include <stddef.h>
#include <algorithm>

void generalIm2col_RVV(float* destOrigin, float const** sourceGroup, const int32_t* info, const int32_t* el, int LP,
int pack) {
int number = info[0];
int eReal = info[1];
int eDest = info[2];
int offset = info[3];

int eReal_pack = eReal * pack;
int eDest_LP = eDest * LP;

for (int n = 0; n < number; ++n) {
int e = el[4 * n + 0];
int l = el[4 * n + 1];
int eOffset = el[4 * n + 2];
int lOffset = el[4 * n + 3];

auto dest = destOrigin + eOffset * LP;
auto source = sourceGroup[n];

for (int y = 0; y < e; ++y) {
auto yR = y % eDest;
float* dst_y = dest + yR * LP;
const float* src_y = source + y * pack * offset;

int x_total = lOffset;
int xC = 0;

for (int x = 0; x < l; x += pack) {
int current_pack = std::min(pack, l - x);

int xOut = x_total / LP;
int xIn = x_total % LP;

const float* s_ptr = src_y + xC * eReal_pack;
float* d_ptr = dst_y + xOut * eDest_LP + xIn;

// Note: This RVV path is currently a placeholder.
// Vectorization along this dimension is limited when LP=1 and pack=4,
// causing it to fall through to the scalar loop below.
// For now, we use a scalar loop to handle this case.
// In the future, we can explore more efficient vectorization strategies.
if (xIn + current_pack <= LP) {
size_t vl = __riscv_vsetvl_e32m1(current_pack);
vfloat32m1_t v_val = __riscv_vle32_v_f32m1(s_ptr, vl);
__riscv_vse32_v_f32m1(d_ptr, v_val, vl);
} else {
for (int i = 0; i < current_pack; ++i) {
int temp_x = x_total + i;
dst_y[(temp_x / LP) * eDest_LP + (temp_x % LP)] = s_ptr[i];
}
}
xC++;
x_total += current_pack;
}
}
}
}