RVV指令集加速运行效率，比如模型预处理后处理-次世代BUG池

版权声明：本文为 neucrack 的原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接及本声明。
原文链接(持续更新）：https://neucrack.com/p/551

资源

RVV 指令集文档，不同版本不兼容。。需要注意:

玄铁c906 用的 RVV0.7.1

使用场景

RVV 即RV 矢量加速指令集，可以加速矢量运算或者数据并行运算。
最简单的应用场景就是将计算批量化，因为一个指令可以计算多个数据，类似 openmp 多核并行运行一样，最简单的场景就是加速 for 循环减少计算指令数量来达到加速效果，甚至可以优化内存拷贝比如 hwc 转 chw 内存拷贝可以使用RVV批量拷贝比for循环快。
只要指令集支持的运算就可以加速，比如常见的加减乘除和逻辑运算等，具体需要看芯片支持的RVV指令集文档（注意要对应版本，比如RVV0.7.1 和 RV1.0.0 就是不兼容的，以及芯片是否支持）。

如果要加速没有直接指令的算法，可以基于基本指令进行运算，比如ncnn 中的tanh 函数的实现。

例子

#if __riscv_vector
        int n = size;
        while (n > 0)
        {
            size_t vl = vsetvl_e32m8(n);
            vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
            _p = tanh_ps(_p, vl);
            vse32_v_f32m8(ptr, _p, vl);
            ptr += vl;
            n -= vl;
        }
#else  // __riscv_vector
        for (int i = 0; i < size; i++)
        {
            *ptr = tanh(*ptr);
            ptr++;
        }
#endif // __riscv_vector

vsetvl_e32m8 获取处理 32位数据（元素宽度 SEW）并且8倍寄存器宽度倍数（LMUL）的数据长度，得到一次型能处理的32位数据长度，如果硬件矢量寄存器的长度为128（VLEN），则这里 vl 为min(n, VLEN/SEW * LMUL), 这里即 min(n, 128/32 8) = min(n, 32) 个 32位数据。如果是vsetvl_e16m1 那就是获取一次能处理的16位数据并且1倍寄存器宽度的长度，即 128 / 16 1 = 8 个16位宽数据。
这里寄存器宽度倍率使用1 2 4 8 的决定方式为你的数据长度，比如有 n = 18 个 32位数据，我是先vle32_v_f32m4(ptr, 16) 再调用 vle32_v_f32m1(ptr+16, 2) 好，还是直接调用 vle32_v_f32m8(ptr, 18) 好，比较两者效率：一次性加载只需要一次指令，比分开多次指令好，虽然 vle32_v_f32m8 支持最大 32 个32位数据计算，指定18个硬件会自动调整有效长度（VL）为18,即只会计算这18个元素，但是仍然会分配32个元素的寄存器空间，所以从运算速度上来说分配更多寄存器减少指令更好，只是会有空闲的寄存器被分配。

vle32_v_f32m8 将数据提取 vl 个存到矢量寄存器中得到变量 _p，然后调用tanh_ps 进行批量计算，请且将结果拷贝到 ptr 内存区域即覆盖源值。

相比用CPU循环一个一个计算，使用RVV矢量运算效率会更高。

用RVV加速模型输入预处理(x-mean)*scale

对于标准化输入的代码就是将输入图像所有像素的值减 mean 再乘以scale 或者除以std，这个步骤在不同平台加速方法不一样，比如多核可以用Openmp并行运行, arm 可以SIMD指令加速等，对于RISCV可以用RVV加速，实测在 SG2002(c906)上从 9~14ms 变成 1~2ms，速度提升明显。

for (int i = 0; i < img_h * img_w; ++i)
{
    *ptr_ch0 = ((float)*p - mean[0]) * scales[0];
    *ptr_ch1 = ((float)*(p + 1) - mean[1]) * scales[1];
    *ptr_ch2 = ((float)*(p + 2) - mean[2]) * scales[2];
    ++ptr_ch0;
    ++ptr_ch1;
    ++ptr_ch2;
    p += 3;
}

RVV加速代码，核心思想就是用RVV指令进行批量运算：

static inline void process_image_rvv(const uint8_t *img_data, int8_t *output, int img_h, int img_w, const float mean[3], const float scale[3]) {
    size_t total_pixels = img_h * img_w;
    const uint8_t *p = img_data;
    int8_t *ptr_ch0 = output;
    int8_t *ptr_ch1 = ptr_ch0 + total_pixels;
    int8_t *ptr_ch2 = ptr_ch1 + total_pixels;
    size_t vl = vsetvlmax_e8m2();  // Set the vector length to maximum for uint8_t
    vuint16m4_t v_zero = vmv_v_x_u16m4(0, vl);
    for (size_t n = total_pixels; n > 0;) {
        if (vl > n) vl = n;
        n -= vl;
        // Step 1: Load RGB channels (HWC format)
        vuint8m2_t v_r_u8 = vlse8_v_u8m2(p, 3, vl);
        vuint8m2_t v_g_u8 = vlse8_v_u8m2(p + 1, 3, vl);
        vuint8m2_t v_b_u8 = vlse8_v_u8m2(p + 2, 3, vl);
        // convert u8 to u16
        vuint16m4_t v_r_u16 = vwcvtu_x_x_v_u16m4(v_r_u8, vl);
        vuint16m4_t v_g_u16 = vwcvtu_x_x_v_u16m4(v_g_u8, vl);
        vuint16m4_t v_b_u16 = vwcvtu_x_x_v_u16m4(v_b_u8, vl);
        // convert u16 to u32
        vuint32m8_t v_r_u32 = vwcvtu_x_x_v_u32m8(v_r_u16, vl);
        vuint32m8_t v_g_u32 = vwcvtu_x_x_v_u32m8(v_g_u16, vl);
        vuint32m8_t v_b_u32 = vwcvtu_x_x_v_u32m8(v_b_u16, vl);
        vfloat32m8_t v_r_f32 = vfcvt_f_xu_v_f32m8(v_r_u32, vl); // Convert uint32 to float32
        vfloat32m8_t v_g_f32 = vfcvt_f_xu_v_f32m8(v_g_u32, vl); // Convert uint32 to float32
        vfloat32m8_t v_b_f32 = vfcvt_f_xu_v_f32m8(v_b_u32, vl); // Convert uint32 to float32
        // Step 4: Apply (x - mean) * scale
        v_r_f32 = vfmul_vf_f32m8(vfsub_vf_f32m8(v_r_f32, mean[0], vl), scale[0], vl);
        v_g_f32 = vfmul_vf_f32m8(vfsub_vf_f32m8(v_g_f32, mean[1], vl), scale[1], vl);
        v_b_f32 = vfmul_vf_f32m8(vfsub_vf_f32m8(v_b_f32, mean[2], vl), scale[2], vl);
        // convert f32 to u32
        vint32m8_t v_r_i32 = vfcvt_x_f_v_i32m8(v_r_f32, vl);
        vint32m8_t v_g_i32 = vfcvt_x_f_v_i32m8(v_g_f32, vl);
        vint32m8_t v_b_i32 = vfcvt_x_f_v_i32m8(v_b_f32, vl);
        // convert i32 to i16
        vint16m4_t v_r_i16 = vnclip_wv_i16m4(v_r_i32, v_zero, vl);
        vint16m4_t v_g_i16 = vnclip_wv_i16m4(v_g_i32, v_zero, vl);
        vint16m4_t v_b_i16 = vnclip_wv_i16m4(v_b_i32, v_zero, vl);
        // convert i16 to i8
        vint8m2_t v_r_i8 = vnclip_wx_i8m2(v_r_i16, 0, vl);
        vint8m2_t v_g_i8 = vnclip_wx_i8m2(v_g_i16, 0, vl);
        vint8m2_t v_b_i8 = vnclip_wx_i8m2(v_b_i16, 0, vl);
        // Step 6: Store the result in CHW format
        vse8_v_i8m2(ptr_ch0, v_r_i8, vl);
        vse8_v_i8m2(ptr_ch1, v_g_i8, vl);
        vse8_v_i8m2(ptr_ch2, v_b_i8, vl);
        // Step 7: Advance the pointers
        p += vl * 3;
        ptr_ch0 += vl;
        ptr_ch1 += vl;
        ptr_ch2 += vl;
    }
}

灰度图

static inline void process_image_gray_rvv(const uint8_t *img_data, int8_t *output, int img_h, int img_w, const float &mean, const float &scale) {
    size_t total_pixels = img_h * img_w;
    const uint8_t *p = img_data;
    int8_t *ptr_ch0 = output;
    size_t vl = vsetvlmax_e8m2();
    vuint16m4_t v_zero = vmv_v_x_u16m4(0, vl);
    for (size_t n = total_pixels; n > 0;) {
        if (vl > n) vl = n;
        n -= vl;
        vuint8m2_t v_r_u8 = vle8_v_u8m2(p, vl);
        vuint16m4_t v_r_u16 = vwcvtu_x_x_v_u16m4(v_r_u8, vl);
        vuint32m8_t v_r_u32 = vwcvtu_x_x_v_u32m8(v_r_u16, vl);
        vfloat32m8_t v_r_f32 = vfcvt_f_xu_v_f32m8(v_r_u32, vl); // Convert uint32 to float32
        v_r_f32 = vfmul_vf_f32m8(vfsub_vf_f32m8(v_r_f32, mean[0], vl), scale[0], vl);
        vint32m8_t v_r_i32 = vfcvt_x_f_v_i32m8(v_r_f32, vl);
        vint16m4_t v_r_i16 = vnclip_wv_i16m4(v_r_i32, v_zero, vl);
        vint8m2_t v_r_i8 = vnclip_wx_i8m2(v_r_i16, 0, vl);
        vse8_v_i8m2(ptr_ch0, v_r_i8, vl);
        p += vl;
        ptr_ch0 += vl;
    }
}