add RVV optimization for ZSTD_row_getMatchMask

Co-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
This commit is contained in:
w1m024 2025-09-09 06:20:55 +00:00
parent 98d2b90e82
commit c9d2cbd5ba

View File

@ -1050,6 +1050,36 @@ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag
}
}
#endif
#if defined(ZSTD_ARCH_RISCV_RVV) && (__riscv_xlen == 64)
FORCE_INLINE_TEMPLATE ZSTD_VecMask
ZSTD_row_getRVVMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
{
U16 matches[4] = {0};
int i;
assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
size_t vl = __riscv_vsetvl_e8m1(16);
for (i = 0; i < nbChunks; i++) {
vuint8m1_t chunk = __riscv_vle8_v_u8m1((const uint8_t*)(src + 16 * i), vl);
vbool8_t equalMask = __riscv_vmseq_vx_u8m1_b8(chunk, tag, vl);
size_t vl_w = __riscv_vsetvl_e16m2(16);
vuint16m2_t one = __riscv_vmv_v_x_u16m2(1, vl_w);
vuint16m2_t indices = __riscv_vid_v_u16m2(vl_w);
vuint16m2_t powers_of_2 = __riscv_vsll_vv_u16m2(one, indices, vl_w);
vuint16m2_t zero = __riscv_vmv_v_x_u16m2(0, vl_w);
vuint16m2_t selected_bits = __riscv_vmerge_vvm_u16m2(zero, powers_of_2, equalMask, vl_w);
vuint16m1_t reduction = __riscv_vredor_vs_u16m2_u16m1(selected_bits, __riscv_vmv_s_x_u16m1(0, vl_w), vl_w);
matches[i] = __riscv_vmv_x_s_u16m1_u16(reduction);
}
if (nbChunks == 1) return ZSTD_rotateRight_U16(matches[0], head);
if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
assert(nbChunks == 4);
return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
}
#endif
/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
* ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
@ -1069,14 +1099,20 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGr
return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
#else /* SW or NEON-LE */
#elif defined(ZSTD_ARCH_RISCV_RVV) && (__riscv_xlen == 64)
# if defined(ZSTD_ARCH_ARM_NEON)
return ZSTD_row_getRVVMask(rowEntries, src, tag, headGrouped);
#else
#if defined(ZSTD_ARCH_ARM_NEON)
/* This NEON path only works for little endian - otherwise use SWAR below */
if (MEM_isLittleEndian()) {
return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
}
# endif /* ZSTD_ARCH_ARM_NEON */
#endif
/* SWAR */
{ const int chunkSize = sizeof(size_t);
const size_t shiftAmount = ((chunkSize * 8) - chunkSize);