From c9d2cbd5bacf493362aa9a81057938a11961369a Mon Sep 17 00:00:00 2001
From: w1m024 <iwangyiming@gmail.com>
Date: Tue, 9 Sep 2025 06:20:55 +0000
Subject: [PATCH 1/2] add RVV optimization for ZSTD_row_getMatchMask

Co-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
---
 lib/compress/zstd_lazy.c | 42 +++++++++++++++++++++++++++++++++++++---
 1 file changed, 39 insertions(+), 3 deletions(-)

diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 272ebe0ec..ca8fb4194 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -1050,6 +1050,36 @@ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag
     }
 }
 #endif
+#if defined(ZSTD_ARCH_RISCV_RVV) && (__riscv_xlen == 64)
+FORCE_INLINE_TEMPLATE ZSTD_VecMask
+ZSTD_row_getRVVMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
+{
+    U16 matches[4] = {0};
+    int i;
+    assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
+
+    size_t vl = __riscv_vsetvl_e8m1(16);
+
+    for (i = 0; i < nbChunks; i++) {
+        vuint8m1_t chunk = __riscv_vle8_v_u8m1((const uint8_t*)(src + 16 * i), vl);
+        vbool8_t equalMask = __riscv_vmseq_vx_u8m1_b8(chunk, tag, vl);
+
+        size_t vl_w = __riscv_vsetvl_e16m2(16);
+        vuint16m2_t one = __riscv_vmv_v_x_u16m2(1, vl_w);
+        vuint16m2_t indices = __riscv_vid_v_u16m2(vl_w);
+        vuint16m2_t powers_of_2 = __riscv_vsll_vv_u16m2(one, indices, vl_w);
+        vuint16m2_t zero = __riscv_vmv_v_x_u16m2(0, vl_w);
+        vuint16m2_t selected_bits = __riscv_vmerge_vvm_u16m2(zero, powers_of_2, equalMask, vl_w);
+        vuint16m1_t reduction = __riscv_vredor_vs_u16m2_u16m1(selected_bits, __riscv_vmv_s_x_u16m1(0, vl_w), vl_w);
+        matches[i] = __riscv_vmv_x_s_u16m1_u16(reduction);
+    }
+    
+    if (nbChunks == 1) return ZSTD_rotateRight_U16(matches[0], head);
+    if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
+    assert(nbChunks == 4);
+    return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
+}
+#endif
 
 /* Returns a ZSTD_VecMask (U64) that has the nth group (determined by
  * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
@@ -1069,14 +1099,20 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGr
 
     return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped);
 
-#else /* SW or NEON-LE */
+#elif defined(ZSTD_ARCH_RISCV_RVV) && (__riscv_xlen == 64)
 
-# if defined(ZSTD_ARCH_ARM_NEON)
+    return ZSTD_row_getRVVMask(rowEntries, src, tag, headGrouped);
+
+#else
+
+#if defined(ZSTD_ARCH_ARM_NEON)
   /* This NEON path only works for little endian - otherwise use SWAR below */
     if (MEM_isLittleEndian()) {
         return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
     }
-# endif /* ZSTD_ARCH_ARM_NEON */
+
+
+#endif
     /* SWAR */
     {   const int chunkSize = sizeof(size_t);
         const size_t shiftAmount = ((chunkSize * 8) - chunkSize);

From fb7a86f20f7b3825ccf2a77153d4fb104733b2f4 Mon Sep 17 00:00:00 2001
From: w1m024 <iwangyiming@gmail.com>
Date: Thu, 11 Sep 2025 20:42:40 +0000
Subject: [PATCH 2/2] Refactor ZSTD_row_getMatchMask for RVV optimization

Performance (vs. SWAR)
- 16-byte data: 5.87x speedup
- 32-byte data: 9.63x speedup
- 64-byte data: 17.98x speedup

Co-authored-by: gong-flying <gongxiaofei24@iscas.ac.cn>
---
 lib/compress/zstd_lazy.c | 44 +++++++++++++++++++++-------------------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index ca8fb4194..f5efa8d8a 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -1054,30 +1054,32 @@ ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag
 FORCE_INLINE_TEMPLATE ZSTD_VecMask
 ZSTD_row_getRVVMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
 {
-    U16 matches[4] = {0};
-    int i;
-    assert(nbChunks == 1 || nbChunks == 2 || nbChunks == 4);
+    ZSTD_VecMask matches;
+    size_t vl;
 
-    size_t vl = __riscv_vsetvl_e8m1(16);
+    if (rowEntries == 16) {
+        vl = __riscv_vsetvl_e8m1(16);
+        vuint8m1_t chunk = __riscv_vle8_v_u8m1(src, vl);
+        vbool8_t mask = __riscv_vmseq_vx_u8m1_b8(chunk, tag, vl);
+        vuint16m1_t mask_u16 = __riscv_vreinterpret_v_b8_u16m1(mask);
+        matches = __riscv_vmv_x_s_u16m1_u16(mask_u16);
+        return ZSTD_rotateRight_U16((U16)matches, head);
 
-    for (i = 0; i < nbChunks; i++) {
-        vuint8m1_t chunk = __riscv_vle8_v_u8m1((const uint8_t*)(src + 16 * i), vl);
-        vbool8_t equalMask = __riscv_vmseq_vx_u8m1_b8(chunk, tag, vl);
-
-        size_t vl_w = __riscv_vsetvl_e16m2(16);
-        vuint16m2_t one = __riscv_vmv_v_x_u16m2(1, vl_w);
-        vuint16m2_t indices = __riscv_vid_v_u16m2(vl_w);
-        vuint16m2_t powers_of_2 = __riscv_vsll_vv_u16m2(one, indices, vl_w);
-        vuint16m2_t zero = __riscv_vmv_v_x_u16m2(0, vl_w);
-        vuint16m2_t selected_bits = __riscv_vmerge_vvm_u16m2(zero, powers_of_2, equalMask, vl_w);
-        vuint16m1_t reduction = __riscv_vredor_vs_u16m2_u16m1(selected_bits, __riscv_vmv_s_x_u16m1(0, vl_w), vl_w);
-        matches[i] = __riscv_vmv_x_s_u16m1_u16(reduction);
+    } else if (rowEntries == 32) {
+        vl = __riscv_vsetvl_e8m2(32);
+        vuint8m2_t chunk = __riscv_vle8_v_u8m2(src, vl);
+        vbool4_t mask = __riscv_vmseq_vx_u8m2_b4(chunk, tag, vl);
+        vuint32m1_t mask_u32 = __riscv_vreinterpret_v_b4_u32m1(mask);
+        matches = __riscv_vmv_x_s_u32m1_u32(mask_u32);
+        return ZSTD_rotateRight_U32((U32)matches, head);
+    } else { // rowEntries = 64
+        vl = __riscv_vsetvl_e8m4(64);
+        vuint8m4_t chunk = __riscv_vle8_v_u8m4(src, vl);
+        vbool2_t mask = __riscv_vmseq_vx_u8m4_b2(chunk, tag, vl);
+        vuint64m1_t mask_u64 = __riscv_vreinterpret_v_b2_u64m1(mask);
+        matches = __riscv_vmv_x_s_u64m1_u64(mask_u64);
+        return ZSTD_rotateRight_U64(matches, head);
     }
-    
-    if (nbChunks == 1) return ZSTD_rotateRight_U16(matches[0], head);
-    if (nbChunks == 2) return ZSTD_rotateRight_U32((U32)matches[1] << 16 | (U32)matches[0], head);
-    assert(nbChunks == 4);
-    return ZSTD_rotateRight_U64((U64)matches[3] << 48 | (U64)matches[2] << 32 | (U64)matches[1] << 16 | (U64)matches[0], head);
 }
 #endif