Merge pull request #4413 from arpadpanyik-arm/huf_decode2x

AArch64: Enhance struct access in Huffman decode 2X
This commit is contained in:
Yann Collet 2025-07-23 15:03:37 -08:00 committed by GitHub
commit 34f3a0ab11
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -785,19 +785,19 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
}
#endif
#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
do { \
int const index = (int)(bits[(_stream)] >> 53); \
int const entry = (int)dtable[index]; \
bits[(_stream)] <<= (entry & 0x3F); \
op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \
#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \
do { \
U64 const index = bits[(_stream)] >> 53; \
U16 const entry = dtable[index]; \
bits[(_stream)] <<= entry & 0x3F; \
op[(_stream)][(_symbol)] = (BYTE)(entry >> 8); \
} while (0)
#define HUF_4X1_RELOAD_STREAM(_stream) \
#define HUF_5X1_RELOAD_STREAM(_stream) \
do { \
int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
int const nbBits = ctz & 7; \
int const nbBytes = ctz >> 3; \
U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
U64 const nbBits = ctz & 7; \
U64 const nbBytes = ctz >> 3; \
op[(_stream)] += 5; \
ip[(_stream)] -= nbBytes; \
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
@ -816,11 +816,11 @@ void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4);
/* Reload each of the 4 the bitstreams */
HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM);
HUF_4X_FOR_EACH_STREAM(HUF_5X1_RELOAD_STREAM);
} while (op[3] < olimit);
#undef HUF_4X1_DECODE_SYMBOL
#undef HUF_4X1_RELOAD_STREAM
#undef HUF_5X1_RELOAD_STREAM
}
_out:
@ -1603,57 +1603,65 @@ void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs*
}
#endif
#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
do { \
if ((_decode3) || (_stream) != 3) { \
int const index = (int)(bits[(_stream)] >> 53); \
HUF_DEltX2 const entry = dtable[index]; \
MEM_write16(op[(_stream)], entry.sequence); \
bits[(_stream)] <<= (entry.nbBits) & 0x3F; \
op[(_stream)] += (entry.length); \
} \
#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \
do { \
if ((_decode3) || (_stream) != 3) { \
U64 const index = bits[(_stream)] >> 53; \
size_t const entry = MEM_readLE32(&dtable[index]); \
MEM_write16(op[(_stream)], (U16)entry); \
bits[(_stream)] <<= (entry >> 16) & 0x3F; \
op[(_stream)] += entry >> 24; \
} \
} while (0)
#define HUF_4X2_RELOAD_STREAM(_stream) \
#define HUF_5X2_RELOAD_STREAM(_stream, _decode3) \
do { \
HUF_4X2_DECODE_SYMBOL(3, 1); \
if (_decode3) HUF_4X2_DECODE_SYMBOL(3, 1); \
{ \
int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
int const nbBits = ctz & 7; \
int const nbBytes = ctz >> 3; \
U64 const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \
U64 const nbBits = ctz & 7; \
U64 const nbBytes = ctz >> 3; \
ip[(_stream)] -= nbBytes; \
bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \
bits[(_stream)] <<= nbBits; \
} \
} while (0)
#if defined(__aarch64__)
# define HUF_4X2_4WAY 1
#else
# define HUF_4X2_4WAY 0
#endif
#define HUF_4X2_3WAY !HUF_4X2_4WAY
/* Manually unroll the loop because compilers don't consistently
* unroll the inner loops, which destroys performance.
*/
do {
/* Decode 5 symbols from each of the first 3 streams.
* The final stream will be decoded during the reload phase
* to reduce register pressure.
/* Decode 5 symbols from each of the first 3 or 4 streams.
* In the 3-way case the final stream will be decoded during
* the reload phase to reduce register pressure.
*/
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, HUF_4X2_4WAY);
/* Decode one symbol from the final stream */
HUF_4X2_DECODE_SYMBOL(3, 1);
/* In the 3-way case decode one symbol from the final stream. */
HUF_4X2_DECODE_SYMBOL(3, HUF_4X2_3WAY);
/* Decode 4 symbols from the final stream & reload bitstreams.
* The final stream is reloaded last, meaning that all 5 symbols
* are decoded from the final stream before it is reloaded.
/* In the 3-way case decode 4 symbols from the final stream &
* reload bitstreams. The final stream is reloaded last, meaning
* that all 5 symbols are decoded from the final stream before
* it is reloaded.
*/
HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM);
HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_5X2_RELOAD_STREAM, HUF_4X2_3WAY);
} while (op[3] < olimit);
}
#undef HUF_4X2_DECODE_SYMBOL
#undef HUF_4X2_RELOAD_STREAM
#undef HUF_5X2_RELOAD_STREAM
_out: