Merge pull request #2258 from Niadb/dev

Added STATIC_BMI2 for compile time detection of BMI2 on MSVC, when enabled various intrinsics are used
This commit is contained in:
Yann Collet 2020-08-04 09:43:59 -07:00 committed by GitHub
commit 38e38546a4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 47 additions and 15 deletions

View File

@ -17,7 +17,6 @@
#if defined (__cplusplus) #if defined (__cplusplus)
extern "C" { extern "C" {
#endif #endif
/* /*
* This API consists of small unitary functions, which must be inlined for best performance. * This API consists of small unitary functions, which must be inlined for best performance.
* Since link-time-optimization is not available for all compilers, * Since link-time-optimization is not available for all compilers,
@ -141,8 +140,12 @@ MEM_STATIC unsigned BIT_highbit32 (U32 val)
assert(val != 0); assert(val != 0);
{ {
# if defined(_MSC_VER) /* Visual */ # if defined(_MSC_VER) /* Visual */
unsigned long r=0; # if STATIC_BMI2 == 1
return _BitScanReverse ( &r, val ) ? (unsigned)r : 0; return _lzcnt_u32(val) ^ 31;
# else
unsigned long r = 0;
return _BitScanReverse(&r, val) ? (unsigned)r : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ # elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */
return __builtin_clz (val) ^ 31; return __builtin_clz (val) ^ 31;
# elif defined(__ICCARM__) /* IAR Intrinsic */ # elif defined(__ICCARM__) /* IAR Intrinsic */
@ -317,12 +320,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si
return srcSize; return srcSize;
} }
MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start)
{ {
return bitContainer >> start; return bitContainer >> start;
} }
MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits)
{ {
U32 const regMask = sizeof(bitContainer)*8 - 1; U32 const regMask = sizeof(bitContainer)*8 - 1;
/* if start > regMask, bitstream is corrupted, and result is undefined */ /* if start > regMask, bitstream is corrupted, and result is undefined */
@ -330,10 +333,14 @@ MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 co
return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; return (bitContainer >> (start & regMask)) & BIT_mask[nbBits];
} }
MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
{ {
#if STATIC_BMI2
return _bzhi_u64(bitContainer, nbBits);
#else
assert(nbBits < BIT_MASK_SIZE); assert(nbBits < BIT_MASK_SIZE);
return bitContainer & BIT_mask[nbBits]; return bitContainer & BIT_mask[nbBits];
#endif
} }
/*! BIT_lookBits() : /*! BIT_lookBits() :
@ -342,7 +349,7 @@ MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits)
* On 32-bits, maxNbBits==24. * On 32-bits, maxNbBits==24.
* On 64-bits, maxNbBits==56. * On 64-bits, maxNbBits==56.
* @return : value extracted */ * @return : value extracted */
MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits)
{ {
/* arbitrate between double-shift and shift+mask */ /* arbitrate between double-shift and shift+mask */
#if 1 #if 1
@ -365,7 +372,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits)
return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask);
} }
MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
{ {
bitD->bitsConsumed += nbBits; bitD->bitsConsumed += nbBits;
} }
@ -374,7 +381,7 @@ MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits)
* Read (consume) next n bits from local register and update. * Read (consume) next n bits from local register and update.
* Pay attention to not read more than nbBits contained into local register. * Pay attention to not read more than nbBits contained into local register.
* @return : extracted value. */ * @return : extracted value. */
MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits)
{ {
size_t const value = BIT_lookBits(bitD, nbBits); size_t const value = BIT_lookBits(bitD, nbBits);
BIT_skipBits(bitD, nbBits); BIT_skipBits(bitD, nbBits);

View File

@ -183,4 +183,17 @@
# pragma warning(disable : 4324) /* disable: C4324: padded structure */ # pragma warning(disable : 4324) /* disable: C4324: padded structure */
#endif #endif
/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/
#ifndef STATIC_BMI2
# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86))
# ifdef __AVX2__ //MSVC does not have a BMI2 specific flag, but every CPU that supports AVX2 also supports BMI2
# define STATIC_BMI2 1
# endif
# endif
#endif
#ifndef STATIC_BMI2
#define STATIC_BMI2 0
#endif
#endif /* ZSTD_COMPILER_H */ #endif /* ZSTD_COMPILER_H */

View File

@ -396,8 +396,12 @@ MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus
assert(val != 0); assert(val != 0);
{ {
# if defined(_MSC_VER) /* Visual */ # if defined(_MSC_VER) /* Visual */
# if STATIC_BMI2 == 1
return _lzcnt_u32(val)^31;
# else
unsigned long r=0; unsigned long r=0;
return _BitScanReverse(&r, val) ? (unsigned)r : 0; return _BitScanReverse(&r, val) ? (unsigned)r : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */ # elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */
return __builtin_clz (val) ^ 31; return __builtin_clz (val) ^ 31;
# elif defined(__ICCARM__) /* IAR Intrinsic */ # elif defined(__ICCARM__) /* IAR Intrinsic */

View File

@ -498,8 +498,12 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
if (MEM_isLittleEndian()) { if (MEM_isLittleEndian()) {
if (MEM_64bits()) { if (MEM_64bits()) {
# if defined(_MSC_VER) && defined(_WIN64) # if defined(_MSC_VER) && defined(_WIN64)
# if STATIC_BMI2
return _tzcnt_u64(val) >> 3;
# else
unsigned long r = 0; unsigned long r = 0;
return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0; return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 4) # elif defined(__GNUC__) && (__GNUC__ >= 4)
return (__builtin_ctzll((U64)val) >> 3); return (__builtin_ctzll((U64)val) >> 3);
# else # else
@ -530,8 +534,12 @@ static unsigned ZSTD_NbCommonBytes (size_t val)
} else { /* Big Endian CPU */ } else { /* Big Endian CPU */
if (MEM_64bits()) { if (MEM_64bits()) {
# if defined(_MSC_VER) && defined(_WIN64) # if defined(_MSC_VER) && defined(_WIN64)
# if STATIC_BMI2
return _lzcnt_u64(val) >> 3;
# else
unsigned long r = 0; unsigned long r = 0;
return _BitScanReverse64( &r, val ) ? (unsigned)(r >> 3) : 0; return _BitScanReverse64(&r, (U64)val) ? (unsigned)(r >> 3) : 0;
# endif
# elif defined(__GNUC__) && (__GNUC__ >= 4) # elif defined(__GNUC__) && (__GNUC__ >= 4)
return (__builtin_clzll(val) >> 3); return (__builtin_clzll(val) >> 3);
# else # else