From a95a88af57a9e20adc872483d71e667ad5c64762 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 13 Mar 2018 13:44:10 -0700 Subject: [PATCH 1/2] removed huf_compress_impl.h re-imported all functions inside huf_compress.c for easier source editing. Also updated a bunch of code comments for clarification. --- lib/common/fse.h | 2 +- lib/common/huf.h | 184 +++++++++++++++++-------------- lib/compress/fse_compress.c | 22 ++-- lib/compress/huf_compress.c | 172 +++++++++++++++++++++++------ lib/compress/huf_compress_impl.h | 120 -------------------- lib/compress/zstd_compress.c | 4 +- lib/decompress/huf_decompress.c | 25 +++-- 7 files changed, 272 insertions(+), 257 deletions(-) delete mode 100644 lib/compress/huf_compress_impl.h diff --git a/lib/common/fse.h b/lib/common/fse.h index afd780196..6a1d272be 100644 --- a/lib/common/fse.h +++ b/lib/common/fse.h @@ -345,7 +345,7 @@ size_t FSE_countFast(unsigned* count, unsigned* maxSymbolValuePtr, const void* s */ size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* workSpace); -/*! FSE_count_simple +/*! FSE_count_simple() : * Same as FSE_countFast(), but does not use any additional memory (not even on stack). * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` (presuming it's also the size of `count`). */ diff --git a/lib/common/huf.h b/lib/common/huf.h index 4e7eee0cb..b4645b4e5 100644 --- a/lib/common/huf.h +++ b/lib/common/huf.h @@ -58,31 +58,32 @@ extern "C" { #endif -/* *** simple functions *** */ -/** -HUF_compress() : - Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. - 'dst' buffer must be already allocated. - Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). - `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. - @return : size of compressed data (<= `dstCapacity`). - Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! - if HUF_isError(return), compression failed (more details using HUF_getErrorName()) -*/ +/* ========================== */ +/* *** simple functions *** */ +/* ========================== */ + +/** HUF_compress() : + * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. + * 'dst' buffer must be already allocated. + * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). + * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. + * @return : size of compressed data (<= `dstCapacity`). + * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! + * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) + */ HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, const void* src, size_t srcSize); -/** -HUF_decompress() : - Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', - into already allocated buffer 'dst', of minimum size 'dstSize'. - `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. - Note : in contrast with FSE, HUF_decompress can regenerate - RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, - because it knows size to regenerate (originalSize). - @return : size of regenerated data (== originalSize), - or an error code, which can be tested using HUF_isError() -*/ +/** HUF_decompress() : + * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', + * into already allocated buffer 'dst', of minimum size 'dstSize'. + * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. + * Note : in contrast with FSE, HUF_decompress can regenerate + * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, + * because it knows size to regenerate (originalSize). + * @return : size of regenerated data (== originalSize), + * or an error code, which can be tested using HUF_isError() + */ HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, const void* cSrc, size_t cSrcSize); @@ -99,30 +100,22 @@ HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /**< provides error c /* *** Advanced function *** */ /** HUF_compress2() : - * Same as HUF_compress(), but offers direct control over `maxSymbolValue` and `tableLog`. - * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . - * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ -HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); + * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. + * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . + * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog); /** HUF_compress4X_wksp() : * Same as HUF_compress2(), but uses externally allocated `workSpace`. - * `workspace` must have minimum alignment of 4, and be at least as large as following macro */ + * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ #define HUF_WORKSPACE_SIZE (6 << 10) #define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) -HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); - -/** - * The minimum workspace size for the `workSpace` used in - * HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp(). - * - * The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when - * HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15. - * Buffer overflow errors may potentially occur if code modifications result in - * a required workspace size greater than that specified in the following - * macro. - */ -#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) -#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) +HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize); #endif /* HUF_H_298734234 */ @@ -132,7 +125,7 @@ HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, const * which shall never be used in the context of a dynamic library, * because they are not guaranteed to remain stable in the future. * Only consider them in association with static linking. - *******************************************************************/ + * *****************************************************************/ #if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY) #define HUF_H_HUF_STATIC_LINKING_ONLY @@ -192,24 +185,23 @@ size_t HUF_decompress4X4_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, /* **************************************** -* HUF detailed API -******************************************/ -/*! -HUF_compress() does the following: -1. count symbol occurrence from source[] into table count[] using FSE_count() -2. (optional) refine tableLog using HUF_optimalTableLog() -3. build Huffman table from count using HUF_buildCTable() -4. save Huffman table to memory buffer using HUF_writeCTable() -5. encode the data stream using HUF_compress4X_usingCTable() + * HUF detailed API + * ****************************************/ -The following API allows targeting specific sub-functions for advanced tasks. -For example, it's possible to compress several blocks using the same 'CTable', -or to save and regenerate 'CTable' using external methods. -*/ -/* FSE_count() : exposed within "fse.h" */ +/*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") + * 2. (optional) refine tableLog using HUF_optimalTableLog() + * 3. build Huffman table from count using HUF_buildCTable() + * 4. save Huffman table to memory buffer using HUF_writeCTable() + * 5. encode the data stream using HUF_compress4X_usingCTable() + * + * The following API allows targeting specific sub-functions for advanced tasks. + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ -size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap, in which case, CTable will overwrite count content */ +size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); @@ -219,47 +211,65 @@ typedef enum { HUF_repeat_valid /**< Can use the previous table and it is asumed to be valid */ } HUF_repeat; /** HUF_compress4X_repeat() : -* Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. -* If it uses hufTable it does not modify hufTable or repeat. -* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. -* If preferRepeat then the old table will always be used if valid. */ -size_t HUF_compress4X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); /** HUF_buildCTable_wksp() : * Same as HUF_buildCTable(), but using externally allocated scratch buffer. - * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as a table of HUF_CTABLE_WORKSPACE_SIZE_U32 unsigned. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. */ - #define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) +#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) +#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) size_t HUF_buildCTable_wksp (HUF_CElt* tree, const U32* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize); /*! HUF_readStats() : - Read compact Huffman tree, saved by HUF_writeCTable(). - `huffWeight` is destination buffer. - @return : size read from `src` , or an error Code . - Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ + * Read compact Huffman tree, saved by HUF_writeCTable(). + * `huffWeight` is destination buffer. + * @return : size read from `src` , or an error Code . + * Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, const void* src, size_t srcSize); /** HUF_readCTable() : -* Loading a CTable saved with HUF_writeCTable() */ + * Loading a CTable saved with HUF_writeCTable() */ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize); /* -HUF_decompress() does the following: -1. select the decompression algorithm (X2, X4) based on pre-computed heuristics -2. build Huffman table from save, using HUF_readDTableXn() -3. decode 1 or 4 segments in parallel using HUF_decompressSXn_usingDTable -*/ + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X2, X4) based on pre-computed heuristics + * 2. build Huffman table from save, using HUF_readDTableX?() + * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable() + */ /** HUF_selectDecoder() : -* Tells which decoder is likely to decode faster, -* based on a set of pre-determined metrics. -* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . -* Assumption : 0 < cSrcSize < dstSize <= 128 KB */ + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . + * Assumption : 0 < dstSize <= 128 KB */ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); +/** + * The minimum workspace size for the `workSpace` used in + * HUF_readDTableX2_wksp() and HUF_readDTableX4_wksp(). + * + * The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when + * HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15. + * Buffer overflow errors may potentially occur if code modifications result in + * a required workspace size greater than that specified in the following + * macro. + */ +#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); size_t HUF_readDTableX4 (HUF_DTable* DTable, const void* src, size_t srcSize); @@ -270,17 +280,23 @@ size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* c size_t HUF_decompress4X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +/* ====================== */ /* single stream variants */ +/* ====================== */ size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); /** HUF_compress1X_repeat() : -* Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. -* If it uses hufTable it does not modify hufTable or repeat. -* If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. -* If preferRepeat then the old table will always be used if valid. */ -size_t HUF_compress1X_repeat(void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize, HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ size_t HUF_decompress1X4 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ @@ -297,7 +313,7 @@ size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* c size_t HUF_decompress1X4_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /* BMI2 variants. - * If the CPU has BMI2 support pass bmi2=1, otherwise pass bmi2=0. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. */ size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); size_t HUF_decompress1X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); diff --git a/lib/compress/fse_compress.c b/lib/compress/fse_compress.c index ee5eb81ab..cb8f1fa32 100644 --- a/lib/compress/fse_compress.c +++ b/lib/compress/fse_compress.c @@ -292,7 +292,7 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, const short* normalized It doesn't use any additional memory. But this function is unsafe : it doesn't check that all values within `src` can fit into `count`. For this reason, prefer using a table `count` with 256 elements. - @return : count of most numerous element + @return : count of most numerous element. */ size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize) @@ -305,7 +305,10 @@ size_t FSE_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, memset(count, 0, (maxSymbolValue+1)*sizeof(*count)); if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; } - while (ip 255) maxSymbolValue = 255; + for (s=0; s<=maxSymbolValue; s++) { count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s]; if (count[s] > max) max = count[s]; } } @@ -393,9 +399,11 @@ static size_t FSE_count_parallel_wksp( * Same as FSE_countFast(), but using an externally provided scratch buffer. * `workSpace` size must be table of >= `1024` unsigned */ size_t FSE_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, - const void* source, size_t sourceSize, unsigned* workSpace) + const void* source, size_t sourceSize, + unsigned* workSpace) { - if (sourceSize < 1500) return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize); + if (sourceSize < 1500) /* heuristic threshold */ + return FSE_count_simple(count, maxSymbolValuePtr, source, sourceSize); return FSE_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, 0, workSpace); } diff --git a/lib/compress/huf_compress.c b/lib/compress/huf_compress.c index 69007cd92..83230b415 100644 --- a/lib/compress/huf_compress.c +++ b/lib/compress/huf_compress.c @@ -440,59 +440,169 @@ static int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, uns size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } - -#define FUNCTION(fn) fn##_default -#define TARGET -#include "huf_compress_impl.h" -#undef TARGET -#undef FUNCTION - -#if DYNAMIC_BMI2 - -#define FUNCTION(fn) fn##_bmi2 -#define TARGET TARGET_ATTRIBUTE("bmi2") -#include "huf_compress_impl.h" -#undef TARGET -#undef FUNCTION - -#endif - -static size_t HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, - const void* src, size_t srcSize, - const HUF_CElt* CTable, const int bmi2) +FORCE_INLINE_TEMPLATE void +HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) { + BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); +} + +#define HUF_FLUSHBITS(s) BIT_flushBits(s) + +#define HUF_FLUSHBITS_1(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) + +#define HUF_FLUSHBITS_2(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) + +FORCE_INLINE_TEMPLATE size_t +HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + size_t n; + BIT_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ + { size_t const initErr = BIT_initCStream(&bitC, op, oend-op); + if (HUF_isError(initErr)) return 0; } + + n = srcSize & ~3; /* join to mod 4 */ + switch (srcSize & 3) + { + case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); + HUF_FLUSHBITS_2(&bitC); + /* fall-through */ + case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); + HUF_FLUSHBITS_1(&bitC); + /* fall-through */ + case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); + HUF_FLUSHBITS(&bitC); + /* fall-through */ + case 0 : /* fall-through */ + default: break; + } + + for (; n>0; n-=4) { /* note : n&3==0 at this stage */ + HUF_encodeSymbol(&bitC, ip[n- 1], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 2], CTable); + HUF_FLUSHBITS_2(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 3], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 4], CTable); + HUF_FLUSHBITS(&bitC); + } + + return BIT_closeCStream(&bitC); +} + #if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ if (bmi2) { return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); } -#endif - (void)bmi2; return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); } -static size_t HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, - const void* src, size_t srcSize, - const HUF_CElt* CTable, const int bmi2) +#else + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) { -#if DYNAMIC_BMI2 - if (bmi2) { - return HUF_compress4X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); - } -#endif (void)bmi2; - return HUF_compress4X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); } +#endif + size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); } + +static size_t +HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, int bmi2) +{ + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ + if (srcSize < 12) return 0; /* no saving possible : too small input */ + op += 6; /* jumpTable */ + + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, oend-op, ip, iend-ip, CTable, bmi2) ); + if (cSize==0) return 0; + op += cSize; + } + + return op-ostart; +} + size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) { return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); } + static size_t HUF_compressCTable_internal( BYTE* const ostart, BYTE* op, BYTE* const oend, const void* src, size_t srcSize, diff --git a/lib/compress/huf_compress_impl.h b/lib/compress/huf_compress_impl.h deleted file mode 100644 index fa045399f..000000000 --- a/lib/compress/huf_compress_impl.h +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Copyright (c) 2018-present, Facebook, Inc. - * All rights reserved. - * - * This source code is licensed under both the BSD-style license (found in the - * LICENSE file in the root directory of this source tree) and the GPLv2 (found - * in the COPYING file in the root directory of this source tree). - * You may select, at your option, one of the above-listed licenses. - */ - -#ifndef FUNCTION -# error "FUNCTION(name) must be defined" -#endif - -#ifndef TARGET -# error "TARGET must be defined" -#endif - - -static void FUNCTION(HUF_encodeSymbol)(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) -{ - BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); -} - -#define HUF_FLUSHBITS(s) BIT_flushBits(s) - -#define HUF_FLUSHBITS_1(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) - -#define HUF_FLUSHBITS_2(stream) \ - if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) - -static TARGET -size_t FUNCTION(HUF_compress1X_usingCTable_internal)(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) -{ - const BYTE* ip = (const BYTE*) src; - BYTE* const ostart = (BYTE*)dst; - BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - size_t n; - BIT_CStream_t bitC; - - /* init */ - if (dstSize < 8) return 0; /* not enough space to compress */ - { size_t const initErr = BIT_initCStream(&bitC, op, oend-op); - if (HUF_isError(initErr)) return 0; } - - n = srcSize & ~3; /* join to mod 4 */ - switch (srcSize & 3) - { - case 3 : FUNCTION(HUF_encodeSymbol)(&bitC, ip[n+ 2], CTable); - HUF_FLUSHBITS_2(&bitC); - /* fall-through */ - case 2 : FUNCTION(HUF_encodeSymbol)(&bitC, ip[n+ 1], CTable); - HUF_FLUSHBITS_1(&bitC); - /* fall-through */ - case 1 : FUNCTION(HUF_encodeSymbol)(&bitC, ip[n+ 0], CTable); - HUF_FLUSHBITS(&bitC); - /* fall-through */ - case 0 : /* fall-through */ - default: break; - } - - for (; n>0; n-=4) { /* note : n&3==0 at this stage */ - FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 1], CTable); - HUF_FLUSHBITS_1(&bitC); - FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 2], CTable); - HUF_FLUSHBITS_2(&bitC); - FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 3], CTable); - HUF_FLUSHBITS_1(&bitC); - FUNCTION(HUF_encodeSymbol)(&bitC, ip[n- 4], CTable); - HUF_FLUSHBITS(&bitC); - } - - return BIT_closeCStream(&bitC); -} - - -static TARGET -size_t FUNCTION(HUF_compress4X_usingCTable_internal)(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) -{ - size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ - const BYTE* ip = (const BYTE*) src; - const BYTE* const iend = ip + srcSize; - BYTE* const ostart = (BYTE*) dst; - BYTE* const oend = ostart + dstSize; - BYTE* op = ostart; - - if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ - if (srcSize < 12) return 0; /* no saving possible : too small input */ - op += 6; /* jumpTable */ - - { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, segmentSize, CTable) ); - if (cSize==0) return 0; - MEM_writeLE16(ostart, (U16)cSize); - op += cSize; - } - - ip += segmentSize; - { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, segmentSize, CTable) ); - if (cSize==0) return 0; - MEM_writeLE16(ostart+2, (U16)cSize); - op += cSize; - } - - ip += segmentSize; - { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, segmentSize, CTable) ); - if (cSize==0) return 0; - MEM_writeLE16(ostart+4, (U16)cSize); - op += cSize; - } - - ip += segmentSize; - { CHECK_V_F(cSize, FUNCTION(HUF_compress1X_usingCTable_internal)(op, oend-op, ip, iend-ip, CTable) ); - if (cSize==0) return 0; - op += cSize; - } - - return op-ostart; -} diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index 706845eb7..b7f4f7811 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1394,8 +1394,8 @@ static size_t ZSTD_compressLiterals (ZSTD_entropyCTables_t const* prevEntropy, sizeof(prevEntropy->hufCTable)); /* small ? don't even attempt compression (speed opt) */ -# define LITERAL_NOENTROPY 63 - { size_t const minLitSize = prevEntropy->hufCTable_repeatMode == HUF_repeat_valid ? 6 : LITERAL_NOENTROPY; +# define COMPRESS_LITERALS_SIZE_MIN 63 + { size_t const minLitSize = (prevEntropy->hufCTable_repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); } diff --git a/lib/decompress/huf_decompress.c b/lib/decompress/huf_decompress.c index 8b292e792..e06b44533 100644 --- a/lib/decompress/huf_decompress.c +++ b/lib/decompress/huf_decompress.c @@ -958,21 +958,22 @@ static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, qu }; /** HUF_selectDecoder() : -* Tells which decoder is likely to decode faster, -* based on a set of pre-determined metrics. -* @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . -* Assumption : 0 < cSrcSize, dstSize <= 128 KB */ + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X2, 1==HUF_decompress4X4 . + * Assumption : 0 < dstSize <= 128 KB */ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) { + assert(dstSize > 0); + assert(dstSize <= 128 KB); /* decoder timing evaluation */ - U32 const Q = cSrcSize >= dstSize ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ - U32 const D256 = (U32)(dstSize >> 8); - U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); - U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); - DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, for cache eviction */ - - return DTime1 < DTime0; -} + { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ + U32 const D256 = (U32)(dstSize >> 8); + U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); + U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); + DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + return DTime1 < DTime0; +} } typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); From 4af1fafeb80db846d7931a5fdfd6c24108761770 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Thu, 15 Mar 2018 18:52:38 -0700 Subject: [PATCH 2/2] Restore setting loadedDictEnd Setting `loadedDictEnd` was accidently removed from `ZSTD_loadDictionaryContent()`, which means that dictionary compression will only be able to reference the parts of the dictionary within the window. The spec allows us to reference the entire dictionary so long as even one byte is in the window. `ZSTD_enforceMaxDist()` incorrectly always allowed offsets up to `loadedDictEnd` beyond the window, even once the dictionary was out of range. When overflow protection kicked in, the check `current > loadedDictEnd + maxDist` is incorrect if `loadedDictEnd` isn't reset back to zero. `current` could be reset below the value, which would incorrectly allow references beyond the window. This bug is present in `master`, but is very hard to trigger, since it requires both dictionaries and data which triggers overflow correction. --- lib/compress/zstd_compress.c | 4 +++- lib/compress/zstd_compress_internal.h | 21 ++++++++++++++++----- lib/compress/zstd_ldm.c | 2 +- 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index e70c2a54d..db8d7a8c3 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1931,8 +1931,9 @@ static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, ZSTD_reduceIndex(cctx, correction); if (ms->nextToUpdate < correction) ms->nextToUpdate = 0; else ms->nextToUpdate -= correction; + ms->loadedDictEnd = 0; } - ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, ms->loadedDictEnd + maxDist); + ZSTD_window_enforceMaxDist(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd); if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; { size_t cSize = ZSTD_compressBlock_internal(cctx, @@ -2118,6 +2119,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, ZSTD_CCtx_params ZSTD_compressionParameters const* cParams = ¶ms->cParams; ZSTD_window_update(&ms->window, src, srcSize); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); if (srcSize <= HASH_READ_SIZE) return 0; diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 0d8bbf296..865c080cc 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -562,15 +562,24 @@ MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, /** * ZSTD_window_enforceMaxDist(): - * Sets lowLimit such that indices earlier than (srcEnd - base) - lowLimit are - * invalid. This allows a simple check index >= lowLimit to see if it is valid. - * Source pointers past srcEnd are not guaranteed to be valid. + * Updates lowLimit so that: + * (srcEnd - base) - lowLimit == maxDist + loadedDictEnd + * This allows a simple check that index >= lowLimit to see if index is valid. + * This must be called before a block compression call, with srcEnd as the block + * source end. + * If loadedDictEndPtr is not NULL, we set it to zero once we update lowLimit. + * This is because dictionaries are allowed to be referenced as long as the last + * byte of the dictionary is in the window, but once they are out of range, + * they cannot be referenced. If loadedDictEndPtr is NULL, we use + * loadedDictEnd == 0. */ MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window, - void const* srcEnd, U32 maxDist) + void const* srcEnd, U32 maxDist, + U32* loadedDictEndPtr) { U32 const current = (U32)((BYTE const*)srcEnd - window->base); - if (current > maxDist) { + U32 loadedDictEnd = loadedDictEndPtr != NULL ? *loadedDictEndPtr : 0; + if (current > maxDist + loadedDictEnd) { U32 const newLowLimit = current - maxDist; if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit; if (window->dictLimit < window->lowLimit) { @@ -578,6 +587,8 @@ MEM_STATIC void ZSTD_window_enforceMaxDist(ZSTD_window_t* window, window->lowLimit); window->dictLimit = window->lowLimit; } + if (loadedDictEndPtr) + *loadedDictEndPtr = 0; } } diff --git a/lib/compress/zstd_ldm.c b/lib/compress/zstd_ldm.c index fa395ed7d..5c9c0d2b6 100644 --- a/lib/compress/zstd_ldm.c +++ b/lib/compress/zstd_ldm.c @@ -514,7 +514,7 @@ size_t ZSTD_ldm_generateSequences( * * Try invalidation after the sequence generation and test the * the offset against maxDist directly. */ - ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist); + ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL); /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ newLeftoverSize = ZSTD_ldm_generateSequences_internal( ldmState, sequences, params, chunkStart, chunkSize);