From e717a5b0ddb478108f826076f3efef3a17735735 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sat, 18 Nov 2017 16:24:02 -0800 Subject: [PATCH 1/9] zstd_opt: minor speed optimization Calculate reference log2sums only once per serie of sequence (as opposed to once per sequence) Also: improved code comments --- lib/compress/zstd_compress_internal.h | 19 +++++++++--------- lib/compress/zstd_opt.c | 28 +++++++++++++-------------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index efcbc4501..862558b50 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -72,26 +72,27 @@ typedef struct { } ZSTD_optimal_t; typedef struct { - U32* litFreq; - U32* litLengthFreq; - U32* matchLengthFreq; - U32* offCodeFreq; - ZSTD_match_t* matchTable; - ZSTD_optimal_t* priceTable; + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + U32* litFreq; /* table of literals statistics, of size 256 */ + U32* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + U32* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + U32* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ + ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ + ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ U32 litSum; /* nb of literals */ U32 litLengthSum; /* nb of litLength codes */ U32 matchLengthSum; /* nb of matchLength codes */ - U32 matchSum; /* one argument to calculate `factor` */ + U32 matchSum; /* a strange argument used in calculating `factor` */ U32 offCodeSum; /* nb of offset codes */ /* begin updated by ZSTD_setLog2Prices */ U32 log2litSum; /* pow2 to compare log2(litfreq) to */ U32 log2litLengthSum; /* pow2 to compare log2(llfreq) to */ U32 log2matchLengthSum; /* pow2 to compare log2(mlfreq) to */ U32 log2offCodeSum; /* pow2 to compare log2(offreq) to */ - U32 factor; /* added to calculate ZSTD_getPrice() (but why?) */ + U32 factor; /* fixed cost added when calculating ZSTD_getPrice() (but why ? seems to favor less sequences) */ /* end : updated by ZSTD_setLog2Prices */ - U32 staticPrices; /* prices follow a static cost structure, statistics are irrelevant */ + U32 staticPrices; /* prices follow a pre-defined cost structure, statistics are irrelevant */ U32 cachedPrice; U32 cachedLitLength; const BYTE* cachedLiterals; diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index 04f03ebcd..a626a3d13 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -13,7 +13,7 @@ #include "zstd_lazy.h" /* ZSTD_updateTree, ZSTD_updateTree_extDict */ -#define ZSTD_LITFREQ_ADD 2 /* sort of scaling factor for litSum and litFreq (why the need ?), but also used for matchSum ? */ +#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */ #define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ #define ZSTD_MAX_PRICE (1<<30) @@ -161,25 +161,26 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_getPrice(optState_t* optPtr, U32 litLength, const } -static void ZSTD_updatePrice(optState_t* optPtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength) +static void ZSTD_updateStats(optState_t* optPtr, U32 litLength, const BYTE* literals, U32 offsetCode, U32 matchLength) { - U32 u; - /* literals */ - optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; - for (u=0; u < litLength; u++) - optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + { U32 u; + optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; + for (u=0; u < litLength; u++) + optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + } /* literal Length */ - { const U32 llCode = ZSTD_LLcode(litLength); + { U32 const llCode = ZSTD_LLcode(litLength); optPtr->litLengthFreq[llCode]++; optPtr->litLengthSum++; } - /* match offset */ - { BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1); - optPtr->offCodeSum++; + /* match offset code (0-2=>repCode; 3+=>offset+2) */ + { U32 const offCode = ZSTD_highbit32(offsetCode+1); + assert(offCode <= MaxOff); optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; } /* match Length */ @@ -188,8 +189,6 @@ static void ZSTD_updatePrice(optState_t* optPtr, U32 litLength, const BYTE* lite optPtr->matchLengthFreq[mlCode]++; optPtr->matchLengthSum++; } - - ZSTD_setLog2Prices(optPtr); } @@ -646,10 +645,11 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ } } - ZSTD_updatePrice(optStatePtr, llen, anchor, offset, mlen); + ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen); ZSTD_storeSeq(seqStorePtr, llen, anchor, offset, mlen-MINMATCH); anchor = ip; } } + ZSTD_setLog2Prices(optStatePtr); } /* for (cur=0; cur < last_pos; ) */ /* Save reps for next block */ From e6da37c4305be931ada88a8242b0d2f71ad73b16 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 19 Nov 2017 10:21:21 -0800 Subject: [PATCH 2/9] created (hidden) new strategy btopt0 about ~+10% faster but losing ~0.01 compression ratio (note : amplitude vary a lot depending on files, but direction remains the same) --- lib/compress/zstd_opt.c | 43 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index a626a3d13..ed8527989 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -96,7 +96,8 @@ static void ZSTD_rescaleFreqs(optState_t* optPtr, const BYTE* src, size_t srcSiz } -static U32 ZSTD_getLiteralPrice(optState_t* optPtr, U32 const litLength, const BYTE* literals) +static U32 ZSTD_getLiteralPrice(optState_t* const optPtr, + U32 const litLength, const BYTE* const literals) { U32 price; @@ -138,17 +139,22 @@ static U32 ZSTD_getLiteralPrice(optState_t* optPtr, U32 const litLength, const B } -FORCE_INLINE_TEMPLATE U32 ZSTD_getPrice(optState_t* optPtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength, const int ultra) +/* ZSTD_getPrice() : + * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ +FORCE_INLINE_TEMPLATE U32 ZSTD_getPrice(optState_t* optPtr, + U32 const litLength, const BYTE* const literals, U32 const offset, U32 const matchLength, + int const optLevel) { - BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1); - U32 const mlBase = matchLength - MINMATCH; U32 price; + U32 const offCode = ZSTD_highbit32(offset+1); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); if (optPtr->staticPrices) /* fixed scheme, do not use statistics */ return ZSTD_getLiteralPrice(optPtr, litLength, literals) + ZSTD_highbit32((U32)mlBase+1) + 16 + offCode; price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1); - if (!ultra /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */ + if ((optLevel<2) /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */ /* match Length */ { U32 const mlCode = ZSTD_MLcode(mlBase); @@ -165,9 +171,9 @@ static void ZSTD_updateStats(optState_t* optPtr, U32 litLength, const BYTE* lite { /* literals */ { U32 u; - optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; for (u=0; u < litLength; u++) optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; } /* literal Length */ @@ -451,7 +457,7 @@ repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) FORCE_INLINE_TEMPLATE size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, const void* src, size_t srcSize, - const int ultra, const int extDict) + const int optLevel, const int extDict) { seqStore_t* const seqStorePtr = &(ctx->seqStore); optState_t* const optStatePtr = &(ctx->optState); @@ -518,7 +524,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, U32 const end = matches[matchNb].len; repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0); while (pos <= end) { - U32 const matchPrice = ZSTD_getPrice(optStatePtr, litlen, anchor, offset, pos, ultra); + U32 const matchPrice = ZSTD_getPrice(optStatePtr, litlen, anchor, offset, pos, optLevel); if (pos > last_pos || matchPrice < opt[pos].price) { DEBUGLOG(7, "rPos:%u => set initial price : %u", pos, matchPrice); @@ -576,25 +582,26 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, /* set prices using matches found at position == cur */ for (matchNb = 0; matchNb < nbMatches; matchNb++) { - U32 mlen = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; - U32 const lastML = matches[matchNb].len; U32 const offset = matches[matchNb].off; repcodes_t const repHistory = ZSTD_updateRep(opt[cur].rep, offset, ll0); + U32 const lastML = matches[matchNb].len; + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; DEBUGLOG(7, "testing match %u => offCode=%u, mlen=%u, llen=%u", matchNb, matches[matchNb].off, lastML, litlen); - while (mlen <= lastML) { + for (mlen = lastML; mlen >= startML; mlen--) { U32 const pos = cur + mlen; - U32 const price = basePrice + ZSTD_getPrice(optStatePtr, litlen, baseLiterals, offset, mlen, ultra); + U32 const price = basePrice + ZSTD_getPrice(optStatePtr, litlen, baseLiterals, offset, mlen, optLevel); if ((pos > last_pos) || (price < opt[pos].price)) { DEBUGLOG(7, "rPos:%u => new better price (%u<%u)", pos, price, opt[pos].price); SET_PRICE(pos, mlen, offset, litlen, price, repHistory); /* note : macro modifies last_pos */ + } else { + if (optLevel==0) break; /* gets ~+10% speed for about 0.01 ratio loss */ } - - mlen++; } } } } best_mlen = opt[last_pos].mlen; @@ -663,20 +670,20 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ size_t ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize) { DEBUGLOG(5, "ZSTD_compressBlock_btopt"); - return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*ultra*/, 0 /*extDict*/); + return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/); } size_t ZSTD_compressBlock_btultra(ZSTD_CCtx* ctx, const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*ultra*/, 0 /*extDict*/); + return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/); } size_t ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*ultra*/, 1 /*extDict*/); + return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*optLevel*/, 1 /*extDict*/); } size_t ZSTD_compressBlock_btultra_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*ultra*/, 1 /*extDict*/); + return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/); } From d100670045fb39adcce509d4bbb3ceb4a0091975 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 19 Nov 2017 10:38:02 -0800 Subject: [PATCH 3/9] btopt0 : a bit faster and weaker --- lib/compress/zstd_opt.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index ed8527989..c62b8897c 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -553,11 +553,15 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, SET_PRICE(cur, 1/*mlen*/, 0/*offset*/, litlen, price, opt[cur-1].rep); } } - if (cur == last_pos) break; - /* last match must start at a minimum distance of 8 from oend */ if (inr > ilimit) continue; + if (cur == last_pos) break; + + if ( (optLevel==0) /*static*/ + && (opt[cur+1].price <= opt[cur].price) ) + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + { U32 const ll0 = (opt[cur].mlen != 1); U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0; U32 const basePrice = (cur > litlen) ? opt[cur-litlen].price : 0; @@ -600,7 +604,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, pos, price, opt[pos].price); SET_PRICE(pos, mlen, offset, litlen, price, repHistory); /* note : macro modifies last_pos */ } else { - if (optLevel==0) break; /* gets ~+10% speed for about 0.01 ratio loss */ + if (optLevel==0) break; /* gets ~+10% speed for about -0.01 ratio loss */ } } } } } From 99435dbbab0361c105b1bc6f5c6aa4ab39c89e96 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 19 Nov 2017 12:58:04 -0800 Subject: [PATCH 4/9] minor : search early-out on sufficient_len for hc3 and rep very very small speed and ratio increases --- lib/compress/zstd_opt.c | 28 ++++++++++++++-------------- tests/paramgrill.c | 12 ++++-------- 2 files changed, 18 insertions(+), 22 deletions(-) diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index c62b8897c..7fd866f0a 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -240,9 +240,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertBtAndGetAllMatches ( ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iLimit, const int extDict, - U32 nbCompares, U32 const mls, + U32 nbCompares, U32 const mls, U32 const sufficient_len, U32 rep[ZSTD_REP_NUM], U32 const ll0, - ZSTD_match_t* matches, const U32 minMatchLen) + ZSTD_match_t* matches, const U32 lengthToBeat) { const BYTE* const base = zc->base; U32 const current = (U32)(ip-base); @@ -267,7 +267,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( U32 dummy32; /* to be nullified at the end */ U32 mnum = 0; - size_t bestLength = minMatchLen-1; + size_t bestLength = lengthToBeat-1; DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches"); /* check repCode */ @@ -299,7 +299,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( matches[mnum].off = repCode - ll0; matches[mnum].len = (U32)repLen; mnum++; - if ( (repLen > ZSTD_OPT_NUM) + if ( (repLen > sufficient_len) | (ip+repLen == iLimit) ) { /* best possible */ return mnum; } } } } @@ -328,7 +328,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE; matches[0].len = (U32)mlen; mnum = 1; - if ( (mlen > ZSTD_OPT_NUM) + if ( (mlen > sufficient_len) | (ip+mlen == iLimit) ) { /* best possible */ return 1; } } } } @@ -394,10 +394,10 @@ U32 ZSTD_insertBtAndGetAllMatches ( FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( ZSTD_CCtx* zc, /* Index table will be updated */ - const BYTE* ip, const BYTE* const iHighLimit, const int extDict, - const U32 maxNbAttempts, const U32 matchLengthSearch, + const BYTE* ip, const BYTE* const iHighLimit, int const extDict, + U32 const maxNbAttempts, U32 const matchLengthSearch, U32 const sufficient_len, U32 rep[ZSTD_REP_NUM], U32 const ll0, - ZSTD_match_t* matches, const U32 minMatchLen) + ZSTD_match_t* matches, U32 const lengthToBeat) { DEBUGLOG(7, "ZSTD_BtGetAllMatches"); if (ip < zc->base + zc->nextToUpdate) return 0; /* skipped area */ @@ -405,12 +405,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( else ZSTD_updateTree(zc, ip, iHighLimit, maxNbAttempts, matchLengthSearch); switch(matchLengthSearch) { - case 3 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 3, rep, ll0, matches, minMatchLen); + case 3 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 3, sufficient_len, rep, ll0, matches, lengthToBeat); default : - case 4 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 4, rep, ll0, matches, minMatchLen); - case 5 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 5, rep, ll0, matches, minMatchLen); + case 4 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 4, sufficient_len, rep, ll0, matches, lengthToBeat); + case 5 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 5, sufficient_len, rep, ll0, matches, lengthToBeat); case 7 : - case 6 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 6, rep, ll0, matches, minMatchLen); + case 6 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 6, sufficient_len, rep, ll0, matches, lengthToBeat); } } @@ -493,7 +493,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, /* find first match */ { U32 const litlen = (U32)(ip - anchor); U32 const ll0 = !litlen; - U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, ip, iend, extDict, maxSearches, mls, rep, ll0, matches, minMatch); + U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, ip, iend, extDict, maxSearches, mls, sufficient_len, rep, ll0, matches, minMatch); if (!nbMatches) { ip++; continue; } /* initialize opt[0] */ @@ -566,7 +566,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx, U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0; U32 const basePrice = (cur > litlen) ? opt[cur-litlen].price : 0; const BYTE* const baseLiterals = ip + cur - litlen; - U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, inr, iend, extDict, maxSearches, mls, opt[cur].rep, ll0, matches, minMatch); + U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, inr, iend, extDict, maxSearches, mls, sufficient_len, opt[cur].rep, ll0, matches, minMatch); U32 matchNb; if (!nbMatches) continue; assert(baseLiterals >= prefixStart); diff --git a/tests/paramgrill.c b/tests/paramgrill.c index a387a44c6..5560e82f6 100644 --- a/tests/paramgrill.c +++ b/tests/paramgrill.c @@ -310,14 +310,10 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr, } -const char* g_stratName[] = { "ZSTD_fast ", - "ZSTD_dfast ", - "ZSTD_greedy ", - "ZSTD_lazy ", - "ZSTD_lazy2 ", - "ZSTD_btlazy2 ", - "ZSTD_btopt ", - "ZSTD_btultra "}; +const char* g_stratName[ZSTD_btultra] = { + "ZSTD_fast ", "ZSTD_dfast ", + "ZSTD_greedy ", "ZSTD_lazy ", "ZSTD_lazy2 ", + "ZSTD_btlazy2 ", "ZSTD_btopt ", "ZSTD_btultra "}; static void BMK_printWinner(FILE* f, U32 cLevel, BMK_result_t result, ZSTD_compressionParameters params, size_t srcSize) { From 42c1e642704c2c4d1f402296376b59393c25ca1c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 19 Nov 2017 13:39:12 -0800 Subject: [PATCH 5/9] slightly improved ratio at -22 merging of repcode search into btsearch introduced a small compression ratio regressio at max level : 1.3.2 : 52728769 after repMerge patch : 52760789 (+32020) A few minor changes have produced this difference. They can be hard to spot. This patch buys back about half of the difference, by no longer inserting position at hc3 when a long match is found there. It feels strangely counter-intuitive, but works : after this patch : 52742555 (-18234) --- lib/compress/zstd_lazy.c | 15 ++++++++------- lib/compress/zstd_opt.c | 10 ++++++---- tests/paramgrill.c | 4 ++-- 3 files changed, 16 insertions(+), 13 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index cd3d1530a..a2c658bd3 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -16,8 +16,8 @@ * Binary Tree search ***************************************/ /** ZSTD_insertBt1() : add one or multiple positions to tree. -* ip : assumed <= iend-8 . -* @return : nb of positions added */ + * ip : assumed <= iend-8 . + * @return : nb of positions added */ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const BYTE* const iend, U32 nbCompares, U32 const mls, U32 const extDict) @@ -42,7 +42,7 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, U32* largerPtr = smallerPtr + 1; U32 dummy32; /* to be nullified at the end */ U32 const windowLow = zc->lowLimit; - U32 matchEndIdx = current+8; + U32 matchEndIdx = current+8+1; size_t bestLength = 8; #ifdef ZSTD_C_PREDICT U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0); @@ -122,8 +122,8 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, *smallerPtr = *largerPtr = 0; if (bestLength > 384) return MIN(192, (U32)(bestLength - 384)); /* speed optimization */ - if (matchEndIdx > current + 8) return matchEndIdx - (current + 8); - return 1; + assert(matchEndIdx > current + 8); + return matchEndIdx - (current + 8); } FORCE_INLINE_TEMPLATE @@ -182,7 +182,7 @@ static size_t ZSTD_insertBtAndFindBestMatch ( const U32 windowLow = zc->lowLimit; U32* smallerPtr = bt + 2*(current&btMask); U32* largerPtr = bt + 2*(current&btMask) + 1; - U32 matchEndIdx = current+8; + U32 matchEndIdx = current+8+1; U32 dummy32; /* to be nullified at the end */ size_t bestLength = 0; @@ -233,7 +233,8 @@ static size_t ZSTD_insertBtAndFindBestMatch ( *smallerPtr = *largerPtr = 0; - zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1; /* skip repetitive patterns */ + assert(matchEndIdx > current+8); + zc->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ return bestLength; } diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index 7fd866f0a..7508aa1bc 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -263,7 +263,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( U32 const windowLow = zc->lowLimit; U32* smallerPtr = bt + 2*(current&btMask); U32* largerPtr = bt + 2*(current&btMask) + 1; - U32 matchEndIdx = current+8; /* farthest referenced position of any match => detects repetitive patterns */ + U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */ U32 dummy32; /* to be nullified at the end */ U32 mnum = 0; @@ -328,8 +328,9 @@ U32 ZSTD_insertBtAndGetAllMatches ( matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE; matches[0].len = (U32)mlen; mnum = 1; - if ( (mlen > sufficient_len) - | (ip+mlen == iLimit) ) { /* best possible */ + if ( (mlen > sufficient_len) | + (ip+mlen == iLimit) ) { /* best possible length */ + zc->nextToUpdate = current+1; /* skip insertion */ return 1; } } } } @@ -387,7 +388,8 @@ U32 ZSTD_insertBtAndGetAllMatches ( *smallerPtr = *largerPtr = 0; - zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1; /* skip repetitive patterns */ + assert(matchEndIdx > current+8); + zc->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ return mnum; } diff --git a/tests/paramgrill.c b/tests/paramgrill.c index 5560e82f6..081a284e4 100644 --- a/tests/paramgrill.c +++ b/tests/paramgrill.c @@ -310,8 +310,8 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr, } -const char* g_stratName[ZSTD_btultra] = { - "ZSTD_fast ", "ZSTD_dfast ", +const char* g_stratName[ZSTD_btultra+1] = { + "(none) ", "ZSTD_fast ", "ZSTD_dfast ", "ZSTD_greedy ", "ZSTD_lazy ", "ZSTD_lazy2 ", "ZSTD_btlazy2 ", "ZSTD_btopt ", "ZSTD_btultra "}; From 3f457264d153e1a5e08cc2403eb5d6f2887b6c44 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Sun, 19 Nov 2017 14:40:21 -0800 Subject: [PATCH 6/9] slightly improved compression speed --- lib/compress/zstd_compress_internal.h | 18 +++++++++------- lib/compress/zstd_lazy.c | 30 +++++++++++++-------------- lib/compress/zstd_opt.c | 5 ++--- 3 files changed, 27 insertions(+), 26 deletions(-) diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 862558b50..4fd55c438 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -347,13 +347,17 @@ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* co const BYTE* const pStart = pIn; const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1); - while (pIn < pInLoopLimit) { - size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); - if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; } - pIn += ZSTD_NbCommonBytes(diff); - return (size_t)(pIn - pStart); - } - if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; } + if (pIn < pInLoopLimit) { + { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (diff) return ZSTD_NbCommonBytes(diff); } + pIn+=sizeof(size_t); pMatch+=sizeof(size_t); + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } } + if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; } if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; } if ((pIn= dictLimit)) { assert(matchIndex+matchLength >= dictLimit); /* might be wrong if extDict is incorrectly set to 0 */ match = base + matchIndex; - if (match[matchLength] == ip[matchLength]) - matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; + //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); } else { match = dictBase + matchIndex; matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); @@ -196,8 +196,8 @@ static size_t ZSTD_insertBtAndFindBestMatch ( if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { match = base + matchIndex; - if (match[matchLength] == ip[matchLength]) - matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; + //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); } else { match = dictBase + matchIndex; matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); @@ -355,14 +355,14 @@ size_t ZSTD_HcFindBestMatch_generic ( U32 matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls); for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { - const BYTE* match; size_t currentMl=0; if ((!extDict) || matchIndex >= dictLimit) { - match = base + matchIndex; + const BYTE* const match = base + matchIndex; if (match[ml] == ip[ml]) /* potentially better */ currentMl = ZSTD_count(ip, match, iLimit); } else { - match = dictBase + matchIndex; + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; } @@ -400,10 +400,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( - ZSTD_CCtx* zc, + ZSTD_CCtx* const zc, const BYTE* ip, const BYTE* const iLimit, - size_t* offsetPtr, - const U32 maxNbAttempts, const U32 matchLengthSearch) + size_t* const offsetPtr, + U32 const maxNbAttempts, U32 const matchLengthSearch) { switch(matchLengthSearch) { @@ -522,9 +522,8 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx, */ /* catch up */ if (offset) { - while ( (start > anchor) - && (start > base+offset-ZSTD_REP_MOVE) - && (start[-1] == (start-offset+ZSTD_REP_MOVE)[-1]) ) /* only search for offset within prefix */ + while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > base)) + && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ { start--; matchLength++; } offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); } @@ -536,9 +535,8 @@ _storeSequence: } /* check immediate repcode */ - while ( (ip <= ilimit) - && ((offset_2>0) - & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + while ( ((ip <= ilimit) & (offset_2>0)) + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { /* store sequence */ matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index 7508aa1bc..9eb338e18 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -345,9 +345,8 @@ U32 ZSTD_insertBtAndGetAllMatches ( if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ match = base + matchIndex; - if (match[matchLength] == ip[matchLength]) { - matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1; - } + //if (match[matchLength] == ip[matchLength]) { matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1; } + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit); } else { match = dictBase + matchIndex; matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart); From 899f2a29f6f70f7812c3f79259b593c3dde67401 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 20 Nov 2017 11:53:20 -0800 Subject: [PATCH 7/9] strategy ZSTD_btopt pinned to (0) variant (faster one) --- lib/compress/zstd_lazy.c | 2 -- lib/compress/zstd_opt.c | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c index 9937e24b5..6d4804961 100644 --- a/lib/compress/zstd_lazy.c +++ b/lib/compress/zstd_lazy.c @@ -85,7 +85,6 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { assert(matchIndex+matchLength >= dictLimit); /* might be wrong if extDict is incorrectly set to 0 */ match = base + matchIndex; - //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); } else { match = dictBase + matchIndex; @@ -196,7 +195,6 @@ static size_t ZSTD_insertBtAndFindBestMatch ( if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { match = base + matchIndex; - //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1; matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); } else { match = dictBase + matchIndex; diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c index 9eb338e18..b9a2cf85f 100644 --- a/lib/compress/zstd_opt.c +++ b/lib/compress/zstd_opt.c @@ -345,7 +345,6 @@ U32 ZSTD_insertBtAndGetAllMatches ( if ((!extDict) || (matchIndex+matchLength >= dictLimit)) { assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ match = base + matchIndex; - //if (match[matchLength] == ip[matchLength]) { matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1; } matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit); } else { match = dictBase + matchIndex; @@ -685,7 +684,7 @@ size_t ZSTD_compressBlock_btultra(ZSTD_CCtx* ctx, const void* src, size_t srcSiz size_t ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) { - return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*optLevel*/, 1 /*extDict*/); + return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/); } size_t ZSTD_compressBlock_btultra_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize) From 4154aec679e98b76c3e19ccdcb105e89faf9af37 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 21 Nov 2017 10:26:17 -0800 Subject: [PATCH 8/9] fixed comment, as suggested by @terrelln --- doc/zstd_manual.html | 5 +++-- lib/compress/zstd_opt.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html index 9697ebd57..e50f46661 100644 --- a/doc/zstd_manual.html +++ b/doc/zstd_manual.html @@ -111,7 +111,7 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); @return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise.


-

Helper functions

#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < 128 KB) ? ((128 KB - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+

Helper functions

#define ZSTD_COMPRESSBOUND(srcSize)   ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0))  /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
 size_t      ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */
 unsigned    ZSTD_isError(size_t code);          /*!< tells if a `size_t` function result is an error code */
 const char* ZSTD_getErrorName(size_t code);     /*!< provides readable string from an error code */
@@ -906,7 +906,8 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
 
size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
 

Set one compression parameter, selected by enum ZSTD_cParameter. Note : when `value` is an enum, cast it to unsigned for proper type checking. - @result : 0, or an error code (which can be tested with ZSTD_isError()). + @result : informational value (typically, the one being set, possibly corrected), + or an error code (which can be tested with ZSTD_isError()).


size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index b9a2cf85f..1defcb8fe 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -606,7 +606,8 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
                         } else {
                             if (optLevel==0) break;  /* gets ~+10% speed for about -0.01 ratio loss */
                         }
-        }   }   }   }
+            }   }   }
+        }  /* for (cur = 1; cur <= last_pos; cur++) */
 
         best_mlen = opt[last_pos].mlen;
         best_off = opt[last_pos].off;
@@ -661,7 +662,7 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
                 anchor = ip;
         }   }
         ZSTD_setLog2Prices(optStatePtr);
-    }   /* for (cur=0; cur < last_pos; ) */
+    }   while (ip < ilimit)
 
     /* Save reps for next block */
     { int i; for (i=0; irepToConfirm[i] = rep[i]; }

From f8d5c478afe143ffff63ac67c3ade593974efa27 Mon Sep 17 00:00:00 2001
From: Yann Collet 
Date: Tue, 21 Nov 2017 10:36:14 -0800
Subject: [PATCH 9/9] fixed comment, reported by @gyscos

---
 lib/compress/zstd_opt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 1defcb8fe..66a7da19c 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -662,7 +662,7 @@ _shortestPath:   /* cur, last_pos, best_mlen, best_off have to be set */
                 anchor = ip;
         }   }
         ZSTD_setLog2Prices(optStatePtr);
-    }   while (ip < ilimit)
+    }   /* while (ip < ilimit) */
 
     /* Save reps for next block */
     { int i; for (i=0; irepToConfirm[i] = rep[i]; }