From e717a5b0ddb478108f826076f3efef3a17735735 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Sat, 18 Nov 2017 16:24:02 -0800
Subject: [PATCH 1/9] zstd_opt: minor speed optimization
Calculate reference log2sums only once per serie of sequence
(as opposed to once per sequence)
Also: improved code comments
---
lib/compress/zstd_compress_internal.h | 19 +++++++++---------
lib/compress/zstd_opt.c | 28 +++++++++++++--------------
2 files changed, 24 insertions(+), 23 deletions(-)
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index efcbc4501..862558b50 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -72,26 +72,27 @@ typedef struct {
} ZSTD_optimal_t;
typedef struct {
- U32* litFreq;
- U32* litLengthFreq;
- U32* matchLengthFreq;
- U32* offCodeFreq;
- ZSTD_match_t* matchTable;
- ZSTD_optimal_t* priceTable;
+ /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */
+ U32* litFreq; /* table of literals statistics, of size 256 */
+ U32* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */
+ U32* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */
+ U32* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */
+ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */
+ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */
U32 litSum; /* nb of literals */
U32 litLengthSum; /* nb of litLength codes */
U32 matchLengthSum; /* nb of matchLength codes */
- U32 matchSum; /* one argument to calculate `factor` */
+ U32 matchSum; /* a strange argument used in calculating `factor` */
U32 offCodeSum; /* nb of offset codes */
/* begin updated by ZSTD_setLog2Prices */
U32 log2litSum; /* pow2 to compare log2(litfreq) to */
U32 log2litLengthSum; /* pow2 to compare log2(llfreq) to */
U32 log2matchLengthSum; /* pow2 to compare log2(mlfreq) to */
U32 log2offCodeSum; /* pow2 to compare log2(offreq) to */
- U32 factor; /* added to calculate ZSTD_getPrice() (but why?) */
+ U32 factor; /* fixed cost added when calculating ZSTD_getPrice() (but why ? seems to favor less sequences) */
/* end : updated by ZSTD_setLog2Prices */
- U32 staticPrices; /* prices follow a static cost structure, statistics are irrelevant */
+ U32 staticPrices; /* prices follow a pre-defined cost structure, statistics are irrelevant */
U32 cachedPrice;
U32 cachedLitLength;
const BYTE* cachedLiterals;
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 04f03ebcd..a626a3d13 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -13,7 +13,7 @@
#include "zstd_lazy.h" /* ZSTD_updateTree, ZSTD_updateTree_extDict */
-#define ZSTD_LITFREQ_ADD 2 /* sort of scaling factor for litSum and litFreq (why the need ?), but also used for matchSum ? */
+#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats. Also used for matchSum (?) */
#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */
#define ZSTD_MAX_PRICE (1<<30)
@@ -161,25 +161,26 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_getPrice(optState_t* optPtr, U32 litLength, const
}
-static void ZSTD_updatePrice(optState_t* optPtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength)
+static void ZSTD_updateStats(optState_t* optPtr, U32 litLength, const BYTE* literals, U32 offsetCode, U32 matchLength)
{
- U32 u;
-
/* literals */
- optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
- for (u=0; u < litLength; u++)
- optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+ { U32 u;
+ optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
+ for (u=0; u < litLength; u++)
+ optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+ }
/* literal Length */
- { const U32 llCode = ZSTD_LLcode(litLength);
+ { U32 const llCode = ZSTD_LLcode(litLength);
optPtr->litLengthFreq[llCode]++;
optPtr->litLengthSum++;
}
- /* match offset */
- { BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
- optPtr->offCodeSum++;
+ /* match offset code (0-2=>repCode; 3+=>offset+2) */
+ { U32 const offCode = ZSTD_highbit32(offsetCode+1);
+ assert(offCode <= MaxOff);
optPtr->offCodeFreq[offCode]++;
+ optPtr->offCodeSum++;
}
/* match Length */
@@ -188,8 +189,6 @@ static void ZSTD_updatePrice(optState_t* optPtr, U32 litLength, const BYTE* lite
optPtr->matchLengthFreq[mlCode]++;
optPtr->matchLengthSum++;
}
-
- ZSTD_setLog2Prices(optPtr);
}
@@ -646,10 +645,11 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
}
}
- ZSTD_updatePrice(optStatePtr, llen, anchor, offset, mlen);
+ ZSTD_updateStats(optStatePtr, llen, anchor, offset, mlen);
ZSTD_storeSeq(seqStorePtr, llen, anchor, offset, mlen-MINMATCH);
anchor = ip;
} }
+ ZSTD_setLog2Prices(optStatePtr);
} /* for (cur=0; cur < last_pos; ) */
/* Save reps for next block */
From e6da37c4305be931ada88a8242b0d2f71ad73b16 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Sun, 19 Nov 2017 10:21:21 -0800
Subject: [PATCH 2/9] created (hidden) new strategy btopt0
about ~+10% faster but losing ~0.01 compression ratio
(note : amplitude vary a lot depending on files, but direction remains the same)
---
lib/compress/zstd_opt.c | 43 ++++++++++++++++++++++++-----------------
1 file changed, 25 insertions(+), 18 deletions(-)
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index a626a3d13..ed8527989 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -96,7 +96,8 @@ static void ZSTD_rescaleFreqs(optState_t* optPtr, const BYTE* src, size_t srcSiz
}
-static U32 ZSTD_getLiteralPrice(optState_t* optPtr, U32 const litLength, const BYTE* literals)
+static U32 ZSTD_getLiteralPrice(optState_t* const optPtr,
+ U32 const litLength, const BYTE* const literals)
{
U32 price;
@@ -138,17 +139,22 @@ static U32 ZSTD_getLiteralPrice(optState_t* optPtr, U32 const litLength, const B
}
-FORCE_INLINE_TEMPLATE U32 ZSTD_getPrice(optState_t* optPtr, U32 litLength, const BYTE* literals, U32 offset, U32 matchLength, const int ultra)
+/* ZSTD_getPrice() :
+ * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */
+FORCE_INLINE_TEMPLATE U32 ZSTD_getPrice(optState_t* optPtr,
+ U32 const litLength, const BYTE* const literals, U32 const offset, U32 const matchLength,
+ int const optLevel)
{
- BYTE const offCode = (BYTE)ZSTD_highbit32(offset+1);
- U32 const mlBase = matchLength - MINMATCH;
U32 price;
+ U32 const offCode = ZSTD_highbit32(offset+1);
+ U32 const mlBase = matchLength - MINMATCH;
+ assert(matchLength >= MINMATCH);
if (optPtr->staticPrices) /* fixed scheme, do not use statistics */
return ZSTD_getLiteralPrice(optPtr, litLength, literals) + ZSTD_highbit32((U32)mlBase+1) + 16 + offCode;
price = offCode + optPtr->log2offCodeSum - ZSTD_highbit32(optPtr->offCodeFreq[offCode]+1);
- if (!ultra /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */
+ if ((optLevel<2) /*static*/ && offCode >= 20) price += (offCode-19)*2; /* handicap for long distance offsets, favor decompression speed */
/* match Length */
{ U32 const mlCode = ZSTD_MLcode(mlBase);
@@ -165,9 +171,9 @@ static void ZSTD_updateStats(optState_t* optPtr, U32 litLength, const BYTE* lite
{
/* literals */
{ U32 u;
- optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
for (u=0; u < litLength; u++)
optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD;
+ optPtr->litSum += litLength*ZSTD_LITFREQ_ADD;
}
/* literal Length */
@@ -451,7 +457,7 @@ repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0)
FORCE_INLINE_TEMPLATE
size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
const void* src, size_t srcSize,
- const int ultra, const int extDict)
+ const int optLevel, const int extDict)
{
seqStore_t* const seqStorePtr = &(ctx->seqStore);
optState_t* const optStatePtr = &(ctx->optState);
@@ -518,7 +524,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
U32 const end = matches[matchNb].len;
repcodes_t const repHistory = ZSTD_updateRep(rep, offset, ll0);
while (pos <= end) {
- U32 const matchPrice = ZSTD_getPrice(optStatePtr, litlen, anchor, offset, pos, ultra);
+ U32 const matchPrice = ZSTD_getPrice(optStatePtr, litlen, anchor, offset, pos, optLevel);
if (pos > last_pos || matchPrice < opt[pos].price) {
DEBUGLOG(7, "rPos:%u => set initial price : %u",
pos, matchPrice);
@@ -576,25 +582,26 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
/* set prices using matches found at position == cur */
for (matchNb = 0; matchNb < nbMatches; matchNb++) {
- U32 mlen = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
- U32 const lastML = matches[matchNb].len;
U32 const offset = matches[matchNb].off;
repcodes_t const repHistory = ZSTD_updateRep(opt[cur].rep, offset, ll0);
+ U32 const lastML = matches[matchNb].len;
+ U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch;
+ U32 mlen;
DEBUGLOG(7, "testing match %u => offCode=%u, mlen=%u, llen=%u",
matchNb, matches[matchNb].off, lastML, litlen);
- while (mlen <= lastML) {
+ for (mlen = lastML; mlen >= startML; mlen--) {
U32 const pos = cur + mlen;
- U32 const price = basePrice + ZSTD_getPrice(optStatePtr, litlen, baseLiterals, offset, mlen, ultra);
+ U32 const price = basePrice + ZSTD_getPrice(optStatePtr, litlen, baseLiterals, offset, mlen, optLevel);
if ((pos > last_pos) || (price < opt[pos].price)) {
DEBUGLOG(7, "rPos:%u => new better price (%u<%u)",
pos, price, opt[pos].price);
SET_PRICE(pos, mlen, offset, litlen, price, repHistory); /* note : macro modifies last_pos */
+ } else {
+ if (optLevel==0) break; /* gets ~+10% speed for about 0.01 ratio loss */
}
-
- mlen++;
} } } }
best_mlen = opt[last_pos].mlen;
@@ -663,20 +670,20 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
size_t ZSTD_compressBlock_btopt(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
DEBUGLOG(5, "ZSTD_compressBlock_btopt");
- return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*ultra*/, 0 /*extDict*/);
+ return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*optLevel*/, 0 /*extDict*/);
}
size_t ZSTD_compressBlock_btultra(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*ultra*/, 0 /*extDict*/);
+ return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 2 /*optLevel*/, 0 /*extDict*/);
}
size_t ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*ultra*/, 1 /*extDict*/);
+ return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*optLevel*/, 1 /*extDict*/);
}
size_t ZSTD_compressBlock_btultra_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*ultra*/, 1 /*extDict*/);
+ return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 2 /*optLevel*/, 1 /*extDict*/);
}
From d100670045fb39adcce509d4bbb3ceb4a0091975 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Sun, 19 Nov 2017 10:38:02 -0800
Subject: [PATCH 3/9] btopt0 : a bit faster and weaker
---
lib/compress/zstd_opt.c | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index ed8527989..c62b8897c 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -553,11 +553,15 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
SET_PRICE(cur, 1/*mlen*/, 0/*offset*/, litlen, price, opt[cur-1].rep);
} }
- if (cur == last_pos) break;
-
/* last match must start at a minimum distance of 8 from oend */
if (inr > ilimit) continue;
+ if (cur == last_pos) break;
+
+ if ( (optLevel==0) /*static*/
+ && (opt[cur+1].price <= opt[cur].price) )
+ continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */
+
{ U32 const ll0 = (opt[cur].mlen != 1);
U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0;
U32 const basePrice = (cur > litlen) ? opt[cur-litlen].price : 0;
@@ -600,7 +604,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
pos, price, opt[pos].price);
SET_PRICE(pos, mlen, offset, litlen, price, repHistory); /* note : macro modifies last_pos */
} else {
- if (optLevel==0) break; /* gets ~+10% speed for about 0.01 ratio loss */
+ if (optLevel==0) break; /* gets ~+10% speed for about -0.01 ratio loss */
}
} } } }
From 99435dbbab0361c105b1bc6f5c6aa4ab39c89e96 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Sun, 19 Nov 2017 12:58:04 -0800
Subject: [PATCH 4/9] minor : search early-out on sufficient_len for hc3 and
rep
very very small speed and ratio increases
---
lib/compress/zstd_opt.c | 28 ++++++++++++++--------------
tests/paramgrill.c | 12 ++++--------
2 files changed, 18 insertions(+), 22 deletions(-)
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index c62b8897c..7fd866f0a 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -240,9 +240,9 @@ FORCE_INLINE_TEMPLATE
U32 ZSTD_insertBtAndGetAllMatches (
ZSTD_CCtx* zc,
const BYTE* const ip, const BYTE* const iLimit, const int extDict,
- U32 nbCompares, U32 const mls,
+ U32 nbCompares, U32 const mls, U32 const sufficient_len,
U32 rep[ZSTD_REP_NUM], U32 const ll0,
- ZSTD_match_t* matches, const U32 minMatchLen)
+ ZSTD_match_t* matches, const U32 lengthToBeat)
{
const BYTE* const base = zc->base;
U32 const current = (U32)(ip-base);
@@ -267,7 +267,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
U32 dummy32; /* to be nullified at the end */
U32 mnum = 0;
- size_t bestLength = minMatchLen-1;
+ size_t bestLength = lengthToBeat-1;
DEBUGLOG(7, "ZSTD_insertBtAndGetAllMatches");
/* check repCode */
@@ -299,7 +299,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
matches[mnum].off = repCode - ll0;
matches[mnum].len = (U32)repLen;
mnum++;
- if ( (repLen > ZSTD_OPT_NUM)
+ if ( (repLen > sufficient_len)
| (ip+repLen == iLimit) ) { /* best possible */
return mnum;
} } } }
@@ -328,7 +328,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE;
matches[0].len = (U32)mlen;
mnum = 1;
- if ( (mlen > ZSTD_OPT_NUM)
+ if ( (mlen > sufficient_len)
| (ip+mlen == iLimit) ) { /* best possible */
return 1;
} } } }
@@ -394,10 +394,10 @@ U32 ZSTD_insertBtAndGetAllMatches (
FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
ZSTD_CCtx* zc, /* Index table will be updated */
- const BYTE* ip, const BYTE* const iHighLimit, const int extDict,
- const U32 maxNbAttempts, const U32 matchLengthSearch,
+ const BYTE* ip, const BYTE* const iHighLimit, int const extDict,
+ U32 const maxNbAttempts, U32 const matchLengthSearch, U32 const sufficient_len,
U32 rep[ZSTD_REP_NUM], U32 const ll0,
- ZSTD_match_t* matches, const U32 minMatchLen)
+ ZSTD_match_t* matches, U32 const lengthToBeat)
{
DEBUGLOG(7, "ZSTD_BtGetAllMatches");
if (ip < zc->base + zc->nextToUpdate) return 0; /* skipped area */
@@ -405,12 +405,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches (
else ZSTD_updateTree(zc, ip, iHighLimit, maxNbAttempts, matchLengthSearch);
switch(matchLengthSearch)
{
- case 3 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 3, rep, ll0, matches, minMatchLen);
+ case 3 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 3, sufficient_len, rep, ll0, matches, lengthToBeat);
default :
- case 4 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 4, rep, ll0, matches, minMatchLen);
- case 5 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 5, rep, ll0, matches, minMatchLen);
+ case 4 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 4, sufficient_len, rep, ll0, matches, lengthToBeat);
+ case 5 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 5, sufficient_len, rep, ll0, matches, lengthToBeat);
case 7 :
- case 6 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 6, rep, ll0, matches, minMatchLen);
+ case 6 : return ZSTD_insertBtAndGetAllMatches(zc, ip, iHighLimit, extDict, maxNbAttempts, 6, sufficient_len, rep, ll0, matches, lengthToBeat);
}
}
@@ -493,7 +493,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
/* find first match */
{ U32 const litlen = (U32)(ip - anchor);
U32 const ll0 = !litlen;
- U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, ip, iend, extDict, maxSearches, mls, rep, ll0, matches, minMatch);
+ U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, ip, iend, extDict, maxSearches, mls, sufficient_len, rep, ll0, matches, minMatch);
if (!nbMatches) { ip++; continue; }
/* initialize opt[0] */
@@ -566,7 +566,7 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
U32 const litlen = (opt[cur].mlen == 1) ? opt[cur].litlen : 0;
U32 const basePrice = (cur > litlen) ? opt[cur-litlen].price : 0;
const BYTE* const baseLiterals = ip + cur - litlen;
- U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, inr, iend, extDict, maxSearches, mls, opt[cur].rep, ll0, matches, minMatch);
+ U32 const nbMatches = ZSTD_BtGetAllMatches(ctx, inr, iend, extDict, maxSearches, mls, sufficient_len, opt[cur].rep, ll0, matches, minMatch);
U32 matchNb;
if (!nbMatches) continue;
assert(baseLiterals >= prefixStart);
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index a387a44c6..5560e82f6 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -310,14 +310,10 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
}
-const char* g_stratName[] = { "ZSTD_fast ",
- "ZSTD_dfast ",
- "ZSTD_greedy ",
- "ZSTD_lazy ",
- "ZSTD_lazy2 ",
- "ZSTD_btlazy2 ",
- "ZSTD_btopt ",
- "ZSTD_btultra "};
+const char* g_stratName[ZSTD_btultra] = {
+ "ZSTD_fast ", "ZSTD_dfast ",
+ "ZSTD_greedy ", "ZSTD_lazy ", "ZSTD_lazy2 ",
+ "ZSTD_btlazy2 ", "ZSTD_btopt ", "ZSTD_btultra "};
static void BMK_printWinner(FILE* f, U32 cLevel, BMK_result_t result, ZSTD_compressionParameters params, size_t srcSize)
{
From 42c1e642704c2c4d1f402296376b59393c25ca1c Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Sun, 19 Nov 2017 13:39:12 -0800
Subject: [PATCH 5/9] slightly improved ratio at -22
merging of repcode search into btsearch introduced a small compression ratio regressio at max level :
1.3.2 : 52728769
after repMerge patch : 52760789 (+32020)
A few minor changes have produced this difference.
They can be hard to spot.
This patch buys back about half of the difference,
by no longer inserting position at hc3 when a long match is found there.
It feels strangely counter-intuitive, but works :
after this patch : 52742555 (-18234)
---
lib/compress/zstd_lazy.c | 15 ++++++++-------
lib/compress/zstd_opt.c | 10 ++++++----
tests/paramgrill.c | 4 ++--
3 files changed, 16 insertions(+), 13 deletions(-)
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index cd3d1530a..a2c658bd3 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -16,8 +16,8 @@
* Binary Tree search
***************************************/
/** ZSTD_insertBt1() : add one or multiple positions to tree.
-* ip : assumed <= iend-8 .
-* @return : nb of positions added */
+ * ip : assumed <= iend-8 .
+ * @return : nb of positions added */
static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
const BYTE* const ip, const BYTE* const iend,
U32 nbCompares, U32 const mls, U32 const extDict)
@@ -42,7 +42,7 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
U32* largerPtr = smallerPtr + 1;
U32 dummy32; /* to be nullified at the end */
U32 const windowLow = zc->lowLimit;
- U32 matchEndIdx = current+8;
+ U32 matchEndIdx = current+8+1;
size_t bestLength = 8;
#ifdef ZSTD_C_PREDICT
U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0);
@@ -122,8 +122,8 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
*smallerPtr = *largerPtr = 0;
if (bestLength > 384) return MIN(192, (U32)(bestLength - 384)); /* speed optimization */
- if (matchEndIdx > current + 8) return matchEndIdx - (current + 8);
- return 1;
+ assert(matchEndIdx > current + 8);
+ return matchEndIdx - (current + 8);
}
FORCE_INLINE_TEMPLATE
@@ -182,7 +182,7 @@ static size_t ZSTD_insertBtAndFindBestMatch (
const U32 windowLow = zc->lowLimit;
U32* smallerPtr = bt + 2*(current&btMask);
U32* largerPtr = bt + 2*(current&btMask) + 1;
- U32 matchEndIdx = current+8;
+ U32 matchEndIdx = current+8+1;
U32 dummy32; /* to be nullified at the end */
size_t bestLength = 0;
@@ -233,7 +233,8 @@ static size_t ZSTD_insertBtAndFindBestMatch (
*smallerPtr = *largerPtr = 0;
- zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1; /* skip repetitive patterns */
+ assert(matchEndIdx > current+8);
+ zc->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
return bestLength;
}
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 7fd866f0a..7508aa1bc 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -263,7 +263,7 @@ U32 ZSTD_insertBtAndGetAllMatches (
U32 const windowLow = zc->lowLimit;
U32* smallerPtr = bt + 2*(current&btMask);
U32* largerPtr = bt + 2*(current&btMask) + 1;
- U32 matchEndIdx = current+8; /* farthest referenced position of any match => detects repetitive patterns */
+ U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */
U32 dummy32; /* to be nullified at the end */
U32 mnum = 0;
@@ -328,8 +328,9 @@ U32 ZSTD_insertBtAndGetAllMatches (
matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE;
matches[0].len = (U32)mlen;
mnum = 1;
- if ( (mlen > sufficient_len)
- | (ip+mlen == iLimit) ) { /* best possible */
+ if ( (mlen > sufficient_len) |
+ (ip+mlen == iLimit) ) { /* best possible length */
+ zc->nextToUpdate = current+1; /* skip insertion */
return 1;
} } } }
@@ -387,7 +388,8 @@ U32 ZSTD_insertBtAndGetAllMatches (
*smallerPtr = *largerPtr = 0;
- zc->nextToUpdate = (matchEndIdx > current + 8) ? matchEndIdx - 8 : current+1; /* skip repetitive patterns */
+ assert(matchEndIdx > current+8);
+ zc->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */
return mnum;
}
diff --git a/tests/paramgrill.c b/tests/paramgrill.c
index 5560e82f6..081a284e4 100644
--- a/tests/paramgrill.c
+++ b/tests/paramgrill.c
@@ -310,8 +310,8 @@ static size_t BMK_benchParam(BMK_result_t* resultPtr,
}
-const char* g_stratName[ZSTD_btultra] = {
- "ZSTD_fast ", "ZSTD_dfast ",
+const char* g_stratName[ZSTD_btultra+1] = {
+ "(none) ", "ZSTD_fast ", "ZSTD_dfast ",
"ZSTD_greedy ", "ZSTD_lazy ", "ZSTD_lazy2 ",
"ZSTD_btlazy2 ", "ZSTD_btopt ", "ZSTD_btultra "};
From 3f457264d153e1a5e08cc2403eb5d6f2887b6c44 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Sun, 19 Nov 2017 14:40:21 -0800
Subject: [PATCH 6/9] slightly improved compression speed
---
lib/compress/zstd_compress_internal.h | 18 +++++++++-------
lib/compress/zstd_lazy.c | 30 +++++++++++++--------------
lib/compress/zstd_opt.c | 5 ++---
3 files changed, 27 insertions(+), 26 deletions(-)
diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h
index 862558b50..4fd55c438 100644
--- a/lib/compress/zstd_compress_internal.h
+++ b/lib/compress/zstd_compress_internal.h
@@ -347,13 +347,17 @@ MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* co
const BYTE* const pStart = pIn;
const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1);
- while (pIn < pInLoopLimit) {
- size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
- if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
- pIn += ZSTD_NbCommonBytes(diff);
- return (size_t)(pIn - pStart);
- }
- if (MEM_64bits()) if ((pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
+ if (pIn < pInLoopLimit) {
+ { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+ if (diff) return ZSTD_NbCommonBytes(diff); }
+ pIn+=sizeof(size_t); pMatch+=sizeof(size_t);
+ while (pIn < pInLoopLimit) {
+ size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
+ if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; }
+ pIn += ZSTD_NbCommonBytes(diff);
+ return (size_t)(pIn - pStart);
+ } }
+ if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; }
if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; }
if ((pIn= dictLimit)) {
assert(matchIndex+matchLength >= dictLimit); /* might be wrong if extDict is incorrectly set to 0 */
match = base + matchIndex;
- if (match[matchLength] == ip[matchLength])
- matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+ //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
} else {
match = dictBase + matchIndex;
matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
@@ -196,8 +196,8 @@ static size_t ZSTD_insertBtAndFindBestMatch (
if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
match = base + matchIndex;
- if (match[matchLength] == ip[matchLength])
- matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+ //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
} else {
match = dictBase + matchIndex;
matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
@@ -355,14 +355,14 @@ size_t ZSTD_HcFindBestMatch_generic (
U32 matchIndex = ZSTD_insertAndFindFirstIndex (zc, ip, mls);
for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) {
- const BYTE* match;
size_t currentMl=0;
if ((!extDict) || matchIndex >= dictLimit) {
- match = base + matchIndex;
+ const BYTE* const match = base + matchIndex;
if (match[ml] == ip[ml]) /* potentially better */
currentMl = ZSTD_count(ip, match, iLimit);
} else {
- match = dictBase + matchIndex;
+ const BYTE* const match = dictBase + matchIndex;
+ assert(match+4 <= dictEnd);
if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */
currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4;
}
@@ -400,10 +400,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS (
FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS (
- ZSTD_CCtx* zc,
+ ZSTD_CCtx* const zc,
const BYTE* ip, const BYTE* const iLimit,
- size_t* offsetPtr,
- const U32 maxNbAttempts, const U32 matchLengthSearch)
+ size_t* const offsetPtr,
+ U32 const maxNbAttempts, U32 const matchLengthSearch)
{
switch(matchLengthSearch)
{
@@ -522,9 +522,8 @@ size_t ZSTD_compressBlock_lazy_generic(ZSTD_CCtx* ctx,
*/
/* catch up */
if (offset) {
- while ( (start > anchor)
- && (start > base+offset-ZSTD_REP_MOVE)
- && (start[-1] == (start-offset+ZSTD_REP_MOVE)[-1]) ) /* only search for offset within prefix */
+ while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > base))
+ && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */
{ start--; matchLength++; }
offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE);
}
@@ -536,9 +535,8 @@ _storeSequence:
}
/* check immediate repcode */
- while ( (ip <= ilimit)
- && ((offset_2>0)
- & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) {
+ while ( ((ip <= ilimit) & (offset_2>0))
+ && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
/* store sequence */
matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4;
offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 7508aa1bc..9eb338e18 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -345,9 +345,8 @@ U32 ZSTD_insertBtAndGetAllMatches (
if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */
match = base + matchIndex;
- if (match[matchLength] == ip[matchLength]) {
- matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1;
- }
+ //if (match[matchLength] == ip[matchLength]) { matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1; }
+ matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
} else {
match = dictBase + matchIndex;
matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart);
From 899f2a29f6f70f7812c3f79259b593c3dde67401 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Mon, 20 Nov 2017 11:53:20 -0800
Subject: [PATCH 7/9] strategy ZSTD_btopt pinned to (0) variant (faster one)
---
lib/compress/zstd_lazy.c | 2 --
lib/compress/zstd_opt.c | 3 +--
2 files changed, 1 insertion(+), 4 deletions(-)
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 9937e24b5..6d4804961 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -85,7 +85,6 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc,
if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
assert(matchIndex+matchLength >= dictLimit); /* might be wrong if extDict is incorrectly set to 0 */
match = base + matchIndex;
- //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
} else {
match = dictBase + matchIndex;
@@ -196,7 +195,6 @@ static size_t ZSTD_insertBtAndFindBestMatch (
if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
match = base + matchIndex;
- //if (match[matchLength] == ip[matchLength]) matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iend) +1;
matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
} else {
match = dictBase + matchIndex;
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 9eb338e18..b9a2cf85f 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -345,7 +345,6 @@ U32 ZSTD_insertBtAndGetAllMatches (
if ((!extDict) || (matchIndex+matchLength >= dictLimit)) {
assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */
match = base + matchIndex;
- //if (match[matchLength] == ip[matchLength]) { matchLength += ZSTD_count(ip+matchLength+1, match+matchLength+1, iLimit) +1; }
matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit);
} else {
match = dictBase + matchIndex;
@@ -685,7 +684,7 @@ size_t ZSTD_compressBlock_btultra(ZSTD_CCtx* ctx, const void* src, size_t srcSiz
size_t ZSTD_compressBlock_btopt_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
{
- return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 1 /*optLevel*/, 1 /*extDict*/);
+ return ZSTD_compressBlock_opt_generic(ctx, src, srcSize, 0 /*optLevel*/, 1 /*extDict*/);
}
size_t ZSTD_compressBlock_btultra_extDict(ZSTD_CCtx* ctx, const void* src, size_t srcSize)
From 4154aec679e98b76c3e19ccdcb105e89faf9af37 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Tue, 21 Nov 2017 10:26:17 -0800
Subject: [PATCH 8/9] fixed comment, as suggested by @terrelln
---
doc/zstd_manual.html | 5 +++--
lib/compress/zstd_opt.c | 5 +++--
2 files changed, 6 insertions(+), 4 deletions(-)
diff --git a/doc/zstd_manual.html b/doc/zstd_manual.html
index 9697ebd57..e50f46661 100644
--- a/doc/zstd_manual.html
+++ b/doc/zstd_manual.html
@@ -111,7 +111,7 @@ unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize);
@return : content size to be decompressed, as a 64-bits value _if known and not empty_, 0 otherwise.
-Helper functions
#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < 128 KB) ? ((128 KB - (srcSize)) >> 11)
/* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
+Helper functions
#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11)
/* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */
size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case scenario */
unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */
const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */
@@ -906,7 +906,8 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long
size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, unsigned value);
Set one compression parameter, selected by enum ZSTD_cParameter.
Note : when `value` is an enum, cast it to unsigned for proper type checking.
- @result : 0, or an error code (which can be tested with ZSTD_isError()).
+ @result : informational value (typically, the one being set, possibly corrected),
+ or an error code (which can be tested with ZSTD_isError()).
size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize);
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index b9a2cf85f..1defcb8fe 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -606,7 +606,8 @@ size_t ZSTD_compressBlock_opt_generic(ZSTD_CCtx* ctx,
} else {
if (optLevel==0) break; /* gets ~+10% speed for about -0.01 ratio loss */
}
- } } } }
+ } } }
+ } /* for (cur = 1; cur <= last_pos; cur++) */
best_mlen = opt[last_pos].mlen;
best_off = opt[last_pos].off;
@@ -661,7 +662,7 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
anchor = ip;
} }
ZSTD_setLog2Prices(optStatePtr);
- } /* for (cur=0; cur < last_pos; ) */
+ } while (ip < ilimit)
/* Save reps for next block */
{ int i; for (i=0; irepToConfirm[i] = rep[i]; }
From f8d5c478afe143ffff63ac67c3ade593974efa27 Mon Sep 17 00:00:00 2001
From: Yann Collet
Date: Tue, 21 Nov 2017 10:36:14 -0800
Subject: [PATCH 9/9] fixed comment, reported by @gyscos
---
lib/compress/zstd_opt.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/lib/compress/zstd_opt.c b/lib/compress/zstd_opt.c
index 1defcb8fe..66a7da19c 100644
--- a/lib/compress/zstd_opt.c
+++ b/lib/compress/zstd_opt.c
@@ -662,7 +662,7 @@ _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */
anchor = ip;
} }
ZSTD_setLog2Prices(optStatePtr);
- } while (ip < ilimit)
+ } /* while (ip < ilimit) */
/* Save reps for next block */
{ int i; for (i=0; irepToConfirm[i] = rep[i]; }