From 50524bf0da482e4b340b709eaa3ef0e4a83d17ba Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 23 Nov 2016 15:11:07 -0800 Subject: [PATCH 01/13] delayed decompression --- lib/decompress/zstd_decompress.c | 29 +++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index cc92ca276..1890a8530 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -1013,7 +1013,9 @@ static size_t ZSTD_decompressSequences( /* Regen sequences */ if (nbSeq) { + seq_t sequences[4]; seqState_t seqState; + int seqNb; dctx->fseEntropy = 1; { U32 i; for (i=0; irep[i]; } CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); @@ -1021,16 +1023,31 @@ static size_t ZSTD_decompressSequences( FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); - for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) { - nbSeq--; - { seq_t const sequence = ZSTD_decodeSequence(&seqState); - size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd); + /* prepare in advance */ + int const seqAdvance = MIN(nbSeq, 3); + for (seqNb=0; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } } From 73f88a66f17539f86f03fa0ecf7c0e56c23d02da Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Wed, 23 Nov 2016 15:43:30 -0800 Subject: [PATCH 02/13] added prefetch --- lib/decompress/zstd_decompress.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 1890a8530..d85c803b1 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -775,6 +775,7 @@ typedef struct { size_t litLength; size_t matchLength; size_t offset; + const BYTE* match; } seq_t; typedef struct { @@ -783,6 +784,7 @@ typedef struct { FSE_DState_t stateOffb; FSE_DState_t stateML; size_t prevOffset[ZSTD_REP_NUM]; + const BYTE* ptr; } seqState_t; @@ -851,11 +853,14 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState) if (MEM_32bits() || (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); + seq.match = seqState->ptr + seq.litLength - seq.offset; /* only for single memory segment ! */ + /* ANS state update */ FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + seqState->ptr += seq.matchLength + seq.litLength; return seq; } @@ -919,7 +924,7 @@ size_t ZSTD_execSequence(BYTE* op, BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; const BYTE* const iLitEnd = *litPtr + sequence.litLength; - const BYTE* match = oLitEnd - sequence.offset; + const BYTE* match = sequence.match; /* check */ if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ @@ -987,6 +992,8 @@ size_t ZSTD_execSequence(BYTE* op, return sequenceLength; } +#include +#define PREFETCH(ptr) _mm_prefetch(ptr, _MM_HINT_T0); static size_t ZSTD_decompressSequences( ZSTD_DCtx* dctx, @@ -1018,6 +1025,7 @@ static size_t ZSTD_decompressSequences( int seqNb; dctx->fseEntropy = 1; { U32 i; for (i=0; irep[i]; } + seqState.ptr = op; CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); @@ -1025,28 +1033,28 @@ static size_t ZSTD_decompressSequences( /* prepare in advance */ int const seqAdvance = MIN(nbSeq, 3); - for (seqNb=0; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } From 764e70a4f35c3f84a5bc61e42a1c856809d46eb5 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 28 Nov 2016 15:50:16 -0800 Subject: [PATCH 03/13] added decodeSequencesLong --- lib/decompress/zstd_decompress.c | 80 ++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 8 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index d85c803b1..ea5722e2a 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -927,9 +927,11 @@ size_t ZSTD_execSequence(BYTE* op, const BYTE* match = sequence.match; /* check */ +#if 1 if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); +#endif /* copy Literals */ ZSTD_copy8(op, *litPtr); @@ -939,6 +941,7 @@ size_t ZSTD_execSequence(BYTE* op, *litPtr = iLitEnd; /* update for next sequence */ /* copy Match */ +#if 1 if (sequence.offset > (size_t)(oLitEnd - base)) { /* offset beyond prefix */ if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); @@ -960,6 +963,7 @@ size_t ZSTD_execSequence(BYTE* op, } } } /* Requirement: op <= oend_w */ +#endif /* match within prefix */ if (sequence.offset < 8) { @@ -992,8 +996,6 @@ size_t ZSTD_execSequence(BYTE* op, return sequenceLength; } -#include -#define PREFETCH(ptr) _mm_prefetch(ptr, _MM_HINT_T0); static size_t ZSTD_decompressSequences( ZSTD_DCtx* dctx, @@ -1020,7 +1022,68 @@ static size_t ZSTD_decompressSequences( /* Regen sequences */ if (nbSeq) { - seq_t sequences[4]; + seqState_t seqState; + dctx->fseEntropy = 1; + { U32 i; for (i=0; irep[i]; } + CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); + FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + for ( ; (BIT_reloadDStream(&(seqState.DStream)) <= BIT_DStream_completed) && nbSeq ; ) { + nbSeq--; + { seq_t const sequence = ZSTD_decodeSequence(&seqState); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, base, vBase, dictEnd); + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; + } } + + /* check if reached exact end */ + if (nbSeq) return ERROR(corruption_detected); + /* save reps for next block */ + { U32 i; for (i=0; irep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall); + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + + return op-ostart; +} + +#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0); +static size_t ZSTD_decompressSequencesLong( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const base = (const BYTE*) (dctx->base); + const BYTE* const vBase = (const BYTE*) (dctx->vBase); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + int nbSeq; + + /* Build Decoding Tables */ + { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + } + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 8 +#define STOSEQ_MASK (STORED_SEQS-1) +#define ADVANCED_SEQS 5 + seq_t sequences[STORED_SEQS]; seqState_t seqState; int seqNb; dctx->fseEntropy = 1; @@ -1032,7 +1095,7 @@ static size_t ZSTD_decompressSequences( FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); /* prepare in advance */ - int const seqAdvance = MIN(nbSeq, 3); + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNb Date: Mon, 28 Nov 2016 16:11:30 -0800 Subject: [PATCH 04/13] restored normal mode --- lib/decompress/zstd_decompress.c | 321 +++++++++++++++++++++++-------- 1 file changed, 238 insertions(+), 83 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index ea5722e2a..d96dd7841 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -788,7 +788,7 @@ typedef struct { } seqState_t; -static seq_t ZSTD_decodeSequence(seqState_t* seqState) +static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) { seq_t seq; @@ -914,7 +914,7 @@ size_t ZSTD_execSequenceLast7(BYTE* op, FORCE_INLINE -size_t ZSTD_execSequence(BYTE* op, +size_t ZSTD_execSequenceLong(BYTE* op, BYTE* const oend, seq_t sequence, const BYTE** litPtr, const BYTE* const litLimit, const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) @@ -997,6 +997,242 @@ size_t ZSTD_execSequence(BYTE* op, } +#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0); +static size_t ZSTD_decompressSequencesLong( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const base = (const BYTE*) (dctx->base); + const BYTE* const vBase = (const BYTE*) (dctx->vBase); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + int nbSeq; + + /* Build Decoding Tables */ + { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + } + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 8 +#define STOSEQ_MASK (STORED_SEQS-1) +#define ADVANCED_SEQS 5 + seq_t sequences[STORED_SEQS]; + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); + seqState_t seqState; + int seqNb; + dctx->fseEntropy = 1; + { U32 i; for (i=0; irep[i]; } + seqState.ptr = op; + CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); + FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ + for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall); + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + + return op-ostart; +} + + +static seq_t ZSTD_decodeSequence(seqState_t* seqState) +{ + seq_t seq; + + U32 const llCode = FSE_peekSymbol(&seqState->stateLL); + U32 const mlCode = FSE_peekSymbol(&seqState->stateML); + U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ + + U32 const llBits = LL_bits[llCode]; + U32 const mlBits = ML_bits[mlCode]; + U32 const ofBits = ofCode; + U32 const totalBits = llBits+mlBits+ofBits; + + static const U32 LL_base[MaxLL+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, + 0x2000, 0x4000, 0x8000, 0x10000 }; + + static const U32 ML_base[MaxML+1] = { + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, + 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; + + static const U32 OF_base[MaxOff+1] = { + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; + + /* sequence */ + { size_t offset; + if (!ofCode) + offset = 0; + else { + offset = OF_base[ofCode] + BIT_readBits(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + + if (ofCode <= 1) { + offset += (llCode==0); + if (offset) { + size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } else { + offset = seqState->prevOffset[0]; + } + } else { + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + seq.offset = offset; + } + + seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBits(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); + + seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBits(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + if (MEM_32bits() || + (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); + + /* ANS state update */ + FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + + return seq; +} + + +FORCE_INLINE +size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + /* check */ + if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ + if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); + + /* copy Literals */ + ZSTD_copy8(op, *litPtr); + if (sequence.litLength > 8) + ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - base)) { + /* offset beyond prefix */ + if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); + match = dictEnd - (base-match); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = base; + if (op > oend_w) { + U32 i; + for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i]; + return sequenceLength; + } + } } + /* Requirement: op <= oend_w */ + + /* match within prefix */ + if (sequence.offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* substracted */ + int const sub2 = dec64table[sequence.offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[sequence.offset]; + ZSTD_copy4(op+4, match); + match -= sub2; + } else { + ZSTD_copy8(op, match); + } + op += 8; match += 8; + + if (oMatchEnd > oend-(16-MINMATCH)) { + if (op < oend_w) { + ZSTD_wildcopy(op, match, oend_w - op); + match += oend_w - op; + op = oend_w; + } + while (op < oMatchEnd) *op++ = *match++; + } else { + ZSTD_wildcopy(op, match, sequence.matchLength-8); /* works even if matchLength < 8 */ + } + return sequenceLength; +} + + static size_t ZSTD_decompressSequences( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1054,86 +1290,6 @@ static size_t ZSTD_decompressSequences( return op-ostart; } -#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0); -static size_t ZSTD_decompressSequencesLong( - ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize) -{ - const BYTE* ip = (const BYTE*)seqStart; - const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; - BYTE* const oend = ostart + maxDstSize; - BYTE* op = ostart; - const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; - const BYTE* const base = (const BYTE*) (dctx->base); - const BYTE* const vBase = (const BYTE*) (dctx->vBase); - const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - int nbSeq; - - /* Build Decoding Tables */ - { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); - if (ZSTD_isError(seqHSize)) return seqHSize; - ip += seqHSize; - } - - /* Regen sequences */ - if (nbSeq) { -#define STORED_SEQS 8 -#define STOSEQ_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 5 - seq_t sequences[STORED_SEQS]; - seqState_t seqState; - int seqNb; - dctx->fseEntropy = 1; - { U32 i; for (i=0; irep[i]; } - seqState.ptr = op; - CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); - FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); - FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); - FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); - - /* prepare in advance */ - int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } - } - - /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; - if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; - } - - return op-ostart; -} - static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) { @@ -1145,7 +1301,6 @@ static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) } } - static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) From ce3527ca0c11bce66b5d64d36a3f1f0355d4c10c Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 28 Nov 2016 18:38:52 -0800 Subject: [PATCH 05/13] combined normal and long decoder --- lib/decompress/zstd_decompress.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index d96dd7841..2d4087335 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -56,6 +56,15 @@ #endif +#if defined(_MSC_VER) +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define ZSTD_PREFETCH(ptr) _mm_prefetch(ptr, _MM_HINT_T0) +#elif defined(__GNUC__) +# define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0) +#else +# define ZSTD_PREFETCH(ptr) /* disabled */ +#endif + /*-************************************* * Macros ***************************************/ @@ -997,7 +1006,6 @@ size_t ZSTD_execSequenceLong(BYTE* op, } -#define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0); static size_t ZSTD_decompressSequencesLong( ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, @@ -1023,9 +1031,9 @@ static size_t ZSTD_decompressSequencesLong( /* Regen sequences */ if (nbSeq) { -#define STORED_SEQS 8 +#define STORED_SEQS 4 #define STOSEQ_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 5 +#define ADVANCED_SEQS 4 seq_t sequences[STORED_SEQS]; int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); seqState_t seqState; @@ -1315,6 +1323,8 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, ip += litCSize; srcSize -= litCSize; } + if (dctx->fParams.windowSize > 23) + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize); return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize); } From 52e136ed3d174d134500fbf679bec9ddcbdff490 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Mon, 28 Nov 2016 19:59:11 -0800 Subject: [PATCH 06/13] long decoder compatible with round and separate buffers --- lib/common/mem.h | 4 +++- lib/decompress/zstd_decompress.c | 8 ++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/lib/common/mem.h b/lib/common/mem.h index 681dd35d2..32c63dd17 100644 --- a/lib/common/mem.h +++ b/lib/common/mem.h @@ -55,14 +55,16 @@ MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (size typedef int32_t S32; typedef uint64_t U64; typedef int64_t S64; + typedef intptr_t iPtrDiff; #else - typedef unsigned char BYTE; + typedef unsigned char BYTE; typedef unsigned short U16; typedef signed short S16; typedef unsigned int U32; typedef signed int S32; typedef unsigned long long U64; typedef signed long long S64; + typedef ptrdiff_t iPtrDiff; #endif diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 2d4087335..da3cf56e2 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -794,6 +794,8 @@ typedef struct { FSE_DState_t stateML; size_t prevOffset[ZSTD_REP_NUM]; const BYTE* ptr; + const BYTE* base; + iPtrDiff gotoDict; } seqState_t; @@ -862,7 +864,8 @@ static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) if (MEM_32bits() || (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); - seq.match = seqState->ptr + seq.litLength - seq.offset; /* only for single memory segment ! */ + seq.match = seqState->ptr + seq.litLength - seq.offset; /* single memory segment ! */ + if (seq.match < seqState->base) seq.match += seqState->gotoDict; /* what if seq.match underflows ? prefetch is a non issue, but match has now a wrong address */ /* ANS state update */ FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ @@ -954,7 +957,6 @@ size_t ZSTD_execSequenceLong(BYTE* op, if (sequence.offset > (size_t)(oLitEnd - base)) { /* offset beyond prefix */ if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); - match = dictEnd - (base-match); if (match + sequence.matchLength <= dictEnd) { memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; @@ -1041,6 +1043,8 @@ static size_t ZSTD_decompressSequencesLong( dctx->fseEntropy = 1; { U32 i; for (i=0; irep[i]; } seqState.ptr = op; + seqState.base = base; + seqState.gotoDict = (iPtrDiff)(dictEnd - base); CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); From 05c00f2ff771bf80488b8c106de5dd2d275b2512 Mon Sep 17 00:00:00 2001 From: Nick Terrell Date: Tue, 29 Nov 2016 11:46:37 -0800 Subject: [PATCH 07/13] Fix ZSTD_STATIC_LINKING_ONLY with double include --- lib/zstd.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/lib/zstd.h b/lib/zstd.h index 7db135b5b..607612026 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -7,13 +7,13 @@ * of patent rights can be found in the PATENTS file in the same directory. */ -#ifndef ZSTD_H_235446 -#define ZSTD_H_235446 - #if defined (__cplusplus) extern "C" { #endif +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 + /* ====== Dependency ======*/ #include /* size_t */ @@ -313,9 +313,11 @@ ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* outp ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ +#endif /* ZSTD_H_235446 */ -#ifdef ZSTD_STATIC_LINKING_ONLY +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY /**************************************************************************************** * START OF ADVANCED AND EXPERIMENTAL FUNCTIONS @@ -642,10 +644,8 @@ ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCa ZSTDLIB_API size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert block into `dctx` history. Useful for uncompressed blocks */ -#endif /* ZSTD_STATIC_LINKING_ONLY */ +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ #if defined (__cplusplus) } #endif - -#endif /* ZSTD_H_235446 */ From 4f5350f610a93817554bf402946ae9203a6ca7b7 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 29 Nov 2016 13:12:24 -0800 Subject: [PATCH 08/13] long matches support overflow --- lib/decompress/zstd_decompress.c | 14 ++++++++------ programs/bench.c | 6 +++--- tests/playTests.sh | 2 +- 3 files changed, 12 insertions(+), 10 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index da3cf56e2..6539eb41e 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -793,8 +793,8 @@ typedef struct { FSE_DState_t stateOffb; FSE_DState_t stateML; size_t prevOffset[ZSTD_REP_NUM]; - const BYTE* ptr; const BYTE* base; + size_t pos; iPtrDiff gotoDict; } seqState_t; @@ -864,15 +864,17 @@ static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) if (MEM_32bits() || (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); - seq.match = seqState->ptr + seq.litLength - seq.offset; /* single memory segment ! */ - if (seq.match < seqState->base) seq.match += seqState->gotoDict; /* what if seq.match underflows ? prefetch is a non issue, but match has now a wrong address */ + { size_t const pos = seqState->pos + seq.litLength; + seq.match = seqState->base + pos - seq.offset; /* single memory segment */ + if (seq.offset > pos) seq.match += seqState->gotoDict; /* separate memory segment */ + seqState->pos = pos + seq.matchLength; + } /* ANS state update */ FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - seqState->ptr += seq.matchLength + seq.litLength; return seq; } @@ -1042,8 +1044,8 @@ static size_t ZSTD_decompressSequencesLong( int seqNb; dctx->fseEntropy = 1; { U32 i; for (i=0; irep[i]; } - seqState.ptr = op; seqState.base = base; + seqState.pos = (size_t)(op-base); seqState.gotoDict = (iPtrDiff)(dictEnd - base); CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); @@ -1194,7 +1196,7 @@ size_t ZSTD_execSequence(BYTE* op, if (sequence.offset > (size_t)(oLitEnd - base)) { /* offset beyond prefix */ if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); - match = dictEnd - (base-match); + match += (dictEnd-base); if (match + sequence.matchLength <= dictEnd) { memmove(oLitEnd, match, sequence.matchLength); return sequenceLength; diff --git a/programs/bench.c b/programs/bench.c index df68d0ada..4efba160a 100644 --- a/programs/bench.c +++ b/programs/bench.c @@ -175,7 +175,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, { U64 fastestC = (U64)(-1LL), fastestD = (U64)(-1LL); U64 const crcOrig = XXH64(srcBuffer, srcSize, 0); UTIL_time_t coolTime; - U64 const maxTime = (g_nbSeconds * TIMELOOP_MICROSEC) + 100; + U64 const maxTime = (g_nbSeconds * TIMELOOP_MICROSEC) + 1; U64 totalCTime=0, totalDTime=0; U32 cCompleted=0, dCompleted=0; # define NB_MARKS 4 @@ -234,7 +234,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, { U64 const clockSpan = UTIL_clockSpanMicro(clockStart, ticksPerSecond); if (clockSpan < fastestC*nbLoops) fastestC = clockSpan / nbLoops; totalCTime += clockSpan; - cCompleted = totalCTime>maxTime; + cCompleted = (totalCTime >= maxTime); } } cSize = 0; @@ -279,7 +279,7 @@ static int BMK_benchMem(const void* srcBuffer, size_t srcSize, { U64 const clockSpan = UTIL_clockSpanMicro(clockStart, ticksPerSecond); if (clockSpan < fastestD*nbLoops) fastestD = clockSpan / nbLoops; totalDTime += clockSpan; - dCompleted = totalDTime>maxTime; + dCompleted = (totalDTime >= maxTime); } } markNb = (markNb+1) % NB_MARKS; diff --git a/tests/playTests.sh b/tests/playTests.sh index a50604b7e..c539d567f 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -16,7 +16,7 @@ roundTripTest() { rm -f tmp1 tmp2 $ECHO "roundTripTest: ./datagen $1 $p | $ZSTD -v$c | $ZSTD -d" ./datagen $1 $p | $MD5SUM > tmp1 - ./datagen $1 $p | $ZSTD -v$c | $ZSTD -d | $MD5SUM > tmp2 + ./datagen $1 $p | $ZSTD --ultra -v$c | $ZSTD -d | $MD5SUM > tmp2 diff -q tmp1 tmp2 } From 37870d7a668f9cf5f5d7b4d4b4c85ee700fe1830 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 29 Nov 2016 14:31:57 -0800 Subject: [PATCH 09/13] fixed minor visual warning --- lib/decompress/zstd_decompress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 6539eb41e..d6d97c1d4 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -58,7 +58,7 @@ #if defined(_MSC_VER) # include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ -# define ZSTD_PREFETCH(ptr) _mm_prefetch(ptr, _MM_HINT_T0) +# define ZSTD_PREFETCH(ptr) _mm_prefetch((const char*)ptr, _MM_HINT_T0) #elif defined(__GNUC__) # define ZSTD_PREFETCH(ptr) __builtin_prefetch(ptr, 0, 0) #else From a56ac2815c971c98051e80a9d0d7828038b9d837 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 29 Nov 2016 15:30:23 -0800 Subject: [PATCH 10/13] restored normal decoder speed --- lib/decompress/zstd_decompress.c | 503 +++++++++++++++---------------- 1 file changed, 251 insertions(+), 252 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index d6d97c1d4..3e9be3f6a 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -799,87 +799,6 @@ typedef struct { } seqState_t; -static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) -{ - seq_t seq; - - U32 const llCode = FSE_peekSymbol(&seqState->stateLL); - U32 const mlCode = FSE_peekSymbol(&seqState->stateML); - U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ - - U32 const llBits = LL_bits[llCode]; - U32 const mlBits = ML_bits[mlCode]; - U32 const ofBits = ofCode; - U32 const totalBits = llBits+mlBits+ofBits; - - static const U32 LL_base[MaxLL+1] = { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, - 0x2000, 0x4000, 0x8000, 0x10000 }; - - static const U32 ML_base[MaxML+1] = { - 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, - 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, - 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, - 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; - - static const U32 OF_base[MaxOff+1] = { - 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, - 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, - 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, - 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; - - /* sequence */ - { size_t offset; - if (!ofCode) - offset = 0; - else { - offset = OF_base[ofCode] + BIT_readBits(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); - } - - if (ofCode <= 1) { - offset += (llCode==0); - if (offset) { - size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; - temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ - if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset = temp; - } else { - offset = seqState->prevOffset[0]; - } - } else { - seqState->prevOffset[2] = seqState->prevOffset[1]; - seqState->prevOffset[1] = seqState->prevOffset[0]; - seqState->prevOffset[0] = offset; - } - seq.offset = offset; - } - - seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBits(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ - if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); - - seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBits(&seqState->DStream, llBits) : 0); /* <= 16 bits */ - if (MEM_32bits() || - (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); - - { size_t const pos = seqState->pos + seq.litLength; - seq.match = seqState->base + pos - seq.offset; /* single memory segment */ - if (seq.offset > pos) seq.match += seqState->gotoDict; /* separate memory segment */ - seqState->pos = pos + seq.matchLength; - } - - /* ANS state update */ - FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ - FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ - if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ - FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ - - return seq; -} - - FORCE_NOINLINE size_t ZSTD_execSequenceLast7(BYTE* op, BYTE* const oend, seq_t sequence, @@ -927,169 +846,6 @@ size_t ZSTD_execSequenceLast7(BYTE* op, } -FORCE_INLINE -size_t ZSTD_execSequenceLong(BYTE* op, - BYTE* const oend, seq_t sequence, - const BYTE** litPtr, const BYTE* const litLimit, - const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) -{ - BYTE* const oLitEnd = op + sequence.litLength; - size_t const sequenceLength = sequence.litLength + sequence.matchLength; - BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ - BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; - const BYTE* const iLitEnd = *litPtr + sequence.litLength; - const BYTE* match = sequence.match; - - /* check */ -#if 1 - if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ - if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ - if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); -#endif - - /* copy Literals */ - ZSTD_copy8(op, *litPtr); - if (sequence.litLength > 8) - ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ - op = oLitEnd; - *litPtr = iLitEnd; /* update for next sequence */ - - /* copy Match */ -#if 1 - if (sequence.offset > (size_t)(oLitEnd - base)) { - /* offset beyond prefix */ - if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); - if (match + sequence.matchLength <= dictEnd) { - memmove(oLitEnd, match, sequence.matchLength); - return sequenceLength; - } - /* span extDict & currentPrefixSegment */ - { size_t const length1 = dictEnd - match; - memmove(oLitEnd, match, length1); - op = oLitEnd + length1; - sequence.matchLength -= length1; - match = base; - if (op > oend_w) { - U32 i; - for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i]; - return sequenceLength; - } - } } - /* Requirement: op <= oend_w */ -#endif - - /* match within prefix */ - if (sequence.offset < 8) { - /* close range match, overlap */ - static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ - static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* substracted */ - int const sub2 = dec64table[sequence.offset]; - op[0] = match[0]; - op[1] = match[1]; - op[2] = match[2]; - op[3] = match[3]; - match += dec32table[sequence.offset]; - ZSTD_copy4(op+4, match); - match -= sub2; - } else { - ZSTD_copy8(op, match); - } - op += 8; match += 8; - - if (oMatchEnd > oend-(16-MINMATCH)) { - if (op < oend_w) { - ZSTD_wildcopy(op, match, oend_w - op); - match += oend_w - op; - op = oend_w; - } - while (op < oMatchEnd) *op++ = *match++; - } else { - ZSTD_wildcopy(op, match, sequence.matchLength-8); /* works even if matchLength < 8 */ - } - return sequenceLength; -} - - -static size_t ZSTD_decompressSequencesLong( - ZSTD_DCtx* dctx, - void* dst, size_t maxDstSize, - const void* seqStart, size_t seqSize) -{ - const BYTE* ip = (const BYTE*)seqStart; - const BYTE* const iend = ip + seqSize; - BYTE* const ostart = (BYTE* const)dst; - BYTE* const oend = ostart + maxDstSize; - BYTE* op = ostart; - const BYTE* litPtr = dctx->litPtr; - const BYTE* const litEnd = litPtr + dctx->litSize; - const BYTE* const base = (const BYTE*) (dctx->base); - const BYTE* const vBase = (const BYTE*) (dctx->vBase); - const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); - int nbSeq; - - /* Build Decoding Tables */ - { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); - if (ZSTD_isError(seqHSize)) return seqHSize; - ip += seqHSize; - } - - /* Regen sequences */ - if (nbSeq) { -#define STORED_SEQS 4 -#define STOSEQ_MASK (STORED_SEQS-1) -#define ADVANCED_SEQS 4 - seq_t sequences[STORED_SEQS]; - int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); - seqState_t seqState; - int seqNb; - dctx->fseEntropy = 1; - { U32 i; for (i=0; irep[i]; } - seqState.base = base; - seqState.pos = (size_t)(op-base); - seqState.gotoDict = (iPtrDiff)(dictEnd - base); - CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); - FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); - FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); - FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); - - /* prepare in advance */ - for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } - } - - /* last literal segment */ - { size_t const lastLLSize = litEnd - litPtr; - if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall); - memcpy(op, litPtr, lastLLSize); - op += lastLLSize; - } - - return op-ostart; -} static seq_t ZSTD_decodeSequence(seqState_t* seqState) @@ -1305,16 +1061,250 @@ static size_t ZSTD_decompressSequences( } -static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) +static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) { - if (dst != dctx->previousDstEnd) { /* not contiguous */ - dctx->dictEnd = dctx->previousDstEnd; - dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base)); - dctx->base = dst; - dctx->previousDstEnd = dst; + seq_t seq; + + U32 const llCode = FSE_peekSymbol(&seqState->stateLL); + U32 const mlCode = FSE_peekSymbol(&seqState->stateML); + U32 const ofCode = FSE_peekSymbol(&seqState->stateOffb); /* <= maxOff, by table construction */ + + U32 const llBits = LL_bits[llCode]; + U32 const mlBits = ML_bits[mlCode]; + U32 const ofBits = ofCode; + U32 const totalBits = llBits+mlBits+ofBits; + + static const U32 LL_base[MaxLL+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 18, 20, 22, 24, 28, 32, 40, 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, + 0x2000, 0x4000, 0x8000, 0x10000 }; + + static const U32 ML_base[MaxML+1] = { + 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, + 35, 37, 39, 41, 43, 47, 51, 59, 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, + 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; + + static const U32 OF_base[MaxOff+1] = { + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; + + /* sequence */ + { size_t offset; + if (!ofCode) + offset = 0; + else { + offset = OF_base[ofCode] + BIT_readBits(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + + if (ofCode <= 1) { + offset += (llCode==0); + if (offset) { + size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } else { + offset = seqState->prevOffset[0]; + } + } else { + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + seq.offset = offset; } + + seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBits(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); + + seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBits(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + if (MEM_32bits() || + (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); + + { size_t const pos = seqState->pos + seq.litLength; + seq.match = seqState->base + pos - seq.offset; /* single memory segment */ + if (seq.offset > pos) seq.match += seqState->gotoDict; /* separate memory segment */ + seqState->pos = pos + seq.matchLength; + } + + /* ANS state update */ + FSE_updateState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + FSE_updateState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + FSE_updateState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + + return seq; } +FORCE_INLINE +size_t ZSTD_execSequenceLong(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const base, const BYTE* const vBase, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = sequence.match; + + /* check */ +#if 1 + if (oMatchEnd>oend) return ERROR(dstSize_tooSmall); /* last match must start at a minimum distance of WILDCOPY_OVERLENGTH from oend */ + if (iLitEnd > litLimit) return ERROR(corruption_detected); /* over-read beyond lit buffer */ + if (oLitEnd>oend_w) return ZSTD_execSequenceLast7(op, oend, sequence, litPtr, litLimit, base, vBase, dictEnd); +#endif + + /* copy Literals */ + ZSTD_copy8(op, *litPtr); + if (sequence.litLength > 8) + ZSTD_wildcopy(op+8, (*litPtr)+8, sequence.litLength - 8); /* note : since oLitEnd <= oend-WILDCOPY_OVERLENGTH, no risk of overwrite beyond oend */ + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* copy Match */ +#if 1 + if (sequence.offset > (size_t)(oLitEnd - base)) { + /* offset beyond prefix */ + if (sequence.offset > (size_t)(oLitEnd - vBase)) return ERROR(corruption_detected); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = base; + if (op > oend_w) { + U32 i; + for (i = 0; i < sequence.matchLength; ++i) op[i] = match[i]; + return sequenceLength; + } + } } + /* Requirement: op <= oend_w */ +#endif + + /* match within prefix */ + if (sequence.offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* substracted */ + int const sub2 = dec64table[sequence.offset]; + op[0] = match[0]; + op[1] = match[1]; + op[2] = match[2]; + op[3] = match[3]; + match += dec32table[sequence.offset]; + ZSTD_copy4(op+4, match); + match -= sub2; + } else { + ZSTD_copy8(op, match); + } + op += 8; match += 8; + + if (oMatchEnd > oend-(16-MINMATCH)) { + if (op < oend_w) { + ZSTD_wildcopy(op, match, oend_w - op); + match += oend_w - op; + op = oend_w; + } + while (op < oMatchEnd) *op++ = *match++; + } else { + ZSTD_wildcopy(op, match, sequence.matchLength-8); /* works even if matchLength < 8 */ + } + return sequenceLength; +} + +static size_t ZSTD_decompressSequencesLong( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const base = (const BYTE*) (dctx->base); + const BYTE* const vBase = (const BYTE*) (dctx->vBase); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + int nbSeq; + + /* Build Decoding Tables */ + { size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, seqSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + } + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 4 +#define STOSEQ_MASK (STORED_SEQS-1) +#define ADVANCED_SEQS 4 + seq_t sequences[STORED_SEQS]; + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); + seqState_t seqState; + int seqNb; + dctx->fseEntropy = 1; + { U32 i; for (i=0; irep[i]; } + seqState.base = base; + seqState.pos = (size_t)(op-base); + seqState.gotoDict = (iPtrDiff)(dictEnd - base); + CHECK_E(BIT_initDStream(&seqState.DStream, ip, iend-ip), corruption_detected); + FSE_initDState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + FSE_initDState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + FSE_initDState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ + for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && seqNbrep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + if (lastLLSize > (size_t)(oend-op)) return ERROR(dstSize_tooSmall); + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + + return op-ostart; +} + + static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) @@ -1329,12 +1319,21 @@ static size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, ip += litCSize; srcSize -= litCSize; } - if (dctx->fParams.windowSize > 23) - return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize); + if (dctx->fParams.windowSize > (1<<23)) return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize); return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize); } +static void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) +{ + if (dst != dctx->previousDstEnd) { /* not contiguous */ + dctx->dictEnd = dctx->previousDstEnd; + dctx->vBase = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->base)); + dctx->base = dst; + dctx->previousDstEnd = dst; + } +} + size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) From 25f46dcc0fa4b3336d65dbf636824dc76f5278bf Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 29 Nov 2016 16:59:27 -0800 Subject: [PATCH 11/13] minor const --- lib/compress/zstd_compress.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index f7286ae8f..b2aabb94e 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -1482,8 +1482,9 @@ static U32 ZSTD_insertBt1(ZSTD_CCtx* zc, const BYTE* const ip, const U32 mls, co hashTable[h] = current; /* Update Hash Table */ while (nbCompares-- && (matchIndex > windowLow)) { - U32* nextPtr = bt + 2*(matchIndex & btMask); + U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + #ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ if (matchIndex == predictedSmall) { @@ -1579,7 +1580,7 @@ static size_t ZSTD_insertBtAndFindBestMatch ( hashTable[h] = current; /* Update Hash Table */ while (nbCompares-- && (matchIndex > windowLow)) { - U32* nextPtr = bt + 2*(matchIndex & btMask); + U32* const nextPtr = bt + 2*(matchIndex & btMask); size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ const BYTE* match; From b766c8029c503e48d208922042a58f083ad07bc1 Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 29 Nov 2016 17:11:01 -0800 Subject: [PATCH 12/13] update NEWS --- NEWS | 1 + 1 file changed, 1 insertion(+) diff --git a/NEWS b/NEWS index e6fb2be6c..28ea1c85d 100644 --- a/NEWS +++ b/NEWS @@ -1,4 +1,5 @@ v1.1.2 +Improved : better decompression speed at high compression settings cli : new : preserve file attributes, by Przemyslaw Skibinski cli : fixed : status displays total amount decoded when stream/file consists of multiple appended frames (like pzstd) API : changed : zbuff prototypes now generate deprecation warnings From ff504de39127f1d738bafb3de9d033193470de3a Mon Sep 17 00:00:00 2001 From: Yann Collet Date: Tue, 29 Nov 2016 17:42:46 -0800 Subject: [PATCH 13/13] minor decompression speed improvement --- lib/decompress/zstd_decompress.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/lib/decompress/zstd_decompress.c b/lib/decompress/zstd_decompress.c index 3e9be3f6a..3106fa014 100644 --- a/lib/decompress/zstd_decompress.c +++ b/lib/decompress/zstd_decompress.c @@ -873,17 +873,17 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState) 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; static const U32 OF_base[MaxOff+1] = { - 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, - 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, - 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, - 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; /* sequence */ { size_t offset; if (!ofCode) offset = 0; else { - offset = OF_base[ofCode] + BIT_readBits(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } @@ -906,10 +906,10 @@ static seq_t ZSTD_decodeSequence(seqState_t* seqState) seq.offset = offset; } - seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBits(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); - seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBits(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ if (MEM_32bits() || (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream); @@ -1086,17 +1086,17 @@ static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; static const U32 OF_base[MaxOff+1] = { - 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, - 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, - 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, - 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD }; /* sequence */ { size_t offset; if (!ofCode) offset = 0; else { - offset = OF_base[ofCode] + BIT_readBits(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + offset = OF_base[ofCode] + BIT_readBitsFast(&seqState->DStream, ofBits); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); } @@ -1119,10 +1119,10 @@ static seq_t ZSTD_decodeSequenceLong(seqState_t* seqState) seq.offset = offset; } - seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBits(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ + seq.matchLength = ML_base[mlCode] + ((mlCode>31) ? BIT_readBitsFast(&seqState->DStream, mlBits) : 0); /* <= 16 bits */ if (MEM_32bits() && (mlBits+llBits>24)) BIT_reloadDStream(&seqState->DStream); - seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBits(&seqState->DStream, llBits) : 0); /* <= 16 bits */ + seq.litLength = LL_base[llCode] + ((llCode>15) ? BIT_readBitsFast(&seqState->DStream, llBits) : 0); /* <= 16 bits */ if (MEM_32bits() || (totalBits > 64 - 7 - (LLFSELog+MLFSELog+OffFSELog)) ) BIT_reloadDStream(&seqState->DStream);