Improves decompression speed when using cold dictionary

by triggering the prefetching decoder path
(which used to be dedicated to long-range offsets only).

Figures on my laptop :
no content prefetch : ~300 MB/s (for reference)
full content prefetch : ~325 MB/s (before this patch)
new prefetch path : ~375 MB/s (after this patch)

The benchmark speed is already significant,
but another side-effect is that this version
prefetch less data into memory,
since it only prefetches what's needed, instead of the full dictionary.

This is supposed to help highly active environments
such as active databases,
that can't be properly measured in benchmark environment (too clean).

Also :
fixed the largeNbDict test program
which was working improperly when setting nbBlocks > nbFiles.
This commit is contained in:
Yann Collet 2018-11-08 17:00:23 -08:00
parent 20fb9e7f36
commit 483759a3de
5 changed files with 21 additions and 25 deletions

View File

@ -33,7 +33,7 @@ largeNbDicts: util.o bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
.PHONY: $(LIBZSTD) .PHONY: $(LIBZSTD)
$(LIBZSTD): $(LIBZSTD):
$(MAKE) -C $(LIBDIR) libzstd.a $(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"
bench.o : $(PROGDIR)/bench.c bench.o : $(PROGDIR)/bench.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c $(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
@ -50,4 +50,5 @@ xxhash.o : $(LIBDIR)/common/xxhash.c
clean: clean:
$(RM) *.o $(RM) *.o
$(MAKE) -C $(LIBDIR) clean > /dev/null
$(RM) largeNbDicts $(RM) largeNbDicts

View File

@ -49,6 +49,7 @@
/*--- Macros ---*/ /*--- Macros ---*/
#define CONTROL(c) { if (!(c)) abort(); } #define CONTROL(c) { if (!(c)) abort(); }
#undef MIN #undef MIN
#define MIN(a,b) ((a) < (b) ? (a) : (b)) #define MIN(a,b) ((a) < (b) ? (a) : (b))
@ -594,6 +595,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
if (blockSize) if (blockSize)
DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize); DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
DISPLAYLEVEL(3, "\n"); DISPLAYLEVEL(3, "\n");
size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices);
size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities)); size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
@ -625,8 +627,8 @@ int bench(const char** fileNameTable, unsigned nbFiles,
/* dictionary determination */ /* dictionary determination */
buffer_t const dictBuffer = createDictionaryBuffer(dictionary, buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
srcBuffer.ptr, srcs.buffer.ptr,
srcSlices.capacities, nbBlocks, srcs.slices.capacities, srcs.slices.nbSlices,
DICTSIZE); DICTSIZE);
CONTROL(dictBuffer.ptr != NULL); CONTROL(dictBuffer.ptr != NULL);
@ -637,7 +639,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
CONTROL(cTotalSizeNoDict != 0); CONTROL(cTotalSizeNoDict != 0);
DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f (%u bytes) \n", DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f (%u bytes) \n",
clevel, clevel,
(double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict); (double)totalSrcSlicesSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
size_t* const cSizes = malloc(nbBlocks * sizeof(size_t)); size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
CONTROL(cSizes != NULL); CONTROL(cSizes != NULL);
@ -646,7 +648,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
CONTROL(cTotalSize != 0); CONTROL(cTotalSize != 0);
DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f (%u bytes) \n", DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f (%u bytes) \n",
(unsigned)dictBuffer.size, (unsigned)dictBuffer.size,
(double)srcSize / cTotalSize, (unsigned)cTotalSize); (double)totalSrcSlicesSize / cTotalSize, (unsigned)cTotalSize);
/* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */ /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
shrinkSizes(dstSlices, cSizes); shrinkSizes(dstSlices, cSizes);

View File

@ -15,7 +15,6 @@
* Dependencies * Dependencies
*********************************************************/ *********************************************************/
#include <string.h> /* memcpy, memmove, memset */ #include <string.h> /* memcpy, memmove, memset */
#include "compiler.h" /* prefetch */
#include "cpu.h" /* bmi2 */ #include "cpu.h" /* bmi2 */
#include "mem.h" /* low level memory routines */ #include "mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY #define FSE_STATIC_LINKING_ONLY

View File

@ -56,7 +56,6 @@
* Dependencies * Dependencies
*********************************************************/ *********************************************************/
#include <string.h> /* memcpy, memmove, memset */ #include <string.h> /* memcpy, memmove, memset */
#include "compiler.h" /* prefetch */
#include "cpu.h" /* bmi2 */ #include "cpu.h" /* bmi2 */
#include "mem.h" /* low level memory routines */ #include "mem.h" /* low level memory routines */
#define FSE_STATIC_LINKING_ONLY #define FSE_STATIC_LINKING_ONLY

View File

@ -507,16 +507,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
} }
} }
/* prefetch dictionary content */
if (dctx->ddictIsCold) {
size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
size_t const pSize = MIN(psmin, 128 KB /* protection */ );
const void* const pStart = (const char*)dctx->dictEnd - pSize;
PREFETCH_AREA(pStart, pSize);
dctx->ddictIsCold = 0;
}
return ip-istart; return ip-istart;
} }
@ -1046,6 +1036,7 @@ ZSTD_decompressSequencesLong_body(
/* prepare in advance */ /* prepare in advance */
for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) { for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset); sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
} }
if (seqNb<seqAdvance) return ERROR(corruption_detected); if (seqNb<seqAdvance) return ERROR(corruption_detected);
@ -1070,9 +1061,6 @@ ZSTD_decompressSequencesLong_body(
/* save reps for next block */ /* save reps for next block */
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); } { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
#undef STORED_SEQS
#undef STORED_SEQS_MASK
#undef ADVANCED_SEQS
} }
/* last literal segment */ /* last literal segment */
@ -1213,20 +1201,27 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
} }
/* Build Decoding Tables */ /* Build Decoding Tables */
{ int nbSeq; { int usePrefetchDecoder = dctx->ddictIsCold;
int nbSeq;
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
if (ZSTD_isError(seqHSize)) return seqHSize; if (ZSTD_isError(seqHSize)) return seqHSize;
ip += seqHSize; ip += seqHSize;
srcSize -= seqHSize; srcSize -= seqHSize;
if ( (!frame || (dctx->fParams.windowSize > (1<<24))) if ( !usePrefetchDecoder
&& (nbSeq>0) ) { /* could probably use a larger nbSeq limit */ && (!frame || (dctx->fParams.windowSize > (1<<24)))
&& (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
if (shareLongOffsets >= minShare) usePrefetchDecoder = (shareLongOffsets >= minShare);
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
} }
dctx->ddictIsCold = 0;
if (usePrefetchDecoder)
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
/* else */
return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
} }
} }