mirror of
https://github.com/facebook/zstd.git
synced 2025-11-22 00:10:22 -05:00
Improves decompression speed when using cold dictionary
by triggering the prefetching decoder path (which used to be dedicated to long-range offsets only). Figures on my laptop : no content prefetch : ~300 MB/s (for reference) full content prefetch : ~325 MB/s (before this patch) new prefetch path : ~375 MB/s (after this patch) The benchmark speed is already significant, but another side-effect is that this version prefetch less data into memory, since it only prefetches what's needed, instead of the full dictionary. This is supposed to help highly active environments such as active databases, that can't be properly measured in benchmark environment (too clean). Also : fixed the largeNbDict test program which was working improperly when setting nbBlocks > nbFiles.
This commit is contained in:
parent
20fb9e7f36
commit
483759a3de
@ -33,7 +33,7 @@ largeNbDicts: util.o bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)
|
|||||||
|
|
||||||
.PHONY: $(LIBZSTD)
|
.PHONY: $(LIBZSTD)
|
||||||
$(LIBZSTD):
|
$(LIBZSTD):
|
||||||
$(MAKE) -C $(LIBDIR) libzstd.a
|
$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"
|
||||||
|
|
||||||
bench.o : $(PROGDIR)/bench.c
|
bench.o : $(PROGDIR)/bench.c
|
||||||
$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
|
$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
|
||||||
@ -50,4 +50,5 @@ xxhash.o : $(LIBDIR)/common/xxhash.c
|
|||||||
|
|
||||||
clean:
|
clean:
|
||||||
$(RM) *.o
|
$(RM) *.o
|
||||||
|
$(MAKE) -C $(LIBDIR) clean > /dev/null
|
||||||
$(RM) largeNbDicts
|
$(RM) largeNbDicts
|
||||||
|
|||||||
@ -49,6 +49,7 @@
|
|||||||
|
|
||||||
|
|
||||||
/*--- Macros ---*/
|
/*--- Macros ---*/
|
||||||
|
|
||||||
#define CONTROL(c) { if (!(c)) abort(); }
|
#define CONTROL(c) { if (!(c)) abort(); }
|
||||||
#undef MIN
|
#undef MIN
|
||||||
#define MIN(a,b) ((a) < (b) ? (a) : (b))
|
#define MIN(a,b) ((a) < (b) ? (a) : (b))
|
||||||
@ -594,6 +595,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
|
|||||||
if (blockSize)
|
if (blockSize)
|
||||||
DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
|
DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
|
||||||
DISPLAYLEVEL(3, "\n");
|
DISPLAYLEVEL(3, "\n");
|
||||||
|
size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices);
|
||||||
|
|
||||||
|
|
||||||
size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
|
size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
|
||||||
@ -625,8 +627,8 @@ int bench(const char** fileNameTable, unsigned nbFiles,
|
|||||||
|
|
||||||
/* dictionary determination */
|
/* dictionary determination */
|
||||||
buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
|
buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
|
||||||
srcBuffer.ptr,
|
srcs.buffer.ptr,
|
||||||
srcSlices.capacities, nbBlocks,
|
srcs.slices.capacities, srcs.slices.nbSlices,
|
||||||
DICTSIZE);
|
DICTSIZE);
|
||||||
CONTROL(dictBuffer.ptr != NULL);
|
CONTROL(dictBuffer.ptr != NULL);
|
||||||
|
|
||||||
@ -637,7 +639,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
|
|||||||
CONTROL(cTotalSizeNoDict != 0);
|
CONTROL(cTotalSizeNoDict != 0);
|
||||||
DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f (%u bytes) \n",
|
DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f (%u bytes) \n",
|
||||||
clevel,
|
clevel,
|
||||||
(double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
|
(double)totalSrcSlicesSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
|
||||||
|
|
||||||
size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
|
size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
|
||||||
CONTROL(cSizes != NULL);
|
CONTROL(cSizes != NULL);
|
||||||
@ -646,7 +648,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
|
|||||||
CONTROL(cTotalSize != 0);
|
CONTROL(cTotalSize != 0);
|
||||||
DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f (%u bytes) \n",
|
DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f (%u bytes) \n",
|
||||||
(unsigned)dictBuffer.size,
|
(unsigned)dictBuffer.size,
|
||||||
(double)srcSize / cTotalSize, (unsigned)cTotalSize);
|
(double)totalSrcSlicesSize / cTotalSize, (unsigned)cTotalSize);
|
||||||
|
|
||||||
/* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
|
/* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
|
||||||
shrinkSizes(dstSlices, cSizes);
|
shrinkSizes(dstSlices, cSizes);
|
||||||
|
|||||||
@ -15,7 +15,6 @@
|
|||||||
* Dependencies
|
* Dependencies
|
||||||
*********************************************************/
|
*********************************************************/
|
||||||
#include <string.h> /* memcpy, memmove, memset */
|
#include <string.h> /* memcpy, memmove, memset */
|
||||||
#include "compiler.h" /* prefetch */
|
|
||||||
#include "cpu.h" /* bmi2 */
|
#include "cpu.h" /* bmi2 */
|
||||||
#include "mem.h" /* low level memory routines */
|
#include "mem.h" /* low level memory routines */
|
||||||
#define FSE_STATIC_LINKING_ONLY
|
#define FSE_STATIC_LINKING_ONLY
|
||||||
|
|||||||
@ -56,7 +56,6 @@
|
|||||||
* Dependencies
|
* Dependencies
|
||||||
*********************************************************/
|
*********************************************************/
|
||||||
#include <string.h> /* memcpy, memmove, memset */
|
#include <string.h> /* memcpy, memmove, memset */
|
||||||
#include "compiler.h" /* prefetch */
|
|
||||||
#include "cpu.h" /* bmi2 */
|
#include "cpu.h" /* bmi2 */
|
||||||
#include "mem.h" /* low level memory routines */
|
#include "mem.h" /* low level memory routines */
|
||||||
#define FSE_STATIC_LINKING_ONLY
|
#define FSE_STATIC_LINKING_ONLY
|
||||||
|
|||||||
@ -507,16 +507,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* prefetch dictionary content */
|
|
||||||
if (dctx->ddictIsCold) {
|
|
||||||
size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
|
|
||||||
size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
|
|
||||||
size_t const pSize = MIN(psmin, 128 KB /* protection */ );
|
|
||||||
const void* const pStart = (const char*)dctx->dictEnd - pSize;
|
|
||||||
PREFETCH_AREA(pStart, pSize);
|
|
||||||
dctx->ddictIsCold = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
return ip-istart;
|
return ip-istart;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -1046,6 +1036,7 @@ ZSTD_decompressSequencesLong_body(
|
|||||||
/* prepare in advance */
|
/* prepare in advance */
|
||||||
for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
|
for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
|
||||||
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
|
sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
|
||||||
|
PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
|
||||||
}
|
}
|
||||||
if (seqNb<seqAdvance) return ERROR(corruption_detected);
|
if (seqNb<seqAdvance) return ERROR(corruption_detected);
|
||||||
|
|
||||||
@ -1070,9 +1061,6 @@ ZSTD_decompressSequencesLong_body(
|
|||||||
|
|
||||||
/* save reps for next block */
|
/* save reps for next block */
|
||||||
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
|
{ U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
|
||||||
#undef STORED_SEQS
|
|
||||||
#undef STORED_SEQS_MASK
|
|
||||||
#undef ADVANCED_SEQS
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* last literal segment */
|
/* last literal segment */
|
||||||
@ -1213,20 +1201,27 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* Build Decoding Tables */
|
/* Build Decoding Tables */
|
||||||
{ int nbSeq;
|
{ int usePrefetchDecoder = dctx->ddictIsCold;
|
||||||
|
int nbSeq;
|
||||||
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
|
size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
|
||||||
if (ZSTD_isError(seqHSize)) return seqHSize;
|
if (ZSTD_isError(seqHSize)) return seqHSize;
|
||||||
ip += seqHSize;
|
ip += seqHSize;
|
||||||
srcSize -= seqHSize;
|
srcSize -= seqHSize;
|
||||||
|
|
||||||
if ( (!frame || (dctx->fParams.windowSize > (1<<24)))
|
if ( !usePrefetchDecoder
|
||||||
&& (nbSeq>0) ) { /* could probably use a larger nbSeq limit */
|
&& (!frame || (dctx->fParams.windowSize > (1<<24)))
|
||||||
|
&& (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */
|
||||||
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
|
U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
|
||||||
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
|
U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
|
||||||
if (shareLongOffsets >= minShare)
|
usePrefetchDecoder = (shareLongOffsets >= minShare);
|
||||||
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dctx->ddictIsCold = 0;
|
||||||
|
|
||||||
|
if (usePrefetchDecoder)
|
||||||
|
return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
|
||||||
|
|
||||||
|
/* else */
|
||||||
return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
|
return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user