Improves decompression speed when using cold dictionary

by triggering the prefetching decoder path (which used to be dedicated to long-range offsets only). Figures on my laptop : no content prefetch : ~300 MB/s (for reference) full content prefetch : ~325 MB/s (before this patch) new prefetch path : ~375 MB/s (after this patch) The benchmark speed is already significant, but another side-effect is that this version prefetch less data into memory, since it only prefetches what's needed, instead of the full dictionary. This is supposed to help highly active environments such as active databases, that can't be properly measured in benchmark environment (too clean). Also : fixed the largeNbDict test program which was working improperly when setting nbBlocks > nbFiles.
2025-11-22 00:10:22 -05:00 · 2018-11-08 17:00:23 -08:00 · 2018-11-08 17:00:23 -08:00 · 483759a3de
commit 483759a3de
parent 20fb9e7f36
5 changed files with 21 additions and 25 deletions
--- a/contrib/largeNbDicts/Makefile
+++ b/contrib/largeNbDicts/Makefile
@ -33,7 +33,7 @@ largeNbDicts: util.o bench.o datagen.o xxhash.o largeNbDicts.c $(LIBZSTD)

 .PHONY: $(LIBZSTD)
 $(LIBZSTD):
-	$(MAKE) -C $(LIBDIR) libzstd.a
+	$(MAKE) -C $(LIBDIR) libzstd.a CFLAGS="$(CFLAGS)"

 bench.o  : $(PROGDIR)/bench.c
 	$(CC) $(CPPFLAGS) $(CFLAGS) $^ -c
@ -50,4 +50,5 @@ xxhash.o : $(LIBDIR)/common/xxhash.c

 clean:
 	$(RM) *.o
+	$(MAKE) -C $(LIBDIR) clean > /dev/null
 	$(RM) largeNbDicts
--- a/contrib/largeNbDicts/largeNbDicts.c
+++ b/contrib/largeNbDicts/largeNbDicts.c
@ -49,6 +49,7 @@


 /*---  Macros  ---*/
+
 #define CONTROL(c)   { if (!(c)) abort(); }
 #undef MIN
 #define MIN(a,b)     ((a) < (b) ? (a) : (b))
@ -594,6 +595,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
    if (blockSize)
        DISPLAYLEVEL(3, "of max size %u bytes ", (unsigned)blockSize);
    DISPLAYLEVEL(3, "\n");
+    size_t const totalSrcSlicesSize = sliceCollection_totalCapacity(srcSlices);


    size_t* const dstCapacities = malloc(nbBlocks * sizeof(*dstCapacities));
@ -625,8 +627,8 @@ int bench(const char** fileNameTable, unsigned nbFiles,

    /* dictionary determination */
    buffer_t const dictBuffer = createDictionaryBuffer(dictionary,
-                                srcBuffer.ptr,
-                                srcSlices.capacities, nbBlocks,
+                                srcs.buffer.ptr,
+                                srcs.slices.capacities, srcs.slices.nbSlices,
                                DICTSIZE);
    CONTROL(dictBuffer.ptr != NULL);

@ -637,7 +639,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
    CONTROL(cTotalSizeNoDict != 0);
    DISPLAYLEVEL(3, "compressing at level %u without dictionary : Ratio=%.2f  (%u bytes) \n",
                    clevel,
-                    (double)srcSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);
+                    (double)totalSrcSlicesSize / cTotalSizeNoDict, (unsigned)cTotalSizeNoDict);

    size_t* const cSizes = malloc(nbBlocks * sizeof(size_t));
    CONTROL(cSizes != NULL);
@ -646,7 +648,7 @@ int bench(const char** fileNameTable, unsigned nbFiles,
    CONTROL(cTotalSize != 0);
    DISPLAYLEVEL(3, "compressed using a %u bytes dictionary : Ratio=%.2f  (%u bytes) \n",
                    (unsigned)dictBuffer.size,
-                    (double)srcSize / cTotalSize, (unsigned)cTotalSize);
+                    (double)totalSrcSlicesSize / cTotalSize, (unsigned)cTotalSize);

    /* now dstSlices contain the real compressed size of each block, instead of the maximum capacity */
    shrinkSizes(dstSlices, cSizes);
--- a/lib/decompress/zstd_ddict.c
+++ b/lib/decompress/zstd_ddict.c
@ -15,7 +15,6 @@
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "compiler.h"    /* prefetch */
 #include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
--- a/lib/decompress/zstd_decompress.c
+++ b/lib/decompress/zstd_decompress.c
@ -56,7 +56,6 @@
 *  Dependencies
 *********************************************************/
 #include <string.h>      /* memcpy, memmove, memset */
-#include "compiler.h"    /* prefetch */
 #include "cpu.h"         /* bmi2 */
 #include "mem.h"         /* low level memory routines */
 #define FSE_STATIC_LINKING_ONLY
--- a/lib/decompress/zstd_decompress_block.c
+++ b/lib/decompress/zstd_decompress_block.c
@ -507,16 +507,6 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr,
        }
    }

-    /* prefetch dictionary content */
-    if (dctx->ddictIsCold) {
-        size_t const dictSize = (const char*)dctx->prefixStart - (const char*)dctx->virtualStart;
-        size_t const psmin = MIN(dictSize, (size_t)(64*nbSeq) /* heuristic */ );
-        size_t const pSize = MIN(psmin, 128 KB /* protection */ );
-        const void* const pStart = (const char*)dctx->dictEnd - pSize;
-        PREFETCH_AREA(pStart, pSize);
-        dctx->ddictIsCold = 0;
-    }
-
    return ip-istart;
 }

@ -1046,6 +1036,7 @@ ZSTD_decompressSequencesLong_body(
        /* prepare in advance */
        for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNb<seqAdvance); seqNb++) {
            sequences[seqNb] = ZSTD_decodeSequenceLong(&seqState, isLongOffset);
+            PREFETCH_L1(sequences[seqNb].match); PREFETCH_L1(sequences[seqNb].match + sequences[seqNb].matchLength - 1); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */
        }
        if (seqNb<seqAdvance) return ERROR(corruption_detected);

@ -1070,9 +1061,6 @@ ZSTD_decompressSequencesLong_body(

        /* save reps for next block */
        { U32 i; for (i=0; i<ZSTD_REP_NUM; i++) dctx->entropy.rep[i] = (U32)(seqState.prevOffset[i]); }
-#undef STORED_SEQS
-#undef STORED_SEQS_MASK
-#undef ADVANCED_SEQS
    }

    /* last literal segment */
@ -1213,20 +1201,27 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx,
    }

    /* Build Decoding Tables */
-    {   int nbSeq;
+    {   int usePrefetchDecoder = dctx->ddictIsCold;
+        int nbSeq;
        size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize);
        if (ZSTD_isError(seqHSize)) return seqHSize;
        ip += seqHSize;
        srcSize -= seqHSize;

-        if ( (!frame || (dctx->fParams.windowSize > (1<<24)))
-          && (nbSeq>0) ) {  /* could probably use a larger nbSeq limit */
+        if ( !usePrefetchDecoder
+          && (!frame || (dctx->fParams.windowSize > (1<<24)))
+          && (nbSeq>ADVANCED_SEQS) ) {  /* could probably use a larger nbSeq limit */
            U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr);
            U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */
-            if (shareLongOffsets >= minShare)
-                return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+            usePrefetchDecoder = (shareLongOffsets >= minShare);
        }

+        dctx->ddictIsCold = 0;
+
+        if (usePrefetchDecoder)
+            return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
+
+        /* else */
        return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset);
    }
 }