Merge pull request #2925 from embg/dict_training_sample_limit_size

Allow user to specify memory limit for dictionary training
2025-12-11 00:06:10 -05:00 · 2021-12-15 15:58:17 -05:00 · 2021-12-15 15:58:17 -05:00 · c5f1e826ca
commit c5f1e826ca
parent 622fee6140 71c0c07c19
5 changed files with 23 additions and 5 deletions
--- a/programs/dibio.c
+++ b/programs/dibio.c
@ -309,7 +309,7 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
-                       ZDICT_fastCover_params_t* fastCoverParams, int optimize)
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
 {
    fileStats fs;
    size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
@ -341,6 +341,11 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
        /* Limit the size of the training data to 2GB */
        /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
        loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
        if (memLimit != 0) {
            DISPLAYLEVEL(2, "!  Warning : setting manual memory limit for dictionary training data at %u MB \n",
                (unsigned)(memLimit / (1 MB)));
            loadedSize = (size_t)MIN(loadedSize, memLimit);
        }
        srcBuffer = malloc(loadedSize+NOISELENGTH);
        sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
    }
--- a/programs/dibio.h
+++ b/programs/dibio.h
@ -34,6 +34,6 @@
 int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
                       const char** fileNamesTable, int nbFiles, size_t chunkSize,
                       ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
-                       ZDICT_fastCover_params_t* fastCoverParams, int optimize);
+                       ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);
 #endif
--- a/programs/zstd.1.md
+++ b/programs/zstd.1.md
@ -190,6 +190,10 @@ the last one takes effect.
    This is also used during compression when using with --patch-from=. In this case,
    this parameter overrides that maximum size allowed for a dictionary. (128 MB).
    Additionally, this can be used to limit memory for dictionary training. This parameter
    overrides the default limit of 2 GB. zstd will load training samples up to the memory limit
    and ignore the rest.
 * `--stream-size=#` :
    Sets the pledged source size of input coming from a stream. This value must be exact, as it
    will be included in the produced frame header. Incorrect stream sizes will cause an error.
@ -329,6 +333,8 @@ Compression of small files similar to the sample set will be greatly improved.
    resulting in a _small_ compression ratio improvement for this level.
 * `-B#`:
    Split input files into blocks of size # (default: no split)
 * `-M#`, `--memory=#`:
    Limit the amount of sample data loaded for training (default: 2 GB). See above for details.
 * `--dictID=#`:
    A dictionary ID is a locally unique ID
    that a decoder can use to verify it is using the right dictionary.
--- a/programs/zstdcli.c
+++ b/programs/zstdcli.c
@ -1327,18 +1327,18 @@ int main(int argCount, const char* argv[])
            int const optimize = !coverParams.k || !coverParams.d;
            coverParams.nbThreads = (unsigned)nbWorkers;
            coverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit);
        } else if (dict == fastCover) {
            int const optimize = !fastCoverParams.k || !fastCoverParams.d;
            fastCoverParams.nbThreads = (unsigned)nbWorkers;
            fastCoverParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit);
        } else {
            ZDICT_legacy_params_t dictParams;
            memset(&dictParams, 0, sizeof(dictParams));
            dictParams.selectivityLevel = dictSelect;
            dictParams.zParams = zParams;
-            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0);
+            operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit);
        }
 #else
        (void)dictCLevel; (void)dictSelect; (void)dictID;  (void)maxDictSize; /* not used when ZSTD_NODICT set */
--- a/tests/playTests.sh
+++ b/tests/playTests.sh
@ -1051,6 +1051,13 @@ then
 fi
 rm -f tmp* dictionary
 println "- Test --memory for dictionary compression"
 datagen -g12M -P90 > tmpCorpusHighCompress
 zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)"
 zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr
 cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB"
 cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..."
 rm zstTrainWithMemLimitStdErr
 println "\n===>  fastCover dictionary builder : advanced options "
 TESTFILE="$PRGDIR"/zstdcli.c