Merge pull request #2925 from embg/dict_training_sample_limit_size

Allow user to specify memory limit for dictionary training
This commit is contained in:
Elliot Gorokhovsky 2021-12-15 15:58:17 -05:00 committed by GitHub
commit c5f1e826ca
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 23 additions and 5 deletions

View File

@ -309,7 +309,7 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
const char** fileNamesTable, int nbFiles, size_t chunkSize, const char** fileNamesTable, int nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
ZDICT_fastCover_params_t* fastCoverParams, int optimize) ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit)
{ {
fileStats fs; fileStats fs;
size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */ size_t* sampleSizes; /* vector of sample sizes. Each sample can be up to SAMPLESIZE_MAX */
@ -341,6 +341,11 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
/* Limit the size of the training data to 2GB */ /* Limit the size of the training data to 2GB */
/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */ /* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE ); loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
if (memLimit != 0) {
DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
(unsigned)(memLimit / (1 MB)));
loadedSize = (size_t)MIN(loadedSize, memLimit);
}
srcBuffer = malloc(loadedSize+NOISELENGTH); srcBuffer = malloc(loadedSize+NOISELENGTH);
sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t)); sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
} }

View File

@ -34,6 +34,6 @@
int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize, int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
const char** fileNamesTable, int nbFiles, size_t chunkSize, const char** fileNamesTable, int nbFiles, size_t chunkSize,
ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams, ZDICT_legacy_params_t* params, ZDICT_cover_params_t* coverParams,
ZDICT_fastCover_params_t* fastCoverParams, int optimize); ZDICT_fastCover_params_t* fastCoverParams, int optimize, unsigned memLimit);
#endif #endif

View File

@ -190,6 +190,10 @@ the last one takes effect.
This is also used during compression when using with --patch-from=. In this case, This is also used during compression when using with --patch-from=. In this case,
this parameter overrides that maximum size allowed for a dictionary. (128 MB). this parameter overrides that maximum size allowed for a dictionary. (128 MB).
Additionally, this can be used to limit memory for dictionary training. This parameter
overrides the default limit of 2 GB. zstd will load training samples up to the memory limit
and ignore the rest.
* `--stream-size=#` : * `--stream-size=#` :
Sets the pledged source size of input coming from a stream. This value must be exact, as it Sets the pledged source size of input coming from a stream. This value must be exact, as it
will be included in the produced frame header. Incorrect stream sizes will cause an error. will be included in the produced frame header. Incorrect stream sizes will cause an error.
@ -329,6 +333,8 @@ Compression of small files similar to the sample set will be greatly improved.
resulting in a _small_ compression ratio improvement for this level. resulting in a _small_ compression ratio improvement for this level.
* `-B#`: * `-B#`:
Split input files into blocks of size # (default: no split) Split input files into blocks of size # (default: no split)
* `-M#`, `--memory=#`:
Limit the amount of sample data loaded for training (default: 2 GB). See above for details.
* `--dictID=#`: * `--dictID=#`:
A dictionary ID is a locally unique ID A dictionary ID is a locally unique ID
that a decoder can use to verify it is using the right dictionary. that a decoder can use to verify it is using the right dictionary.

View File

@ -1327,18 +1327,18 @@ int main(int argCount, const char* argv[])
int const optimize = !coverParams.k || !coverParams.d; int const optimize = !coverParams.k || !coverParams.d;
coverParams.nbThreads = (unsigned)nbWorkers; coverParams.nbThreads = (unsigned)nbWorkers;
coverParams.zParams = zParams; coverParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize); operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, &coverParams, NULL, optimize, memLimit);
} else if (dict == fastCover) { } else if (dict == fastCover) {
int const optimize = !fastCoverParams.k || !fastCoverParams.d; int const optimize = !fastCoverParams.k || !fastCoverParams.d;
fastCoverParams.nbThreads = (unsigned)nbWorkers; fastCoverParams.nbThreads = (unsigned)nbWorkers;
fastCoverParams.zParams = zParams; fastCoverParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize); operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, NULL, NULL, &fastCoverParams, optimize, memLimit);
} else { } else {
ZDICT_legacy_params_t dictParams; ZDICT_legacy_params_t dictParams;
memset(&dictParams, 0, sizeof(dictParams)); memset(&dictParams, 0, sizeof(dictParams));
dictParams.selectivityLevel = dictSelect; dictParams.selectivityLevel = dictSelect;
dictParams.zParams = zParams; dictParams.zParams = zParams;
operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0); operationResult = DiB_trainFromFiles(outFileName, maxDictSize, filenames->fileNames, (int)filenames->tableSize, blockSize, &dictParams, NULL, NULL, 0, memLimit);
} }
#else #else
(void)dictCLevel; (void)dictSelect; (void)dictID; (void)maxDictSize; /* not used when ZSTD_NODICT set */ (void)dictCLevel; (void)dictSelect; (void)dictID; (void)maxDictSize; /* not used when ZSTD_NODICT set */

View File

@ -1051,6 +1051,13 @@ then
fi fi
rm -f tmp* dictionary rm -f tmp* dictionary
println "- Test --memory for dictionary compression"
datagen -g12M -P90 > tmpCorpusHighCompress
zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=10K && die "Dictionary training should fail : --memory too low (10K)"
zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress --memory=5MB 2> zstTrainWithMemLimitStdErr
cat zstTrainWithMemLimitStdErr | grep "setting manual memory limit for dictionary training data at 5 MB"
cat zstTrainWithMemLimitStdErr | grep "Training samples set too large (12 MB); training on 5 MB only..."
rm zstTrainWithMemLimitStdErr
println "\n===> fastCover dictionary builder : advanced options " println "\n===> fastCover dictionary builder : advanced options "
TESTFILE="$PRGDIR"/zstdcli.c TESTFILE="$PRGDIR"/zstdcli.c