diff --git a/programs/dibio.c b/programs/dibio.c index b95bab34e..ba15d2106 100644 --- a/programs/dibio.c +++ b/programs/dibio.c @@ -42,6 +42,7 @@ #define SAMPLESIZE_MAX (128 KB) #define MEMMULT 11 /* rough estimation : memory cost to analyze 1 byte of sample */ +#define COVER_MEMMULT 9 /* rough estimation : memory cost to analyze 1 byte of sample */ static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_t)(512 MB) << sizeof(size_t)); #define NOISELENGTH 32 @@ -118,10 +119,36 @@ static unsigned DiB_loadFiles(void* buffer, size_t* bufferSizePtr, fileSizes[n] = fileSize; fclose(f); } } + DISPLAYLEVEL(2, "\r%79s\r", ""); *bufferSizePtr = pos; return n; } +#define DiB_rotl32(x,r) ((x << r) | (x >> (32 - r))) +static U32 DiB_rand(U32* src) +{ + static const U32 prime1 = 2654435761U; + static const U32 prime2 = 2246822519U; + U32 rand32 = *src; + rand32 *= prime1; + rand32 ^= prime2; + rand32 = DiB_rotl32(rand32, 13); + *src = rand32; + return rand32 >> 5; +} + +static void DiB_shuffle(const char** fileNamesTable, unsigned nbFiles) { + /* Initialize the pseudorandom number generator */ + U32 seed = 0xFD2FB528; + unsigned i; + for (i = nbFiles - 1; i > 0; --i) { + unsigned const j = DiB_rand(&seed) % (i + 1); + const char* tmp = fileNamesTable[j]; + fileNamesTable[j] = fileNamesTable[i]; + fileNamesTable[i] = tmp; + } +} + /*-******************************************************** * Dictionary training functions @@ -202,7 +229,8 @@ size_t ZDICT_trainFromBuffer_unsafe(void* dictBuffer, size_t dictBufferCapacity, int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, const char** fileNamesTable, unsigned nbFiles, - ZDICT_params_t params) + ZDICT_params_t *params, COVER_params_t *coverParams, + int optimizeCover) { void* const dictBuffer = malloc(maxDictSize); size_t* const fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t)); @@ -213,8 +241,10 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, int result = 0; /* Checks */ + if (params) g_displayLevel = params->notificationLevel; + else if (coverParams) g_displayLevel = coverParams->notificationLevel; + else EXM_THROW(13, "Neither dictionary algorith selected"); /* should not happen */ if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */ - g_displayLevel = params.notificationLevel; if (g_tooLargeSamples) { DISPLAYLEVEL(2, "! Warning : some samples are very large \n"); DISPLAYLEVEL(2, "! Note that dictionary is only useful for small files or beginning of large files. \n"); @@ -233,12 +263,31 @@ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20)); /* Load input buffer */ + DISPLAYLEVEL(3, "Shuffling input files\n"); + DiB_shuffle(fileNamesTable, nbFiles); nbFiles = DiB_loadFiles(srcBuffer, &benchedSize, fileSizes, fileNamesTable, nbFiles); - DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ - { size_t const dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize, - srcBuffer, fileSizes, nbFiles, - params); + { + size_t dictSize; + if (params) { + DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */ + dictSize = ZDICT_trainFromBuffer_unsafe(dictBuffer, maxDictSize, + srcBuffer, fileSizes, nbFiles, + *params); + } else if (optimizeCover) { + dictSize = COVER_optimizeTrainFromBuffer( + dictBuffer, maxDictSize, srcBuffer, fileSizes, nbFiles, + coverParams); + if (!ZDICT_isError(dictSize)) { + DISPLAYLEVEL(2, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", + coverParams->smoothing, coverParams->kMin, + coverParams->kStep, coverParams->kMax, coverParams->d); + } + } else { + dictSize = COVER_trainFromBuffer(dictBuffer, maxDictSize, + srcBuffer, fileSizes, nbFiles, + *coverParams); + } if (ZDICT_isError(dictSize)) { DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ result = 1; diff --git a/programs/dibio.h b/programs/dibio.h index 6780d8698..e61d0042c 100644 --- a/programs/dibio.h +++ b/programs/dibio.h @@ -32,7 +32,7 @@ */ int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize, const char** fileNamesTable, unsigned nbFiles, - ZDICT_params_t parameters); - + ZDICT_params_t *params, COVER_params_t *coverParams, + int optimizeCover); #endif diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 978ffcfe0..f4d33d3f8 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -127,6 +127,8 @@ static int usage_advanced(const char* programName) DISPLAY( "\n"); DISPLAY( "Dictionary builder :\n"); DISPLAY( "--train ## : create a dictionary from a training set of files \n"); + DISPLAY( "--cover=k=#,d=# : use the cover algorithm with parameters k and d \n"); + DISPLAY( "--optimize-cover[=steps=#,k=#,d=#] : optimize cover parameters with optional parameters\n"); DISPLAY( " -o file : `file` is dictionary name (default: %s) \n", g_defaultDictName); DISPLAY( "--maxdict ## : limit dictionary to specified size (default : %u) \n", g_defaultMaxDictSize); DISPLAY( " -s# : dictionary selectivity level (default: %u)\n", g_defaultSelectivityLevel); @@ -192,6 +194,29 @@ static unsigned longCommandWArg(const char** stringPtr, const char* longCommand) } +#ifndef ZSTD_NODICT +/** + * parseCoverParameters() : + * reads cover parameters from *stringPtr (e.g. "--cover=smoothing=100,kmin=48,kstep=4,kmax=64,d=8") into *params + * @return 1 means that cover parameters were correct + * @return 0 in case of malformed parameters + */ +static unsigned parseCoverParameters(const char* stringPtr, COVER_params_t *params) +{ + memset(params, 0, sizeof(*params)); + for (; ;) { + if (longCommandWArg(&stringPtr, "smoothing=")) { params->smoothing = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + if (longCommandWArg(&stringPtr, "k=") || longCommandWArg(&stringPtr, "kMin=") || longCommandWArg(&stringPtr, "kmin=")) { params->kMin = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + if (longCommandWArg(&stringPtr, "kStep=") || longCommandWArg(&stringPtr, "kstep=")) { params->kStep = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + if (longCommandWArg(&stringPtr, "kMax=") || longCommandWArg(&stringPtr, "kmax=")) { params->kMax = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + if (longCommandWArg(&stringPtr, "d=")) { params->d = readU32FromChar(&stringPtr); if (stringPtr[0]==',') { stringPtr++; continue; } else break; } + return 0; + } + if (stringPtr[0] != 0) return 0; + DISPLAYLEVEL(4, "smoothing=%d\nkMin=%d\nkStep=%d\nkMax=%d\nd=%d\n", params->smoothing, params->kMin, params->kStep, params->kMax, params->d); + return 1; +} +#endif /** parseCompressionParameters() : * reads compression parameters from *stringPtr (e.g. "--zstd=wlog=23,clog=23,hlog=22,slog=6,slen=3,tlen=48,strat=6") into *params * @return 1 means that compression parameters were correct @@ -254,6 +279,10 @@ int main(int argCount, const char* argv[]) char* fileNamesBuf = NULL; unsigned fileNamesNb; #endif +#ifndef ZSTD_NODICT + COVER_params_t coverParams; + int cover = 0; +#endif /* init */ (void)recursive; (void)cLevelLast; /* not used when ZSTD_NOBENCH set */ @@ -318,6 +347,20 @@ int main(int argCount, const char* argv[]) if (!strcmp(argument, "--rm")) { FIO_setRemoveSrcFile(1); continue; } /* long commands with arguments */ +#ifndef ZSTD_NODICT + if (longCommandWArg(&argument, "--cover=")) { + cover=1; if (!parseCoverParameters(argument, &coverParams)) CLEAN_RETURN(badusage(programName)); + continue; + } + if (longCommandWArg(&argument, "--optimize-cover")) { + cover=2; + /* Allow optional arguments following an = */ + if (*argument == 0) { memset(&coverParams, 0, sizeof(coverParams)); } + else if (*argument++ != '=') { CLEAN_RETURN(badusage(programName)); } + else if (!parseCoverParameters(argument, &coverParams)) { CLEAN_RETURN(badusage(programName)); } + continue; + } +#endif if (longCommandWArg(&argument, "--memlimit=")) { memLimit = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--memory=")) { memLimit = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--memlimit-decompress=")) { memLimit = readU32FromChar(&argument); continue; } @@ -520,13 +563,20 @@ int main(int argCount, const char* argv[]) /* Check if dictionary builder is selected */ if (operation==zom_train) { #ifndef ZSTD_NODICT - ZDICT_params_t dictParams; - memset(&dictParams, 0, sizeof(dictParams)); - dictParams.compressionLevel = dictCLevel; - dictParams.selectivityLevel = dictSelect; - dictParams.notificationLevel = displayLevel; - dictParams.dictID = dictID; - DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, dictParams); + if (cover) { + coverParams.compressionLevel = dictCLevel; + coverParams.notificationLevel = displayLevel; + coverParams.dictID = dictID; + DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, NULL, &coverParams, cover - 1); + } else { + ZDICT_params_t dictParams; + memset(&dictParams, 0, sizeof(dictParams)); + dictParams.compressionLevel = dictCLevel; + dictParams.selectivityLevel = dictSelect; + dictParams.notificationLevel = displayLevel; + dictParams.dictID = dictID; + DiB_trainFromFiles(outFileName, maxDictSize, filenameTable, filenameIdx, &dictParams, NULL, 0); + } #endif goto _end; }