diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile new file mode 100644 index 000000000..72ce04f2a --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/Makefile @@ -0,0 +1,44 @@ +ARG := + +CC ?= gcc +CFLAGS ?= -O3 +INCLUDES := -I ../randomDictBuilder -I ../../../programs -I ../../../lib/common -I ../../../lib -I ../../../lib/dictBuilder + +RANDOM_FILE := ../randomDictBuilder/random.c +IO_FILE := ../randomDictBuilder/io.c + +all: run clean + +.PHONY: run +run: benchmark + echo "Benchmarking with $(ARG)" + ./benchmark $(ARG) + +.PHONY: test +test: benchmarkTest clean + +.PHONY: benchmarkTest +benchmarkTest: benchmark test.sh + sh test.sh + +benchmark: benchmark.o io.o random.o libzstd.a + $(CC) $(CFLAGS) benchmark.o io.o random.o libzstd.a -o benchmark + +benchmark.o: benchmark.c + $(CC) $(CFLAGS) $(INCLUDES) -c benchmark.c + +random.o: $(RANDOM_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(RANDOM_FILE) + +io.o: $(IO_FILE) + $(CC) $(CFLAGS) $(INCLUDES) -c $(IO_FILE) + +libzstd.a: + $(MAKE) -C ../../../lib libzstd.a + mv ../../../lib/libzstd.a . + +.PHONY: clean +clean: + rm -f *.o benchmark libzstd.a + $(MAKE) -C ../../../lib clean + echo "Cleaning is completed" diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md new file mode 100644 index 000000000..de783a0ec --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/README.md @@ -0,0 +1,47 @@ +Benchmarking Dictionary Builder + +### Permitted Argument: +Input File/Directory (in=fileName): required; file/directory used to build dictionary; if directory, will operate recursively for files inside directory; can include multiple files/directories, each following "in=" + +###Running Test: +make test + +###Usage: +Benchmark given input files: make ARG= followed by permitted arguments + +### Examples: +make ARG="in=../../../lib/dictBuilder in=../../../lib/compress" + +###Benchmarking Result: + +github: +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000004 | 2.999642 | +| random | 0.180238 | 8.786957 | +| cover | 33.891987 | 10.430999 | +| legacy | 1.077569 | 8.989482 | + +hg-commands +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000006 | 2.425291 | +| random | 0.088735 | 3.489515 | +| cover | 35.447300 | 4.030274 | +| legacy | 1.048509 | 3.911896 | + +hg-manifest +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000005 | 1.866385 | +| random | 1.148231 | 2.309485 | +| cover | 509.685257 | 2.575331 | +| legacy | 10.705866 | 2.506775 | + +hg-changelog +| Algorithm | Speed(sec) | Compression Ratio | +| ------------- |:-------------:| ------------------:| +| nodict | 0.000005 | 1.377613 | +| random | 0.706434 | 2.096785 | +| cover | 122.815783 | 2.175706 | +| legacy | 3.010318 | 2.058273 | diff --git a/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c new file mode 100644 index 000000000..640419649 --- /dev/null +++ b/contrib/experimental_dict_builders/benchmarkDictBuilder/benchmark.c @@ -0,0 +1,374 @@ +#include /* fprintf */ +#include /* malloc, free, qsort */ +#include /* strcmp, strlen */ +#include /* errno */ +#include +#include +#include "random.h" +#include "dictBuilder.h" +#include "zstd_internal.h" /* includes zstd.h */ +#include "io.h" +#include "util.h" +#include "zdict.h" + + + +/*-************************************* +* Console display +***************************************/ +#define DISPLAY(...) fprintf(stderr, __VA_ARGS__) +#define DISPLAYLEVEL(l, ...) if (displayLevel>=l) { DISPLAY(__VA_ARGS__); } + +static const U64 g_refreshRate = SEC_TO_MICRO / 6; +static UTIL_time_t g_displayClock = UTIL_TIME_INITIALIZER; + +#define DISPLAYUPDATE(l, ...) { if (displayLevel>=l) { \ + if ((UTIL_clockSpanMicro(g_displayClock) > g_refreshRate) || (displayLevel>=4)) \ + { g_displayClock = UTIL_getTime(); DISPLAY(__VA_ARGS__); \ + if (displayLevel>=4) fflush(stderr); } } } + + +/*-************************************* +* Exceptions +***************************************/ +#ifndef DEBUG +# define DEBUG 0 +#endif +#define DEBUGOUTPUT(...) if (DEBUG) DISPLAY(__VA_ARGS__); +#define EXM_THROW(error, ...) \ +{ \ + DEBUGOUTPUT("Error defined at %s, line %i : \n", __FILE__, __LINE__); \ + DISPLAY("Error %i : ", error); \ + DISPLAY(__VA_ARGS__); \ + DISPLAY("\n"); \ + exit(error); \ +} + + +/*-************************************* +* Constants +***************************************/ +static const unsigned g_defaultMaxDictSize = 110 KB; +#define DEFAULT_CLEVEL 3 +#define DEFAULT_DISPLAYLEVEL 2 + + +/*-************************************* +* Struct +***************************************/ +typedef struct { + const void* dictBuffer; + size_t dictSize; +} dictInfo; + + +/*-************************************* +* Dictionary related operations +***************************************/ +/** createDictFromFiles() : + * Based on type of param given, train dictionary using the corresponding algorithm + * @return dictInfo containing dictionary buffer and dictionary size + */ +dictInfo* createDictFromFiles(sampleInfo *info, unsigned maxDictSize, + ZDICT_random_params_t *randomParams, ZDICT_cover_params_t *coverParams, + ZDICT_legacy_params_t *legacyParams) { + unsigned const displayLevel = randomParams ? randomParams->zParams.notificationLevel : + coverParams ? coverParams->zParams.notificationLevel : + legacyParams ? legacyParams->zParams.notificationLevel : + DEFAULT_DISPLAYLEVEL; /* no dict */ + void* const dictBuffer = malloc(maxDictSize); + + dictInfo* dInfo = NULL; + + /* Checks */ + if (!dictBuffer) + EXM_THROW(12, "not enough memory for trainFromFiles"); /* should not happen */ + + { size_t dictSize; + if(randomParams) { + dictSize = ZDICT_trainFromBuffer_random(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *randomParams); + }else if(coverParams) { + dictSize = ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, coverParams); + } else if(legacyParams) { + dictSize = ZDICT_trainFromBuffer_legacy(dictBuffer, maxDictSize, info->srcBuffer, + info->samplesSizes, info->nbSamples, *legacyParams); + } else { + dictSize = 0; + } + if (ZDICT_isError(dictSize)) { + DISPLAYLEVEL(1, "dictionary training failed : %s \n", ZDICT_getErrorName(dictSize)); /* should not happen */ + free(dictBuffer); + return dInfo; + } + dInfo = (dictInfo *)malloc(sizeof(dictInfo)); + dInfo->dictBuffer = dictBuffer; + dInfo->dictSize = dictSize; + } + return dInfo; +} + + +/** compressWithDict() : + * Compress samples from sample buffer given dicionary stored on dictionary buffer and compression level + * @return compression ratio + */ +double compressWithDict(sampleInfo *srcInfo, dictInfo* dInfo, int compressionLevel, int displayLevel) { + /* Local variables */ + size_t totalCompressedSize = 0; + size_t totalOriginalSize = 0; + const unsigned hasDict = dInfo->dictSize > 0 ? 1 : 0; + double cRatio; + size_t dstCapacity; + int i; + + /* Pointers */ + ZSTD_CDict *cdict = NULL; + ZSTD_CCtx* cctx = NULL; + size_t *offsets = NULL; + void* dst = NULL; + + /* Allocate dst with enough space to compress the maximum sized sample */ + { + size_t maxSampleSize = 0; + for (int i = 0; i < srcInfo->nbSamples; i++) { + maxSampleSize = MAX(srcInfo->samplesSizes[i], maxSampleSize); + } + dstCapacity = ZSTD_compressBound(maxSampleSize); + dst = malloc(dstCapacity); + } + + /* Calculate offset for each sample */ + offsets = (size_t *)malloc((srcInfo->nbSamples + 1) * sizeof(size_t)); + offsets[0] = 0; + for (i = 1; i <= srcInfo->nbSamples; i++) { + offsets[i] = offsets[i - 1] + srcInfo->samplesSizes[i - 1]; + } + + /* Create the cctx */ + cctx = ZSTD_createCCtx(); + if(!cctx || !dst) { + cRatio = -1; + goto _cleanup; + } + + /* Create CDict if there's a dictionary stored on buffer */ + if (hasDict) { + cdict = ZSTD_createCDict(dInfo->dictBuffer, dInfo->dictSize, compressionLevel); + if(!cdict) { + cRatio = -1; + goto _cleanup; + } + } + + /* Compress each sample and sum their sizes*/ + const BYTE *const samples = (const BYTE *)srcInfo->srcBuffer; + for (i = 0; i < srcInfo->nbSamples; i++) { + size_t compressedSize; + if(hasDict) { + compressedSize = ZSTD_compress_usingCDict(cctx, dst, dstCapacity, samples + offsets[i], srcInfo->samplesSizes[i], cdict); + } else { + compressedSize = ZSTD_compressCCtx(cctx, dst, dstCapacity,samples + offsets[i], srcInfo->samplesSizes[i], compressionLevel); + } + if (ZSTD_isError(compressedSize)) { + cRatio = -1; + goto _cleanup; + } + totalCompressedSize += compressedSize; + } + + /* Sum orignal sizes */ + for (i = 0; inbSamples; i++) { + totalOriginalSize += srcInfo->samplesSizes[i]; + } + + /* Calculate compression ratio */ + DISPLAYLEVEL(2, "original size is %lu\n", totalOriginalSize); + DISPLAYLEVEL(2, "compressed size is %lu\n", totalCompressedSize); + cRatio = (double)totalOriginalSize/(double)totalCompressedSize; + +_cleanup: + free(dst); + free(offsets); + ZSTD_freeCCtx(cctx); + ZSTD_freeCDict(cdict); + return cRatio; +} + + +/** FreeDictInfo() : + * Free memory allocated for dictInfo + */ +void freeDictInfo(dictInfo* info) { + if (!info) return; + if (info->dictBuffer) free((void*)(info->dictBuffer)); + free(info); +} + + + +/*-******************************************************** + * Benchmarking functions +**********************************************************/ +/** benchmarkDictBuilder() : + * Measure how long a dictionary builder takes and compression ratio with the dictionary built + * @return 0 if benchmark successfully, 1 otherwise + */ +int benchmarkDictBuilder(sampleInfo *srcInfo, unsigned maxDictSize, ZDICT_random_params_t *randomParam, + ZDICT_cover_params_t *coverParam, ZDICT_legacy_params_t *legacyParam) { + /* Local variables */ + const unsigned displayLevel = randomParam ? randomParam->zParams.notificationLevel : + coverParam ? coverParam->zParams.notificationLevel : + legacyParam ? legacyParam->zParams.notificationLevel : + DEFAULT_DISPLAYLEVEL; /* no dict */ + const char* name = randomParam ? "RANDOM" : + coverParam ? "COVER" : + legacyParam ? "LEGACY" : + "NODICT"; /* no dict */ + const unsigned cLevel = randomParam ? randomParam->zParams.compressionLevel : + coverParam ? coverParam->zParams.compressionLevel : + legacyParam ? legacyParam->zParams.compressionLevel : + DEFAULT_CLEVEL; /* no dict */ + int result = 0; + + /* Calculate speed */ + const UTIL_time_t begin = UTIL_getTime(); + dictInfo* dInfo = createDictFromFiles(srcInfo, maxDictSize, randomParam, coverParam, legacyParam); + const U64 timeMicro = UTIL_clockSpanMicro(begin); + const double timeSec = timeMicro / (double)SEC_TO_MICRO; + if (!dInfo) { + DISPLAYLEVEL(1, "%s does not train successfully\n", name); + result = 1; + goto _cleanup; + } + DISPLAYLEVEL(2, "%s took %f seconds to execute \n", name, timeSec); + + /* Calculate compression ratio */ + const double cRatio = compressWithDict(srcInfo, dInfo, cLevel, displayLevel); + if (cRatio < 0) { + DISPLAYLEVEL(1, "Compressing with %s dictionary does not work\n", name); + result = 1; + goto _cleanup; + + } + DISPLAYLEVEL(2, "Compression ratio with %s dictionary is %f\n", name, cRatio); + +_cleanup: + freeDictInfo(dInfo); + return result; +} + + + +int main(int argCount, const char* argv[]) +{ + const int displayLevel = DEFAULT_DISPLAYLEVEL; + const char* programName = argv[0]; + int result = 0; + + /* Initialize arguments to default values */ + const unsigned k = 200; + const unsigned d = 6; + const unsigned cLevel = DEFAULT_CLEVEL; + const unsigned dictID = 0; + const unsigned maxDictSize = g_defaultMaxDictSize; + + /* Initialize table to store input files */ + const char** filenameTable = (const char**)malloc(argCount * sizeof(const char*)); + unsigned filenameIdx = 0; + + char* fileNamesBuf = NULL; + unsigned fileNamesNb = filenameIdx; + const int followLinks = 0; + const char** extendedFileList = NULL; + + /* Parse arguments */ + for (int i = 1; i < argCount; i++) { + const char* argument = argv[i]; + if (longCommandWArg(&argument, "in=")) { + filenameTable[filenameIdx] = argument; + filenameIdx++; + continue; + } + DISPLAYLEVEL(1, "benchmark: Incorrect parameters\n"); + return 1; + } + + /* Get the list of all files recursively (because followLinks==0)*/ + extendedFileList = UTIL_createFileList(filenameTable, filenameIdx, &fileNamesBuf, + &fileNamesNb, followLinks); + if (extendedFileList) { + unsigned u; + for (u=0; u='0') && (**stringPtr <='9')) { + unsigned const max = (((unsigned)(-1)) / 10) - 1; + if (result > max) exit(1); + result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; + } + if ((**stringPtr=='K') || (**stringPtr=='M')) { + unsigned const maxK = ((unsigned)(-1)) >> 10; + if (result > maxK) exit(1); + result <<= 10; + if (**stringPtr=='M') { + if (result > maxK) exit(1); + result <<= 10; + } + (*stringPtr)++; /* skip `K` or `M` */ + if (**stringPtr=='i') (*stringPtr)++; + if (**stringPtr=='B') (*stringPtr)++; + } + return result; +} + +unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ + size_t const comSize = strlen(longCommand); + int const result = !strncmp(*stringPtr, longCommand, comSize); + if (result) *stringPtr += comSize; + return result; +} + /* ******************************************************** * File related operations @@ -139,7 +172,7 @@ static void shuffle(const char** fileNamesTable, unsigned nbFiles) { /*-******************************************************** * Dictionary training functions **********************************************************/ -static size_t findMaxMem(unsigned long long requiredMem) { +size_t findMaxMem(unsigned long long requiredMem) { size_t const step = 8 MB; void* testmem = NULL; diff --git a/contrib/randomDictBuilder/io.h b/contrib/experimental_dict_builders/randomDictBuilder/io.h similarity index 78% rename from contrib/randomDictBuilder/io.h rename to contrib/experimental_dict_builders/randomDictBuilder/io.h index 55967f76e..0ee24604e 100644 --- a/contrib/randomDictBuilder/io.h +++ b/contrib/experimental_dict_builders/randomDictBuilder/io.h @@ -48,3 +48,13 @@ void freeSampleInfo(sampleInfo *info); * Save data stored on buff to dictFileName */ void saveDict(const char* dictFileName, const void* buff, size_t buffSize); + + +unsigned readU32FromChar(const char** stringPtr); + +/** longCommandWArg() : + * check if *stringPtr is the same as longCommand. + * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. + * @return 0 and doesn't modify *stringPtr otherwise. + */ +unsigned longCommandWArg(const char** stringPtr, const char* longCommand); diff --git a/contrib/randomDictBuilder/main.c b/contrib/experimental_dict_builders/randomDictBuilder/main.c similarity index 79% rename from contrib/randomDictBuilder/main.c rename to contrib/experimental_dict_builders/randomDictBuilder/main.c index 4751a9e1c..3f3a6ca70 100644 --- a/contrib/randomDictBuilder/main.c +++ b/contrib/experimental_dict_builders/randomDictBuilder/main.c @@ -52,46 +52,6 @@ static const unsigned g_defaultMaxDictSize = 110 KB; -/*-************************************* -* Commandline related functions -***************************************/ -static unsigned readU32FromChar(const char** stringPtr){ - const char errorMsg[] = "error: numeric value too large"; - unsigned result = 0; - while ((**stringPtr >='0') && (**stringPtr <='9')) { - unsigned const max = (((unsigned)(-1)) / 10) - 1; - if (result > max) exit(1); - result *= 10, result += **stringPtr - '0', (*stringPtr)++ ; - } - if ((**stringPtr=='K') || (**stringPtr=='M')) { - unsigned const maxK = ((unsigned)(-1)) >> 10; - if (result > maxK) exit(1); - result <<= 10; - if (**stringPtr=='M') { - if (result > maxK) exit(1); - result <<= 10; - } - (*stringPtr)++; /* skip `K` or `M` */ - if (**stringPtr=='i') (*stringPtr)++; - if (**stringPtr=='B') (*stringPtr)++; - } - return result; -} - -/** longCommandWArg() : - * check if *stringPtr is the same as longCommand. - * If yes, @return 1 and advances *stringPtr to the position which immediately follows longCommand. - * @return 0 and doesn't modify *stringPtr otherwise. - */ -static unsigned longCommandWArg(const char** stringPtr, const char* longCommand){ - size_t const comSize = strlen(longCommand); - int const result = !strncmp(*stringPtr, longCommand, comSize); - if (result) *stringPtr += comSize; - return result; -} - - - /*-************************************* * RANDOM ***************************************/ diff --git a/contrib/randomDictBuilder/random.c b/contrib/experimental_dict_builders/randomDictBuilder/random.c similarity index 100% rename from contrib/randomDictBuilder/random.c rename to contrib/experimental_dict_builders/randomDictBuilder/random.c diff --git a/contrib/randomDictBuilder/random.h b/contrib/experimental_dict_builders/randomDictBuilder/random.h similarity index 100% rename from contrib/randomDictBuilder/random.h rename to contrib/experimental_dict_builders/randomDictBuilder/random.h diff --git a/contrib/randomDictBuilder/test.sh b/contrib/experimental_dict_builders/randomDictBuilder/test.sh similarity index 52% rename from contrib/randomDictBuilder/test.sh rename to contrib/experimental_dict_builders/randomDictBuilder/test.sh index 497820f88..1eb732e52 100644 --- a/contrib/randomDictBuilder/test.sh +++ b/contrib/experimental_dict_builders/randomDictBuilder/test.sh @@ -1,12 +1,12 @@ echo "Building random dictionary with in=../../lib/common k=200 out=dict1" -./main in=../../lib/common k=200 out=dict1 -zstd -be3 -D dict1 -r ../../lib/common -q +./main in=../../../lib/common k=200 out=dict1 +zstd -be3 -D dict1 -r ../../../lib/common -q echo "Building random dictionary with in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000" -./main in=../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 -zstd -be3 -D dict2 -r ../../lib/common -q +./main in=../../../lib/common k=500 out=dict2 dictID=100 maxdict=140000 +zstd -be3 -D dict2 -r ../../../lib/common -q echo "Building random dictionary with 2 sample sources" -./main in=../../lib/common in=../../lib/compress out=dict3 -zstd -be3 -D dict3 -r ../../lib/common -q +./main in=../../../lib/common in=../../../lib/compress out=dict3 +zstd -be3 -D dict3 -r ../../../lib/common -q echo "Removing dict1 dict2 dict3" rm -f dict1 dict2 dict3