diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c index b4ae4e877..8308bf5d1 100644 --- a/lib/compress/zstd_compress.c +++ b/lib/compress/zstd_compress.c @@ -392,6 +392,11 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX; return bounds; + case ZSTD_c_srcSizeHint: + bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN; + bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; + return bounds; + default: { ZSTD_bounds const boundError = { ERROR(parameter_unsupported), 0, 0 }; return boundError; @@ -448,6 +453,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) case ZSTD_c_forceAttachDict: case ZSTD_c_literalCompressionMode: case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: default: return 0; } @@ -494,6 +500,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) case ZSTD_c_ldmMinMatch: case ZSTD_c_ldmBucketSizeLog: case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: break; default: RETURN_ERROR(parameter_unsupported); @@ -674,6 +681,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, CCtxParams->targetCBlockSize = value; return CCtxParams->targetCBlockSize; + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; + return CCtxParams->srcSizeHint; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } } @@ -779,6 +792,9 @@ size_t ZSTD_CCtxParams_getParameter( case ZSTD_c_targetCBlockSize : *value = (int)CCtxParams->targetCBlockSize; break; + case ZSTD_c_srcSizeHint : + *value = (int)CCtxParams->srcSizeHint; + break; default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); } return 0; @@ -1029,7 +1045,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) { - ZSTD_compressionParameters cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize); + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { + srcSizeHint = CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams(CCtxParams->compressionLevel, srcSizeHint, dictSize); if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog; if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog; diff --git a/lib/compress/zstd_compress_internal.h b/lib/compress/zstd_compress_internal.h index 6d623cc6b..3e590ec37 100644 --- a/lib/compress/zstd_compress_internal.h +++ b/lib/compress/zstd_compress_internal.h @@ -203,6 +203,9 @@ struct ZSTD_CCtx_params_s { size_t targetCBlockSize; /* Tries to fit compressed block size to be around targetCBlockSize. * No target when targetCBlockSize == 0. * There is no guarantee on compressed block size */ + int srcSizeHint; /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size */ ZSTD_dictAttachPref_e attachDictPref; ZSTD_literalCompressionMode_e literalCompressionMode; diff --git a/lib/zstd.h b/lib/zstd.h index f8e95f228..38c99e016 100644 --- a/lib/zstd.h +++ b/lib/zstd.h @@ -15,6 +15,7 @@ extern "C" { #define ZSTD_H_235446 /* ====== Dependency ======*/ +#include /* INT_MAX */ #include /* size_t */ @@ -386,6 +387,7 @@ typedef enum { * ZSTD_c_forceAttachDict * ZSTD_c_literalCompressionMode * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. * note : never ever use experimentalParam? names directly; * also, the enums values themselves are unstable and can still change. @@ -396,6 +398,7 @@ typedef enum { ZSTD_c_experimentalParam4=1001, ZSTD_c_experimentalParam5=1002, ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004, } ZSTD_cParameter; typedef struct { @@ -1063,6 +1066,8 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); /* Advanced parameter bounds */ #define ZSTD_TARGETCBLOCKSIZE_MIN 64 #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_SRCSIZEHINT_MIN 0 +#define ZSTD_SRCSIZEHINT_MAX INT_MAX /* internal */ #define ZSTD_HASHLOG3_MAX 17 @@ -1441,6 +1446,12 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre * There is no guarantee on compressed block size (default:0) */ #define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + /*! ZSTD_CCtx_getParameter() : * Get the requested compression parameter value, selected by enum ZSTD_cParameter, * and store it into int* value. diff --git a/programs/fileio.c b/programs/fileio.c index 873013a51..20e2ee2a1 100644 --- a/programs/fileio.c +++ b/programs/fileio.c @@ -30,6 +30,7 @@ #include /* strcmp, strlen */ #include #include /* errno */ +#include /* INT_MAX */ #include #include "timefn.h" /* UTIL_getTime, UTIL_clockSpanMicro */ @@ -306,6 +307,7 @@ struct FIO_prefs_s { int ldmHashRateLog; size_t streamSrcSize; size_t targetCBlockSize; + int srcSizeHint; ZSTD_literalCompressionMode_e literalCompressionMode; /* IO preferences */ @@ -352,6 +354,7 @@ FIO_prefs_t* FIO_createPreferences(void) ret->ldmHashRateLog = FIO_LDM_PARAM_NOTSET; ret->streamSrcSize = 0; ret->targetCBlockSize = 0; + ret->srcSizeHint = 0; ret->literalCompressionMode = ZSTD_lcm_auto; return ret; } @@ -428,6 +431,10 @@ void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize) prefs->targetCBlockSize = targetCBlockSize; } +void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint) { + prefs->srcSizeHint = (int)MIN((size_t)INT_MAX, srcSizeHint); +} + void FIO_setLiteralCompressionMode( FIO_prefs_t* const prefs, ZSTD_literalCompressionMode_e mode) { @@ -672,6 +679,8 @@ static cRess_t FIO_createCResources(FIO_prefs_t* const prefs, CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_compressionLevel, cLevel) ); /* max compressed block size */ CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_targetCBlockSize, (int)prefs->targetCBlockSize) ); + /* source size hint */ + CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_srcSizeHint, (int)prefs->srcSizeHint) ); /* long distance matching */ CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_enableLongDistanceMatching, prefs->ldmFlag) ); CHECK( ZSTD_CCtx_setParameter(ress.cctx, ZSTD_c_ldmHashLog, prefs->ldmHashLog) ); diff --git a/programs/fileio.h b/programs/fileio.h index 13f6f1d05..096d90b5c 100644 --- a/programs/fileio.h +++ b/programs/fileio.h @@ -73,6 +73,7 @@ void FIO_setSparseWrite(FIO_prefs_t* const prefs, unsigned sparse); /**< 0: no void FIO_setRsyncable(FIO_prefs_t* const prefs, int rsyncable); void FIO_setStreamSrcSize(FIO_prefs_t* const prefs, size_t streamSrcSize); void FIO_setTargetCBlockSize(FIO_prefs_t* const prefs, size_t targetCBlockSize); +void FIO_setSrcSizeHint(FIO_prefs_t* const prefs, size_t srcSizeHint); void FIO_setLiteralCompressionMode( FIO_prefs_t* const prefs, ZSTD_literalCompressionMode_e mode); diff --git a/programs/zstd.1.md b/programs/zstd.1.md index 1bdc42654..dff4d9eac 100644 --- a/programs/zstd.1.md +++ b/programs/zstd.1.md @@ -149,6 +149,13 @@ the last one takes effect. will be included in the produced frame header. Incorrect stream sizes will cause an error. This information will be used to better optimize compression parameters, resulting in better and potentially faster compression, especially for smaller source sizes. +* `--size-hint=#`: + When handling input from a stream, `zstd` must guess how large the source size + will be when optimizing compression parameters. If the stream size is relatively + small, this guess may be a poor one, resulting in a higher compression ratio than + expected. This feature allows for controlling the guess when needed. + Exact guesses result in better compression ratios. Overestimates result in slightly + degraded compression ratios, while underestimates may result in significant degradation. * `--rsyncable` : `zstd` will periodically synchronize the compression state to make the compressed file more rsync-friendly. There is a negligible impact to diff --git a/programs/zstdcli.c b/programs/zstdcli.c index 401e1ee2c..98df728a9 100644 --- a/programs/zstdcli.c +++ b/programs/zstdcli.c @@ -142,6 +142,7 @@ static int usage_advanced(const char* programName) DISPLAY( "--fast[=#]: switch to ultra fast compression level (default: %u)\n", 1); DISPLAY( "--adapt : dynamically adapt compression level to I/O conditions \n"); DISPLAY( "--stream-size=# : optimize compression parameters for streaming input of given number of bytes \n"); + DISPLAY( "--size-hint=# optimize compression parameters for streaming input of approximately this size\n"); DISPLAY( "--target-compressed-block-size=# : make compressed block near targeted size \n"); #ifdef ZSTD_MULTITHREAD DISPLAY( " -T# : spawns # compression threads (default: 1, 0==# cores) \n"); @@ -591,6 +592,7 @@ int main(int argCount, const char* argv[]) unsigned dictID = 0; size_t streamSrcSize = 0; size_t targetCBlockSize = 0; + size_t srcSizeHint = 0; int dictCLevel = g_defaultDictCLevel; unsigned dictSelect = g_defaultSelectivityLevel; #ifdef UTIL_HAS_CREATEFILELIST @@ -749,6 +751,7 @@ int main(int argCount, const char* argv[]) if (longCommandWArg(&argument, "--zstd=")) { if (!parseCompressionParameters(argument, &compressionParams)) CLEAN_RETURN(badusage(programName)); continue; } if (longCommandWArg(&argument, "--stream-size=")) { streamSrcSize = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--target-compressed-block-size=")) { targetCBlockSize = readU32FromChar(&argument); continue; } + if (longCommandWArg(&argument, "--size-hint=")) { srcSizeHint = readU32FromChar(&argument); continue; } if (longCommandWArg(&argument, "--long")) { unsigned ldmWindowLog = 0; ldmFlag = 1; @@ -1155,6 +1158,7 @@ int main(int argCount, const char* argv[]) FIO_setRsyncable(prefs, rsyncable); FIO_setStreamSrcSize(prefs, streamSrcSize); FIO_setTargetCBlockSize(prefs, targetCBlockSize); + FIO_setSrcSizeHint(prefs, srcSizeHint); FIO_setLiteralCompressionMode(prefs, literalCompressionMode); if (adaptMin > cLevel) cLevel = adaptMin; if (adaptMax < cLevel) cLevel = adaptMax; @@ -1164,7 +1168,7 @@ int main(int argCount, const char* argv[]) else operationResult = FIO_compressMultipleFilenames(prefs, filenameTable, filenameIdx, outFileName, suffix, dictFileName, cLevel, compressionParams); #else - (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)streamSrcSize; (void)targetCBlockSize; /* not used when ZSTD_NOCOMPRESS set */ + (void)suffix; (void)adapt; (void)rsyncable; (void)ultra; (void)cLevel; (void)ldmFlag; (void)literalCompressionMode; (void)targetCBlockSize; (void)streamSrcSize; (void)srcSizeHint; /* not used when ZSTD_NOCOMPRESS set */ DISPLAY("Compression not supported \n"); #endif } else { /* decompression or test */ diff --git a/tests/fuzz/zstd_helpers.c b/tests/fuzz/zstd_helpers.c index 9dff2895a..5ff057b8c 100644 --- a/tests/fuzz/zstd_helpers.c +++ b/tests/fuzz/zstd_helpers.c @@ -90,6 +90,9 @@ void FUZZ_setRandomParameters(ZSTD_CCtx *cctx, size_t srcSize, uint32_t *state) setRand(cctx, ZSTD_c_forceMaxWindow, 0, 1, state); setRand(cctx, ZSTD_c_literalCompressionMode, 0, 2, state); setRand(cctx, ZSTD_c_forceAttachDict, 0, 2, state); + if (FUZZ_rand32(state, 0, 1) == 0) { + setRand(cctx, ZSTD_c_srcSizeHint, ZSTD_SRCSIZEHINT_MIN, 2 * srcSize, state); + } } FUZZ_dict_t FUZZ_train(void const* src, size_t srcSize, uint32_t *state) diff --git a/tests/playTests.sh b/tests/playTests.sh index b74076763..ad096fddd 100755 --- a/tests/playTests.sh +++ b/tests/playTests.sh @@ -425,6 +425,36 @@ println "test : incorrect stream size" cat tmp | $ZSTD -14 -f -o tmp.zst --stream-size=11001 && die "should fail with incorrect stream size" +println "\n===> size-hint mode" + +./datagen -g11000 > tmp +./datagen -g11000 > tmp2 +./datagen > tmpDict +println "test : basic file compression vs hinted streaming compression" +file_size=$($ZSTD -14 -f tmp -o tmp.zst && wc -c < tmp.zst) +stream_size=$(cat tmp | $ZSTD -14 --size-hint=11000 | wc -c) +if [ "$stream_size" -ge "$file_size" ]; then + die "hinted compression larger than expected" +fi +println "test : hinted streaming compression and decompression" +cat tmp | $ZSTD -14 -f -o tmp.zst --size-hint=11000 +$ZSTD -df tmp.zst -o tmp_decompress +cmp tmp tmp_decompress || die "difference between original and decompressed file" +println "test : hinted streaming compression with dictionary" +cat tmp | $ZSTD -14 -f -D tmpDict --size-hint=11000 | $ZSTD -t -D tmpDict +println "test : multiple file compression with hints and dictionary" +$ZSTD -14 -f -D tmpDict --size-hint=11000 tmp tmp2 +$ZSTD -14 -f -o tmp1_.zst -D tmpDict --size-hint=11000 tmp +$ZSTD -14 -f -o tmp2_.zst -D tmpDict --size-hint=11000 tmp2 +cmp tmp.zst tmp1_.zst || die "first file's output differs" +cmp tmp2.zst tmp2_.zst || die "second file's output differs" +println "test : incorrect hinted stream sizes" +cat tmp | $ZSTD -14 -f --size-hint=11050 | $ZSTD -t # slightly too high +cat tmp | $ZSTD -14 -f --size-hint=10950 | $ZSTD -t # slightly too low +cat tmp | $ZSTD -14 -f --size-hint=22000 | $ZSTD -t # considerably too high +cat tmp | $ZSTD -14 -f --size-hint=5500 | $ZSTD -t # considerably too low + + println "\n===> dictionary tests " println "- test with raw dict (content only) " diff --git a/tests/zstreamtest.c b/tests/zstreamtest.c index 56f16766e..9af08ebe4 100644 --- a/tests/zstreamtest.c +++ b/tests/zstreamtest.c @@ -2106,6 +2106,7 @@ static int fuzzerTests_newAPI(U32 seed, int nbTests, int startTest, if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmMinMatch, FUZ_randomClampedLength(&lseed, ZSTD_LDM_MINMATCH_MIN, ZSTD_LDM_MINMATCH_MAX), opaqueAPI) ); if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmBucketSizeLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_BUCKETSIZELOG_MIN, ZSTD_LDM_BUCKETSIZELOG_MAX), opaqueAPI) ); if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_ldmHashRateLog, FUZ_randomClampedLength(&lseed, ZSTD_LDM_HASHRATELOG_MIN, ZSTD_LDM_HASHRATELOG_MAX), opaqueAPI) ); + if (FUZ_rand(&lseed) & 3) CHECK_Z( setCCtxParameter(zc, cctxParams, ZSTD_c_srcSizeHint, FUZ_randomClampedLength(&lseed, ZSTD_SRCSIZEHINT_MIN, ZSTD_SRCSIZEHINT_MAX), opaqueAPI) ); } /* mess with frame parameters */