[zdict] Stabilize ZDICT_finalizeDictionary()

This commit is contained in:
Nick Terrell 2020-05-06 18:51:14 -07:00 committed by Nick Terrell
parent 625924774e
commit 45c66dd298

View File

@ -61,6 +61,53 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCap
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples);
typedef struct {
int compressionLevel; /*< optimize for a specific zstd compression level; 0 means default */
unsigned notificationLevel; /*< Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /*< force dictID value; 0 means auto mode (32-bits random value) */
} ZDICT_params_t;
/*! ZDICT_finalizeDictionary():
* Given a custom content as a basis for dictionary, and a set of samples,
* finalize dictionary by adding headers and statistics according to the zstd
* dictionary format.
*
* Samples must be stored concatenated in a flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each
* sample in order. The samples are used to construct the statistics, so they
* should be representative of what you will compress with this dictionary.
*
* The compression level can be set in `parameters`. You should pass the
* compression level you expect to use in production. The statistics for each
* compression level differ, so tuning the dictionary for the compression level
* can help quite a bit.
*
* You can set an explicit dictionary ID in `parameters`, or allow us to pick
* a random dictionary ID for you, but we can't guarantee no collisions.
*
* The dstDictBuffer and the dictContent may overlap, and the content will be
* appended to the end of the header. If the header + the content doesn't fit in
* maxDictSize the beginning of the content is truncated to make room, since it
* is presumed that the most profitable content is at the end of the dictionary,
* since that is the cheapest to reference.
*
* `dictContentSize` must be >= ZDICT_CONTENTSIZE_MIN bytes.
* `maxDictSize` must be >= max(dictContentSize, ZSTD_DICTSIZE_MIN).
*
* @return: size of dictionary stored into `dstDictBuffer` (<= `maxDictSize`),
* or an error code, which can be tested by ZDICT_isError().
* Note: ZDICT_finalizeDictionary() will push notifications into stderr if
* instructed to, using notificationLevel>0.
* NOTE: This function currently may fail in several edge cases including:
* * Not enough samples
* * Samples are uncompressible
* * Samples are all exactly the same
*/
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dstDictBuffer, size_t maxDictSize,
const void* dictContent, size_t dictContentSize,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters);
/*====== Helper functions ======*/
ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */
@ -79,11 +126,8 @@ ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode);
* Use them only in association with static linking.
* ==================================================================================== */
typedef struct {
int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */
unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */
unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */
} ZDICT_params_t;
#define ZDICT_CONTENTSIZE_MIN 128
#define ZDICT_DICTSIZE_MIN 256
/*! ZDICT_cover_params_t:
* k and d are the only required parameters.
@ -199,28 +243,6 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_fastCover(void* dictBuffer,
const size_t* samplesSizes, unsigned nbSamples,
ZDICT_fastCover_params_t* parameters);
/*! ZDICT_finalizeDictionary():
* Given a custom content as a basis for dictionary, and a set of samples,
* finalize dictionary by adding headers and statistics.
*
* Samples must be stored concatenated in a flat buffer `samplesBuffer`,
* supplied with an array of sizes `samplesSizes`, providing the size of each sample in order.
*
* dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes.
* maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes.
*
* @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`),
* or an error code, which can be tested by ZDICT_isError().
* Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0.
* Note 2: dictBuffer and dictContent can overlap
*/
#define ZDICT_CONTENTSIZE_MIN 128
#define ZDICT_DICTSIZE_MIN 256
ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
const void* dictContent, size_t dictContentSize,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
ZDICT_params_t parameters);
typedef struct {
unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */
ZDICT_params_t zParams;