mirror of
https://github.com/facebook/zstd.git
synced 2025-12-09 00:03:18 -05:00
Merge pull request #1556 from terrelln/dictbuilder
[cover] Improvements for small or homogeneous data
This commit is contained in:
commit
dcc6c7e9ae
@ -627,6 +627,38 @@ static int COVER_ctx_init(COVER_ctx_t *ctx, const void *samplesBuffer,
|
||||
return 1;
|
||||
}
|
||||
|
||||
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers)
|
||||
{
|
||||
const double ratio = (double)nbDmers / maxDictSize;
|
||||
if (ratio >= 10) {
|
||||
return;
|
||||
}
|
||||
DISPLAYLEVEL(1, "WARNING: The maximum dictionary size %u is too large "
|
||||
"compared to the source size %u! "
|
||||
"size(source)/size(dictionary) = %f, but it should be >= "
|
||||
"10! This may lead to a subpar dictionary! We recommend "
|
||||
"training on sources at least 10x, and up to 100x the "
|
||||
"size of the dictionary!\n", (U32)maxDictSize,
|
||||
(U32)nbDmers, ratio);
|
||||
}
|
||||
|
||||
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize,
|
||||
U32 nbDmers, U32 k, U32 passes)
|
||||
{
|
||||
const U32 minEpochSize = k * 10;
|
||||
COVER_epoch_info_t epochs;
|
||||
epochs.num = MAX(1, maxDictSize / k / passes);
|
||||
epochs.size = nbDmers / epochs.num;
|
||||
if (epochs.size >= minEpochSize) {
|
||||
assert(epochs.size * epochs.num <= nbDmers);
|
||||
return epochs;
|
||||
}
|
||||
epochs.size = MIN(minEpochSize, nbDmers);
|
||||
epochs.num = nbDmers / epochs.size;
|
||||
assert(epochs.size * epochs.num <= nbDmers);
|
||||
return epochs;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given the prepared context build the dictionary.
|
||||
*/
|
||||
@ -636,28 +668,34 @@ static size_t COVER_buildDictionary(const COVER_ctx_t *ctx, U32 *freqs,
|
||||
ZDICT_cover_params_t parameters) {
|
||||
BYTE *const dict = (BYTE *)dictBuffer;
|
||||
size_t tail = dictBufferCapacity;
|
||||
/* Divide the data up into epochs of equal size.
|
||||
* We will select at least one segment from each epoch.
|
||||
*/
|
||||
const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k / 4));
|
||||
const unsigned epochSize = (U32)(ctx->suffixSize / epochs);
|
||||
/* Divide the data into epochs. We will select one segment from each epoch. */
|
||||
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
||||
(U32)dictBufferCapacity, (U32)ctx->suffixSize, parameters.k, 4);
|
||||
const size_t maxZeroScoreRun = MAX(10, MIN(100, epochs.num >> 3));
|
||||
size_t zeroScoreRun = 0;
|
||||
size_t epoch;
|
||||
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
||||
epochs, epochSize);
|
||||
(U32)epochs.num, (U32)epochs.size);
|
||||
/* Loop through the epochs until there are no more segments or the dictionary
|
||||
* is full.
|
||||
*/
|
||||
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
||||
const U32 epochBegin = (U32)(epoch * epochSize);
|
||||
const U32 epochEnd = epochBegin + epochSize;
|
||||
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
||||
const U32 epochBegin = (U32)(epoch * epochs.size);
|
||||
const U32 epochEnd = epochBegin + epochs.size;
|
||||
size_t segmentSize;
|
||||
/* Select a segment */
|
||||
COVER_segment_t segment = COVER_selectSegment(
|
||||
ctx, freqs, activeDmers, epochBegin, epochEnd, parameters);
|
||||
/* If the segment covers no dmers, then we are out of content */
|
||||
/* If the segment covers no dmers, then we are out of content.
|
||||
* There may be new content in other epochs, for continue for some time.
|
||||
*/
|
||||
if (segment.score == 0) {
|
||||
break;
|
||||
if (++zeroScoreRun >= maxZeroScoreRun) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
zeroScoreRun = 0;
|
||||
/* Trim the segment if necessary and if it is too small then we are done */
|
||||
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
||||
if (segmentSize < parameters.d) {
|
||||
@ -706,6 +744,7 @@ ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover(
|
||||
parameters.d, parameters.splitPoint)) {
|
||||
return ERROR(GENERIC);
|
||||
}
|
||||
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize);
|
||||
if (!COVER_map_init(&activeDmers, parameters.k - parameters.d + 1)) {
|
||||
DISPLAYLEVEL(1, "Failed to allocate dmer map: out of memory\n");
|
||||
COVER_ctx_destroy(&ctx);
|
||||
@ -977,6 +1016,7 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
||||
unsigned k;
|
||||
COVER_best_t best;
|
||||
POOL_ctx *pool = NULL;
|
||||
int warned = 0;
|
||||
|
||||
/* Checks */
|
||||
if (splitPoint <= 0 || splitPoint > 1) {
|
||||
@ -1019,6 +1059,10 @@ ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover(
|
||||
POOL_free(pool);
|
||||
return ERROR(GENERIC);
|
||||
}
|
||||
if (!warned) {
|
||||
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.suffixSize);
|
||||
warned = 1;
|
||||
}
|
||||
/* Loop through k reusing the same context */
|
||||
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
||||
/* Prepare the arguments */
|
||||
|
||||
@ -38,6 +38,35 @@ typedef struct {
|
||||
U32 score;
|
||||
} COVER_segment_t;
|
||||
|
||||
/**
|
||||
*Number of epochs and size of each epoch.
|
||||
*/
|
||||
typedef struct {
|
||||
U32 num;
|
||||
U32 size;
|
||||
} COVER_epoch_info_t;
|
||||
|
||||
/**
|
||||
* Computes the number of epochs and the size of each epoch.
|
||||
* We will make sure that each epoch gets at least 10 * k bytes.
|
||||
*
|
||||
* The COVER algorithms divide the data up into epochs of equal size and
|
||||
* select one segemnt from each epoch.
|
||||
*
|
||||
* @param maxDictSize The maximum allowed dictioary size.
|
||||
* @param nbDmers The number of dmers we are training on.
|
||||
* @param k The parameter k (segment size).
|
||||
* @param passes The target number of passes over the dmer corpus.
|
||||
* More passes means a better dictionary.
|
||||
*/
|
||||
COVER_epoch_info_t COVER_computeEpochs(U32 maxDictSize, U32 nbDmers,
|
||||
U32 k, U32 passes);
|
||||
|
||||
/**
|
||||
* Warns the user when their corpus is too small.
|
||||
*/
|
||||
void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers);
|
||||
|
||||
/**
|
||||
* Checks total compressed size of a dictionary
|
||||
*/
|
||||
|
||||
@ -386,29 +386,35 @@ FASTCOVER_buildDictionary(const FASTCOVER_ctx_t* ctx,
|
||||
{
|
||||
BYTE *const dict = (BYTE *)dictBuffer;
|
||||
size_t tail = dictBufferCapacity;
|
||||
/* Divide the data up into epochs of equal size.
|
||||
* We will select at least one segment from each epoch.
|
||||
*/
|
||||
const unsigned epochs = MAX(1, (U32)(dictBufferCapacity / parameters.k));
|
||||
const unsigned epochSize = (U32)(ctx->nbDmers / epochs);
|
||||
/* Divide the data into epochs. We will select one segment from each epoch. */
|
||||
const COVER_epoch_info_t epochs = COVER_computeEpochs(
|
||||
(U32)dictBufferCapacity, (U32)ctx->nbDmers, parameters.k, 1);
|
||||
const size_t maxZeroScoreRun = 10;
|
||||
size_t zeroScoreRun = 0;
|
||||
size_t epoch;
|
||||
DISPLAYLEVEL(2, "Breaking content into %u epochs of size %u\n",
|
||||
epochs, epochSize);
|
||||
(U32)epochs.num, (U32)epochs.size);
|
||||
/* Loop through the epochs until there are no more segments or the dictionary
|
||||
* is full.
|
||||
*/
|
||||
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs) {
|
||||
const U32 epochBegin = (U32)(epoch * epochSize);
|
||||
const U32 epochEnd = epochBegin + epochSize;
|
||||
for (epoch = 0; tail > 0; epoch = (epoch + 1) % epochs.num) {
|
||||
const U32 epochBegin = (U32)(epoch * epochs.size);
|
||||
const U32 epochEnd = epochBegin + epochs.size;
|
||||
size_t segmentSize;
|
||||
/* Select a segment */
|
||||
COVER_segment_t segment = FASTCOVER_selectSegment(
|
||||
ctx, freqs, epochBegin, epochEnd, parameters, segmentFreqs);
|
||||
|
||||
/* If the segment covers no dmers, then we are out of content */
|
||||
/* If the segment covers no dmers, then we are out of content.
|
||||
* There may be new content in other epochs, for continue for some time.
|
||||
*/
|
||||
if (segment.score == 0) {
|
||||
break;
|
||||
if (++zeroScoreRun >= maxZeroScoreRun) {
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
zeroScoreRun = 0;
|
||||
|
||||
/* Trim the segment if necessary and if it is too small then we are done */
|
||||
segmentSize = MIN(segment.end - segment.begin + parameters.d - 1, tail);
|
||||
@ -564,6 +570,7 @@ ZDICT_trainFromBuffer_fastCover(void* dictBuffer, size_t dictBufferCapacity,
|
||||
DISPLAYLEVEL(1, "Failed to initialize context\n");
|
||||
return ERROR(GENERIC);
|
||||
}
|
||||
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers);
|
||||
/* Build the dictionary */
|
||||
DISPLAYLEVEL(2, "Building dictionary\n");
|
||||
{
|
||||
@ -616,6 +623,7 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
||||
unsigned k;
|
||||
COVER_best_t best;
|
||||
POOL_ctx *pool = NULL;
|
||||
int warned = 0;
|
||||
/* Checks */
|
||||
if (splitPoint <= 0 || splitPoint > 1) {
|
||||
LOCALDISPLAYLEVEL(displayLevel, 1, "Incorrect splitPoint\n");
|
||||
@ -664,6 +672,10 @@ ZDICT_optimizeTrainFromBuffer_fastCover(
|
||||
POOL_free(pool);
|
||||
return ERROR(GENERIC);
|
||||
}
|
||||
if (!warned) {
|
||||
COVER_warnOnSmallCorpus(dictBufferCapacity, ctx.nbDmers);
|
||||
warned = 1;
|
||||
}
|
||||
/* Loop through k reusing the same context */
|
||||
for (k = kMinK; k <= kMaxK; k += kStepSize) {
|
||||
/* Prepare the arguments */
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user