mirror of
https://github.com/facebook/zstd.git
synced 2025-11-09 00:11:14 -05:00
added buffer interface to dictBuilder
This commit is contained in:
parent
35f7de52c8
commit
7682e49d0a
@ -184,7 +184,7 @@ int main(int argCount, const char** argv)
|
|||||||
if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
|
if (!strcmp(argument, "--verbose")) { g_displayLevel++; if (g_displayLevel<3) g_displayLevel=3; continue; }
|
||||||
if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
|
if (!strcmp(argument, "--quiet")) { g_displayLevel--; continue; }
|
||||||
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
|
if (!strcmp(argument, "--maxdict")) { nextArgumentIsMaxDict=1; continue; }
|
||||||
if (!strcmp(argument, "--fast")) { selectionLevel=0; cLevel=1; continue; }
|
if (!strcmp(argument, "--fast")) { selectionLevel=1; cLevel=1; continue; }
|
||||||
|
|
||||||
/* Decode commands (note : aggregated commands are allowed) */
|
/* Decode commands (note : aggregated commands are allowed) */
|
||||||
if (argument[0]=='-') {
|
if (argument[0]=='-') {
|
||||||
@ -247,8 +247,15 @@ int main(int argCount, const char** argv)
|
|||||||
}
|
}
|
||||||
|
|
||||||
/* building ... */
|
/* building ... */
|
||||||
|
{
|
||||||
|
DiB_params_t param;
|
||||||
|
param.selectivityLevel = selectionLevel;
|
||||||
|
param.compressionLevel = cLevel;
|
||||||
DiB_setNotificationLevel(g_displayLevel);
|
DiB_setNotificationLevel(g_displayLevel);
|
||||||
operationResult = DiB_trainDictionary(dictFileName, maxDictSize, selectionLevel, cLevel, filenameTable, filenameIdx);
|
operationResult = DiB_trainFromFiles(dictFileName, maxDictSize,
|
||||||
|
filenameTable, filenameIdx,
|
||||||
|
param);
|
||||||
|
}
|
||||||
|
|
||||||
if (main_pause) waitEnter();
|
if (main_pause) waitEnter();
|
||||||
free((void*)filenameTable);
|
free((void*)filenameTable);
|
||||||
|
|||||||
@ -58,17 +58,13 @@
|
|||||||
#include "huff0_static.h"
|
#include "huff0_static.h"
|
||||||
|
|
||||||
|
|
||||||
/* *************************************
|
/*-*************************************
|
||||||
* Compiler specifics
|
* Compiler specifics
|
||||||
***************************************/
|
***************************************/
|
||||||
#if !defined(S_ISREG)
|
#if !defined(S_ISREG)
|
||||||
# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
|
# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
#define snprintf sprintf_s
|
|
||||||
#endif
|
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
* Constants
|
* Constants
|
||||||
@ -87,6 +83,9 @@ static const size_t maxMemory = (sizeof(size_t) == 4) ? (2 GB - 64 MB) : ((size_
|
|||||||
|
|
||||||
#define MINRATIO 4
|
#define MINRATIO 4
|
||||||
static const U32 g_compressionLevel_default = 5;
|
static const U32 g_compressionLevel_default = 5;
|
||||||
|
static const U32 g_selectivity_default = 9;
|
||||||
|
static const size_t g_provision_entropySize = 200;
|
||||||
|
static const size_t g_min_fast_dictContent = 192;
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
@ -146,6 +145,10 @@ static unsigned DiB_GetMilliSpan(clock_t nPrevious)
|
|||||||
return nSpan;
|
return nSpan;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
unsigned DiB_isError(size_t errorCode) { return ERR_isError(errorCode); }
|
||||||
|
|
||||||
|
const char* DiB_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
|
||||||
|
|
||||||
|
|
||||||
/* ********************************************************
|
/* ********************************************************
|
||||||
* File related operations
|
* File related operations
|
||||||
@ -563,9 +566,8 @@ static U32 DiB_dictSize(const dictItem* dictList)
|
|||||||
|
|
||||||
static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
|
static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
|
||||||
const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
|
const void* const buffer, const size_t bufferSize, /* buffer must end with noisy guard band */
|
||||||
const char* displayName,
|
const size_t* fileSizes, unsigned nbFiles,
|
||||||
const size_t* fileSizes, unsigned nbFiles, unsigned maxDictSize,
|
U32 shiftRatio, unsigned maxDictSize)
|
||||||
U32 shiftRatio)
|
|
||||||
{
|
{
|
||||||
saidx_t* const suffix0 = (saidx_t*)malloc((bufferSize+2)*sizeof(*suffix0));
|
saidx_t* const suffix0 = (saidx_t*)malloc((bufferSize+2)*sizeof(*suffix0));
|
||||||
saidx_t* const suffix = suffix0+1;
|
saidx_t* const suffix = suffix0+1;
|
||||||
@ -583,7 +585,7 @@ static void DiB_trainBuffer(dictItem* dictList, U32 dictListSize,
|
|||||||
memset(doneMarks, 0, bufferSize+16);
|
memset(doneMarks, 0, bufferSize+16);
|
||||||
|
|
||||||
/* sort */
|
/* sort */
|
||||||
DISPLAYLEVEL(2, "sorting %s ...\n", displayName);
|
DISPLAYLEVEL(2, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>20));
|
||||||
errorCode = divsufsort((const sauchar_t*)buffer, suffix, (saidx_t)bufferSize);
|
errorCode = divsufsort((const sauchar_t*)buffer, suffix, (saidx_t)bufferSize);
|
||||||
if (errorCode != 0) EXM_THROW(2, "sort failed");
|
if (errorCode != 0) EXM_THROW(2, "sort failed");
|
||||||
suffix[bufferSize] = (saidx_t)bufferSize; /* leads into noise */
|
suffix[bufferSize] = (saidx_t)bufferSize; /* leads into noise */
|
||||||
@ -699,7 +701,7 @@ static void DiB_countEStats(EStats_ress_t esr,
|
|||||||
#define OFFCODE_MAX 18
|
#define OFFCODE_MAX 18
|
||||||
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
||||||
unsigned compressionLevel,
|
unsigned compressionLevel,
|
||||||
const void* srcBuffer, size_t* fileSizes, unsigned nbFiles,
|
const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
|
||||||
const void* dictBuffer, size_t dictBufferSize)
|
const void* dictBuffer, size_t dictBufferSize)
|
||||||
{
|
{
|
||||||
U32 countLit[256];
|
U32 countLit[256];
|
||||||
@ -793,8 +795,7 @@ static size_t DiB_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
|
|||||||
|
|
||||||
|
|
||||||
static void DiB_saveDict(const char* dictFileName,
|
static void DiB_saveDict(const char* dictFileName,
|
||||||
const void* buff1, size_t buff1Size,
|
const void* buff, size_t buffSize)
|
||||||
const void* buff2, size_t buff2Size)
|
|
||||||
{
|
{
|
||||||
FILE* f;
|
FILE* f;
|
||||||
size_t n;
|
size_t n;
|
||||||
@ -802,11 +803,8 @@ static void DiB_saveDict(const char* dictFileName,
|
|||||||
f = fopen(dictFileName, "wb");
|
f = fopen(dictFileName, "wb");
|
||||||
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
|
if (f==NULL) EXM_THROW(3, "cannot open %s ", dictFileName);
|
||||||
|
|
||||||
n = fwrite(buff1, 1, buff1Size, f);
|
n = fwrite(buff, 1, buffSize, f);
|
||||||
if (n!=buff1Size) EXM_THROW(4, "%s : write error", dictFileName)
|
if (n!=buffSize) EXM_THROW(4, "%s : write error", dictFileName)
|
||||||
|
|
||||||
n = fwrite(buff2, 1, buff2Size, f);
|
|
||||||
if (n!=buff2Size) EXM_THROW(4, "%s : write error", dictFileName)
|
|
||||||
|
|
||||||
n = (size_t)fclose(f);
|
n = (size_t)fclose(f);
|
||||||
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
|
if (n!=0) EXM_THROW(5, "%s : flush error", dictFileName)
|
||||||
@ -853,46 +851,35 @@ static size_t DiB_fastSampling(void* dictBuffer, size_t dictSize,
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
static size_t DiB_trainFromBuffer_internal(
|
||||||
unsigned shiftRatio, unsigned compressionLevel,
|
void* dictBuffer, size_t maxDictSize,
|
||||||
const char** fileNamesTable, unsigned nbFiles)
|
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
||||||
|
DiB_params_t params)
|
||||||
{
|
{
|
||||||
void* srcBuffer;
|
const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbSamples), maxDictSize/16);
|
||||||
size_t benchedSize;
|
|
||||||
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
|
||||||
unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
|
|
||||||
const U32 dictListSize = MAX( MAX(DICTLISTSIZE, nbFiles), maxDictSize/16);
|
|
||||||
dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
dictItem* dictList = (dictItem*)malloc(dictListSize * sizeof(*dictList));
|
||||||
char mfName[20] = {0};
|
unsigned selectivity = params.selectivityLevel;
|
||||||
const char* displayName = NULL;
|
unsigned compressionLevel = params.compressionLevel;
|
||||||
|
size_t targetDictSize = maxDictSize - g_provision_entropySize;
|
||||||
|
size_t sBuffSize;
|
||||||
|
size_t dictSize = 0;
|
||||||
|
|
||||||
|
/* checks */
|
||||||
|
if (maxDictSize <= g_provision_entropySize + g_min_fast_dictContent) return ERROR(dstSize_tooSmall);
|
||||||
|
|
||||||
/* init */
|
/* init */
|
||||||
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
|
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
|
||||||
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
|
if (!dictList) { DISPLAYLEVEL(1, "not enough memory for DiB_trainFromBuffer"); return ERROR(memory_allocation); }
|
||||||
if (benchedSize < totalSizeToLoad)
|
|
||||||
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
|
|
||||||
|
|
||||||
/* Memory allocation & restrictions */
|
|
||||||
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
|
|
||||||
if ((!fileSizes) || (!srcBuffer) || (!dictList)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
|
||||||
DiB_initDictItem(dictList);
|
DiB_initDictItem(dictList);
|
||||||
|
if (selectivity==0) selectivity = g_selectivity_default;
|
||||||
|
if (compressionLevel==0) compressionLevel = g_compressionLevel_default;
|
||||||
|
|
||||||
/* Load input buffer */
|
/* select stripes */
|
||||||
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
if (selectivity>1) {
|
||||||
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
|
||||||
|
|
||||||
/* analyze sequences (non-fast mode) */
|
|
||||||
if (shiftRatio>0)
|
|
||||||
{
|
|
||||||
snprintf (mfName, sizeof(mfName), " %u files", nbFiles);
|
|
||||||
if (nbFiles > 1) displayName = mfName;
|
|
||||||
else displayName = fileNamesTable[0];
|
|
||||||
|
|
||||||
DiB_trainBuffer(dictList, dictListSize,
|
DiB_trainBuffer(dictList, dictListSize,
|
||||||
srcBuffer, benchedSize,
|
samplesBuffer, sBuffSize,
|
||||||
displayName,
|
sampleSizes, nbSamples,
|
||||||
fileSizes, nbFiles, maxDictSize,
|
selectivity, targetDictSize);
|
||||||
shiftRatio);
|
|
||||||
|
|
||||||
/* display best matches */
|
/* display best matches */
|
||||||
if (g_displayLevel>= 3) {
|
if (g_displayLevel>= 3) {
|
||||||
@ -907,72 +894,127 @@ int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
|||||||
U32 d = MIN(40, l);
|
U32 d = MIN(40, l);
|
||||||
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
DISPLAYLEVEL(3, "%3u:%3u bytes at pos %8u, savings %7u bytes |",
|
||||||
u, l, p, dictList[u].savings);
|
u, l, p, dictList[u].savings);
|
||||||
DiB_printHex(3, (char*)srcBuffer+p, d);
|
DiB_printHex(3, (const char*)samplesBuffer+p, d);
|
||||||
DISPLAYLEVEL(3, "| \n");
|
DISPLAYLEVEL(3, "| \n");
|
||||||
} } }
|
} } }
|
||||||
|
|
||||||
/* create dictionary */
|
/* create dictionary */
|
||||||
{
|
{
|
||||||
void* dictContent;
|
|
||||||
U32 dictContentSize = DiB_dictSize(dictList);
|
U32 dictContentSize = DiB_dictSize(dictList);
|
||||||
void* dictHeader;
|
size_t hSize;
|
||||||
size_t dictHeaderSize, hSize, addedContentLength;
|
|
||||||
BYTE* ptr;
|
BYTE* ptr;
|
||||||
U32 u;
|
U32 u;
|
||||||
|
|
||||||
/* build dict */
|
|
||||||
#define EBSIZE (2 KB)
|
|
||||||
dictHeaderSize = EBSIZE;
|
|
||||||
dictHeader = malloc(dictHeaderSize);
|
|
||||||
dictContent = malloc(maxDictSize);
|
|
||||||
if (!dictHeader || !dictContent) EXM_THROW(2, "not enough memory");
|
|
||||||
|
|
||||||
/* build dict content */
|
/* build dict content */
|
||||||
ptr = (BYTE*)dictContent + maxDictSize;
|
ptr = (BYTE*)dictBuffer + maxDictSize;
|
||||||
for (u=1; u<dictList->pos; u++) {
|
for (u=1; u<dictList->pos; u++) {
|
||||||
U32 l = dictList[u].length;
|
U32 l = dictList[u].length;
|
||||||
ptr -= l;
|
ptr -= l;
|
||||||
memcpy(ptr, (char*)srcBuffer+dictList[u].pos, l);
|
if (ptr<(BYTE*)dictBuffer) return ERROR(GENERIC); /* should not happen */
|
||||||
|
memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* fast mode dict content */
|
/* fast mode dict content */
|
||||||
if (shiftRatio==0) { /* note could also be used to complete a dictionary, but not necessarily better */
|
if (selectivity==1) { /* note could also be used to complete a dictionary, but not necessarily better */
|
||||||
addedContentLength = ptr-(BYTE*)dictContent;
|
DISPLAYLEVEL(3, "\r%70s\r", ""); /* clean display line */
|
||||||
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
DISPLAYLEVEL(3, "Adding %u KB with fast sampling \n", (U32)(targetDictSize>>10));
|
||||||
DISPLAYLEVEL(2, "Adding %u KB from fast sampling \n", (U32)(addedContentLength>>10));
|
dictContentSize = DiB_fastSampling((char*)dictBuffer + g_provision_entropySize,
|
||||||
addedContentLength = DiB_fastSampling(dictContent, addedContentLength, srcBuffer, benchedSize);
|
targetDictSize, samplesBuffer, sBuffSize);
|
||||||
if (!ERR_isError(addedContentLength))
|
|
||||||
ptr -= addedContentLength, dictContentSize += addedContentLength;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* dictionary header */
|
/* dictionary header */
|
||||||
MEM_writeLE32(dictHeader, ZSTD_DICT_MAGIC);
|
MEM_writeLE32(dictBuffer, ZSTD_DICT_MAGIC);
|
||||||
hSize = 4;
|
hSize = 4;
|
||||||
dictHeaderSize -= 4;
|
|
||||||
|
|
||||||
/* entropic tables */
|
/* entropic tables */
|
||||||
|
DISPLAYLEVEL(2, "\r%70s\r", ""); /* clean display line */
|
||||||
DISPLAYLEVEL(2, "statistics ... \n");
|
DISPLAYLEVEL(2, "statistics ... \n");
|
||||||
hSize += DiB_analyzeEntropy((char*)dictHeader+4, dictHeaderSize,
|
hSize += DiB_analyzeEntropy((char*)dictBuffer+4, maxDictSize-4,
|
||||||
compressionLevel,
|
compressionLevel,
|
||||||
srcBuffer, fileSizes, nbFiles,
|
samplesBuffer, sampleSizes, nbSamples,
|
||||||
ptr, dictContentSize);
|
(char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
|
||||||
|
|
||||||
/* save dict */
|
if (hSize + dictContentSize < maxDictSize)
|
||||||
{
|
memmove((char*)dictBuffer + hSize, (char*)dictBuffer + maxDictSize - dictContentSize, dictContentSize);
|
||||||
size_t dictSize = hSize + dictContentSize;
|
dictSize = MIN(maxDictSize, hSize+dictContentSize);
|
||||||
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
|
||||||
DiB_saveDict(dictFileName, dictHeader, hSize, ptr, dictContentSize);
|
|
||||||
//DiB_saveDict(dictFileName, NULL, 0, dictContent, dictContentSize); // content only
|
|
||||||
}
|
|
||||||
/* clean */
|
|
||||||
free(dictHeader);
|
|
||||||
free(dictContent);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/* clean up */
|
/* clean up */
|
||||||
free(srcBuffer);
|
|
||||||
free(fileSizes);
|
|
||||||
free(dictList);
|
free(dictList);
|
||||||
return 0;
|
return dictSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* issue : samplesBuffer need to be followed by a noisy guard band.
|
||||||
|
* work around : duplicate the buffer, and add the noise ? */
|
||||||
|
size_t DiB_trainFromBuffer(void* dictBuffer, size_t maxDictSize,
|
||||||
|
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
||||||
|
DiB_params_t params)
|
||||||
|
{
|
||||||
|
size_t sBuffSize;
|
||||||
|
void* newBuff;
|
||||||
|
size_t result;
|
||||||
|
|
||||||
|
{ unsigned u; for (u=0, sBuffSize=0; u<nbSamples; u++) sBuffSize += sampleSizes[u]; }
|
||||||
|
newBuff = malloc(sBuffSize + NOISELENGTH);
|
||||||
|
if (!newBuff) return ERROR(memory_allocation);
|
||||||
|
|
||||||
|
memcpy(newBuff, samplesBuffer, sBuffSize);
|
||||||
|
DiB_fillNoise((char*)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||||
|
|
||||||
|
result = DiB_trainFromBuffer_internal(dictBuffer, maxDictSize,
|
||||||
|
newBuff, sampleSizes, nbSamples,
|
||||||
|
params);
|
||||||
|
free(newBuff);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||||
|
const char** fileNamesTable, unsigned nbFiles,
|
||||||
|
DiB_params_t params)
|
||||||
|
{
|
||||||
|
void* srcBuffer;
|
||||||
|
size_t benchedSize;
|
||||||
|
size_t* fileSizes = (size_t*)malloc(nbFiles * sizeof(size_t));
|
||||||
|
unsigned long long totalSizeToLoad = DiB_getTotalFileSize(fileNamesTable, nbFiles);
|
||||||
|
void* dictBuffer = malloc(maxDictSize);
|
||||||
|
size_t dictSize;
|
||||||
|
int result = 0;
|
||||||
|
|
||||||
|
/* init */
|
||||||
|
benchedSize = DiB_findMaxMem(totalSizeToLoad * MEMMULT) / MEMMULT;
|
||||||
|
if ((unsigned long long)benchedSize > totalSizeToLoad) benchedSize = (size_t)totalSizeToLoad;
|
||||||
|
if (benchedSize < totalSizeToLoad)
|
||||||
|
DISPLAYLEVEL(1, "Not enough memory; training on %u MB only...\n", (unsigned)(benchedSize >> 20));
|
||||||
|
|
||||||
|
/* Memory allocation & restrictions */
|
||||||
|
srcBuffer = malloc(benchedSize+NOISELENGTH); /* + noise */
|
||||||
|
if ((!fileSizes) || (!srcBuffer) || (!dictBuffer)) EXM_THROW(12, "not enough memory for DiB_trainFiles"); /* should not happen */
|
||||||
|
|
||||||
|
/* Load input buffer */
|
||||||
|
DiB_loadFiles(srcBuffer, benchedSize, fileSizes, fileNamesTable, nbFiles);
|
||||||
|
DiB_fillNoise((char*)srcBuffer + benchedSize, NOISELENGTH); /* guard band, for end of buffer condition */
|
||||||
|
|
||||||
|
/* call buffer version */
|
||||||
|
dictSize = DiB_trainFromBuffer_internal(dictBuffer, maxDictSize,
|
||||||
|
srcBuffer, fileSizes, nbFiles,
|
||||||
|
params);
|
||||||
|
if (DiB_isError(dictSize))
|
||||||
|
{
|
||||||
|
DISPLAYLEVEL(1, "dictionary training failed : %s", DiB_getErrorName(dictSize)); /* should not happen */
|
||||||
|
result = 1;
|
||||||
|
goto _cleanup;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* save dict */
|
||||||
|
DISPLAYLEVEL(2, "Save dictionary of size %u into file %s \n", (U32)dictSize, dictFileName);
|
||||||
|
DiB_saveDict(dictFileName, dictBuffer, dictSize);
|
||||||
|
|
||||||
|
/* clean up */
|
||||||
|
_cleanup:
|
||||||
|
free(srcBuffer);
|
||||||
|
free(dictBuffer);
|
||||||
|
free(fileSizes);
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|||||||
@ -26,6 +26,9 @@
|
|||||||
/* This library is designed for a single-threaded console application.
|
/* This library is designed for a single-threaded console application.
|
||||||
* It exit() and printf() into stderr when it encounters an error condition. */
|
* It exit() and printf() into stderr when it encounters an error condition. */
|
||||||
|
|
||||||
|
#ifndef DICTBUILDER_H_001
|
||||||
|
#define DICTBUILDER_H_001
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
* Version
|
* Version
|
||||||
***************************************/
|
***************************************/
|
||||||
@ -36,24 +39,56 @@
|
|||||||
unsigned DiB_versionNumber (void);
|
unsigned DiB_versionNumber (void);
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Public type
|
||||||
|
***************************************/
|
||||||
|
typedef struct {
|
||||||
|
unsigned selectivityLevel; /* 0 means default; larger => bigger selection => larger dictionary */
|
||||||
|
unsigned compressionLevel; /* 0 means default; target a specific zstd compression level */
|
||||||
|
} DiB_params_t;
|
||||||
|
|
||||||
|
|
||||||
/*-*************************************
|
/*-*************************************
|
||||||
* Public functions
|
* Public functions
|
||||||
***************************************/
|
***************************************/
|
||||||
/*! DiB_trainDictionary
|
/*! DiB_trainFromBuffer
|
||||||
Train a dictionary from a set of files provided by @fileNamesTable
|
Train a dictionary from a memory buffer @samplesBuffer
|
||||||
Resulting dictionary is written in file @dictFileName.
|
where @nbSamples samples have been stored concatenated.
|
||||||
@selectivityLevel change criteria for insertion into the dictionary (more => bigger selection => larger dictionary)
|
Each sample size is provided into an orderly table @sampleSizes.
|
||||||
@compressionLevel can be used to target a specific compression level of zstd. 0 means "default".
|
Resulting dictionary will be saved into @dictBuffer.
|
||||||
@result : 0 == ok
|
@parameters is optional and can be provided with 0 values to mean "default".
|
||||||
|
@result : size of dictionary stored into @dictBuffer (<= @dictBufferSize)
|
||||||
|
or an error code, which can be tested by DiB_isError().
|
||||||
|
note : DiB_trainFromBuffer() will send notifications into stderr if instructed to, using DiB_setNotificationLevel()
|
||||||
*/
|
*/
|
||||||
int DiB_trainDictionary(const char* dictFileName, unsigned maxDictSize,
|
size_t DiB_trainFromBuffer(void* dictBuffer, size_t dictBufferSize,
|
||||||
unsigned selectivityLevel, unsigned compressionLevel,
|
const void* samplesBuffer, const size_t* sampleSizes, unsigned nbSamples,
|
||||||
const char** fileNamesTable, unsigned nbFiles);
|
DiB_params_t parameters);
|
||||||
|
|
||||||
|
|
||||||
|
/*! DiB_trainFromFiles
|
||||||
|
Train a dictionary from a set of files provided by @fileNamesTable
|
||||||
|
Resulting dictionary is written into file @dictFileName.
|
||||||
|
@parameters is optional and can be provided with 0 values.
|
||||||
|
@result : 0 == ok. Any other : error.
|
||||||
|
*/
|
||||||
|
int DiB_trainFromFiles(const char* dictFileName, unsigned maxDictSize,
|
||||||
|
const char** fileNamesTable, unsigned nbFiles,
|
||||||
|
DiB_params_t parameters);
|
||||||
|
|
||||||
|
|
||||||
|
/*-*************************************
|
||||||
|
* Helper functions
|
||||||
|
***************************************/
|
||||||
|
unsigned DiB_isError(size_t errorCode);
|
||||||
|
const char* DiB_getErrorName(size_t errorCode);
|
||||||
|
|
||||||
/*! DiB_setNotificationLevel
|
/*! DiB_setNotificationLevel
|
||||||
Set amount of notification to be displayed on the console.
|
Set amount of notification to be displayed on the console.
|
||||||
0 = no console notification (default).
|
default initial value : 0 = no console notification.
|
||||||
Note : not thread-safe (use a global constant)
|
Note : not thread-safe (use a global constant)
|
||||||
*/
|
*/
|
||||||
void DiB_setNotificationLevel(unsigned l);
|
void DiB_setNotificationLevel(unsigned l);
|
||||||
|
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user