diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index 0d4dea069..3159df756 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,13 +25,17 @@ LDFLAGS += -lzstd default: all -all: main-ldm +all: main-basic main-chaining -main-ldm : basic_table.c ldm.c main-ldm.c +main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ +main-chaining : chaining_table.c ldm.c main-ldm.c + $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ + + clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main main-ldm + main-basic main-chaining @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 007086fee..c6a5040ee 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -2,16 +2,19 @@ #include #include "ldm_hashtable.h" +#include "mem.h" struct LDM_hashTable { U32 size; LDM_hashEntry *entries; + const BYTE *offsetBase; }; -LDM_hashTable *HASH_createTable(U32 size) { +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); table->size = size; table->entries = calloc(size, sizeof(LDM_hashEntry)); + table->offsetBase = offsetBase; return table; } @@ -20,15 +23,19 @@ void HASH_initializeTable(LDM_hashTable *table, U32 size) { table->entries = calloc(size, sizeof(LDM_hashEntry)); } +LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { + return table->entries + hash; +} LDM_hashEntry *HASH_getEntryFromHash( - const LDM_hashTable *table, const hash_t hash) { - return &(table->entries[hash]); + const LDM_hashTable *table, const hash_t hash, const U32 checksum) { + (void)checksum; + return getBucket(table, hash); } void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { - *HASH_getEntryFromHash(table, hash) = entry; + *getBucket(table, hash) = entry; } U32 HASH_getSize(const LDM_hashTable *table) { @@ -44,7 +51,7 @@ void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { U32 i = 0; U32 ctr = 0; for (; i < HASH_getSize(hashTable); i++) { - if (HASH_getEntryFromHash(hashTable, i)->offset == 0) { + if (getBucket(hashTable, i)->offset == 0) { ctr++; } } @@ -52,5 +59,3 @@ void HASH_outputTableOccupancy(const LDM_hashTable *hashTable) { HASH_getSize(hashTable), ctr, 100.0 * (double)(ctr) / (double)HASH_getSize(hashTable)); } - - diff --git a/contrib/long_distance_matching/chaining_table.c b/contrib/long_distance_matching/chaining_table.c new file mode 100644 index 000000000..226f78225 --- /dev/null +++ b/contrib/long_distance_matching/chaining_table.c @@ -0,0 +1,92 @@ +#include +#include + +#include "ldm_hashtable.h" +#include "mem.h" + +//TODO: move def somewhere else. +//TODO: memory usage is currently no longer LDM_MEMORY_USAGE. +// refactor code to scale the number of elements appropriately. + +// Number of elements per hash bucket. +#define HASH_BUCKET_SIZE_LOG 2 // MAX is 4 for now +#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) + +struct LDM_hashTable { + U32 size; + LDM_hashEntry *entries; // 1-D array for now. + + // Position corresponding to offset=0 in LDM_hashEntry. + const BYTE *offsetBase; + BYTE *bucketOffsets; // Pointer to current insert position. + // Last insert was at bucketOffsets - 1? +}; + +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { + LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); + table->size = size; + table->entries = calloc(size * HASH_BUCKET_SIZE, sizeof(LDM_hashEntry)); + table->bucketOffsets = calloc(size, sizeof(BYTE)); + table->offsetBase = offsetBase; + return table; +} + +static LDM_hashEntry *getBucket(const LDM_hashTable *table, const hash_t hash) { + return table->entries + (hash << HASH_BUCKET_SIZE_LOG); +} + +/* +static LDM_hashEntry *getLastInsertFromHash(const LDM_hashTable *table, + const hash_t hash) { + LDM_hashEntry *bucket = getBucket(table, hash); + BYTE offset = (table->bucketOffsets[hash] - 1) & (HASH_BUCKET_SIZE - 1); + return bucket + offset; +} +*/ + +LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, + const hash_t hash, + const U32 checksum) { + // Loop through bucket. + // TODO: in order of recency??? + LDM_hashEntry *bucket = getBucket(table, hash); + LDM_hashEntry *cur = bucket; + for(; cur < bucket + HASH_BUCKET_SIZE; ++cur) { + if (cur->checksum == checksum) { + return cur; + } + } + return NULL; +} + +void HASH_insert(LDM_hashTable *table, + const hash_t hash, const LDM_hashEntry entry) { + *(getBucket(table, hash) + table->bucketOffsets[hash]) = entry; + table->bucketOffsets[hash]++; + table->bucketOffsets[hash] &= HASH_BUCKET_SIZE - 1; +} + +U32 HASH_getSize(const LDM_hashTable *table) { + return table->size * HASH_BUCKET_SIZE; +} + +void HASH_destroyTable(LDM_hashTable *table) { + free(table->entries); + free(table->bucketOffsets); + free(table); +} + +void HASH_outputTableOccupancy(const LDM_hashTable *table) { + U32 ctr = 0; + LDM_hashEntry *cur = table->entries; + LDM_hashEntry *end = table->entries + (table->size * HASH_BUCKET_SIZE); + for (; cur < end; ++cur) { + if (cur->offset == 0) { + ctr++; + } + } + + printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", + HASH_getSize(table), ctr, + 100.0 * (double)(ctr) / (double)HASH_getSize(table)); +} diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index 32da40f82..3cb82ea68 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -20,9 +20,8 @@ #define CHECKSUM_CHAR_OFFSET 10 //#define RUN_CHECKS //#define LDM_DEBUG -// -#include "ldm.h" +#include "ldm.h" #include "ldm_hashtable.h" // TODO: Scanning speed @@ -98,6 +97,7 @@ static int intLog2(U32 x) { // TODO: Maybe we would eventually prefer to have linear rather than // exponential buckets. +/** void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { U32 i = 0; int buckets[32] = { 0 }; @@ -119,6 +119,7 @@ void HASH_outputTableOffsetHistogram(const LDM_CCtx *cctx) { } printf("\n"); } +*/ void LDM_printCompressStats(const LDM_compressStats *stats) { int i = 0; @@ -127,9 +128,11 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { //TODO: compute percentage matched? printf("Window size, hash table size (bytes): 2^%u, 2^%u\n", stats->windowSizeLog, stats->hashTableSizeLog); - printf("num matches, total match length: %u, %llu\n", + printf("num matches, total match length, %% matched: %u, %llu, %.3f\n", stats->numMatches, - stats->totalMatchLength); + stats->totalMatchLength, + 100.0 * (double)stats->totalMatchLength / + (double)(stats->totalMatchLength + stats->totalLiteralLength)); printf("avg match length: %.1f\n", ((double)stats->totalMatchLength) / (double)stats->numMatches); printf("avg literal length, total literalLength: %.1f, %llu\n", @@ -155,11 +158,13 @@ void LDM_printCompressStats(const LDM_compressStats *stats) { printf("Num invalid hashes, num valid hashes, %llu %llu\n", stats->numInvalidHashes, stats->numValidHashes); */ + /* printf("num collisions, num hash inserts, %% collisions: %u, %u, %.3f\n", stats->numCollisions, stats->numHashInserts, stats->numHashInserts == 0 ? 1.0 : (100.0 * (double)stats->numCollisions) / (double)stats->numHashInserts); + */ printf("=====================\n"); } @@ -173,6 +178,7 @@ int LDM_isValidMatch(const BYTE *pIn, const BYTE *pMatch) { */ //TODO: This seems to be faster for some reason? + U32 lengthLeft = LDM_MIN_MATCH_LENGTH; const BYTE *curIn = pIn; const BYTE *curMatch = pMatch; @@ -286,8 +292,9 @@ static void setNextHash(LDM_CCtx *cctx) { static void putHashOfCurrentPositionFromHash( LDM_CCtx *cctx, hash_t hash, U32 sum) { + /* #ifdef COMPUTE_STATS - if (cctx->stats.numHashInserts < LDM_HASHTABLESIZE_U32) { + if (cctx->stats.numHashInserts < HASH_getSize(cctx->hashTable)) { U32 offset = HASH_getEntryFromHash(cctx->hashTable, hash)->offset; cctx->stats.numHashInserts++; if (offset != 0 && !LDM_isValidMatch(cctx->ip, offset + cctx->ibase)) { @@ -295,11 +302,13 @@ static void putHashOfCurrentPositionFromHash( } } #endif +*/ // Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Note: this works only when cctx->step is 1. if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { - const LDM_hashEntry entry = { cctx->ip - cctx->ibase }; + const LDM_hashEntry entry = { cctx->ip - cctx->ibase , + MEM_read32(cctx->ip) }; HASH_insert(cctx->hashTable, hash, entry); } @@ -393,7 +402,7 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32, cctx->ibase); //HASH_initializeTable(cctx->hashTable, LDM_HASHTABLESIZE_U32); @@ -425,12 +434,13 @@ void LDM_destroyCCtx(LDM_CCtx *cctx) { * */ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { + + LDM_hashEntry *entry = NULL; cctx->nextIp = cctx->ip + cctx->step; do { hash_t h; U32 sum; - LDM_hashEntry *entry; setNextHash(cctx); h = cctx->nextHash; sum = cctx->nextSum; @@ -441,13 +451,17 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match) { return 1; } - entry = HASH_getEntryFromHash(cctx->hashTable, h); - *match = entry->offset + cctx->ibase; + entry = HASH_getEntryFromHash(cctx->hashTable, h, MEM_read32(cctx->ip)); + + if (entry != NULL) { + *match = entry->offset + cctx->ibase; + } putHashOfCurrentPositionFromHash(cctx, h, sum); - } while (cctx->ip - *match > LDM_WINDOW_SIZE || - !LDM_isValidMatch(cctx->ip, *match)); + } while (entry == NULL || + (cctx->ip - *match > LDM_WINDOW_SIZE || + !LDM_isValidMatch(cctx->ip, *match))); setNextHash(cctx); return 0; } @@ -510,7 +524,7 @@ void LDM_outputBlock(LDM_CCtx *cctx, size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; - const BYTE *match; + const BYTE *match = NULL; LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 18b64e378..6325d1b19 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -17,8 +17,8 @@ #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) //These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 4 -#define LDM_HASH_LENGTH 4 +#define LDM_MIN_MATCH_LENGTH 1024 +#define LDM_HASH_LENGTH 1024 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 690c47a15..92add96f9 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -7,6 +7,7 @@ typedef U32 hash_t; typedef struct LDM_hashEntry { U32 offset; + U32 checksum; // Not needed? } LDM_hashEntry; typedef struct LDM_hashTable LDM_hashTable; @@ -14,10 +15,11 @@ typedef struct LDM_hashTable LDM_hashTable; // TODO: rename functions // TODO: comments -LDM_hashTable *HASH_createTable(U32 size); +LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase); LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, - const hash_t hash); + const hash_t hash, + const U32 checksum); void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry);