diff --git a/contrib/long_distance_matching/Makefile b/contrib/long_distance_matching/Makefile index df4390157..131638fdb 100644 --- a/contrib/long_distance_matching/Makefile +++ b/contrib/long_distance_matching/Makefile @@ -25,7 +25,7 @@ LDFLAGS += -lzstd default: all -all: main-basic main-circular-buffer main-lag +all: main-basic main-circular-buffer main-basic : basic_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ @@ -33,11 +33,8 @@ main-basic : basic_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ -main-lag: lag_table.c ldm.c main-ldm.c - $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ - clean: @rm -f core *.o tmp* result* *.ldm *.ldm.dec \ - main-basic main-circular-buffer main-lag + main-basic main-circular-buffer @echo Cleaning completed diff --git a/contrib/long_distance_matching/basic_table.c b/contrib/long_distance_matching/basic_table.c index 893a4caf9..8b3588e81 100644 --- a/contrib/long_distance_matching/basic_table.c +++ b/contrib/long_distance_matching/basic_table.c @@ -1,9 +1,12 @@ #include #include +#include "ldm.h" #include "ldm_hashtable.h" #include "mem.h" +#define LDM_HASHLOG ((LDM_MEMORY_USAGE) - 4) + struct LDM_hashTable { U32 size; LDM_hashEntry *entries; @@ -46,6 +49,10 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, return NULL; } +hash_t HASH_hashU32(U32 value) { + return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); +} + void HASH_insert(LDM_hashTable *table, const hash_t hash, const LDM_hashEntry entry) { *getBucket(table, hash) = entry; diff --git a/contrib/long_distance_matching/circular_buffer_table.c b/contrib/long_distance_matching/circular_buffer_table.c index b578d2bf1..bc7503f17 100644 --- a/contrib/long_distance_matching/circular_buffer_table.c +++ b/contrib/long_distance_matching/circular_buffer_table.c @@ -1,33 +1,36 @@ #include #include +#include "ldm.h" #include "ldm_hashtable.h" #include "mem.h" //TODO: move def somewhere else. -//TODO: memory usage is currently no longer LDM_MEMORY_USAGE. -// refactor code to scale the number of elements appropriately. // Number of elements per hash bucket. +// HASH_BUCKET_SIZE_LOG defined in ldm.h #define HASH_BUCKET_SIZE_LOG 0 // MAX is 4 for now #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) +#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-4-HASH_BUCKET_SIZE_LOG) + struct LDM_hashTable { - U32 size; + U32 size; // Number of buckets + U32 maxEntries; // Rename... LDM_hashEntry *entries; // 1-D array for now. // Position corresponding to offset=0 in LDM_hashEntry. const BYTE *offsetBase; BYTE *bucketOffsets; // Pointer to current insert position. - // Last insert was at bucketOffsets - 1? }; LDM_hashTable *HASH_createTable(U32 size, const BYTE *offsetBase) { LDM_hashTable *table = malloc(sizeof(LDM_hashTable)); - table->size = size; - table->entries = calloc(size * HASH_BUCKET_SIZE, sizeof(LDM_hashEntry)); - table->bucketOffsets = calloc(size, sizeof(BYTE)); + table->size = size >> HASH_BUCKET_SIZE_LOG; + table->maxEntries = size; + table->entries = calloc(size, sizeof(LDM_hashEntry)); + table->bucketOffsets = calloc(size >> HASH_BUCKET_SIZE_LOG, sizeof(BYTE)); table->offsetBase = offsetBase; return table; } @@ -45,11 +48,6 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, LDM_hashEntry *cur = bucket; // TODO: in order of recency? for (; cur < bucket + HASH_BUCKET_SIZE; ++cur) { - /* - if (cur->checksum == 0 && cur->offset == 0) { - return NULL; - } - */ // Check checksum for faster check. if (cur->checksum == checksum && (*isValid)(pIn, cur->offset + table->offsetBase)) { @@ -59,6 +57,11 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, return NULL; } +hash_t HASH_hashU32(U32 value) { + return ((value * 2654435761U) >> (32 - LDM_HASHLOG)); +} + + LDM_hashEntry *HASH_getEntryFromHash(const LDM_hashTable *table, const hash_t hash, const U32 checksum) { @@ -82,7 +85,7 @@ void HASH_insert(LDM_hashTable *table, } U32 HASH_getSize(const LDM_hashTable *table) { - return table->size * HASH_BUCKET_SIZE; + return table->size; } void HASH_destroyTable(LDM_hashTable *table) { @@ -101,7 +104,8 @@ void HASH_outputTableOccupancy(const LDM_hashTable *table) { } } + printf("Num buckets, bucket size: %d, %d\n", table->size, HASH_BUCKET_SIZE); printf("Hash table size, empty slots, %% empty: %u, %u, %.3f\n", - HASH_getSize(table), ctr, - 100.0 * (double)(ctr) / (double)HASH_getSize(table)); + table->maxEntries, ctr, + 100.0 * (double)(ctr) / table->maxEntries); } diff --git a/contrib/long_distance_matching/ldm.c b/contrib/long_distance_matching/ldm.c index dedbf79a9..4d8ca40bc 100644 --- a/contrib/long_distance_matching/ldm.c +++ b/contrib/long_distance_matching/ldm.c @@ -4,12 +4,16 @@ #include #include -// Insert every (HASH_ONLY_EVERY + 1) into the hash table. -#define HASH_ONLY_EVERY 15 -#define LDM_HASHLOG (LDM_MEMORY_USAGE-2) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) +//#define LDM_HASH_ENTRY_SIZE 4 #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) +#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 4) + +// Insert every (HASH_ONLY_EVERY + 1) into the hash table. +#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - 4)) +#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) + #define ML_BITS 4 #define ML_MASK ((1U<> (32 - LDM_HASHLOG)); + return HASH_hashU32(sum); +// return ((sum * 2654435761U) >> (32 - LDM_HASHLOG)); } /** @@ -261,9 +266,9 @@ static void setNextHash(LDM_CCtx *cctx) { cctx->nextPosHashed = cctx->nextIp; cctx->nextHash = checksumToHash(cctx->nextSum); -#if LAG - if (cctx->ip - cctx->ibase > LAG) { -// printf("LAG %zu\n", cctx->ip - cctx->lagIp); +#if LDM_LAG +// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp); + if (cctx->ip - cctx->ibase > LDM_LAG) { cctx->lagSum = updateChecksum( cctx->lagSum, LDM_HASH_LENGTH, cctx->lagIp[0], cctx->lagIp[LDM_HASH_LENGTH]); @@ -296,7 +301,7 @@ static void putHashOfCurrentPositionFromHash( const LDM_hashEntry entry = { cctx->ip - cctx->ibase , MEM_read32(cctx->ip) }; */ -#if LAG +#if LDM_LAG // TODO: off by 1, but whatever if (cctx->lagIp - cctx->ibase > 0) { const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; @@ -364,6 +369,18 @@ U32 LDM_countMatchLength(const BYTE *pIn, const BYTE *pMatch, return (U32)(pIn - pStart); } +void LDM_outputConfiguration(void) { + printf("=====================\n"); + printf("Configuration\n"); + printf("Window size log: %d\n", LDM_WINDOW_SIZE_LOG); + printf("Min match, hash length: %d, %d\n", + LDM_MIN_MATCH_LENGTH, LDM_HASH_LENGTH); + printf("LDM_MEMORY_USAGE: %d\n", LDM_MEMORY_USAGE); + printf("HASH_ONLY_EVERY: %d\n", HASH_ONLY_EVERY); + printf("LDM_LAG %d\n", LDM_LAG); + printf("=====================\n"); +} + void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize) { const BYTE *ip = (const BYTE *)src; @@ -392,12 +409,8 @@ void LDM_initializeCCtx(LDM_CCtx *cctx, cctx->anchor = cctx->ibase; memset(&(cctx->stats), 0, sizeof(cctx->stats)); - cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U32, cctx->ibase); + cctx->hashTable = HASH_createTable(LDM_HASHTABLESIZE_U64, cctx->ibase); - //HASH_initializeTable(cctx->hashTable, LDM_HASHTABLESIZE_U32); - -// calloc(LDM_HASHTABLESIZE_U32, sizeof(LDM_hashEntry)); -// memset(cctx->hashTable, 0, sizeof(cctx->hashTable)); cctx->stats.minOffset = UINT_MAX; cctx->stats.windowSizeLog = LDM_WINDOW_SIZE_LOG; cctx->stats.hashTableSizeLog = LDM_MEMORY_USAGE; @@ -520,17 +533,19 @@ size_t LDM_compress(const void *src, size_t srcSize, void *dst, size_t maxDstSize) { LDM_CCtx cctx; const BYTE *match = NULL; +// printf("TST: %d\n", LDM_WINDOW_SIZE / LDM_HASHTABLESIZE_U64); + printf("HASH LOG: %d\n", HASH_ONLY_EVERY_LOG); + LDM_initializeCCtx(&cctx, src, srcSize, dst, maxDstSize); /* Hash the first position and put it into the hash table. */ LDM_putHashOfCurrentPosition(&cctx); -#if LAG +#if LDM_LAG cctx.lagIp = cctx.ip; cctx.lagHash = cctx.lastHash; cctx.lagSum = cctx.lastSum; #endif - /** * Find a match. * If no more matches can be found (i.e. the length of the remaining input @@ -542,6 +557,7 @@ size_t LDM_compress(const void *src, size_t srcSize, cctx.stats.numMatches++; #endif +// printf("HERE %zu\n", cctx.ip - cctx.ibase); /** * Catch up: look back to extend the match backwards from the found match. */ diff --git a/contrib/long_distance_matching/ldm.h b/contrib/long_distance_matching/ldm.h index 6d7c4af27..2d4ff9cf2 100644 --- a/contrib/long_distance_matching/ldm.h +++ b/contrib/long_distance_matching/ldm.h @@ -10,15 +10,20 @@ #define LDM_HEADER_SIZE ((LDM_COMPRESS_SIZE)+(LDM_DECOMPRESS_SIZE)) #define LDM_OFFSET_SIZE 4 -// Defines the size of the hash table (currently the number of elements). -#define LDM_MEMORY_USAGE 12 +// Defines the size of the hash table. +// Currently this should be less than WINDOW_SIZE_LOG + 4? +#define LDM_MEMORY_USAGE 24 -#define LDM_WINDOW_SIZE_LOG 30 +//#define LDM_LAG (1 << 23) +//#define LDM_LAG (1 << 20) +#define LDM_LAG 0 + +#define LDM_WINDOW_SIZE_LOG 28 #define LDM_WINDOW_SIZE (1 << (LDM_WINDOW_SIZE_LOG)) -//These should be multiples of four. -#define LDM_MIN_MATCH_LENGTH 64 -#define LDM_HASH_LENGTH 64 +//These should be multiples of four (and perhaps set to the same values?). +#define LDM_MIN_MATCH_LENGTH 512 +#define LDM_HASH_LENGTH 512 typedef struct LDM_compressStats LDM_compressStats; typedef struct LDM_CCtx LDM_CCtx; @@ -48,7 +53,7 @@ typedef struct LDM_DCtx LDM_DCtx; * The lower four bits of the token encode the match length. With additional * bytes added similarly to the additional literal length bytes after the offset. * - * The last sequence is incomplete and stops right after the lieterals. + * The last sequence is incomplete and stops right after the literals. * */ size_t LDM_compress(const void *src, size_t srcSize, @@ -142,6 +147,8 @@ void LDM_initializeDCtx(LDM_DCtx *dctx, void LDM_readHeader(const void *src, U64 *compressedSize, U64 *decompressedSize); +void LDM_outputConfiguration(void); + void LDM_test(void); #endif /* LDM_H */ diff --git a/contrib/long_distance_matching/ldm_hashtable.h b/contrib/long_distance_matching/ldm_hashtable.h index 83a9ed27a..4fef66214 100644 --- a/contrib/long_distance_matching/ldm_hashtable.h +++ b/contrib/long_distance_matching/ldm_hashtable.h @@ -42,6 +42,8 @@ LDM_hashEntry *HASH_getValidEntry(const LDM_hashTable *table, const BYTE *pIn, int (*isValid)(const BYTE *pIn, const BYTE *pMatch)); +hash_t HASH_hashU32(U32 value); + /** * Insert an LDM_hashEntry into the bucket corresponding to hash. */ @@ -61,5 +63,4 @@ void HASH_destroyTable(LDM_hashTable *table); */ void HASH_outputTableOccupancy(const LDM_hashTable *hashTable); - #endif /* LDM_HASHTABLE_H */ diff --git a/contrib/long_distance_matching/main-ldm.c b/contrib/long_distance_matching/main-ldm.c index a379d3a6d..a43ec0002 100644 --- a/contrib/long_distance_matching/main-ldm.c +++ b/contrib/long_distance_matching/main-ldm.c @@ -18,7 +18,7 @@ /* Compress file given by fname and output to oname. * Returns 0 if successful, error code otherwise. * - * TODO: This currently seg faults if the compressed size is > the decompress + * TODO: This might seg fault if the compressed size is > the decompress * size due to the mmapping and output file size allocated to be the input size. * The compress function should check before writing or buffer writes. */ @@ -28,6 +28,8 @@ static int compress(const char *fname, const char *oname) { char *src, *dst; size_t maxCompressedSize, compressedSize; + struct timeval tv1, tv2; + /* Open the input file. */ if ((fdin = open(fname, O_RDONLY)) < 0) { perror("Error in file opening"); @@ -46,7 +48,10 @@ static int compress(const char *fname, const char *oname) { return 1; } - maxCompressedSize = statbuf.st_size + LDM_HEADER_SIZE; + maxCompressedSize = (statbuf.st_size + LDM_HEADER_SIZE); + // Handle case where compressed size is > decompressed size. + // The compress function should check before writing or buffer writes. + maxCompressedSize += statbuf.st_size / 255; /* Go to the location corresponding to the last byte. */ /* TODO: fallocate? */ @@ -74,10 +79,12 @@ static int compress(const char *fname, const char *oname) { perror("mmap error for output"); return 1; } + gettimeofday(&tv1, NULL); compressedSize = LDM_HEADER_SIZE + LDM_compress(src, statbuf.st_size, dst + LDM_HEADER_SIZE, maxCompressedSize); + gettimeofday(&tv2, NULL); // Write compress and decompress size to header // TODO: should depend on LDM_DECOMPRESS_SIZE write32 @@ -96,6 +103,14 @@ static int compress(const char *fname, const char *oname) { (unsigned)statbuf.st_size, (unsigned)compressedSize, oname, (double)compressedSize / (statbuf.st_size) * 100); + printf("Total compress time = %.3f seconds, Average compression speed: %.3f MB/s\n", + (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec), + ((double)statbuf.st_size / (double) (1 << 20)) / + ((double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + + (double) (tv2.tv_sec - tv1.tv_sec))); + + // Close files. close(fdin); close(fdout); @@ -234,16 +249,10 @@ int main(int argc, const char *argv[]) { /* Compress */ { - struct timeval tv1, tv2; - gettimeofday(&tv1, NULL); if (compress(inpFilename, ldmFilename)) { printf("Compress error"); return 1; } - gettimeofday(&tv2, NULL); - printf("Total compress time = %f seconds\n", - (double) (tv2.tv_usec - tv1.tv_usec) / 1000000 + - (double) (tv2.tv_sec - tv1.tv_sec)); } /* Decompress */