Experiment with 64-bit hash and checksum

This commit is contained in:
Stella Lau 2017-07-20 16:50:06 -07:00
parent 13a01ffb27
commit 273c17b350
8 changed files with 1096 additions and 75 deletions

View File

@ -25,7 +25,7 @@ LDFLAGS += -lzstd
default: all default: all
all: main-circular-buffer main-integrated all: main-circular-buffer main-integrated main-hf
#main-basic : basic_table.c ldm.c main-ldm.c #main-basic : basic_table.c ldm.c main-ldm.c
# $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ # $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
@ -33,12 +33,14 @@ all: main-circular-buffer main-integrated
main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c main-circular-buffer: circular_buffer_table.c ldm.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-hf: ldm_hf_test.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
main-integrated: ldm_with_table.c main-ldm.c main-integrated: ldm_with_table.c main-ldm.c
$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@ $(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
clean: clean:
@rm -f core *.o tmp* result* *.ldm *.ldm.dec \ @rm -f core *.o tmp* result* *.ldm *.ldm.dec \
main-basic main-circular-buffer main-integrated main-basic main-circular-buffer main-integrated main-hf
@echo Cleaning completed @echo Cleaning completed

View File

@ -5,14 +5,16 @@
#include "ldm_hashtable.h" #include "ldm_hashtable.h"
#include "mem.h" #include "mem.h"
// Number of elements per hash bucket. // Number of elements per hash bucket.
// HASH_BUCKET_SIZE_LOG defined in ldm.h // HASH_BUCKET_SIZE_LOG defined in ldm.h
#define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG)) #define HASH_BUCKET_SIZE (1 << (HASH_BUCKET_SIZE_LOG))
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)-(HASH_BUCKET_SIZE_LOG))
// TODO: rename. Number of hash buckets. // TODO: rename. Number of hash buckets.
// TODO: Link to HASH_ENTRY_SIZE_LOG // TODO: Link to HASH_ENTRY_SIZE_LOG
#define LDM_HASHLOG ((LDM_MEMORY_USAGE)-3-(HASH_BUCKET_SIZE_LOG))
//#define ZSTD_SKIP //#define ZSTD_SKIP
struct LDM_hashTable { struct LDM_hashTable {
@ -175,6 +177,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_hashTable *table,
if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) { if (cur->checksum == checksum && pIn - pMatch <= table->maxWindowSize) {
U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd); U32 forwardMatchLength = ZSTD_count(pIn, pMatch, pEnd);
U32 backwardMatchLength, totalMatchLength; U32 backwardMatchLength, totalMatchLength;
if (forwardMatchLength < table->minMatchLength) { if (forwardMatchLength < table->minMatchLength) {
continue; continue;
} }

View File

@ -4,14 +4,15 @@
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include "ldm.h"
#include "ldm_hashtable.h"
#define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE)) #define LDM_HASHTABLESIZE (1 << (LDM_MEMORY_USAGE))
#define LDM_HASH_ENTRY_SIZE_LOG 3
#define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2) #define LDM_HASHTABLESIZE_U32 ((LDM_HASHTABLESIZE) >> 2)
#define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3) #define LDM_HASHTABLESIZE_U64 ((LDM_HASHTABLESIZE) >> 3)
// Insert every (HASH_ONLY_EVERY + 1) into the hash table. // Insert every (HASH_ONLY_EVERY + 1) into the hash table.
#define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE) - (LDM_HASH_ENTRY_SIZE_LOG))) #define HASH_ONLY_EVERY_LOG (LDM_WINDOW_SIZE_LOG-((LDM_MEMORY_USAGE)-(LDM_HASH_ENTRY_SIZE_LOG)))
#define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1) #define HASH_ONLY_EVERY ((1 << HASH_ONLY_EVERY_LOG) - 1)
#define ML_BITS 4 #define ML_BITS 4
@ -26,8 +27,7 @@
//#define RUN_CHECKS //#define RUN_CHECKS
//#define TMP_RECOMPUTE_LENGTHS //#define TMP_RECOMPUTE_LENGTHS
#include "ldm.h" typedef U32 checksum_t;
#include "ldm_hashtable.h"
// TODO: Scanning speed // TODO: Scanning speed
// TODO: Memory usage // TODO: Memory usage
@ -71,22 +71,22 @@ struct LDM_CCtx {
LDM_hashTable *hashTable; LDM_hashTable *hashTable;
// LDM_hashEntry hashTable[LDM_HASHTABLESIZE_U32];
const BYTE *lastPosHashed; /* Last position hashed */ const BYTE *lastPosHashed; /* Last position hashed */
hash_t lastHash; /* Hash corresponding to lastPosHashed */ hash_t lastHash; /* Hash corresponding to lastPosHashed */
U32 lastSum; checksum_t lastSum;
const BYTE *nextIp; // TODO: this is redundant (ip + step) const BYTE *nextIp; // TODO: this is redundant (ip + step)
const BYTE *nextPosHashed; const BYTE *nextPosHashed;
hash_t nextHash; /* Hash corresponding to nextPosHashed */ hash_t nextHash; /* Hash corresponding to nextPosHashed */
U32 nextSum; checksum_t nextSum;
unsigned step; // ip step, should be 1. unsigned step; // ip step, should be 1.
const BYTE *lagIp; const BYTE *lagIp;
hash_t lagHash; hash_t lagHash;
U32 lagSum; checksum_t lagSum;
U64 numHashInserts; U64 numHashInserts;
// DEBUG // DEBUG
@ -191,15 +191,15 @@ static hash_t checksumToHash(U32 sum) {
} }
/** /**
* Computes a checksum based on rsync's checksum. * Computes a 32-bit checksum based on rsync's checksum.
* *
* a(k,l) = \sum_{i = k}^l x_i (mod M) * a(k,l) = \sum_{i = k}^l x_i (mod M)
* b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M) * b(k,l) = \sum_{i = k}^l ((l - i + 1) * x_i) (mod M)
* checksum(k,l) = a(k,l) + 2^{16} * b(k,l) * checksum(k,l) = a(k,l) + 2^{16} * b(k,l)
*/ */
static U32 getChecksum(const BYTE *buf, U32 len) { static checksum_t getChecksum(const BYTE *buf, U32 len) {
U32 i; U32 i;
U32 s1, s2; checksum_t s1, s2;
s1 = s2 = 0; s1 = s2 = 0;
for (i = 0; i < (len - 4); i += 4) { for (i = 0; i < (len - 4); i += 4) {
@ -226,8 +226,8 @@ static U32 getChecksum(const BYTE *buf, U32 len) {
* *
* Thus toRemove should correspond to data[0]. * Thus toRemove should correspond to data[0].
*/ */
static U32 updateChecksum(U32 sum, U32 len, static checksum_t updateChecksum(checksum_t sum, U32 len,
BYTE toRemove, BYTE toAdd) { BYTE toRemove, BYTE toAdd) {
U32 s1 = (sum & 0xffff) - toRemove + toAdd; U32 s1 = (sum & 0xffff) - toRemove + toAdd;
U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1; U32 s2 = (sum >> 16) - ((toRemove + CHECKSUM_CHAR_OFFSET) * len) + s1;
@ -262,7 +262,6 @@ static void setNextHash(LDM_CCtx *cctx) {
cctx->nextHash = checksumToHash(cctx->nextSum); cctx->nextHash = checksumToHash(cctx->nextSum);
#if LDM_LAG #if LDM_LAG
// printf("LDM_LAG %zu\n", cctx->ip - cctx->lagIp);
if (cctx->ip - cctx->ibase > LDM_LAG) { if (cctx->ip - cctx->ibase > LDM_LAG) {
cctx->lagSum = updateChecksum( cctx->lagSum = updateChecksum(
cctx->lagSum, LDM_HASH_LENGTH, cctx->lagSum, LDM_HASH_LENGTH,
@ -288,32 +287,28 @@ static void setNextHash(LDM_CCtx *cctx) {
} }
static void putHashOfCurrentPositionFromHash( static void putHashOfCurrentPositionFromHash(
LDM_CCtx *cctx, hash_t hash, U32 sum) { LDM_CCtx *cctx, hash_t hash, U32 checksum) {
// Hash only every HASH_ONLY_EVERY times, based on cctx->ip. // Hash only every HASH_ONLY_EVERY times, based on cctx->ip.
// Note: this works only when cctx->step is 1. // Note: this works only when cctx->step is 1.
if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) { if (((cctx->ip - cctx->ibase) & HASH_ONLY_EVERY) == HASH_ONLY_EVERY) {
/**
const LDM_hashEntry entry = { cctx->ip - cctx->ibase ,
MEM_read32(cctx->ip) };
*/
#if LDM_LAG #if LDM_LAG
// TODO: off by 1, but whatever // TODO: off by 1, but whatever
if (cctx->lagIp - cctx->ibase > 0) { if (cctx->lagIp - cctx->ibase > 0) {
const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum }; const LDM_hashEntry entry = { cctx->lagIp - cctx->ibase, cctx->lagSum };
HASH_insert(cctx->hashTable, cctx->lagHash, entry); HASH_insert(cctx->hashTable, cctx->lagHash, entry);
} else { } else {
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
HASH_insert(cctx->hashTable, hash, entry); HASH_insert(cctx->hashTable, hash, entry);
} }
#else #else
const LDM_hashEntry entry = { cctx->ip - cctx->ibase, sum }; const LDM_hashEntry entry = { cctx->ip - cctx->ibase, checksum };
HASH_insert(cctx->hashTable, hash, entry); HASH_insert(cctx->hashTable, hash, entry);
#endif #endif
} }
cctx->lastPosHashed = cctx->ip; cctx->lastPosHashed = cctx->ip;
cctx->lastHash = hash; cctx->lastHash = hash;
cctx->lastSum = sum; cctx->lastSum = checksum;
} }
/** /**
@ -336,7 +331,7 @@ static void LDM_updateLastHashFromNextHash(LDM_CCtx *cctx) {
* Insert hash of the current position into the hash table. * Insert hash of the current position into the hash table.
*/ */
static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) { static void LDM_putHashOfCurrentPosition(LDM_CCtx *cctx) {
U32 sum = getChecksum(cctx->ip, LDM_HASH_LENGTH); checksum_t sum = getChecksum(cctx->ip, LDM_HASH_LENGTH);
hash_t hash = checksumToHash(sum); hash_t hash = checksumToHash(sum);
#ifdef RUN_CHECKS #ifdef RUN_CHECKS
@ -441,7 +436,7 @@ static int LDM_findBestMatch(LDM_CCtx *cctx, const BYTE **match,
while (entry == NULL) { while (entry == NULL) {
hash_t h; hash_t h;
U32 sum; checksum_t sum;
setNextHash(cctx); setNextHash(cctx);
h = cctx->nextHash; h = cctx->nextHash;
sum = cctx->nextSum; sum = cctx->nextSum;
@ -698,23 +693,7 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
} }
// TODO: implement and test hash function // TODO: implement and test hash function
void LDM_test(void) { void LDM_test(const BYTE *src) {
(void)src;
} }
/*
void LDM_test(const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
const BYTE *ip = (const BYTE *)src + 1125;
U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
U32 sum2;
++ip;
for (; ip < (const BYTE *)src + 1125 + 100; ip++) {
sum2 = updateChecksum(sum, LDM_HASH_LENGTH,
ip[-1], ip[LDM_HASH_LENGTH - 1]);
sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2);
}
}
*/

View File

@ -31,6 +31,7 @@ typedef struct LDM_compressStats LDM_compressStats;
typedef struct LDM_CCtx LDM_CCtx; typedef struct LDM_CCtx LDM_CCtx;
typedef struct LDM_DCtx LDM_DCtx; typedef struct LDM_DCtx LDM_DCtx;
/** /**
* Compresses src into dst. * Compresses src into dst.
* *
@ -151,6 +152,6 @@ void LDM_readHeader(const void *src, U64 *compressedSize,
void LDM_outputConfiguration(void); void LDM_outputConfiguration(void);
void LDM_test(void); void LDM_test(const BYTE *src);
#endif /* LDM_H */ #endif /* LDM_H */

View File

@ -3,6 +3,8 @@
#include "mem.h" #include "mem.h"
#define LDM_HASH_ENTRY_SIZE_LOG 3
// TODO: clean up comments // TODO: clean up comments
typedef U32 hash_t; typedef U32 hash_t;

File diff suppressed because it is too large Load Diff

View File

@ -29,7 +29,7 @@
#define CHECKSUM_CHAR_OFFSET 10 #define CHECKSUM_CHAR_OFFSET 10
// Take first match only. // Take first match only.
#define ZSTD_SKIP //#define ZSTD_SKIP
//#define RUN_CHECKS //#define RUN_CHECKS
@ -292,8 +292,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
totalMatchLength = forwardMatchLength + backwardMatchLength; totalMatchLength = forwardMatchLength + backwardMatchLength;
if (totalMatchLength >= bestMatchLength && if (totalMatchLength >= bestMatchLength) {
totalMatchLength >= LDM_MIN_MATCH_LENGTH) {
bestMatchLength = totalMatchLength; bestMatchLength = totalMatchLength;
*pForwardMatchLength = forwardMatchLength; *pForwardMatchLength = forwardMatchLength;
*pBackwardMatchLength = backwardMatchLength; *pBackwardMatchLength = backwardMatchLength;
@ -305,7 +304,7 @@ LDM_hashEntry *HASH_getBestEntry(const LDM_CCtx *cctx,
} }
} }
} }
if (bestEntry != NULL && bestMatchLength > LDM_MIN_MATCH_LENGTH) { if (bestEntry != NULL) {
return bestEntry; return bestEntry;
} }
return NULL; return NULL;
@ -951,23 +950,8 @@ size_t LDM_decompress(const void *src, size_t compressedSize,
} }
// TODO: implement and test hash function // TODO: implement and test hash function
void LDM_test(void) { void LDM_test(const BYTE *src) {
(void)src;
} }
/*
void LDM_test(const void *src, size_t srcSize,
void *dst, size_t maxDstSize) {
const BYTE *ip = (const BYTE *)src + 1125;
U32 sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
U32 sum2;
++ip;
for (; ip < (const BYTE *)src + 1125 + 100; ip++) {
sum2 = updateChecksum(sum, LDM_HASH_LENGTH,
ip[-1], ip[LDM_HASH_LENGTH - 1]);
sum = getChecksum((const char *)ip, LDM_HASH_LENGTH);
printf("TEST HASH: %zu %u %u\n", ip - (const BYTE *)src, sum, sum2);
}
}
*/

View File

@ -13,13 +13,13 @@
#include "zstd.h" #include "zstd.h"
#define DEBUG #define DEBUG
#define TEST //#define TEST
/* Compress file given by fname and output to oname. /* Compress file given by fname and output to oname.
* Returns 0 if successful, error code otherwise. * Returns 0 if successful, error code otherwise.
* *
* TODO: This might seg fault if the compressed size is > the decompress * TODO: This might seg fault if the compressed size is > the decompress
* size due to the mmapping and output file size allocated to be the input size. * size due to the mmapping and output file size allocated to be the input size
* The compress function should check before writing or buffer writes. * The compress function should check before writing or buffer writes.
*/ */
static int compress(const char *fname, const char *oname) { static int compress(const char *fname, const char *oname) {
@ -69,6 +69,11 @@ static int compress(const char *fname, const char *oname) {
perror("mmap error for output"); perror("mmap error for output");
return 1; return 1;
} }
#ifdef TEST
LDM_test((const BYTE *)src);
#endif
gettimeofday(&tv1, NULL); gettimeofday(&tv1, NULL);
compressedSize = LDM_HEADER_SIZE + compressedSize = LDM_HEADER_SIZE +
@ -251,8 +256,5 @@ int main(int argc, const char *argv[]) {
/* verify */ /* verify */
verify(inpFilename, decFilename); verify(inpFilename, decFilename);
#ifdef TEST
LDM_test();
#endif
return 0; return 0;
} }