Software pipeline for ZSTD_compressBlock_fast_dictMatchState (#3086)

* prefetch dict content inside loop

* ip0/ip1 pipeline

* add L2_4 prefetch to dms pipeline

* Remove L1 prefetch

* Remove L2 prefetching

* Reduce # of gotos

* Cosmetic fixes

* Check final position sometimes

* Track step size as in bc768bc

* Fix nits
This commit is contained in:
Elliot Gorokhovsky 2022-03-17 09:35:11 -07:00 committed by GitHub
parent eadb6c874f
commit 64efba4c5e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -380,7 +380,8 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
U32 const stepSize = cParams->targetLength + !(cParams->targetLength); U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
const BYTE* const base = ms->window.base; const BYTE* const base = ms->window.base;
const BYTE* const istart = (const BYTE*)src; const BYTE* const istart = (const BYTE*)src;
const BYTE* ip = istart; const BYTE* ip0 = istart;
const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */
const BYTE* anchor = istart; const BYTE* anchor = istart;
const U32 prefixStartIndex = ms->window.dictLimit; const U32 prefixStartIndex = ms->window.dictLimit;
const BYTE* const prefixStart = base + prefixStartIndex; const BYTE* const prefixStart = base + prefixStartIndex;
@ -397,13 +398,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
const BYTE* const dictStart = dictBase + dictStartIndex; const BYTE* const dictStart = dictBase + dictStartIndex;
const BYTE* const dictEnd = dms->window.nextSrc; const BYTE* const dictEnd = dms->window.nextSrc;
const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
const U32 dictHLog = dictCParams->hashLog; const U32 dictHLog = dictCParams->hashLog;
/* if a dictionary is still attached, it necessarily means that /* if a dictionary is still attached, it necessarily means that
* it is within window size. So we just check it. */ * it is within window size. So we just check it. */
const U32 maxDistance = 1U << cParams->windowLog; const U32 maxDistance = 1U << cParams->windowLog;
const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
assert(endIndex - prefixStartIndex <= maxDistance); assert(endIndex - prefixStartIndex <= maxDistance);
(void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */
@ -415,101 +416,134 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic(
/* init */ /* init */
DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic");
ip += (dictAndPrefixLength == 0); ip0 += (dictAndPrefixLength == 0);
/* dictMatchState repCode checks don't currently handle repCode == 0 /* dictMatchState repCode checks don't currently handle repCode == 0
* disabling. */ * disabling. */
assert(offset_1 <= dictAndPrefixLength); assert(offset_1 <= dictAndPrefixLength);
assert(offset_2 <= dictAndPrefixLength); assert(offset_2 <= dictAndPrefixLength);
/* Main Search Loop */ /* Outer search loop */
while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ assert(stepSize >= 1);
while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */
size_t mLength; size_t mLength;
size_t const h = ZSTD_hashPtr(ip, hlog, mls); size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
U32 const curr = (U32)(ip-base); const size_t dictHash0 = ZSTD_hashPtr(ip0, dictHLog, mls);
U32 const matchIndex = hashTable[h]; U32 dictMatchIndex = dictHashTable[dictHash0];
U32 matchIndex = hashTable[hash0];
U32 curr = (U32)(ip0 - base);
size_t step = stepSize;
const size_t kStepIncr = 1 << kSearchStrength;
const BYTE* nextStep = ip0 + kStepIncr;
/* Inner search loop */
while (1) {
const BYTE* match = base + matchIndex; const BYTE* match = base + matchIndex;
const U32 repIndex = curr + 1 - offset_1; const U32 repIndex = curr + 1 - offset_1;
const BYTE* repMatch = (repIndex < prefixStartIndex) ? const BYTE* repMatch = (repIndex < prefixStartIndex) ?
dictBase + (repIndex - dictIndexDelta) : dictBase + (repIndex - dictIndexDelta) :
base + repIndex; base + repIndex;
hashTable[h] = curr; /* update hash table */ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
const size_t dictHash1 = ZSTD_hashPtr(ip1, dictHLog, mls);
hashTable[hash0] = curr; /* update hash table */
if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ if (((U32) ((prefixStartIndex - 1) - repIndex) >=
&& (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */
&& (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) {
const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4;
ip++; ip0++;
ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
} else if ( (matchIndex <= prefixStartIndex) ) { break;
size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); } else if (matchIndex <= prefixStartIndex) {
U32 const dictMatchIndex = dictHashTable[dictHash]; /* We only look for a dict match if the normal matchIndex is invalid */
const BYTE* dictMatch = dictBase + dictMatchIndex; const BYTE* dictMatch = dictBase + dictMatchIndex;
if (dictMatchIndex <= dictStartIndex || if (dictMatchIndex > dictStartIndex &&
MEM_read32(dictMatch) != MEM_read32(ip)) { MEM_read32(dictMatch) == MEM_read32(ip0)) {
assert(stepSize >= 1);
ip += ((ip-anchor) >> kSearchStrength) + stepSize;
continue;
} else {
/* found a dict match */ /* found a dict match */
U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4;
while (((ip>anchor) & (dictMatch>dictStart)) while (((ip0 > anchor) & (dictMatch > dictStart))
&& (ip[-1] == dictMatch[-1])) { && (ip0[-1] == dictMatch[-1])) {
ip--; dictMatch--; mLength++; ip0--;
dictMatch--;
mLength++;
} /* catch up */ } /* catch up */
offset_2 = offset_1; offset_2 = offset_1;
offset_1 = offset; offset_1 = offset;
ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
break;
} }
} else if (MEM_read32(match) != MEM_read32(ip)) { } else if (MEM_read32(match) == MEM_read32(ip0)) {
/* it's not a match, and we're not going to check the dictionary */
assert(stepSize >= 1);
ip += ((ip-anchor) >> kSearchStrength) + stepSize;
continue;
} else {
/* found a regular match */ /* found a regular match */
U32 const offset = (U32)(ip-match); U32 const offset = (U32) (ip0 - match);
mLength = ZSTD_count(ip+4, match+4, iend) + 4; mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4;
while (((ip>anchor) & (match>prefixStart)) while (((ip0 > anchor) & (match > prefixStart))
&& (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ && (ip0[-1] == match[-1])) {
ip0--;
match--;
mLength++;
} /* catch up */
offset_2 = offset_1; offset_2 = offset_1;
offset_1 = offset; offset_1 = offset;
ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
break;
} }
/* match found */ /* Prepare for next iteration */
ip += mLength; dictMatchIndex = dictHashTable[dictHash1];
anchor = ip; matchIndex = hashTable[hash1];
if (ip <= ilimit) { if (ip1 >= nextStep) {
step++;
nextStep += kStepIncr;
}
ip0 = ip1;
ip1 = ip1 + step;
if (ip1 > ilimit) goto _cleanup;
curr = (U32)(ip0 - base);
hash0 = hash1;
} /* end inner search loop */
/* match found */
assert(mLength);
ip0 += mLength;
anchor = ip0;
if (ip0 <= ilimit) {
/* Fill Table */ /* Fill Table */
assert(base+curr+2 > istart); /* check base overflow */ assert(base+curr+2 > istart); /* check base overflow */
hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */
hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base);
/* check immediate repcode */ /* check immediate repcode */
while (ip <= ilimit) { while (ip0 <= ilimit) {
U32 const current2 = (U32)(ip-base); U32 const current2 = (U32)(ip0-base);
U32 const repIndex2 = current2 - offset_2; U32 const repIndex2 = current2 - offset_2;
const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
dictBase - dictIndexDelta + repIndex2 : dictBase - dictIndexDelta + repIndex2 :
base + repIndex2; base + repIndex2;
if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */)
&& (MEM_read32(repMatch2) == MEM_read32(ip)) ) { && (MEM_read32(repMatch2) == MEM_read32(ip0))) {
const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4;
U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */
ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
ip += repLength2; ip0 += repLength2;
anchor = ip; anchor = ip0;
continue; continue;
} }
break; break;
} }
} }
/* Prepare for next iteration */
assert(ip0 == anchor);
ip1 = ip0 + stepSize;
} }
_cleanup:
/* save reps for next block */ /* save reps for next block */
rep[0] = offset_1 ? offset_1 : offsetSaved; rep[0] = offset_1 ? offset_1 : offsetSaved;
rep[1] = offset_2 ? offset_2 : offsetSaved; rep[1] = offset_2 ? offset_2 : offsetSaved;