diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
index 3e253006a..eedd9b67f 100644
--- a/lib/compress/zstd_compress.c
+++ b/lib/compress/zstd_compress.c
@@ -1647,13 +1647,13 @@ static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict,
 {
     size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy];
     int const dedicatedDictSearch = cdict->matchState.dedicatedDictSearch;
-    return ( dedicatedDictSearch
-          || pledgedSrcSize <= cutoff
-          || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
-          || params->attachDictPref == ZSTD_dictForceAttach )
-        && params->attachDictPref != ZSTD_dictForceCopy
-        && !params->forceWindow; /* dictMatchState isn't correctly
-                                 * handled in _enforceMaxDist */
+    return dedicatedDictSearch
+        || ( ( pledgedSrcSize <= cutoff
+            || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN
+            || params->attachDictPref == ZSTD_dictForceAttach )
+          && params->attachDictPref != ZSTD_dictForceCopy
+          && !params->forceWindow ); /* dictMatchState isn't correctly
+                                      * handled in _enforceMaxDist */
 }
 
 static size_t
@@ -2914,10 +2914,12 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms,
         case ZSTD_greedy:
         case ZSTD_lazy:
         case ZSTD_lazy2:
-            if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch)
+            if (chunk >= HASH_READ_SIZE && ms->dedicatedDictSearch) {
+                assert(chunk == remaining); /* must load everything in one go */
                 ZSTD_dedicatedDictSearch_lazy_loadDictionary(ms, ichunk-HASH_READ_SIZE);
-            else if (chunk >= HASH_READ_SIZE)
+            } else if (chunk >= HASH_READ_SIZE) {
                 ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE);
+            }
             break;
 
         case ZSTD_btlazy2:   /* we want the dictionary table fully sorted */
@@ -3416,6 +3418,9 @@ static size_t ZSTD_initCDict_internal(
     assert(!ZSTD_checkCParams(cParams));
     cdict->matchState.cParams = cParams;
     cdict->matchState.dedicatedDictSearch = params.enableDedicatedDictSearch;
+    if (cdict->matchState.dedicatedDictSearch && dictSize > ZSTD_CHUNKSIZE_MAX) {
+        cdict->matchState.dedicatedDictSearch = 0;
+    }
     if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) {
         cdict->dictContent = dictBuffer;
     } else {
diff --git a/lib/compress/zstd_lazy.c b/lib/compress/zstd_lazy.c
index 5ce805326..49ec1b09e 100644
--- a/lib/compress/zstd_lazy.c
+++ b/lib/compress/zstd_lazy.c
@@ -478,23 +478,114 @@ U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
 
 void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
 {
-    U32 const target = (U32)(ip - ms->window.base);
+    const BYTE* const base = ms->window.base;
+    U32 const target = (U32)(ip - base);
+    U32* const hashTable = ms->hashTable;
     U32* const chainTable = ms->chainTable;
-    U32 const chainMask = (1 << ms->cParams.chainLog) - 1;
+    U32 const chainSize = 1 << ms->cParams.chainLog;
     U32 idx = ms->nextToUpdate;
-    U32 bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32 const minChain = chainSize < target ? target - chainSize : idx;
+    U32 const bucketSize = 1 << ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32 const cacheSize = bucketSize - 1;
+    U32 const chainAttempts = (1 << ms->cParams.searchLog) - cacheSize;
+    U32 const chainLimit = chainAttempts > 255 ? 255 : chainAttempts;
+
+    /* We know the hashtable is oversized by a factor of `bucketSize`.
+     * We are going to temporarily pretend `bucketSize == 1`, keeping only a
+     * single entry. We will use the rest of the space to construct a temporary
+     * chaintable.
+     */
+    U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
+    U32* const tmpHashTable = hashTable;
+    U32* const tmpChainTable = hashTable + ((size_t)1 << hashLog);
+    U32 const tmpChainSize = ((1 << ZSTD_LAZY_DDSS_BUCKET_LOG) - 1) << hashLog;
+    U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
+
+    U32 hashIdx;
+
+    assert(ms->cParams.chainLog <= 24);
+    assert(ms->cParams.hashLog >= ms->cParams.chainLog);
+    assert(idx != 0);
+    assert(tmpMinChain <= minChain);
+
+    /* fill conventional hash table and conventional chain table */
     for ( ; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
+        if (idx >= tmpMinChain) {
+            tmpChainTable[idx - tmpMinChain] = hashTable[h];
+        }
+        tmpHashTable[h] = idx;
+    }
+
+    /* sort chains into ddss chain table */
+    {
+        U32 chainPos = 0;
+        for (hashIdx = 0; hashIdx < (1U << hashLog); hashIdx++) {
+            U32 count;
+            U32 countBeyondMinChain = 0;
+            U32 i = tmpHashTable[hashIdx];
+            for (count = 0; i >= tmpMinChain && count < cacheSize; count++) {
+                /* skip through the chain to the first position that won't be
+                 * in the hash cache bucket */
+                if (i < minChain) {
+                    countBeyondMinChain++;
+                }
+                i = tmpChainTable[i - tmpMinChain];
+            }
+            if (count == cacheSize) {
+                for (count = 0; count < chainLimit;) {
+                    if (i < minChain) {
+                        if (!i || countBeyondMinChain++ > cacheSize) {
+                            /* only allow pulling `cacheSize` number of entries
+                             * into the cache or chainTable beyond `minChain`,
+                             * to replace the entries pulled out of the
+                             * chainTable into the cache. This lets us reach
+                             * back further without increasing the total number
+                             * of entries in the chainTable, guaranteeing the
+                             * DDSS chain table will fit into the space
+                             * allocated for the regular one. */
+                            break;
+                        }
+                    }
+                    chainTable[chainPos++] = i;
+                    count++;
+                    if (i < tmpMinChain) {
+                        break;
+                    }
+                    i = tmpChainTable[i - tmpMinChain];
+                }
+            } else {
+                count = 0;
+            }
+            if (count) {
+                tmpHashTable[hashIdx] = ((chainPos - count) << 8) + count;
+            } else {
+                tmpHashTable[hashIdx] = 0;
+            }
+        }
+        assert(chainPos <= chainSize); /* I believe this is guaranteed... */
+    }
+
+    /* move chain pointers into the last entry of each hash bucket */
+    for (hashIdx = (1 << hashLog); hashIdx; ) {
+        U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
+        U32 const chainPackedPointer = tmpHashTable[hashIdx];
+        U32 i;
+        for (i = 0; i < cacheSize; i++) {
+            hashTable[bucketIdx + i] = 0;
+        }
+        hashTable[bucketIdx + bucketSize - 1] = chainPackedPointer;
+    }
+
+    /* fill the buckets of the hash table */
+    for (idx = ms->nextToUpdate; idx < target; idx++) {
+        U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
+                   << ZSTD_LAZY_DDSS_BUCKET_LOG;
         U32 i;
-        size_t const h = ZSTD_hashPtr(
-            ms->window.base + idx,
-            ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG,
-            ms->cParams.minMatch) << ZSTD_LAZY_DDSS_BUCKET_LOG;
         /* Shift hash cache down 1. */
-        for (i = bucketSize - 1; i; i--)
-            ms->hashTable[h + i] = ms->hashTable[h + i - 1];
-        /* Insert new position. */
-        chainTable[idx & chainMask] = ms->hashTable[h];
-        ms->hashTable[h] = idx;
+        for (i = cacheSize - 1; i; i--)
+            hashTable[h + i] = hashTable[h + i - 1];
+        hashTable[h] = idx;
     }
 
     ms->nextToUpdate = target;
@@ -570,32 +661,39 @@ size_t ZSTD_HcFindBestMatch_generic (
     }
 
     if (dictMode == ZSTD_dedicatedDictSearch) {
-        const U32 ddsChainSize    = (1 << dms->cParams.chainLog);
-        const U32 ddsChainMask    = ddsChainSize - 1;
         const U32 ddsLowestIndex  = dms->window.dictLimit;
         const BYTE* const ddsBase = dms->window.base;
         const BYTE* const ddsEnd  = dms->window.nextSrc;
         const U32 ddsSize         = (U32)(ddsEnd - ddsBase);
         const U32 ddsIndexDelta   = dictLimit - ddsSize;
-        const U32 ddsMinChain     = ddsSize > ddsChainSize ? ddsSize - ddsChainSize : 0;
         const U32 bucketSize      = (1 << ZSTD_LAZY_DDSS_BUCKET_LOG);
-        const U32 bucketLimit     = nbAttempts < bucketSize ? nbAttempts : bucketSize;
+        const U32 bucketLimit     = nbAttempts < bucketSize - 1 ? nbAttempts : bucketSize - 1;
         U32 ddsAttempt;
 
-        for (ddsAttempt = 0; ddsAttempt < bucketSize; ddsAttempt++) {
+        for (ddsAttempt = 0; ddsAttempt < bucketSize - 1; ddsAttempt++) {
             PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
         }
 
+        {
+            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+            U32 const chainIndex = chainPackedPointer >> 8;
+
+            PREFETCH_L1(&dms->chainTable[chainIndex]);
+        }
+
         for (ddsAttempt = 0; ddsAttempt < bucketLimit; ddsAttempt++) {
             size_t currentMl=0;
             const BYTE* match;
             matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
             match = ddsBase + matchIndex;
 
-            if (matchIndex < ddsLowestIndex) {
+            if (!matchIndex) {
                 return ml;
             }
 
+            /* guaranteed by table construction */
+            (void)ddsLowestIndex;
+            assert(matchIndex >= ddsLowestIndex);
             assert(match+4 <= ddsEnd);
             if (MEM_read32(match) == MEM_read32(ip)) {
                 /* assumption : matchIndex <= dictLimit-4 (by table construction) */
@@ -613,27 +711,38 @@ size_t ZSTD_HcFindBestMatch_generic (
             }
         }
 
-        for ( ; (ddsAttempt < nbAttempts) & (matchIndex >= ddsMinChain); ddsAttempt++) {
-            size_t currentMl=0;
-            const BYTE* match;
-            matchIndex = dms->chainTable[matchIndex & ddsChainMask];
-            match = ddsBase + matchIndex;
+        {
+            U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - 1];
+            U32 chainIndex = chainPackedPointer >> 8;
+            U32 const chainLength = chainPackedPointer & 0xFF;
+            U32 const chainAttempts = nbAttempts - ddsAttempt;
+            U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
+            U32 chainAttempt;
 
-            if (matchIndex < ddsLowestIndex) {
-                break;
+            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++) {
+                PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
             }
 
-            assert(match+4 <= ddsEnd);
-            if (MEM_read32(match) == MEM_read32(ip)) {
-                /* assumption : matchIndex <= dictLimit-4 (by table construction) */
-                currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
-            }
+            for (chainAttempt = 0 ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
+                size_t currentMl=0;
+                const BYTE* match;
+                matchIndex = dms->chainTable[chainIndex];
+                match = ddsBase + matchIndex;
 
-            /* save best solution */
-            if (currentMl > ml) {
-                ml = currentMl;
-                *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
-                if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+                /* guaranteed by table construction */
+                assert(matchIndex >= ddsLowestIndex);
+                assert(match+4 <= ddsEnd);
+                if (MEM_read32(match) == MEM_read32(ip)) {
+                    /* assumption : matchIndex <= dictLimit-4 (by table construction) */
+                    currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, ddsEnd, prefixStart) + 4;
+                }
+
+                /* save best solution */
+                if (currentMl > ml) {
+                    ml = currentMl;
+                    *offsetPtr = curr - (matchIndex + ddsIndexDelta) + ZSTD_REP_MOVE;
+                    if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */
+                }
             }
         }
     } else if (dictMode == ZSTD_dictMatchState) {
@@ -763,6 +872,12 @@ ZSTD_compressBlock_lazy_generic(
                         ZSTD_matchState_t* ms,
                         const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr);
 
+    /**
+     * This table is indexed first by the four ZSTD_dictMode_e values, and then
+     * by the two searchMethod_e values. NULLs are placed for configurations
+     * that should never occur (extDict modes go to the other implementation
+     * below and there is no DDSS for binary tree search yet).
+     */
     const searchMax_f searchFuncs[4][2] = {
         {
             ZSTD_HcFindBestMatch_selectMLS,
@@ -787,16 +902,13 @@ ZSTD_compressBlock_lazy_generic(
 
     const int isDMS = dictMode == ZSTD_dictMatchState;
     const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
+    const int isDxS = isDMS || isDDS;
     const ZSTD_matchState_t* const dms = ms->dictMatchState;
-    const U32 dictLowestIndex      = isDMS || isDDS ?
-                                     dms->window.dictLimit : 0;
-    const BYTE* const dictBase     = isDMS || isDDS ?
-                                     dms->window.base : NULL;
-    const BYTE* const dictLowest   = isDMS || isDDS ?
-                                     dictBase + dictLowestIndex : NULL;
-    const BYTE* const dictEnd      = isDMS || isDDS ?
-                                     dms->window.nextSrc : NULL;
-    const U32 dictIndexDelta       = isDMS || isDDS ?
+    const U32 dictLowestIndex      = isDxS ? dms->window.dictLimit : 0;
+    const BYTE* const dictBase     = isDxS ? dms->window.base : NULL;
+    const BYTE* const dictLowest   = isDxS ? dictBase + dictLowestIndex : NULL;
+    const BYTE* const dictEnd      = isDxS ? dms->window.nextSrc : NULL;
+    const U32 dictIndexDelta       = isDxS ?
                                      prefixLowestIndex - (U32)(dictEnd - dictBase) :
                                      0;
     const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
@@ -814,7 +926,7 @@ ZSTD_compressBlock_lazy_generic(
         if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0;
         if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0;
     }
-    if (isDMS || isDDS) {
+    if (isDxS) {
         /* dictMatchState repCode checks don't currently handle repCode == 0
          * disabling. */
         assert(offset_1 <= dictAndPrefixLength);
@@ -834,7 +946,7 @@ ZSTD_compressBlock_lazy_generic(
         const BYTE* start=ip+1;
 
         /* check repCode */
-        if (isDMS || isDDS) {
+        if (isDxS) {
             const U32 repIndex = (U32)(ip - base) + 1 - offset_1;
             const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState || dictMode == ZSTD_dedicatedDictSearch)
                                 && repIndex < prefixLowestIndex) ?
@@ -877,7 +989,7 @@ ZSTD_compressBlock_lazy_generic(
                 if ((mlRep >= 4) && (gain2 > gain1))
                     matchLength = mlRep, offset = 0, start = ip;
             }
-            if (isDMS || isDDS) {
+            if (isDxS) {
                 const U32 repIndex = (U32)(ip - base) - offset_1;
                 const BYTE* repMatch = repIndex < prefixLowestIndex ?
                                dictBase + (repIndex - dictIndexDelta) :
@@ -912,7 +1024,7 @@ ZSTD_compressBlock_lazy_generic(
                     if ((mlRep >= 4) && (gain2 > gain1))
                         matchLength = mlRep, offset = 0, start = ip;
                 }
-                if (isDMS || isDDS) {
+                if (isDxS) {
                     const U32 repIndex = (U32)(ip - base) - offset_1;
                     const BYTE* repMatch = repIndex < prefixLowestIndex ?
                                    dictBase + (repIndex - dictIndexDelta) :
@@ -950,7 +1062,7 @@ ZSTD_compressBlock_lazy_generic(
                      && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) )  /* only search for offset within prefix */
                     { start--; matchLength++; }
             }
-            if (isDMS || isDDS) {
+            if (isDxS) {
                 U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE));
                 const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
                 const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
@@ -966,7 +1078,7 @@ _storeSequence:
         }
 
         /* check immediate repcode */
-        if (isDMS || isDDS) {
+        if (isDxS) {
             while (ip <= ilimit) {
                 U32 const current2 = (U32)(ip-base);
                 U32 const repIndex = current2 - offset_2;
diff --git a/lib/zstd.h b/lib/zstd.h
index e42a4bd49..f8d5e84da 100644
--- a/lib/zstd.h
+++ b/lib/zstd.h
@@ -1548,15 +1548,16 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
 #define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7
 
 /* Controls whether the new and experimental "dedicated dictionary search
- * structure" can be used.
+ * structure" can be used. This feature is still rough around the edges, be
+ * prepared for surprising behavior!
  *
  * How to use it:
  *
  * When using a CDict, whether to use this feature or not is controlled at
  * CDict creation, and it must be set in a CCtxParams set passed into that
- * construction. A compression will then use the feature or not based on how
- * the CDict was constructed; the value of this param, set in the CCtx, will
- * have no effect.
+ * construction (via ZSTD_createCDict_advanced2()). A compression will then
+ * use the feature or not based on how the CDict was constructed; the value of
+ * this param, set in the CCtx, will have no effect.
  *
  * However, when a dictionary buffer is passed into a CCtx, such as via
  * ZSTD_CCtx_loadDictionary(), this param can be set on the CCtx to control
@@ -1578,10 +1579,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
  * written as the compression goes along. This means we can choose a search
  * structure for the dictionary that is read-optimized.
  *
- * This feature enables the use of that different structure. Note that this
- * means that the CDict tables can no longer be copied into the CCtx, so
- * the dict attachment mode ZSTD_dictForceCopy will no longer be useable. The
- * dictionary can only be attached or reloaded.
+ * This feature enables the use of that different structure.
+ *
+ * Note that some of the members of the ZSTD_compressionParameters struct have
+ * different semantics and constraints in the dedicated search structure. It is
+ * highly recommended that you simply set a compression level in the CCtxParams
+ * you pass into the CDict creation call, and avoid messing with the cParams
+ * directly.
  *
  * Effects:
  *
@@ -1589,9 +1593,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* pre
  * implementation supports this feature. Currently, that's limited to
  * ZSTD_greedy, ZSTD_lazy, and ZSTD_lazy2.
  *
- * In general, you should expect compression to be faster, and CDict creation
- * to be slightly slower. Eventually, we will probably make this mode the
- * default.
+ * Note that this means that the CDict tables can no longer be copied into the
+ * CCtx, so the dict attachment mode ZSTD_dictForceCopy will no longer be
+ * useable. The dictionary can only be attached or reloaded.
+ *
+ * In general, you should expect compression to be faster--sometimes very much
+ * so--and CDict creation to be slightly slower. Eventually, we will probably
+ * make this mode the default.
  */
 #define ZSTD_c_enableDedicatedDictSearch ZSTD_c_experimentalParam8
 
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
index c354c4212..52086c7f3 100644
--- a/tests/fuzzer.c
+++ b/tests/fuzzer.c
@@ -571,7 +571,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
         r = ZSTD_decompress(decodedBuffer, CNBuffSize, compressedBuffer, cSize);
         if (!ZSTD_isError(r)) goto _output_error;
         if (ZSTD_getErrorCode(r) != ZSTD_error_checksum_wrong) goto _output_error;
-        
+
         CHECK_Z(ZSTD_DCtx_setParameter(dctx, ZSTD_d_forceIgnoreChecksum, ZSTD_d_ignoreChecksum));
         r = ZSTD_decompressDCtx(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize-1);
         if (!ZSTD_isError(r)) goto _output_error;   /* wrong checksum size should still throw error */
@@ -2926,7 +2926,7 @@ static int basicUnitTests(U32 const seed, double compressibility)
     {
         ZSTD_CCtx* const cctx = ZSTD_createCCtx();
         ZSTD_DCtx* const dctx = ZSTD_createDCtx();
-        size_t dictSize = CNBuffSize > 110 KB ? 110 KB : CNBuffSize;
+        size_t dictSize = CNBuffSize;
         void* dict = (void*)malloc(dictSize);
         ZSTD_CCtx_params* cctx_params = ZSTD_createCCtxParams();
         ZSTD_dictAttachPref_e const attachPrefs[] = {
@@ -2934,10 +2934,13 @@ static int basicUnitTests(U32 const seed, double compressibility)
             ZSTD_dictForceAttach,
             ZSTD_dictForceCopy,
             ZSTD_dictForceLoad,
-            ZSTD_dictForceAttach
+            ZSTD_dictDefaultAttach,
+            ZSTD_dictForceAttach,
+            ZSTD_dictForceCopy,
+            ZSTD_dictForceLoad
         };
-        int const enableDedicatedDictSearch[] = {0, 0, 0, 0, 1};
-        int const cLevel = 6;
+        int const enableDedicatedDictSearch[] = {0, 0, 0, 0, 1, 1, 1, 1};
+        int cLevel;
         int i;
 
         RDG_genBuffer(dict, dictSize, 0.5, 0.5, seed);
@@ -2945,28 +2948,35 @@ static int basicUnitTests(U32 const seed, double compressibility)
 
         CHECK(cctx_params != NULL);
 
-        for (i = 0; i < 5; ++i) {
-            ZSTD_dictAttachPref_e const attachPref = attachPrefs[i];
-            int const enableDDS = enableDedicatedDictSearch[i];
-            ZSTD_CDict* cdict;
+        for (dictSize = CNBuffSize; dictSize; dictSize = dictSize >> 3) {
+            DISPLAYLEVEL(3, "\n    Testing with dictSize %u ", (U32)dictSize);
+            for (cLevel = 4; cLevel < 13; cLevel++) {
+                for (i = 0; i < 8; ++i) {
+                    ZSTD_dictAttachPref_e const attachPref = attachPrefs[i];
+                    int const enableDDS = enableDedicatedDictSearch[i];
+                    ZSTD_CDict* cdict;
 
-            DISPLAYLEVEL(5, "\n  iter %d ", i);
+                    DISPLAYLEVEL(5, "\n      dictSize %u cLevel %d iter %d ", (U32)dictSize, cLevel, i);
 
-            ZSTD_CCtxParams_init(cctx_params, cLevel);
-            CHECK_Z(ZSTD_CCtxParams_setParameter(cctx_params, ZSTD_c_enableDedicatedDictSearch, enableDDS));
+                    ZSTD_CCtxParams_init(cctx_params, cLevel);
+                    CHECK_Z(ZSTD_CCtxParams_setParameter(cctx_params, ZSTD_c_enableDedicatedDictSearch, enableDDS));
 
-            cdict = ZSTD_createCDict_advanced2(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cctx_params, ZSTD_defaultCMem);
-            CHECK(cdict != NULL);
+                    cdict = ZSTD_createCDict_advanced2(dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, cctx_params, ZSTD_defaultCMem);
+                    CHECK(cdict != NULL);
 
-            CHECK_Z(ZSTD_CCtx_refCDict(cctx, cdict));
-            CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, attachPref));
+                    CHECK_Z(ZSTD_CCtx_refCDict(cctx, cdict));
+                    CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_forceAttachDict, attachPref));
 
-            cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
-            CHECK_Z(cSize);
-            CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, dict, dictSize));
+                    cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, CNBuffer, CNBuffSize);
+                    CHECK_Z(cSize);
+                    CHECK_Z(ZSTD_decompress_usingDict(dctx, decodedBuffer, CNBuffSize, compressedBuffer, cSize, dict, dictSize));
 
-            CHECK_Z(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters));
-            ZSTD_freeCDict(cdict);
+                    DISPLAYLEVEL(5, "compressed to %u bytes ", (U32)cSize);
+
+                    CHECK_Z(ZSTD_CCtx_reset(cctx, ZSTD_reset_session_and_parameters));
+                    ZSTD_freeCDict(cdict);
+                }
+            }
         }
 
         ZSTD_freeCCtx(cctx);