Add debug logging and simple repro

This commit is contained in:
Neil Johari 2025-09-15 23:58:45 -07:00
parent 22b4483163
commit 96fdb9bd16
No known key found for this signature in database
GPG Key ID: 4087615AB84D394E
2 changed files with 64 additions and 0 deletions

View File

@ -279,10 +279,16 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
for (n=0; n<nbFiles; n++) {
S64 const fileSize = DiB_getFileSize(fileNamesTable[n]);
DISPLAYLEVEL(1, "[DEBUG] File '%s': size=%lld\n", fileNamesTable[n], (long long)fileSize);
/* TODO: is there a minimum sample size? What if the file is 1-byte? */
if (fileSize == 0) {
DISPLAYLEVEL(3, "Sample file '%s' has zero size, skipping...\n", fileNamesTable[n]);
continue;
} else if (fileSize < 0) {
/* BUG: This path is NOT skipped but should be! */
DISPLAYLEVEL(1, "[BUG] File '%s' has NEGATIVE size %lld but is NOT skipped!\n",
fileNamesTable[n], (long long)fileSize);
}
/* the case where we are breaking up files in sample chunks */
@ -290,6 +296,8 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
/* TODO: is there a minimum sample size? Can we have a 1-byte sample? */
fs.nbSamples += (int)((fileSize + chunkSize-1) / chunkSize);
fs.totalSizeToLoad += fileSize;
DISPLAYLEVEL(1, "[DEBUG] After chunked file: nbSamples=%d, totalSizeToLoad=%lld\n",
fs.nbSamples, (long long)fs.totalSizeToLoad);
}
else {
/* the case where one file is one sample */
@ -303,9 +311,14 @@ static fileStats DiB_fileStats(const char** fileNamesTable, int nbFiles, size_t
}
fs.nbSamples += 1;
fs.totalSizeToLoad += MIN(fileSize, SAMPLESIZE_MAX);
DISPLAYLEVEL(1, "[DEBUG] After single file: nbSamples=%d, totalSizeToLoad=%lld\n",
fs.nbSamples, (long long)fs.totalSizeToLoad);
}
}
DISPLAYLEVEL(4, "Found training data %d files, %d KB, %d samples\n", nbFiles, (int)(fs.totalSizeToLoad / (1 KB)), fs.nbSamples);
DISPLAYLEVEL(1, "[DEBUG FINAL] fileStats: nbSamples=%d, totalSizeToLoad=%lld (%s)\n",
fs.nbSamples, (long long)fs.totalSizeToLoad,
fs.totalSizeToLoad < 0 ? "NEGATIVE!" : "ok");
return fs;
}
@ -344,11 +357,18 @@ int DiB_trainFromFiles(const char* dictFileName, size_t maxDictSize,
/* Limit the size of the training data to 2GB */
/* TODO: there is opportunity to stop DiB_fileStats() early when the data limit is reached */
loadedSize = (size_t)MIN( MIN((S64)maxMem, fs.totalSizeToLoad), MAX_SAMPLES_SIZE );
DISPLAYLEVEL(1, "[DEBUG] Memory calc: totalSizeToLoad=%lld, maxMem=%zu, loadedSize=%zu (0x%zx)\n",
(long long)fs.totalSizeToLoad, maxMem, loadedSize, loadedSize);
if (fs.totalSizeToLoad < 0) {
DISPLAYLEVEL(1, "[BUG] totalSizeToLoad is NEGATIVE! This will cause allocation issues!\n");
}
if (memLimit != 0) {
DISPLAYLEVEL(2, "! Warning : setting manual memory limit for dictionary training data at %u MB \n",
(unsigned)(memLimit / (1 MB)));
loadedSize = (size_t)MIN(loadedSize, memLimit);
}
DISPLAYLEVEL(1, "[DEBUG] About to malloc: srcBuffer size=%zu, sampleSizes array size=%zu\n",
loadedSize+NOISELENGTH, (size_t)(fs.nbSamples * sizeof(size_t)));
srcBuffer = malloc(loadedSize+NOISELENGTH);
sampleSizes = (size_t*)malloc(fs.nbSamples * sizeof(size_t));
}

44
test_allocation_bug.sh Executable file
View File

@ -0,0 +1,44 @@
#!/bin/bash
echo "=== Test to show allocation bug with negative totalSizeToLoad ==="
echo ""
echo "We need at least 5 samples to pass the minimum check"
echo ""
# Clean up
rm -rf alloc_test
mkdir alloc_test
# Create exactly 5 valid files (minimum to not exit early)
echo "Creating 5 valid files (minimum required)..."
for i in {1..5}; do
echo "data$i" > alloc_test/good_$i.txt
done
echo "Valid files created (about 6 bytes each = 30 bytes total)"
echo ""
# We need enough bad files to make totalSizeToLoad negative
# 30 bytes positive, so we need at least 31 bad files
echo "Adding 1000 non-existent files to make totalSizeToLoad very negative..."
echo "Expected: totalSizeToLoad = 30 + (1000 * -1) = -970 bytes"
echo ""
# Build command
CMD="./zstd --train alloc_test/good_*.txt"
for i in {1..1000}; do
CMD="$CMD alloc_test/BAD_$i"
done
CMD="$CMD -o alloc_test/dict.zst --maxdict=65536 2>&1"
echo "Running command..."
echo "================="
# Run and capture ALL debug output related to our issue
eval $CMD | grep -E "\[DEBUG FINAL\]|\[DEBUG\] Memory calc|\[BUG\]|About to malloc|Error|not enough memory"
echo ""
echo "Output should show something like the following:"
echo "1. [DEBUG FINAL] fileStats: totalSizeToLoad=-970 (NEGATIVE!)"
echo "2. [BUG] totalSizeToLoad is NEGATIVE!"
echo "3. [DEBUG] Memory calc: showing huge loadedSize value"
echo "4. Error about memory allocation"