From 2b0a271ed216169d96dafed32713b9a6cb5f0697 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 6 Sep 2019 14:30:13 -0700
Subject: [PATCH 1/3] fix eductional decoder

fix #1774
also :
- fix minor compilation warnings
- make sure the `test` is run during CI tests
---
 Makefile                                  |  1 +
 doc/educational_decoder/Makefile          |  5 +++-
 doc/educational_decoder/zstd_decompress.c | 28 +++++++++++------------
 doc/educational_decoder/zstd_decompress.h |  4 +++-
 lib/dictBuilder/cover.c                   |  4 ++--
 5 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/Makefile b/Makefile
index acf50cfa3..da3112e30 100644
--- a/Makefile
+++ b/Makefile
@@ -69,6 +69,7 @@ test: MOREFLAGS += -g -DDEBUGLEVEL=$(DEBUGLEVEL) -Werror
 test:
 	MOREFLAGS="$(MOREFLAGS)" $(MAKE) -j -C $(PRGDIR) allVariants
 	$(MAKE) -C $(TESTDIR) $@
+	$(MAKE) -C doc/educational_decoder test
 
 ## shortest: same as `make check`
 .PHONY: shortest
diff --git a/doc/educational_decoder/Makefile b/doc/educational_decoder/Makefile
index c1d2c4cc4..ed3f3fef7 100644
--- a/doc/educational_decoder/Makefile
+++ b/doc/educational_decoder/Makefile
@@ -26,7 +26,10 @@ test: harness
 	@./harness tmp.zst tmp
 	@diff -s tmp README.md
 	@$(RM) -f tmp*
-	@zstd --train harness.c zstd_decompress.c zstd_decompress.h README.md
+	# present files for training multiple times, to reach minimum threshold
+	@zstd --train harness.c zstd_decompress.c zstd_decompress.h README.md \
+                  harness.c zstd_decompress.c zstd_decompress.h README.md \
+                  harness.c zstd_decompress.c zstd_decompress.h README.md
 	@zstd -D dictionary README.md -o tmp.zst
 	@./harness tmp.zst tmp dictionary
 	@diff -s tmp README.md
diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c
index 8e231bbb5..26143cd1a 100644
--- a/doc/educational_decoder/zstd_decompress.c
+++ b/doc/educational_decoder/zstd_decompress.c
@@ -395,7 +395,7 @@ size_t ZSTD_decompress_with_dict(void *const dst, const size_t dst_len,
     /* this decoder assumes decompression of a single frame */
     decode_frame(&out, &in, parsed_dict);
 
-    return out.ptr - (u8 *)dst;
+    return (size_t)(out.ptr - (u8 *)dst);
 }
 
 /******* FRAME DECODING ******************************************************/
@@ -416,7 +416,7 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
 
 static void decode_frame(ostream_t *const out, istream_t *const in,
                          const dictionary_t *const dict) {
-    const u32 magic_number = IO_read_bits(in, 32);
+    const u32 magic_number = (u32)IO_read_bits(in, 32);
     // Zstandard frame
     //
     // "Magic_Number
@@ -497,7 +497,7 @@ static void parse_frame_header(frame_header_t *const header,
     // 3    Reserved_bit
     // 2    Content_Checksum_flag
     // 1-0  Dictionary_ID_flag"
-    const u8 descriptor = IO_read_bits(in, 8);
+    const u8 descriptor = (u8)IO_read_bits(in, 8);
 
     // decode frame header descriptor into flags
     const u8 frame_content_size_flag = descriptor >> 6;
@@ -521,7 +521,7 @@ static void parse_frame_header(frame_header_t *const header,
         //
         // Bit numbers  7-3         2-0
         // Field name   Exponent    Mantissa"
-        u8 window_descriptor = IO_read_bits(in, 8);
+        u8 window_descriptor = (u8)IO_read_bits(in, 8);
         u8 exponent = window_descriptor >> 3;
         u8 mantissa = window_descriptor & 7;
 
@@ -541,7 +541,7 @@ static void parse_frame_header(frame_header_t *const header,
         const int bytes_array[] = {0, 1, 2, 4};
         const int bytes = bytes_array[dictionary_id_flag];
 
-        header->dictionary_id = IO_read_bits(in, bytes * 8);
+        header->dictionary_id = (u32)IO_read_bits(in, bytes * 8);
     } else {
         header->dictionary_id = 0;
     }
@@ -633,8 +633,8 @@ static void decompress_data(frame_context_t *const ctx, ostream_t *const out,
         //
         // The next 2 bits represent the Block_Type, while the remaining 21 bits
         // represent the Block_Size. Format is little-endian."
-        last_block = IO_read_bits(in, 1);
-        const int block_type = IO_read_bits(in, 2);
+        last_block = (int)IO_read_bits(in, 1);
+        const int block_type = (int)IO_read_bits(in, 2);
         const size_t block_len = IO_read_bits(in, 21);
 
         switch (block_type) {
@@ -748,8 +748,8 @@ static size_t decode_literals(frame_context_t *const ctx, istream_t *const in,
     // types"
     //
     // size_format takes between 1 and 2 bits
-    int block_type = IO_read_bits(in, 2);
-    int size_format = IO_read_bits(in, 2);
+    int block_type = (int)IO_read_bits(in, 2);
+    int size_format = (int)IO_read_bits(in, 2);
 
     if (block_type <= 1) {
         // Raw or RLE literals block
@@ -1005,7 +1005,7 @@ static const i16 SEQ_MATCH_LENGTH_DEFAULT_DIST[53] = {
 static const u32 SEQ_LITERAL_LENGTH_BASELINES[36] = {
     0,  1,  2,   3,   4,   5,    6,    7,    8,    9,     10,    11,
     12, 13, 14,  15,  16,  18,   20,   22,   24,   28,    32,    40,
-    48, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65538};
+    48, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
 static const u8 SEQ_LITERAL_LENGTH_EXTRA_BITS[36] = {
     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  1,  1,
     1, 1, 2, 2, 3, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
@@ -1021,7 +1021,7 @@ static const u8 SEQ_MATCH_LENGTH_EXTRA_BITS[53] = {
     2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
 
 /// Offset decoding is simpler so we just need a maximum code value
-static const u8 SEQ_MAX_CODES[3] = {35, -1, 52};
+static const u8 SEQ_MAX_CODES[3] = {35, (u8)-1, 52};
 
 static void decompress_sequences(frame_context_t *const ctx,
                                  istream_t *const in,
@@ -1132,7 +1132,7 @@ static void decompress_sequences(frame_context_t *const ctx, istream_t *in,
     // a single 1-bit and then fills the byte with 0-7 0 bits of padding."
     const int padding = 8 - highest_set_bit(src[len - 1]);
     // The offset starts at the end because FSE streams are read backwards
-    i64 bit_offset = len * 8 - padding;
+    i64 bit_offset = (i64)(len * 8 - (size_t)padding);
 
     // "The bitstream starts with initial state values, each using the required
     // number of bits in their respective accuracy, decoded previously from
@@ -1409,7 +1409,7 @@ size_t ZSTD_get_decompressed_size(const void *src, const size_t src_len) {
 
     // get decompressed size from ZSTD frame header
     {
-        const u32 magic_number = IO_read_bits(&in, 32);
+        const u32 magic_number = (u32)IO_read_bits(&in, 32);
 
         if (magic_number == 0xFD2FB528U) {
             // ZSTD frame
@@ -1418,7 +1418,7 @@ size_t ZSTD_get_decompressed_size(const void *src, const size_t src_len) {
 
             if (header.frame_content_size == 0 && !header.single_segment_flag) {
                 // Content size not provided, we can't tell
-                return -1;
+                return (size_t)-1;
             }
 
             return header.frame_content_size;
diff --git a/doc/educational_decoder/zstd_decompress.h b/doc/educational_decoder/zstd_decompress.h
index a01fde331..74b185338 100644
--- a/doc/educational_decoder/zstd_decompress.h
+++ b/doc/educational_decoder/zstd_decompress.h
@@ -7,6 +7,8 @@
  * in the COPYING file in the root directory of this source tree).
  */
 
+#include <stddef.h>   /* size_t */
+
 /******* EXPOSED TYPES ********************************************************/
 /*
 * Contains the parsed contents of a dictionary
@@ -39,7 +41,7 @@ size_t ZSTD_get_decompressed_size(const void *const src, const size_t src_len);
  * Return a valid dictionary_t pointer for use with dictionary initialization
  * or decompression
  */
-dictionary_t* create_dictionary();
+dictionary_t* create_dictionary(void);
 
 /*
  * Parse a provided dictionary blob for use in decompression
diff --git a/lib/dictBuilder/cover.c b/lib/dictBuilder/cover.c
index 4721205da..2e129dd91 100644
--- a/lib/dictBuilder/cover.c
+++ b/lib/dictBuilder/cover.c
@@ -638,8 +638,8 @@ void COVER_warnOnSmallCorpus(size_t maxDictSize, size_t nbDmers, int displayLeve
                     "compared to the source size %u! "
                     "size(source)/size(dictionary) = %f, but it should be >= "
                     "10! This may lead to a subpar dictionary! We recommend "
-                    "training on sources at least 10x, and up to 100x the "
-                    "size of the dictionary!\n", (U32)maxDictSize,
+                    "training on sources at least 10x, and preferably 100x "
+                    "the size of the dictionary! \n", (U32)maxDictSize,
                     (U32)nbDmers, ratio);
 }
 

From a3815d233c0ad35d6da7e03e5d71ac71b303977f Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Fri, 6 Sep 2019 16:51:16 -0700
Subject: [PATCH 2/3] fix minor compilation condition

harness.c is not designed to pass  -Wdeclaration-after-statement
---
 doc/educational_decoder/Makefile          | 6 +++---
 doc/educational_decoder/harness.c         | 2 +-
 doc/educational_decoder/zstd_decompress.c | 1 +
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/doc/educational_decoder/Makefile b/doc/educational_decoder/Makefile
index ed3f3fef7..fc0803755 100644
--- a/doc/educational_decoder/Makefile
+++ b/doc/educational_decoder/Makefile
@@ -6,10 +6,10 @@ CPPFLAGS += -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
             -I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(PRGDIR)
 CFLAGS   ?= -O3
 CFLAGS   += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow                 \
-            -Wstrict-aliasing=1 -Wswitch-enum -Wdeclaration-after-statement \
-            -Wstrict-prototypes -Wundef                                     \
+            -Wstrict-aliasing=1 -Wswitch-enum                               \
+            -Wredundant-decls -Wstrict-prototypes -Wundef                   \
             -Wvla -Wformat=2 -Winit-self -Wfloat-equal -Wwrite-strings      \
-            -Wredundant-decls
+            -std=c99
 CFLAGS   += $(DEBUGFLAGS)
 CFLAGS   += $(MOREFLAGS)
 FLAGS     = $(CPPFLAGS) $(CFLAGS) $(LDFLAGS) $(MULTITHREAD_LDFLAGS)
diff --git a/doc/educational_decoder/harness.c b/doc/educational_decoder/harness.c
index 47882b168..36f3967a9 100644
--- a/doc/educational_decoder/harness.c
+++ b/doc/educational_decoder/harness.c
@@ -33,7 +33,7 @@ size_t read_file(const char *path, u8 **ptr) {
     }
 
     fseek(f, 0L, SEEK_END);
-    size_t size = ftell(f);
+    size_t size = (size_t)ftell(f);
     rewind(f);
 
     *ptr = malloc(size);
diff --git a/doc/educational_decoder/zstd_decompress.c b/doc/educational_decoder/zstd_decompress.c
index 26143cd1a..f3e1b848f 100644
--- a/doc/educational_decoder/zstd_decompress.c
+++ b/doc/educational_decoder/zstd_decompress.c
@@ -833,6 +833,7 @@ static size_t decode_literals_compressed(frame_context_t *const ctx,
         // bits (0-1023)."
         num_streams = 1;
     // Fall through as it has the same size format
+        /* fallthrough */
     case 1:
         // "4 streams. Both Compressed_Size and Regenerated_Size use 10 bits
         // (0-1023)."

From b9b9a1c8e9b2a0e27c485c691a46d695e8c90c95 Mon Sep 17 00:00:00 2001
From: Yann Collet <cyan@fb.com>
Date: Tue, 10 Sep 2019 09:36:02 -0700
Subject: [PATCH 3/3] fix education decoder test

when `zstd` in not installed on local system
by allowing `ZSTD` variable to hold a custom location for the binary
---
 Makefile                         |  2 +-
 doc/educational_decoder/Makefile | 33 ++++++++++++++++++++++++--------
 2 files changed, 26 insertions(+), 9 deletions(-)

diff --git a/Makefile b/Makefile
index da3112e30..efb555c35 100644
--- a/Makefile
+++ b/Makefile
@@ -69,7 +69,7 @@ test: MOREFLAGS += -g -DDEBUGLEVEL=$(DEBUGLEVEL) -Werror
 test:
 	MOREFLAGS="$(MOREFLAGS)" $(MAKE) -j -C $(PRGDIR) allVariants
 	$(MAKE) -C $(TESTDIR) $@
-	$(MAKE) -C doc/educational_decoder test
+	ZSTD=../../programs/zstd $(MAKE) -C doc/educational_decoder test
 
 ## shortest: same as `make check`
 .PHONY: shortest
diff --git a/doc/educational_decoder/Makefile b/doc/educational_decoder/Makefile
index fc0803755..b2ed9f33d 100644
--- a/doc/educational_decoder/Makefile
+++ b/doc/educational_decoder/Makefile
@@ -1,10 +1,21 @@
+# ################################################################
+# Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
+# All rights reserved.
+#
+# This source code is licensed under both the BSD-style license (found in the
+# LICENSE file in the root directory of this source tree) and the GPLv2 (found
+# in the COPYING file in the root directory of this source tree).
+# ################################################################
+
+ZSTD ?= zstd   # requires zstd installation on local system
+DIFF ?= diff
 HARNESS_FILES=*.c
 
 MULTITHREAD_LDFLAGS = -pthread
 DEBUGFLAGS= -g -DZSTD_DEBUG=1
 CPPFLAGS += -I$(ZSTDDIR) -I$(ZSTDDIR)/common -I$(ZSTDDIR)/compress \
             -I$(ZSTDDIR)/dictBuilder -I$(ZSTDDIR)/deprecated -I$(PRGDIR)
-CFLAGS   ?= -O3
+CFLAGS   ?= -O2
 CFLAGS   += -Wall -Wextra -Wcast-qual -Wcast-align -Wshadow                 \
             -Wstrict-aliasing=1 -Wswitch-enum                               \
             -Wredundant-decls -Wstrict-prototypes -Wundef                   \
@@ -22,16 +33,22 @@ clean:
 	@$(RM) -rf harness.dSYM
 
 test: harness
-	@zstd README.md -o tmp.zst
+	#
+	# Testing single-file decompression with educational decoder
+	#
+	@$(ZSTD) README.md -o tmp.zst
 	@./harness tmp.zst tmp
-	@diff -s tmp README.md
+	@$(DIFF) -s tmp README.md
 	@$(RM) -f tmp*
-	# present files for training multiple times, to reach minimum threshold
-	@zstd --train harness.c zstd_decompress.c zstd_decompress.h README.md \
+	#
+	# Testing dictionary decompression with education decoder
+	#
+	# note : files are presented multiple for training, to reach minimum threshold
+	@$(ZSTD) --train harness.c zstd_decompress.c zstd_decompress.h README.md \
                   harness.c zstd_decompress.c zstd_decompress.h README.md \
                   harness.c zstd_decompress.c zstd_decompress.h README.md
-	@zstd -D dictionary README.md -o tmp.zst
+	@$(ZSTD) -D dictionary README.md -o tmp.zst
 	@./harness tmp.zst tmp dictionary
-	@diff -s tmp README.md
+	@$(DIFF) -s tmp README.md
 	@$(RM) -f tmp* dictionary
-	@make clean
+	@$(MAKE) clean