#pragma once #include #include #include #include #include #include #include #include namespace bpecpp { typedef std::pair UnicodeBigram; class bpe_char_byte_table { public: bpe_char_byte_table() { int n = 0; for (uint8_t byte = 0; m_codepoint_to_byte.size() < 256; byte++) { bool keep = (byte >= '!' && byte <= '~') || (byte >= 0xa1 && byte <= 0xac) || (byte >= 0xae && byte <= 0xff); uint32_t codepoint = byte; if (!keep) { codepoint = 256 + n; n++; } m_byte_to_codepoint[byte] = codepoint; m_codepoint_to_byte[codepoint] = byte; }; } uint32_t byte_to_codepoint(uint8_t byte) { return m_byte_to_codepoint[byte]; } uint8_t codepoint_to_byte(uint32_t codepoint) { return m_codepoint_to_byte.at(codepoint); } private: std::array m_byte_to_codepoint; std::unordered_map m_codepoint_to_byte; }; struct bigram_hash { std::size_t operator()(const UnicodeBigram& pair) const { return pair.first.hashCode() + pair.second.hashCode(); } }; struct icu_hash { std::size_t operator()(const icu::UnicodeString& us) const { return us.hashCode(); } }; class BPE { public: BPE(const std::unordered_map &vocab, const std::vector> &merges); std::vector encode(const std::string& input); std::string decode(const std::vector& tokens, bool valid_utf8 = true); private: std::unordered_map m_vocab; std::unordered_map m_reverse_vocab; std::unordered_map m_merges; bpe_char_byte_table m_bs_table; void bpe(icu::UnicodeString token_pretoked, std::vector& output); std::unique_ptr m_pretok_re; std::string normalize_nfc(const std::string& input); std::vector pretokenize(const std::string& input); }; // for embedding tokenizer configs in the library - had initially constructed // `string_view`s in the generated headers, *but* generating thousands actual // references into the buffer generates thousands of *relocations* and makes // compilation rather slow, delaying resolving the real address into a // string_view until runtime fixes that struct buf_ref { // packing these into a single u32 reduces the size of the embedded // configs significantly (5.0MB->1.6MB) uint32_t offset : 20; uint32_t length : 12; std::string_view into(const char* buf) const { return std::string_view(&buf[offset], length); } }; struct additional_vocab_item_embedded { uint32_t id; buf_ref content; bool special; }; struct additional_vocab_item { uint32_t id; std::string_view content; bool special = false; }; class AdditionalVocabAdapter { public: AdditionalVocabAdapter(const std::vector &vocab); std::vector encode(const std::string& input, BPE& bpemodel, bool encode_special_tokens = true); std::string decode(const std::vector& tokens, BPE& bpemodel, bool decode_special_tokens = true, bool valid_utf8 = true); private: std::unordered_map m_token_to_id; std::unordered_map m_id_to_token; std::unordered_set m_special_ids; std::regex m_addedtoken_re; }; } // namespace bpecpp