ci: update path-filtering orb to 1.3.0 (#3588 )

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
readme: add Windows ARM download link
2025-06-23 00:02:10 -04:00 · 2025-05-27 15:46:52 -04:00 · 2025-02-24 19:51:59 -05:00 · 2025-02-24 19:44:45 -05:00 · 2025-02-24 19:41:13 -05:00 · 2025-02-24 17:15:34 -05:00
193 changed files with 28121 additions and 17747 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,13 +1,17 @@
 version: 2.1
 setup: true
 orbs:
-  path-filtering: circleci/path-filtering@0.0.1
+  path-filtering: circleci/path-filtering@1.3.0

 workflows:
  version: 2.1
  generate-config:
    jobs:
      - path-filtering/filter:
+          filters:
+            tags:
+              only:
+                - /.*/
          base-revision: main
          config-path: .circleci/continue_config.yml
          mapping: |
@ -16,4 +20,3 @@ workflows:
            gpt4all-bindings/python/.* run-python-workflow true
            gpt4all-bindings/typescript/.* run-ts-workflow true
            gpt4all-chat/.* run-chat-workflow true
-            .* run-default-workflow true
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = blong, afterall, som, assistent, crasher
-skip = .git,*.pdf,*.svg,*.lock,*.ts
+ignore-words-list = blong, afterall, assistent, crasher, requestor
+skip = ./.git,./gpt4all-chat/translations,*.pdf,*.svg,*.lock
--- a/.gitignore
+++ b/.gitignore
@ -181,6 +181,8 @@ CMakeLists.txt.user
 gpt4all-chat/models/*
 build_*
 build-*
+cmake-build-*
+/gpt4all-chat/tests/python/config.py

 # IntelliJ
 .idea/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,7 +1,25 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/llama.cpp-mainline
+	path = gpt4all-backend/deps/llama.cpp-mainline
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
 [submodule "gpt4all-chat/usearch"]
-	path = gpt4all-chat/usearch
+	path = gpt4all-chat/deps/usearch
 	url = https://github.com/nomic-ai/usearch.git
+[submodule "gpt4all-chat/deps/SingleApplication"]
+	path = gpt4all-chat/deps/SingleApplication
+	url = https://github.com/nomic-ai/SingleApplication.git
+[submodule "gpt4all-chat/deps/fmt"]
+	path = gpt4all-chat/deps/fmt
+	url = https://github.com/fmtlib/fmt.git
+[submodule "gpt4all-chat/deps/DuckX"]
+	path = gpt4all-chat/deps/DuckX
+	url = https://github.com/nomic-ai/DuckX.git
+[submodule "gpt4all-chat/deps/QXlsx"]
+	path = gpt4all-chat/deps/QXlsx
+	url = https://github.com/nomic-ai/QXlsx.git
+[submodule "gpt4all-chat/deps/minja"]
+	path = gpt4all-chat/deps/minja
+	url = https://github.com/nomic-ai/minja.git
+[submodule "gpt4all-chat/deps/json"]
+	path = gpt4all-chat/deps/json
+	url = https://github.com/nlohmann/json.git
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -51,11 +51,6 @@ Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
 E-mail: thiagojramos@outlook.com<br/>
 - pt\_BR translation

-Victor Emanuel ([@SINAPSA-IC](https://github.com/SINAPSA-IC))<br/>
-E-mail: contact@sinapsaro.ro<br/>
-Discord: `@sinapsa_ic_56124_99632`
- ro\_RO translation
-
 不知火 Shiranui ([@supersonictw](https://github.com/supersonictw))<br/>
 E-mail: supersonic@livemail.tw<br/>
 Discord: `@supersonictw`
@ -77,6 +72,6 @@ Discord: `@Tim453`
 - Flatpak

 Jack ([@wuodoo](https://github.com/wuodoo))<br/>
-E-mail: 2296103047@qq.com><br/>
+E-mail: 2296103047@qq.com<br/>
 Discord: `@mikage`
 - zh\_CN translation
--- a/README.md
+++ b/README.md
@ -1,48 +1,77 @@
 <h1 align="center">GPT4All</h1>

-<p align="center">GPT4All runs large language models (LLMs) privately on everyday desktops & laptops. <br> <br> No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>
+<p align="center">
+  Now with support for DeepSeek R1 Distillations
+</p>
+
+<p align="center">
+  <a href="https://www.nomic.ai/gpt4all">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a> &bull; <a href="https://www.youtube.com/watch?v=gQcZDXRVJok">YouTube Tutorial</a>
+</p>
+
+<p align="center">
+  GPT4All runs large language models (LLMs) privately on everyday desktops & laptops.
+</p>
+<p align="center">
+  No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>.
+</p>
+
+<p align="center">
+  Read about what's new in <a href="https://www.nomic.ai/blog/tag/gpt4all">our blog</a>.
+</p>
+<p align="center">
+  <a href="https://nomic.ai/gpt4all/#newsletter-form">Subscribe to the newsletter</a>
+</p>

 https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311

-<p align="center">
-  <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
-    <img src="gpt4all-bindings/python/docs/assets/windows.png" width="80" height="80"><br>
-    Download for Windows
-  </a>
-</p>
-
-<p align="center">
-  <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
-    <img src="gpt4all-bindings/python/docs/assets/mac.png" width="85" height="100"><br>
-    Download for MacOS
-  </a>
-</p>
-
-<p align="center">
-  <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
-    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" width="120" height="120"><br>
-    Download for Ubuntu
-  </a>
-</p>
-
-<p align="center">
-  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
-    <img width='240' alt='Get it on Flathub' src='https://flathub.org/api/badge?locale=en'><br>
-    Get it on Flathub (community maintained)
-  </a>
-</p>
-
-<p align="center">
-  <a href="https://gpt4all.io">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a>
-</p>
-<p align="center">
-  <a href="https://forms.nomic.ai/gpt4all-release-notes-signup">Subscribe to the newsletter</a>
-</p>
 <p align="center">
 GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
 </p>
-<p align="center">
- <a href="https://www.phorm.ai/query?projectId=755eecd3-24ad-49cc-abf4-0ab84caacf63"><img src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg" alt="phorm.ai"></a>
+
+## Download Links
+
+<p>
+  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
+    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
+  </a> &mdash;
+</p>
+<p>
+  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64-arm.exe">
+    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows ARM Installer
+  </a> &mdash;
+</p>
+<p>
+  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
+    <img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
+  </a> &mdash;
+</p>
+<p>
+  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
+    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
+  </a> &mdash;
+</p>
+<p>
+  The Windows and Linux builds require Intel Core i3 2nd Gen / AMD Bulldozer, or better.
+</p>
+<p>
+  The Windows ARM build supports Qualcomm Snapdragon and Microsoft SQ1/SQ2 processors.
+</p>
+<p>
+  The Linux build is x86-64 only (no ARM).
+</p>
+<p>
+  The macOS build requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
+</p>
+
+See the full [System Requirements](gpt4all-chat/system_requirements.md) for more details.
+
+<br/>
+<br/>
+<p>
+  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
+    <img style="height: 2em; width: auto" alt='Get it on Flathub' src='https://flathub.org/api/badge'><br/>
+    Flathub (community maintained)
+  </a>
 </p>

 ## Install GPT4All Python
@ -75,7 +104,7 @@ with model.chat_session():
    - Improved user workflow for LocalDocs
    - Expanded access to more model architectures
 - **October 19th, 2023**: GGUF Support Launches with Support for:
-    - Mistral 7b base model, an updated model gallery on [gpt4all.io](https://gpt4all.io), several new local code models including Rift Coder v1.5
+    - Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5
    - [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) support for Q4\_0 and Q4\_1 quantizations in GGUF.
    - Offline build support for running old versions of the GPT4All Local LLM Chat Client.
 - **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
--- a/common/common.cmake
+++ b/common/common.cmake
@ -0,0 +1,41 @@
+function(gpt4all_add_warning_options target)
+    if (MSVC)
+        return()
+    endif()
+    target_compile_options("${target}" PRIVATE
+        # base options
+        -Wall
+        -Wextra
+        # extra options
+        -Wcast-align
+        -Wextra-semi
+        -Wformat=2
+        -Wmissing-include-dirs
+        -Wsuggest-override
+        -Wvla
+        # errors
+        -Werror=format-security
+        -Werror=init-self
+        -Werror=pointer-arith
+        -Werror=undef
+        # disabled warnings
+        -Wno-sign-compare
+        -Wno-unused-parameter
+    )
+    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+        target_compile_options("${target}" PRIVATE
+            -Wduplicated-branches
+            -Wduplicated-cond
+            -Wlogical-op
+            -Wno-reorder
+            -Wno-null-dereference
+        )
+    elseif (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
+        target_compile_options("${target}" PRIVATE
+            -Wunreachable-code-break
+            -Wunreachable-code-return
+            -Werror=pointer-integer-compare
+            -Wno-reorder-ctor
+        )
+    endif()
+endfunction()
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,4 +1,7 @@
-cmake_minimum_required(VERSION 3.21)  # for PROJECT_IS_TOP_LEVEL
+cmake_minimum_required(VERSION 3.23)  # for FILE_SET
+
+include(../common/common.cmake)
+
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

@ -33,7 +36,7 @@ set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)

-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 set(BUILD_SHARED_LIBS ON)
@ -47,7 +50,7 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

-set(DIRECTORY llama.cpp-mainline)
+set(DIRECTORY deps/llama.cpp-mainline)
 include(llama.cpp.cmake)

 set(BUILD_VARIANTS)
@ -63,9 +66,23 @@ if (LLMODEL_VULKAN)
    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
 endif()
 if (LLMODEL_CUDA)
-    if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-        set(GGML_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
+    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+
+    # Defaults must be set before enable_language(CUDA).
+    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
+    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
+        # 52 == lowest CUDA 12 standard
+        # 60 == f16 CUDA intrinsics
+        # 61 == integer CUDA intrinsics
+        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
+            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
+        else()
+            set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
+            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
+        endif()
    endif()
+    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")

    include(CheckLanguage)
    check_language(CUDA)
@ -80,8 +97,6 @@ if (LLMODEL_ROCM)
    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()

-set(CMAKE_VERBOSE_MAKEFILE ON)
-
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
@ -114,6 +129,10 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Include GGML
    include_ggml(-mainline-${BUILD_VARIANT})

+    if (BUILD_VARIANT MATCHES metal)
+        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
+    endif()
+
    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
        set(TARGET_NAME ${TARGET_NAME}-${BUILD_VARIANT})
@ -132,9 +151,13 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)

    # Add each individual implementations
    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        llamamodel.cpp llmodel_shared.cpp)
+        src/llamamodel.cpp src/llmodel_shared.cpp)
+    gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
+    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
+        src include/gpt4all-backend
+    )
    prepare_target(llamamodel-mainline llama-mainline)

    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
@ -143,11 +166,20 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 endforeach()

 add_library(llmodel
-    llmodel.h llmodel.cpp llmodel_shared.cpp
-    llmodel_c.h llmodel_c.cpp
-    dlhandle.cpp
+    src/dlhandle.cpp
+    src/llmodel.cpp
+    src/llmodel_c.cpp
+    src/llmodel_shared.cpp
+)
+gpt4all_add_warning_options(llmodel)
+target_sources(llmodel PUBLIC
+    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
+    FILES include/gpt4all-backend/llmodel.h
+          include/gpt4all-backend/llmodel_c.h
+          include/gpt4all-backend/sysinfo.h
 )
 target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
+target_include_directories(llmodel PRIVATE src include/gpt4all-backend)

 set_target_properties(llmodel PROPERTIES
                              VERSION ${PROJECT_VERSION}
--- a/gpt4all-backend/README.md
+++ b/gpt4all-backend/README.md
@ -27,7 +27,7 @@ Unfortunately, no for three reasons:

 # What is being done to make them more compatible?

-A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.
+A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.

 # What about GPU inference?

--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -0,0 +1 @@
+Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -5,8 +5,10 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <expected>
 #include <functional>
 #include <optional>
+#include <span>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@ -14,14 +16,19 @@
 #include <utility>
 #include <vector>

+class Dlhandle;
+
 using namespace std::string_literals;

 #define LLMODEL_MAX_PROMPT_BATCH 128

-class Dlhandle;
 class LLModel {
 public:
    using Token = int32_t;
+    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
+    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
+    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
+    using ProgressCallback    = std::function<bool(float progress)>;

    class BadArchError: public std::runtime_error {
    public:
@ -99,6 +106,7 @@ public:
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
        static bool isEmbeddingModel(const std::string &modelPath);
+        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
        static void setImplementationsSearchPath(const std::string &path);
        static const std::string &implementationsSearchPath();
        static bool hasSupportedCPU();
@ -122,9 +130,6 @@ public:
    };

    struct PromptContext {
-        std::vector<int32_t> tokens;    // current tokens in the context window
-        int32_t n_past = 0;             // number of tokens in past conversation
-        int32_t n_ctx = 0;              // number of tokens possible in context window
        int32_t n_predict = 200;
        int32_t top_k = 40;
        float   top_p = 0.9f;
@ -133,38 +138,31 @@ public:
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context window
-        int32_t n_last_batch_tokens = 0;
+        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
    };

-    using ProgressCallback = std::function<bool(float progress)>;
-
    explicit LLModel() {}
    virtual ~LLModel() {}

    virtual bool supportsEmbedding() const = 0;
    virtual bool supportsCompletion() const = 0;
    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; };
+    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const { return 0; }
-    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
-    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
+    virtual size_t stateSize() const = 0;
+    virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
+    virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;

    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
-    virtual void prompt(const std::string &prompt,
-                        const std::string &promptTemplate,
-                        std::function<bool(int32_t)> promptCallback,
-                        std::function<bool(int32_t, const std::string&)> responseCallback,
-                        std::function<bool(bool)> recalculateCallback,
-                        PromptContext &ctx,
-                        bool special = false,
-                        std::string *fakeReply = nullptr);
+    virtual void prompt(std::string_view        prompt,
+                        const PromptCallback   &promptCallback,
+                        const ResponseCallback &responseCallback,
+                        const PromptContext    &ctx);

-    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
+    virtual int32_t countPromptTokens(std::string_view prompt) const;

    virtual size_t embeddingSize() const {
        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
@ -209,14 +207,24 @@ public:

    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }

+    virtual int32_t contextLength() const = 0;
+    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
+
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) const = 0;
+    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
+    virtual bool isSpecialToken(Token id) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
-    virtual Token sampleToken(PromptContext &ctx) const = 0;
-    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
-    virtual int32_t contextLength() const = 0;
+    virtual void initSampler(const PromptContext &ctx) = 0;
+    virtual Token sampleToken() const = 0;
+    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
+    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
+    virtual int32_t inputLength() const = 0;
+    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
+    virtual void setModelInputPosition(int32_t pos) = 0;
+    virtual void appendInputToken(Token tok) = 0;
+    virtual std::span<const Token> inputTokens() const = 0;
    virtual const std::vector<Token> &endTokens() const = 0;
    virtual bool shouldAddBOS() const = 0;

@ -232,9 +240,11 @@ protected:
        return -1;
    }

-    // This is a helper function called from the default implementation of 'prompt' but it can be
-    // shared by all base classes so it isn't virtual
-    void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
+    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
+    {
+        (void)modelPath;
+        return std::unexpected("not implemented");
+    }

    const Implementation *m_implementation = nullptr;

@ -247,16 +257,16 @@ protected:
        return true;
    }

-    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
-                      std::function<bool(int32_t, const std::string&)> responseCallback,
-                      std::function<bool(bool)> recalculateCallback,
-                      PromptContext &promptCtx,
-                      std::vector<Token> embd_inp);
-    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                          std::function<bool(bool)> recalculateCallback,
-                          PromptContext &promptCtx);
+    // prefill context with prompt
+    auto decodePrompt(const PromptCallback &promptCallback,
+                      const PromptContext  &promptCtx,
+                      std::vector<Token>    embd_inp)
+        -> std::optional<int32_t>;
+    // generate a response
+    void generateResponse(const ResponseCallback &responseCallback,
+                          const PromptContext    &promptCtx,
+                          int32_t                 nPast);

-private:
    friend class LLMImplementation;
 };

--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -23,6 +23,11 @@ extern "C" {
 */
 typedef void *llmodel_model;

+/**
+ * A token.
+ */
+typedef int32_t token_t;
+
 /**
 * llmodel_prompt_context structure for holding the prompt context.
 * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
@ -30,21 +35,15 @@ typedef void *llmodel_model;
 * behavior.
 */
 struct llmodel_prompt_context {
-    float *logits;          // logits of current context
-    size_t logits_size;     // the size of the raw logits vector
-    int32_t *tokens;        // current tokens in the context window
-    size_t tokens_size;     // the size of the raw tokens vector
-    int32_t n_past;         // number of tokens in past conversation
-    int32_t n_ctx;          // number of tokens possible in context window
    int32_t n_predict;      // number of tokens to predict
    int32_t top_k;          // top k logits to sample from
-    float top_p;            // nucleus sampling probability threshold
-    float min_p;            // Min P sampling
-    float temp;             // temperature to adjust model's output distribution
+    float   top_p;          // nucleus sampling probability threshold
+    float   min_p;          // Min P sampling
+    float   temp;           // temperature to adjust model's output distribution
    int32_t n_batch;        // number of predictions to generate in parallel
-    float repeat_penalty;   // penalty factor for repeated tokens
+    float   repeat_penalty; // penalty factor for repeated tokens
    int32_t repeat_last_n;  // last n tokens to penalize
-    float context_erase;    // percent of context to erase if we exceed the context window
+    float   context_erase;  // percent of context to erase if we exceed the context window
 };

 struct llmodel_gpu_device {
@ -63,10 +62,12 @@ typedef struct llmodel_gpu_device llmodel_gpu_device;

 /**
 * Callback type for prompt processing.
- * @param token_id The token id of the prompt.
+ * @param token_ids An array of token ids of the prompt.
+ * @param n_token_ids The number of tokens in the array.
+ * @param cached Whether the tokens were already in cache.
 * @return a bool indicating whether the model should keep processing.
 */
-typedef bool (*llmodel_prompt_callback)(int32_t token_id);
+typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);

 /**
 * Callback type for response.
@ -74,14 +75,7 @@ typedef bool (*llmodel_prompt_callback)(int32_t token_id);
 * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
 * @return a bool indicating whether the model should keep generating.
 */
-typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
-
-/**
- * Callback type for recalculation of context.
- * @param whether the model is recalculating the context.
- * @return a bool indicating whether the model should keep generating.
- */
-typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
+typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);

 /**
 * Embedding cancellation callback for use with llmodel_embed.
@ -92,6 +86,8 @@ typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);

+typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
+
 /**
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
@ -150,46 +146,57 @@ bool llmodel_isModelLoaded(llmodel_model model);
 * @param model A pointer to the llmodel_model instance.
 * @return the size in bytes of the internal state of the model
 */
-uint64_t llmodel_get_state_size(llmodel_model model);
+uint64_t llmodel_state_get_size(llmodel_model model);

 /**
- * Saves the internal state of the model to the specified destination address.
+ * Saves the internal state of the model.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param dest A pointer to the destination.
- * @return the number of bytes copied
+ * @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
+ * @param state_size The size of the destination for the state.
+ * @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
+ * be freed with llmodel_state_free_input_tokens.
+ * @param n_input_tokens Where to store the size of the token cache state.
+ * @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
+ * size is set to zero.
 */
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
+uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
+                                token_t **input_tokens_out, uint64_t *n_input_tokens);
+
+/**
+ * Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
+ * @param input_tokens The token cache buffer.
+ */
+void llmodel_state_free_input_tokens(token_t *input_tokens);

 /**
 * Restores the internal state of the model using data from the specified address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param src A pointer to the src.
- * @return the number of bytes read
+ * @param state A pointer to the state data.
+ * @param state_size The size of the state data.
+ * @param input_tokens The token cache associated with the saved state.
+ * @param n_input_tokens The number of tokens in input_tokens.
+ * @return The number of bytes read, or zero on error.
 */
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
+uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
+                                const token_t *input_tokens, uint64_t n_input_tokens);

 /**
 * Generate a response using the model.
 * @param model A pointer to the llmodel_model instance.
 * @param prompt A string representing the input prompt.
- * @param prompt_template A string representing the input prompt template.
 * @param prompt_callback A callback function for handling the processing of prompt.
 * @param response_callback A callback function for handling the generated response.
- * @param recalculate_callback A callback function for handling recalculation requests.
- * @param special True if special tokens in the prompt should be processed, false otherwise.
- * @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
 * @param ctx A pointer to the llmodel_prompt_context structure.
+ * @param error A pointer to a string; will only be set on error.
 */
-void llmodel_prompt(llmodel_model model, const char *prompt,
-                    const char *prompt_template,
-                    llmodel_prompt_callback prompt_callback,
-                    llmodel_response_callback response_callback,
-                    llmodel_recalculate_callback recalculate_callback,
-                    llmodel_prompt_context *ctx,
-                    bool special,
-                    const char *fake_reply);
+bool llmodel_prompt(llmodel_model               model,
+                    const char                 *prompt,
+                    llmodel_prompt_callback     prompt_callback,
+                    llmodel_response_callback   response_callback,
+                    llmodel_prompt_context     *ctx,
+                    const char                **error);

 /**
 * Generate an embedding using the model.
@ -301,6 +308,10 @@ const char *llmodel_model_backend_name(llmodel_model model);
 */
 const char *llmodel_model_gpu_device_name(llmodel_model model);

+int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
+
+void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
+
 #ifdef __cplusplus
 }
 #endif
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +0,0 @@
-Subproject commit c6546b0544ad2c01e8a1630b101e92336a68b036
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -378,19 +378,7 @@ function(include_ggml SUFFIX)
        find_package(CUDAToolkit REQUIRED)
        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)

-        if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
-            # 52 == lowest CUDA 12 standard
-            # 60 == f16 CUDA intrinsics
-            # 61 == integer CUDA intrinsics
-            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-            if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-                set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
-            else()
-                set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
-                #set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-            endif()
-        endif()
-        message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
+        # architectures are set in gpt4all-backend/CMakeLists.txt

        set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
        file(GLOB   GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
@ -823,7 +811,8 @@ function(include_ggml SUFFIX)
            list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
        endif()

-        set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
+        set(GGML_METALLIB "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
+        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
        add_custom_command(
            OUTPUT ${GGML_METALLIB}
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
@ -834,7 +823,6 @@ function(include_ggml SUFFIX)
            DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
            COMMENT "Compiling Metal kernels"
            )
-        set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON)

        add_custom_target(
            ggml-metal ALL
@ -990,10 +978,13 @@ function(include_ggml SUFFIX)

    add_library(llama${SUFFIX} STATIC
                ${DIRECTORY}/include/llama.h
+                ${DIRECTORY}/src/llama-grammar.cpp
+                ${DIRECTORY}/src/llama-sampling.cpp
+                ${DIRECTORY}/src/llama-vocab.cpp
                ${DIRECTORY}/src/llama.cpp
-                ${DIRECTORY}/src/unicode.h
-                ${DIRECTORY}/src/unicode.cpp
                ${DIRECTORY}/src/unicode-data.cpp
+                ${DIRECTORY}/src/unicode.cpp
+                ${DIRECTORY}/src/unicode.h
                )

    target_include_directories(llama${SUFFIX} PUBLIC  ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
@ -1018,9 +1009,6 @@ function(include_ggml SUFFIX)
        C_STANDARD 11
        C_STANDARD_REQUIRED true
        )
-    if (GGML_CUDA_ARCHITECTURES)
-        set_property(TARGET ggml${SUFFIX} llama${SUFFIX} PROPERTY CUDA_ARCHITECTURES "${GGML_CUDA_ARCHITECTURES}")
-    endif()

    target_compile_options(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
    target_compile_options(llama${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@ -1,312 +0,0 @@
-#include "llmodel.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <functional>
-#include <iostream>
-#include <optional>
-#include <regex>
-#include <stdexcept>
-#include <string>
-#include <unordered_set>
-#include <vector>
-
-// TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
-void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
-{
-    int n_keep = shouldAddBOS();
-    const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
-
-    // Erase the first percentage of context from the tokens
-    std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
-    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
-
-    size_t i = n_keep;
-    promptCtx.n_past = n_keep;
-    while (i < promptCtx.tokens.size()) {
-        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
-        std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
-        assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
-        if (!evalTokens(promptCtx, batch)) {
-            std::cerr << "LLModel ERROR: Failed to process prompt\n";
-            goto stop_generating;
-        }
-        promptCtx.n_past += batch.size();
-        if (!recalculate(true))
-            goto stop_generating;
-        i = batch_end;
-    }
-    assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));
-
-stop_generating:
-    recalculate(false);
-}
-
-static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
-{
-    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
-
-    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
-    placeholders.clear();
-    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
-
-    if (placeholders.size() > 2) {
-        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
-        return false;
-    }
-    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
-        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
-        return false;
-    }
-    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
-        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
-        return false;
-    }
-    return true;
-}
-
-void LLModel::prompt(const std::string &prompt,
-                     const std::string &promptTemplate,
-                     std::function<bool(int32_t)> promptCallback,
-                     std::function<bool(int32_t, const std::string&)> responseCallback,
-                     std::function<bool(bool)> recalculateCallback,
-                     PromptContext &promptCtx,
-                     bool special,
-                     std::string *fakeReply)
-{
-    if (!isModelLoaded()) {
-        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
-        return;
-    }
-
-    if (!supportsCompletion()) {
-        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
-        responseCallback(-1, errorMessage);
-        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
-        return;
-    }
-
-    // parse the prompt template
-    std::vector<std::smatch> placeholders;
-    {
-        std::string err;
-        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
-            responseCallback(-1, err);
-            std::cerr << err << "\n";
-            return;
-        }
-    }
-
-    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
-
-    // tokenize the user prompt
-    std::vector<Token> embd_inp;
-    if (placeholders.empty()) {
-        // this is unusual, but well-defined
-        std::cerr << __func__ << ": prompt template has no placeholder\n";
-        embd_inp = tokenize(promptCtx, promptTemplate, true);
-    } else {
-        // template: beginning of user prompt
-        const auto &phUser = placeholders[0];
-        std::string userPrefix(phUser.prefix());
-        if (!userPrefix.empty()) {
-            embd_inp = tokenize(promptCtx, userPrefix, true);
-            promptCtx.n_past += embd_inp.size();
-        }
-
-        // user input (shouldn't have special token processing)
-        auto tokens = tokenize(promptCtx, prompt, special);
-        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
-        promptCtx.n_past += tokens.size();
-
-        // template: end of user prompt + start of assistant prompt
-        size_t start = phUser.position() + phUser.length();
-        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
-        auto userToAsst = promptTemplate.substr(start, end - start);
-        if (!userToAsst.empty()) {
-            tokens = tokenize(promptCtx, userToAsst, true);
-            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
-            promptCtx.n_past += tokens.size();
-        }
-    }
-
-    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
-
-    // decode the user prompt
-    if (!decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp))
-        return; // error
-
-    // decode the assistant's reply, either generated or spoofed
-    if (fakeReply == nullptr) {
-        generateResponse(responseCallback, recalculateCallback, promptCtx);
-    } else {
-        embd_inp = tokenize(promptCtx, *fakeReply, false);
-        if (!decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp))
-            return; // error
-    }
-
-    // decode the rest of the prompt template
-    // template: end of assistant prompt
-    std::string asstSuffix;
-    if (placeholders.size() >= 2) {
-        size_t start = placeholders[1].position() + placeholders[1].length();
-        asstSuffix = promptTemplate.substr(start);
-    } else {
-        asstSuffix = "\n\n"; // default to a blank link, good for e.g. Alpaca
-    }
-    if (!asstSuffix.empty()) {
-        embd_inp = tokenize(promptCtx, asstSuffix, true);
-        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
-    }
-}
-
-// returns false on error
-bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
-                           std::function<bool(int32_t, const std::string&)> responseCallback,
-                           std::function<bool(bool)> recalculateCallback,
-                           PromptContext &promptCtx,
-                           std::vector<Token> embd_inp) {
-    // save the context size
-    promptCtx.n_ctx = contextLength();
-
-    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
-        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
-        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
-            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
-        return false;
-    }
-
-    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
-    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
-    promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
-
-    // process the prompt in batches
-    size_t i = 0;
-    while (i < embd_inp.size()) {
-        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
-        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
-
-        // Check if the context has run out...
-        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
-            recalculateContext(promptCtx, recalculateCallback);
-            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
-        }
-
-        if (!evalTokens(promptCtx, batch)) {
-            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
-            return false;
-        }
-
-        size_t tokens = batch_end - i;
-        for (size_t t = 0; t < tokens; ++t) {
-            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
-                promptCtx.tokens.erase(promptCtx.tokens.begin());
-            promptCtx.tokens.push_back(batch.at(t));
-            promptCtx.n_past += 1;
-            if (!promptCallback(batch.at(t)))
-                return false;
-        }
-        i = batch_end;
-    }
-
-    return true;
-}
-
-void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-                               std::function<bool(bool)> recalculateCallback,
-                               PromptContext &promptCtx) {
-    std::string cachedResponse;
-    std::vector<Token> cachedTokens;
-    std::unordered_set<std::string> reversePrompts
-        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
-
-    // predict next tokens
-    for (int i = 0; i < promptCtx.n_predict; i++) {
-
-        // sample next token
-        auto id = sampleToken(promptCtx);
-
-        // Check if the context has run out...
-        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
-            recalculateContext(promptCtx, recalculateCallback);
-            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
-        }
-
-        if (!evalTokens(promptCtx, { id })) {
-            std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
-            return;
-        }
-
-        // display text
-        for (const auto token : endTokens()) {
-            if (id == token) return;
-        }
-
-        const std::string str = tokenToString(id);
-
-        // Check if the provided str is part of our reverse prompts
-        bool foundPartialReversePrompt = false;
-        const std::string completed = cachedResponse + std::string(str);
-        if (reversePrompts.find(completed) != reversePrompts.end())
-            return;
-
-        // Check if it partially matches our reverse prompts and if so, cache
-        for (const auto& s : reversePrompts) {
-            if (s.compare(0, completed.size(), completed) == 0) {
-                foundPartialReversePrompt = true;
-                cachedResponse = completed;
-                break;
-            }
-        }
-
-        // Regardless the token gets added to our cache
-        cachedTokens.push_back(id);
-
-        // Continue if we have found a partial match
-        if (foundPartialReversePrompt)
-            continue;
-
-        // Empty the cache
-        for (auto t : cachedTokens) {
-            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
-                promptCtx.tokens.erase(promptCtx.tokens.begin());
-            promptCtx.tokens.push_back(t);
-            promptCtx.n_past += 1;
-            //TODO: Conversion to std::string can be avoided here...
-            if (!responseCallback(t, std::string(tokenToString(t))))
-                return;
-        }
-        cachedTokens.clear();
-    }
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)prefix;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    (void)cancelCb;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)isRetrieval;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@ -1,49 +0,0 @@
-#pragma once
-
-#include <ggml.h>
-
-#include <cstddef>
-#include <cstdint>
-#include <vector>
-
-struct llm_buffer {
-    uint8_t * addr = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] addr;
-        addr = new uint8_t[size];
-        this->size = size;
-    }
-
-    ~llm_buffer() {
-        delete[] addr;
-    }
-};
-
-struct llm_kv_cache {
-    struct ggml_tensor * k;
-    struct ggml_tensor * v;
-
-    struct ggml_context * ctx = NULL;
-
-    llm_buffer buf;
-
-    int n; // number of tokens currently in the cache
-
-    ~llm_kv_cache() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
-    }
-};
-
-inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
-{
-    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
-    if (plan.work_size > 0) {
-        buf.resize(plan.work_size);
-        plan.work_data = buf.addr;
-    }
-    ggml_graph_compute(graph, &plan);
-}
--- a/gpt4all-backend/src/dlhandle.cpp
+++ b/gpt4all-backend/src/dlhandle.cpp
--- a/gpt4all-backend/src/dlhandle.h
+++ b/gpt4all-backend/src/dlhandle.h
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
@ -2,6 +2,7 @@
 #include "llamamodel_impl.h"

 #include "llmodel.h"
+#include "utils.h"

 #include <ggml.h>
 #include <llama.h>
@ -52,6 +53,8 @@ static const std::vector<const char *> KNOWN_ARCHES {
    "gpt2",
    // "gptj", -- no inference code
    "gptneox",
+    "granite",
+    "granitemoe",
    "mpt",
    "baichuan",
    "starcoder",
@ -79,6 +82,7 @@ static const std::vector<const char *> KNOWN_ARCHES {
    "command-r",
    // "dbrx", -- 16x12B parameters
    "olmo",
+    "olmoe",
    "openelm",
    // "arctic", -- 10B+128x3.66B parameters
    "deepseek2",
@ -103,26 +107,34 @@ static bool llama_verbose()
    return var && *var;
 }

-static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
+static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
 {
    (void)userdata;
-    if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
-        fputs(text, stderr);
-    }
-}

-#ifdef GGML_USE_CUDA
-static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
-{
-    (void)userdata;
-    if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
-        fputs(text, stderr);
+    static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
+    if (!llama_verbose()) {
+        auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
+        lastlevel = efflevel;
+        switch (efflevel) {
+            case GGML_LOG_LEVEL_CONT:
+                UNREACHABLE();
+                break;
+            case GGML_LOG_LEVEL_WARN:
+                if (warn) break;
+                [[fallthrough]];
+            case GGML_LOG_LEVEL_NONE: // not used?
+            case GGML_LOG_LEVEL_INFO:
+            case GGML_LOG_LEVEL_DEBUG:
+                return; // suppress
+            case GGML_LOG_LEVEL_ERROR:
+                ;
+        }
    }
+
+    fputs(text, stderr);
 }
-#endif

 struct gpt_params {
-    int32_t seed          = -1;   // RNG seed
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
@ -137,37 +149,6 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
 };

-static int llama_sample_top_p_top_k(
-        llama_context *ctx,
-        const llama_token *last_n_tokens_data,
-        int last_n_tokens_size,
-        int top_k,
-        float top_p,
-        float min_p,
-        float temp,
-        float repeat_penalty,
-        int32_t pos) {
-    auto logits = llama_get_logits_ith(ctx, pos);
-    auto n_vocab = llama_n_vocab(llama_get_model(ctx));
-    // Populate initial list of all candidates
-    std::vector<llama_token_data> candidates;
-    candidates.reserve(n_vocab);
-    for (int token_id = 0; token_id < n_vocab; token_id++) {
-        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
-    }
-    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
-    // Sample repeat penalty
-    llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
-    // Temperature sampling
-    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
-    llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
-    llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
-    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
-    llama_sample_min_p(ctx, &candidates_p, min_p, 1);
-    llama_sample_temp(ctx, &candidates_p, temp);
-    return llama_sample_token(ctx, &candidates_p);
-}
-
 const char *get_arch_name(gguf_context *ctx_gguf)
 {
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
@ -224,7 +205,7 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
        if (keyidx != -1) {
            value = gguf_get_val_u32(ctx, keyidx);
        } else {
-            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
+            std::cerr << __func__ << ": " << key << " not found in " << modelPath << "\n";
        }
    }

@ -234,21 +215,27 @@ cleanup:
 }

 struct LLamaPrivate {
-    const std::string modelPath;
-    bool modelLoaded = false;
-    int device = -1;
-    std::string deviceName;
-    llama_model *model = nullptr;
-    llama_context *ctx = nullptr;
-    llama_model_params model_params;
-    llama_context_params ctx_params;
-    int64_t n_threads = 0;
-    std::vector<LLModel::Token> end_tokens;
-    const char *backend_name = nullptr;
+    bool                         modelLoaded  = false;
+    int                          device       = -1;
+    std::string                  deviceName;
+    int64_t                      n_threads    = 0;
+    std::vector<LLModel::Token>  end_tokens;
+    const char                  *backend_name = nullptr;
+    std::vector<LLModel::Token>  inputTokens;
+
+    llama_model          *model        = nullptr;
+    llama_context        *ctx          = nullptr;
+    llama_model_params    model_params;
+    llama_context_params  ctx_params;
+    llama_sampler        *sampler_chain;
 };

 LLamaModel::LLamaModel()
-    : d_ptr(new LLamaPrivate) {}
+    : d_ptr(std::make_unique<LLamaPrivate>())
+{
+    auto sparams = llama_sampler_chain_default_params();
+    d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
+}

 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@ -437,10 +424,9 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        }
    }

-    d_ptr->ctx_params.n_ctx   = n_ctx;
-    d_ptr->ctx_params.seed    = params.seed;
-    d_ptr->ctx_params.type_k  = params.kv_type;
-    d_ptr->ctx_params.type_v  = params.kv_type;
+    d_ptr->ctx_params.n_ctx  = n_ctx;
+    d_ptr->ctx_params.type_k = params.kv_type;
+    d_ptr->ctx_params.type_v = params.kv_type;

    // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
    // that we want this many logits so the state serializes consistently.
@ -506,6 +492,7 @@ LLamaModel::~LLamaModel()
        llama_free(d_ptr->ctx);
    }
    llama_free_model(d_ptr->model);
+    llama_sampler_free(d_ptr->sampler_chain);
 }

 bool LLamaModel::isModelLoaded() const
@ -515,30 +502,41 @@ bool LLamaModel::isModelLoaded() const

 size_t LLamaModel::stateSize() const
 {
-    return llama_get_state_size(d_ptr->ctx);
+    return llama_state_get_size(d_ptr->ctx);
 }

-size_t LLamaModel::saveState(uint8_t *dest) const
+size_t LLamaModel::saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const
 {
-    return llama_copy_state_data(d_ptr->ctx, dest);
+    size_t bytesWritten = llama_state_get_data(d_ptr->ctx, stateOut.data(), stateOut.size());
+    if (bytesWritten)
+        inputTokensOut.assign(d_ptr->inputTokens.begin(), d_ptr->inputTokens.end());
+    return bytesWritten;
 }

-size_t LLamaModel::restoreState(const uint8_t *src)
+size_t LLamaModel::restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens)
 {
-    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
-    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
+    size_t bytesRead = llama_state_set_data(d_ptr->ctx, state.data(), state.size());
+    if (bytesRead)
+        d_ptr->inputTokens.assign(inputTokens.begin(), inputTokens.end());
+    return bytesRead;
 }

-std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special) const
+std::vector<LLModel::Token> LLamaModel::tokenize(std::string_view str) const
 {
-    const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
-    const bool useBOS = wantBOS && shouldAddBOS();
    std::vector<LLModel::Token> fres(str.length() + 4);
-    auto fres_len = llama_tokenize(d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), useBOS, special);
+    int32_t fres_len = llama_tokenize(
+        d_ptr->model, str.data(), str.length(), fres.data(), fres.size(), /*add_special*/ true, /*parse_special*/ true
+    );
    fres.resize(fres_len);
    return fres;
 }

+bool LLamaModel::isSpecialToken(Token id) const
+{
+    return llama_token_get_attr(d_ptr->model, id)
+        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
+}
+
 std::string LLamaModel::tokenToString(Token id) const
 {
    std::vector<char> result(8, 0);
@ -555,27 +553,66 @@ std::string LLamaModel::tokenToString(Token id) const
    return std::string(result.data(), result.size());
 }

-LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
+void LLamaModel::initSampler(const PromptContext &promptCtx)
 {
-    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
-    return llama_sample_top_p_top_k(d_ptr->ctx,
-        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
-        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
-        promptCtx.repeat_penalty, promptCtx.n_last_batch_tokens - 1);
+    auto *model = d_ptr->model;
+    auto *chain = d_ptr->sampler_chain;
+
+    // clear sampler chain
+    for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
+        auto *smpl = llama_sampler_chain_remove(chain, i);
+        llama_sampler_free(smpl);
+    }
+
+    // build new chain
+    llama_sampler_chain_add(chain,
+        llama_sampler_init_penalties(
+            llama_n_vocab(model),
+            llama_token_eos(model),
+            llama_token_nl(model),
+            promptCtx.repeat_last_n,
+            promptCtx.repeat_penalty,
+            // TODO(jared): consider making the below configurable
+            /*penalty_freq*/    0.0f,
+            /*penalty_present*/ 0.0f,
+            /*penalize_nl*/     true,
+            /*ignore_eos*/      false
+        )
+    );
+    if (promptCtx.temp == 0.0f) {
+        llama_sampler_chain_add(chain, llama_sampler_init_greedy());
+    } else {
+        struct llama_sampler *samplers[] = {
+            llama_sampler_init_top_k(promptCtx.top_k),
+            llama_sampler_init_top_p(promptCtx.top_p, 1),
+            llama_sampler_init_min_p(promptCtx.min_p, 1),
+            llama_sampler_init_temp(promptCtx.temp),
+            llama_sampler_init_softmax(),
+            llama_sampler_init_dist(LLAMA_DEFAULT_SEED),
+        };
+        for (auto *smpl : samplers)
+            llama_sampler_chain_add(chain, smpl);
+    }
 }

-bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
+LLModel::Token LLamaModel::sampleToken() const
 {
-    llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
+    return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
+}
+
+bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
+{
+    assert(!tokens.empty());
+
+    llama_kv_cache_seq_rm(d_ptr->ctx, 0, nPast, -1);

    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);

    batch.n_tokens = tokens.size();
-    ctx.n_last_batch_tokens = tokens.size();

    for (int32_t i = 0; i < batch.n_tokens; i++) {
        batch.token   [i] = tokens[i];
-        batch.pos     [i] = ctx.n_past + i;
+        batch.pos     [i] = nPast + i;
        batch.n_seq_id[i] = 1;
        batch.seq_id  [i][0] = 0;
        batch.logits  [i] = false;
@ -589,11 +626,86 @@ bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &toke
    return res == 0;
 }

+void LLamaModel::shiftContext(const PromptContext &promptCtx, int32_t *nPast)
+{
+    // infinite text generation via context shifting
+
+    // erase up to n_ctx*contextErase tokens
+    int n_keep = shouldAddBOS();
+    int n_past = *nPast;
+    int n_discard = std::min(n_past - n_keep, int(contextLength() * promptCtx.contextErase));
+
+    assert(n_discard > 0);
+    if (n_discard <= 0)
+        return;
+
+    std::cerr << "Llama: context full, swapping: n_past = " << n_past << ", n_keep = " << n_keep
+              << ", n_discard = " << n_discard << "\n";
+
+    // erase the first n_discard tokens from the context
+    llama_kv_cache_seq_rm (d_ptr->ctx, 0, n_keep,             n_keep + n_discard);
+    llama_kv_cache_seq_add(d_ptr->ctx, 0, n_keep + n_discard, n_past,             -n_discard);
+
+    auto &inp = d_ptr->inputTokens;
+    inp.erase(inp.begin() + n_keep, inp.begin() + n_keep + n_discard);
+    *nPast = inp.size();
+}
+
 int32_t LLamaModel::contextLength() const
 {
    return llama_n_ctx(d_ptr->ctx);
 }

+auto LLamaModel::specialTokens() -> std::unordered_map<std::string, std::string> const
+{
+    if (!d_ptr->model)
+        throw std::logic_error("model not loaded");
+
+    std::unordered_map<std::string, std::string> tokens;
+    if (auto id = llama_token_bos(d_ptr->model); id != LLAMA_TOKEN_NULL)
+        tokens.emplace("bos_token", tokenToString(id));
+    if (auto id = llama_token_eos(d_ptr->model); id != LLAMA_TOKEN_NULL)
+        tokens.emplace("eos_token", tokenToString(id));
+    return tokens;
+}
+
+int32_t LLamaModel::inputLength() const
+{
+    return d_ptr->inputTokens.size();
+}
+
+int32_t LLamaModel::computeModelInputPosition(std::span<const Token> input) const
+{
+    // find common prefix
+    auto cacheIt = d_ptr->inputTokens.begin();
+    auto inputIt = input.begin();
+    while (cacheIt < d_ptr->inputTokens.end() && inputIt < input.end() && *cacheIt == *inputIt) {
+        ++cacheIt; ++inputIt;
+    }
+    // tell the caller to ignore the tokens between [begin, inputIt)
+    return inputIt - input.begin();
+}
+
+void LLamaModel::setModelInputPosition(int32_t pos)
+{
+    auto &inp = d_ptr->inputTokens;
+    assert(pos >= 0);
+    assert(pos <= inp.size());
+    // truncate token cache to end at the new n_past
+    if (pos < inp.size())
+        inp.resize(pos);
+}
+
+void LLamaModel::appendInputToken(Token tok)
+{
+    d_ptr->inputTokens.push_back(tok);
+}
+
+auto LLamaModel::inputTokens() const -> std::span<const Token>
+{
+    return d_ptr->inputTokens;
+}
+
 const std::vector<LLModel::Token> &LLamaModel::endTokens() const
 {
    return d_ptr->end_tokens;
@ -601,10 +713,7 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const

 bool LLamaModel::shouldAddBOS() const
 {
-    int add_bos = llama_add_bos_token(d_ptr->model);
-    if (add_bos != -1) { return add_bos; }
-    auto vocab_type = llama_vocab_type(d_ptr->model);
-    return vocab_type == LLAMA_VOCAB_TYPE_SPM || vocab_type == LLAMA_VOCAB_TYPE_WPM;
+    return llama_add_bos_token(d_ptr->model);
 }

 int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
@ -617,6 +726,37 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
    return get_arch_key_u32(modelPath, "block_count");
 }

+// TODO(jared): reduce redundant code and operations by combining all metadata getters for unloaded
+//              models into a class that keeps the model file open
+auto LLamaModel::chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
+{
+    auto *ctx = load_gguf(modelPath);
+    if (!ctx)
+        return std::unexpected("failed to open model file");
+
+    std::expected<std::string, std::string> result;
+    enum gguf_type ktype;
+    const int kid = gguf_find_key(ctx, "tokenizer.chat_template");
+    if (kid == -1) {
+        result = std::unexpected("key not found");
+        goto cleanup;
+    }
+
+    ktype = gguf_get_kv_type(ctx, kid);
+    if (ktype != GGUF_TYPE_STRING) {
+        result = std::unexpected(
+            "expected key type STRING (" + std::to_string(GGUF_TYPE_STRING) + "), got " + std::to_string(ktype)
+        );
+        goto cleanup;
+    }
+
+    result = gguf_get_val_str(ctx, kid);
+
+cleanup:
+    gguf_free(ctx);
+    return result;
+}
+
 #ifdef GGML_USE_VULKAN
 static const char *getVulkanVendorName(uint32_t vendorID)
 {
@ -946,7 +1086,7 @@ void LLamaModel::embedInternal(
    const llama_token bos_token = llama_token_bos(d_ptr->model);
    const llama_token eos_token = llama_token_eos(d_ptr->model);

-    bool useBOS = shouldAddBOS();
+    bool useBOS = llama_add_bos_token(d_ptr->model);
    bool useEOS = llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_WPM;

    // no EOS, optional BOS
@ -954,13 +1094,16 @@ void LLamaModel::embedInternal(
        if (!text.empty() && text[0] != ' ') {
            text = ' ' + text; // normalize for SPM - our fork of llama.cpp doesn't add a space prefix
        }
-        wantBOS &= useBOS;

        tokens.resize(text.length()+4);
-        int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false);
+        int32_t n_tokens = llama_tokenize_gpt4all(
+            d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), /*add_special*/ wantBOS,
+            /*parse_special*/ false, /*insert_space*/ false
+        );
        if (n_tokens) {
            (void)eos_token;
-            assert((useEOS && wantBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
+            (void)useBOS;
+            assert((useEOS && wantBOS && useBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
            if (useEOS && wantBOS)
                n_tokens--; // erase EOS/SEP
        }
@ -1186,9 +1329,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)

 DLL_EXPORT LLModel *construct()
 {
-    llama_log_set(llama_log_callback, nullptr);
+    llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
 #ifdef GGML_USE_CUDA
-    ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
+    ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
 #endif
    return new LLamaModel;
 }
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -6,10 +6,12 @@

 #include "llmodel.h"

-#include <functional>
 #include <memory>
+#include <span>
 #include <string>
+#include <string_view>
 #include <vector>
+#include <unordered_map>

 struct LLamaPrivate;
 struct EmbModelSpec;
@ -27,8 +29,8 @@ public:
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
-    size_t saveState(uint8_t *dest) const override;
-    size_t restoreState(const uint8_t *src) override;
+    size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
+    size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
@ -47,25 +49,36 @@ public:
    void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;

-private:
-    std::unique_ptr<LLamaPrivate> d_ptr;
-    bool m_supportsEmbedding = false;
-    bool m_supportsCompletion = false;
+    int32_t contextLength() const override;
+    auto specialTokens() -> std::unordered_map<std::string, std::string> const override;

 protected:
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
+    std::vector<Token> tokenize(std::string_view str) const override;
+    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
-    Token sampleToken(PromptContext &ctx) const override;
-    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
-    int32_t contextLength() const override;
+    void initSampler(const PromptContext &ctx) override;
+    Token sampleToken() const override;
+    bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
+    void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
+    int32_t inputLength() const override;
+    int32_t computeModelInputPosition(std::span<const Token> input) const override;
+    void setModelInputPosition(int32_t pos) override;
+    void appendInputToken(Token tok) override;
+    std::span<const Token> inputTokens() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;
    int32_t maxContextLength(std::string const &modelPath) const override;
    int32_t layerCount(std::string const &modelPath) const override;
+    auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;

    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
                       const EmbModelSpec *spec);
+
+private:
+    std::unique_ptr<LLamaPrivate> d_ptr;
+    bool m_supportsEmbedding = false;
+    bool m_supportsCompletion = false;
 };

 #endif // LLAMAMODEL_H
--- a/gpt4all-backend/src/llmodel.cpp
+++ b/gpt4all-backend/src/llmodel.cpp
@ -140,9 +140,14 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
            std::string path;
            // Split the paths string by the delimiter and process each path.
            while (std::getline(ss, path, ';')) {
-                std::u8string u8_path(path.begin(), path.end());
+                fs::directory_iterator iter;
+                try {
+                    iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
+                } catch (const fs::filesystem_error &) {
+                    continue; // skip nonexistent path
+                }
                // Iterate over all libraries
-                for (const auto &f : fs::directory_iterator(u8_path)) {
+                for (const auto &f : iter) {
                    const fs::path &p = f.path();

                    if (p.extension() != LIB_FILE_EXT) continue;
@ -326,6 +331,12 @@ bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
    return llama && llama->isEmbeddingModel(modelPath);
 }

+auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
+{
+    auto *llama = constructGlobalLlama();
+    return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
+}
+
 void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
 {
    s_implementations_search_path = path;
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -7,16 +7,20 @@
 #include <cstdlib>
 #include <cstring>
 #include <exception>
-#include <functional>
 #include <iostream>
 #include <memory>
 #include <optional>
 #include <string>
+#include <string_view>
 #include <vector>
+#include <span>
+
+namespace ranges = std::ranges;
+
+static_assert(sizeof(token_t) == sizeof(LLModel::Token));

 struct LLModelWrapper {
    LLModel *llModel = nullptr;
-    LLModel::PromptContext promptContext;
    ~LLModelWrapper() { delete llModel; }
 };

@ -84,80 +88,80 @@ bool llmodel_isModelLoaded(llmodel_model model)
    return wrapper->llModel->isModelLoaded();
 }

-uint64_t llmodel_get_state_size(llmodel_model model)
+uint64_t llmodel_state_get_size(llmodel_model model)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->stateSize();
 }

-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
+uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
+                                token_t **input_tokens_out, uint64_t *n_input_tokens)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->saveState(dest);
+    std::vector<LLModel::Token> inputTokens;
+    auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
+    if (bytesWritten) {
+        auto *buf = new LLModel::Token[inputTokens.size()];
+        ranges::copy(inputTokens, buf);
+        *input_tokens_out = buf;
+        *n_input_tokens = uint64_t(inputTokens.size());
+    } else {
+        *input_tokens_out = nullptr;
+        *n_input_tokens = 0;
+    }
+    return bytesWritten;
 }

-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
+void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
 {
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState(src);
+    delete[] input_tokens;
 }

-void llmodel_prompt(llmodel_model model, const char *prompt,
-                    const char *prompt_template,
-                    llmodel_prompt_callback prompt_callback,
-                    llmodel_response_callback response_callback,
-                    llmodel_recalculate_callback recalculate_callback,
-                    llmodel_prompt_context *ctx,
-                    bool special,
-                    const char *fake_reply)
+uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
+                                const token_t *input_tokens, uint64_t n_input_tokens)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
+    return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
+}

-    auto response_func = [response_callback](int32_t token_id, const std::string &response) {
-        return response_callback(token_id, response.c_str());
-    };
-
-    if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
-        wrapper->promptContext.tokens.resize(ctx->n_past);
+bool llmodel_prompt(llmodel_model               model,
+                    const char                 *prompt,
+                    llmodel_prompt_callback     prompt_callback,
+                    llmodel_response_callback   response_callback,
+                    llmodel_prompt_context     *ctx,
+                    const char                **error)
+{
+    auto *wrapper = static_cast<LLModelWrapper *>(model);

    // Copy the C prompt context
-    wrapper->promptContext.n_past = ctx->n_past;
-    wrapper->promptContext.n_ctx = ctx->n_ctx;
-    wrapper->promptContext.n_predict = ctx->n_predict;
-    wrapper->promptContext.top_k = ctx->top_k;
-    wrapper->promptContext.top_p = ctx->top_p;
-    wrapper->promptContext.min_p = ctx->min_p;
-    wrapper->promptContext.temp = ctx->temp;
-    wrapper->promptContext.n_batch = ctx->n_batch;
-    wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
-    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
-    wrapper->promptContext.contextErase = ctx->context_erase;
+    LLModel::PromptContext promptContext {
+        .n_predict      = ctx->n_predict,
+        .top_k          = ctx->top_k,
+        .top_p          = ctx->top_p,
+        .min_p          = ctx->min_p,
+        .temp           = ctx->temp,
+        .n_batch        = ctx->n_batch,
+        .repeat_penalty = ctx->repeat_penalty,
+        .repeat_last_n  = ctx->repeat_last_n,
+        .contextErase   = ctx->context_erase,
+    };

-    std::string fake_reply_str;
-    if (fake_reply) { fake_reply_str = fake_reply; }
-    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
+    auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
+        return prompt_callback(token_ids.data(), token_ids.size(), cached);
+    };
+    auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
+        return response_callback(token_id, piece.data());
+    };

    // Call the C++ prompt method
-    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
-                             wrapper->promptContext, special, fake_reply_p);
+    try {
+        wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
+    } catch (std::exception const &e) {
+        llmodel_set_error(error, e.what());
+        return false;
+    }

-    // Update the C context by giving access to the wrappers raw pointers to std::vector data
-    // which involves no copies
-    ctx->tokens = wrapper->promptContext.tokens.data();
-    ctx->tokens_size = wrapper->promptContext.tokens.size();
-
-    // Update the rest of the C prompt context
-    ctx->n_past = wrapper->promptContext.n_past;
-    ctx->n_ctx = wrapper->promptContext.n_ctx;
-    ctx->n_predict = wrapper->promptContext.n_predict;
-    ctx->top_k = wrapper->promptContext.top_k;
-    ctx->top_p = wrapper->promptContext.top_p;
-    ctx->min_p = wrapper->promptContext.min_p;
-    ctx->temp = wrapper->promptContext.temp;
-    ctx->n_batch = wrapper->promptContext.n_batch;
-    ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;
-    ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;
-    ctx->context_erase = wrapper->promptContext.contextErase;
+    return true;
 }

 float *llmodel_embed(
@ -296,3 +300,21 @@ const char *llmodel_model_gpu_device_name(llmodel_model model)
    const auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->gpuDeviceName();
 }
+
+int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
+{
+    auto *wrapper = static_cast<const LLModelWrapper *>(model);
+    try {
+        return wrapper->llModel->countPromptTokens(prompt);
+    } catch (const std::exception& e) {
+        llmodel_set_error(error, e.what());
+        return -1;
+    }
+}
+
+void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
+{
+    auto *wrapper = static_cast<const LLModelWrapper *>(model);
+    for (auto &[name, token] : wrapper->llModel->specialTokens())
+        callback(name.c_str(), token.c_str());
+}
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -0,0 +1,298 @@
+#include "llmodel.h"
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iostream>
+#include <iterator>
+#include <optional>
+#include <ranges>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+namespace ranges = std::ranges;
+namespace views  = std::ranges::views;
+
+void LLModel::prompt(
+    std::string_view        prompt,
+    const PromptCallback   &promptCallback,
+    const ResponseCallback &responseCallback,
+    const PromptContext    &promptCtx
+) {
+    if (!isModelLoaded())
+        throw std::invalid_argument("Attempted to prompt an unloaded model.");
+    if (!supportsCompletion())
+        throw std::invalid_argument("Not a text completion model.");
+    if (!promptCtx.n_batch)
+        throw std::invalid_argument("Batch size cannot be zero.");
+    if (!promptCtx.n_predict)
+        return; // nothing requested
+
+    auto embd_inp = tokenize(prompt);
+    if (embd_inp.empty())
+        throw std::invalid_argument("Prompt tokenized to zero tokens.");
+
+    if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
+        generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
+}
+
+int32_t LLModel::countPromptTokens(std::string_view prompt) const
+{
+    if (!isModelLoaded())
+        throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
+    return int32_t(tokenize(prompt).size());
+}
+
+auto LLModel::decodePrompt(
+    const PromptCallback &promptCallback,
+    const PromptContext  &promptCtx,
+    std::vector<Token>    embd_inp
+) -> std::optional<int32_t>
+{
+    assert(!embd_inp.empty());
+
+    int32_t nCtx = contextLength();
+    int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
+
+    // Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
+    // requested n_past.
+    // This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
+    int32_t nPast = computeModelInputPosition(embd_inp);
+
+    // always decode up to a full batch before generating, even if cached
+    nPast -= std::min(n_batch, nPast);
+
+    // TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
+    if (!nPast && int32_t(embd_inp.size()) > nCtx) {
+        // no cache hit -> shift the input before even processing
+
+        int32_t nKeep     = shouldAddBOS();
+        auto    newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
+        int32_t nDiscard  = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
+
+        // execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
+        auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
+        if (!promptCallback(discardedTokens, true))
+            return std::nullopt;
+
+        // erase nDiscard tokens
+        embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
+        assert(int32_t(embd_inp.size()) <= nCtx);
+
+        // check the cache again, just in case
+        nPast = computeModelInputPosition(embd_inp);
+        nPast -= std::min(n_batch, nPast);
+    }
+
+    setModelInputPosition(nPast);
+
+    // execute the callback even for skipped tokens
+    if (!promptCallback(embd_inp | views::take(nPast), true))
+        return std::nullopt;
+
+    // process the prompt in batches
+    for (int32_t i = nPast; i < embd_inp.size();) {
+        auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
+        std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
+
+        // Check if the context has run out...
+        if (nPast + int32_t(batch.size()) > nCtx) {
+            shiftContext(promptCtx, &nPast);
+            assert(nPast + int32_t(batch.size()) <= nCtx);
+        }
+
+        // FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
+        if (!evalTokens(nPast, batch))
+            throw std::runtime_error("An internal error was encountered during prompt processing.");
+
+        for (auto &tok : batch) {
+            appendInputToken(tok);
+            nPast++;
+            if (!promptCallback({ &tok, 1 }, false))
+                return std::nullopt;
+        }
+        i = batch_end;
+    }
+
+    return nPast;
+}
+
+/*
+ * If string s overlaps with the string key such that some prefix of the key is at the end
+ * of the string, return the position in s where the first match starts. Otherwise, return
+ * std::string::npos. Examples:
+ * s = "bfo",  key = "foo" -> 1
+ * s = "fooa", key = "foo" -> npos
+ */
+static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
+{
+    if (s.empty() || key.empty())
+        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
+
+    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
+        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
+            return start;
+    }
+    return std::string::npos;
+}
+
+void LLModel::generateResponse(
+    const ResponseCallback &responseCallback,
+    const PromptContext    &promptCtx,
+    int32_t                 nPast
+) {
+    static const char *stopSequences[] {
+        "### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
+        "<|im_start|>", "<|im_end|>", "<|endoftext|>",
+    };
+
+    initSampler(promptCtx);
+
+    std::string cachedResponse;
+    std::vector<Token> cachedTokens;
+    int n_predicted = 0;
+
+    // Predict next tokens
+    for (bool stop = false; !stop;) {
+        // Sample next token
+        std::optional<Token> new_tok = sampleToken();
+        std::string new_piece = tokenToString(new_tok.value());
+        cachedTokens.push_back(new_tok.value());
+        cachedResponse += new_piece;
+
+        auto accept = [this, &promptCtx, &new_tok, &nPast] {
+            // Shift context if out of space
+            if (nPast >= contextLength()) {
+                shiftContext(promptCtx, &nPast);
+                assert(nPast < contextLength());
+            }
+
+            // Accept the token
+            Token tok = std::exchange(new_tok, std::nullopt).value();
+            if (!evalTokens(nPast, { &tok, 1 }))
+                throw std::runtime_error("An internal error was encountered during response generation.");
+
+            appendInputToken(tok);
+            nPast++;
+        };
+
+        // Check for EOS
+        auto lengthLimit = std::string::npos;
+        for (const auto token : endTokens()) {
+            if (new_tok == token) {
+                stop = true;
+                lengthLimit = cachedResponse.size() - new_piece.size();
+            }
+        }
+
+        if (lengthLimit != std::string::npos) {
+            // EOS matched
+        } else if (!isSpecialToken(new_tok.value())) {
+            // Check if the response contains a stop sequence
+            for (const auto &p : stopSequences) {
+                auto match = cachedResponse.find(p);
+                if (match != std::string::npos) stop = true;
+                lengthLimit = std::min(lengthLimit, match);
+                if (match == 0) break;
+            }
+
+            // Check if the response matches the start of a stop sequence
+            if (lengthLimit == std::string::npos) {
+                for (const auto &p : stopSequences) {
+                    auto match = stringsOverlap(cachedResponse, p);
+                    lengthLimit = std::min(lengthLimit, match);
+                    if (match == 0) break;
+                }
+            }
+        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
+            // Special tokens must exactly match a stop sequence
+            stop = true;
+            lengthLimit = cachedResponse.size() - new_piece.size();
+        }
+
+        // Empty the cache, up to the length limit
+        std::string::size_type responseLength = 0;
+        while (!cachedTokens.empty()) {
+            Token tok = cachedTokens.front();
+            std::string piece = tokenToString(tok);
+
+            // Stop if the piece (or part of it) does not fit within the length limit
+            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
+                break;
+
+            // Remove token from cache
+            assert(cachedResponse.starts_with(piece));
+            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
+            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
+
+            // Accept the token, if needed (not cached)
+            if (cachedTokens.empty() && new_tok)
+                accept();
+
+            // Send the token
+            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
+                stop = true;
+                break;
+            }
+
+            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
+            // output token IDs and could cache a partial token for the next prompt call
+            responseLength += piece.size();
+        }
+        assert(cachedTokens.empty() == cachedResponse.empty());
+
+        // Accept the token, if needed (in cache)
+        if (new_tok) {
+            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
+            if (stop) {
+                cachedTokens.pop_back();
+            } else {
+                accept();
+            }
+        }
+    }
+
+    if (inputLength() < cachedTokens.size()) {
+        /* This is theoretically possible if the longest stop sequence is greater than
+         * n_ctx * contextErase tokens. */
+        throw std::runtime_error("shifted too much context, can't go back");
+    }
+
+#ifndef NDEBUG
+    auto inp = inputTokens();
+    auto discard_start = inp.end() - cachedTokens.size();
+    assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
+#endif
+}
+
+void LLModel::embed(
+    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
+    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
+) {
+    (void)texts;
+    (void)embeddings;
+    (void)prefix;
+    (void)dimensionality;
+    (void)tokenCount;
+    (void)doMean;
+    (void)atlas;
+    (void)cancelCb;
+    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
+}
+
+void LLModel::embed(
+    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
+    bool doMean, bool atlas
+) {
+    (void)texts;
+    (void)embeddings;
+    (void)isRetrieval;
+    (void)dimensionality;
+    (void)tokenCount;
+    (void)doMean;
+    (void)atlas;
+    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
+}
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include <cassert>
+
+#ifdef NDEBUG
+#   ifdef __has_builtin
+#       if __has_builtin(__builtin_unreachable)
+#           define UNREACHABLE() __builtin_unreachable()
+#       else
+#           define UNREACHABLE() do {} while (0)
+#       endif
+#   else
+#       define UNREACHABLE() do {} while (0)
+#   endif
+#else
+#   define UNREACHABLE() assert(!"Unreachable statement was reached")
+#endif
--- a/gpt4all-backend/utils.cpp
+++ b/gpt4all-backend/utils.cpp
@ -1,339 +0,0 @@
-#include "utils.h"
-
-#include <cmath>
-#include <cstdio>
-#include <cstdlib>
-#include <fstream>
-#include <iterator>
-#include <regex>
-#include <utility>
-
-void replace(std::string & str, const std::string & needle, const std::string & replacement)
-{
-    size_t pos = 0;
-    while ((pos = str.find(needle, pos)) != std::string::npos) {
-        str.replace(pos, needle.length(), replacement);
-        pos += replacement.length();
-    }
-}
-
-std::map<std::string, int32_t> json_parse(const std::string & fname)
-{
-    std::map<std::string, int32_t> result;
-
-    // read file into string
-    std::string json;
-    {
-        std::ifstream ifs(fname);
-        if (!ifs) {
-            fprintf(stderr, "Failed to open %s\n", fname.c_str());
-            exit(1);
-        }
-
-        json = std::string((std::istreambuf_iterator<char>(ifs)),
-                (std::istreambuf_iterator<char>()));
-    }
-
-    if (json[0] != '{') {
-        return result;
-    }
-
-    // parse json
-    {
-        bool has_key  = false;
-        bool in_token = false;
-
-        std::string str_key = "";
-        std::string str_val = "";
-
-        int n = json.size();
-        for (int i = 1; i < n; ++i) {
-            if (!in_token) {
-                if (json[i] == ' ') continue;
-                if (json[i] == '"') {
-                    in_token = true;
-                    continue;
-                }
-            } else {
-                if (json[i] == '\\' && i+1 < n) {
-                    if (has_key == false) {
-                        str_key += json[i];
-                    } else {
-                        str_val += json[i];
-                    }
-                    ++i;
-                } else if (json[i] == '"') {
-                    if (has_key == false) {
-                        has_key = true;
-                        ++i;
-                        while (json[i] == ' ') ++i;
-                        ++i; // :
-                        while (json[i] == ' ') ++i;
-                        if (json[i] != '\"') {
-                            while (json[i] != ',' && json[i] != '}') {
-                                str_val += json[i++];
-                            }
-                            has_key = false;
-                        } else {
-                            in_token = true;
-                            continue;
-                        }
-                    } else {
-                        has_key = false;
-                    }
-
-                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
-                    ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
-                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
-
-                    try {
-                        result[str_key] = std::stoi(str_val);
-                    } catch (...) {
-                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
-
-                    }
-                    str_key = "";
-                    str_val = "";
-                    in_token = false;
-                    continue;
-                }
-                if (has_key == false) {
-                    str_key += json[i];
-                } else {
-                    str_val += json[i];
-                }
-            }
-        }
-    }
-
-    return result;
-}
-
-std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text)
-{
-    std::vector<std::string> words;
-
-    // first split the text into words
-    {
-        std::string str = text;
-        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
-
-        std::regex re(pat);
-        std::smatch m;
-
-        while (std::regex_search(str, m, re)) {
-            for (auto x : m) {
-                words.push_back(x);
-            }
-            str = m.suffix();
-        }
-    }
-
-    // find the longest tokens that form the words:
-    std::vector<gpt_vocab::id> tokens;
-    for (const auto & word : words) {
-        if (word.size() == 0) continue;
-
-        int i = 0;
-        int n = word.size();
-        while (i < n) {
-            int j = n;
-            while (j > i) {
-                auto it = vocab.token_to_id.find(word.substr(i, j-i));
-                if (it != vocab.token_to_id.end()) {
-                    tokens.push_back(it->second);
-                    i = j;
-                    break;
-                }
-                --j;
-            }
-            if (i == n) {
-                break;
-            }
-            if (j == i) {
-                auto sub = word.substr(i, 1);
-                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
-                    tokens.push_back(vocab.token_to_id.at(sub));
-                } else {
-                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
-                }
-                ++i;
-            }
-        }
-    }
-
-    return tokens;
-}
-
-std::string regex_escape(const std::string &s)
-{
-  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
-  return std::regex_replace(s, metacharacters, "\\$&");
-}
-
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text)
-{
-    // Generate the subpattern from the special_tokens vector if it's not empty
-    if (!vocab.special_tokens.empty()) {
-        std::vector<gpt_vocab::id> out;
-        std::vector<std::string> chunks;
-        std::string str = text;
-        std::string special_tokens_subpattern;
-        for (const auto &token : vocab.special_tokens) {
-            if (!special_tokens_subpattern.empty()) {
-                special_tokens_subpattern += "|";
-            }
-            special_tokens_subpattern += regex_escape(token);
-        }
-        std::regex re(special_tokens_subpattern);
-        std::smatch m;
-        while (std::regex_search(str, m, re)) {
-            auto tok = vocab.token_to_id.find(m.str());
-            if (tok != vocab.token_to_id.end()) {
-                auto tokid = tok->second;
-                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
-                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
-                out.push_back(tokid);
-                str = m.suffix();
-            }
-        }
-        if (!str.empty()) {
-            auto tokrest = gpt_tokenize_inner(vocab, str);
-            out.insert(out.end(), tokrest.begin(), tokrest.end());
-        }
-        return out;
-    } else {
-        return gpt_tokenize_inner(vocab, text);
-    }
-}
-
-
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab)
-{
-    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
-
-    vocab.token_to_id = ::json_parse(fname);
-
-    for (const auto & kv : vocab.token_to_id) {
-        vocab.id_to_token[kv.second] = kv.first;
-    }
-
-    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
-
-    // print the vocabulary
-    //for (auto kv : vocab.token_to_id) {
-    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
-    //}
-
-    return true;
-}
-
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const size_t actualVocabSize,
-        const int32_t * last_n_tokens_data,
-        int   last_n_tokens_size,
-        const std::vector<float> logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        float repeat_penalty,
-        std::mt19937 & rng) {
-    int n_logits = actualVocabSize;
-
-    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
-    const auto * plogits = logits.data();
-
-    if (temp <= 0) {
-        // select the token with the highest logit directly
-        float max_logit = plogits[0];
-        gpt_vocab::id max_id = 0;
-
-        for (int i = 1; i < n_logits; ++i) {
-            if (plogits[i] > max_logit) {
-                max_logit = plogits[i];
-                max_id = i;
-            }
-        }
-        return max_id;
-    }
-    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
-    logits_id.reserve(n_logits);
-
-    {
-        const float scale = 1.0f/temp;
-        for (int i = 0; i < n_logits; ++i) {
-            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
-            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
-            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
-                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
-                if (plogits[i] < 0.0f) {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
-                } else {
-                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
-                }
-            } else {
-                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
-            }
-        }
-    }
-
-    // find the top K tokens
-    std::partial_sort(
-            logits_id.begin(),
-            logits_id.begin() + top_k, logits_id.end(),
-            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
-        return a.first > b.first;
-    });
-
-    logits_id.resize(top_k);
-
-    double maxl = -INFINITY;
-    for (const auto & kv : logits_id) {
-        maxl = std::max(maxl, kv.first);
-    }
-
-    // compute probs for the top K tokens
-    std::vector<double> probs;
-    probs.reserve(logits_id.size());
-
-    double sum = 0.0;
-    for (const auto & kv : logits_id) {
-        double p = exp(kv.first - maxl);
-        probs.push_back(p);
-        sum += p;
-    }
-
-    // normalize the probs
-    for (auto & p : probs) {
-        p /= sum;
-    }
-
-    if (top_p < 1.0f) {
-        double cumsum = 0.0f;
-        for (int i = 0; i < top_k; i++) {
-            cumsum += probs[i];
-            if (cumsum >= top_p) {
-                top_k = i + 1;
-                probs.resize(top_k);
-                logits_id.resize(top_k);
-                break;
-            }
-        }
-
-        cumsum = 1.0/cumsum;
-        for (int i = 0; i < (int) probs.size(); i++) {
-            probs[i] *= cumsum;
-        }
-    }
-
-    //printf("\n");
-    //for (int i = 0; i < (int) probs.size(); i++) {
-    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
-    //}
-    //exit(0);
-
-    std::discrete_distribution<> dist(probs.begin(), probs.end());
-    int idx = dist(rng);
-
-    return logits_id[idx].second;
-}
--- a/gpt4all-backend/utils.h
+++ b/gpt4all-backend/utils.h
@ -1,101 +0,0 @@
-// Various helper functions and utilities
-
-#pragma once
-
-#include <algorithm>
-#include <cstddef>
-#include <cstdint>
-#include <map>
-#include <random>
-#include <string>
-#include <thread>
-#include <vector>
-
-//
-// General purpose inline functions
-//
-constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes)
-{
-    return bytes*1024*1024;
-}
-
-//
-// CLI argument parsing
-//
-
-struct gpt_params {
-    int32_t seed      = -1; // RNG seed
-    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
-    int32_t n_predict = 200; // new tokens to predict
-
-    // sampling parameters
-    int32_t top_k = 40;
-    float   top_p = 0.9f;
-    float   temp  = 0.9f;
-
-    int32_t n_batch = 8; // batch size for prompt processing
-
-    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
-    std::string prompt;
-};
-
-bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
-
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
-
-std::string gpt_random_prompt(std::mt19937 & rng);
-
-//
-// Vocab utils
-//
-
-struct gpt_vocab {
-    using id    = int32_t;
-    using token = std::string;
-
-    std::map<token, id> token_to_id;
-    std::map<id, token> id_to_token;
-    std::vector<std::string> special_tokens;
-
-    void add_special_token(const std::string &token) {
-        special_tokens.push_back(token);
-    }
-};
-
-void replace(std::string & str, const std::string & needle, const std::string & replacement);
-
-// poor-man's JSON parsing
-std::map<std::string, int32_t> json_parse(const std::string & fname);
-
-// split text into tokens
-//
-// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
-//
-// Regex (Python):
-// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-//
-// Regex (C++):
-// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
-//
-std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
-
-// load the tokens from encoder.json
-bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
-
-// sample next token given probabilities for each embedding
-//
-//   - consider only the top K tokens
-//   - from them, consider only the top tokens with cumulative probability > P
-//
-// TODO: not sure if this implementation is correct
-//
-gpt_vocab::id gpt_sample_top_k_top_p(
-        const size_t actualVocabSize,
-        const int32_t * last_n_tokens_data,
-        int   last_n_tokens_size,
-        const std::vector<float> logits,
-        int    top_k,
-        double top_p,
-        double temp,
-        float repeat_penalty,
-        std::mt19937 & rng);
--- a/gpt4all-bindings/cli/README.md
+++ b/gpt4all-bindings/cli/README.md
@ -2,8 +2,7 @@

 GPT4All on the command-line.

-## Documentation
-<https://docs.gpt4all.io/gpt4all_cli.html>
+More details on the [wiki](https://github.com/nomic-ai/gpt4all/wiki/Python-CLI).

 ## Quickstart

@ -34,11 +33,11 @@ python -m pip install --user --upgrade gpt4all typer
 # run the CLI
 python app.py repl
 ```
-By default, it will automatically download the `groovy` model to `.cache/gpt4all/` in your user
-directory, if necessary.
+By default, it will automatically download the `Mistral Instruct` model to `.cache/gpt4all/` in your
+user directory, if necessary.

 If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
 for example:
 ```shell
-python app.py repl --model /home/user/my-gpt4all-models/gpt4all-13b-snoozy-q4_0.gguf
+python app.py repl --model /home/user/my-gpt4all-models/mistral-7b-instruct-v0.1.Q4_0.gguf
 ```
--- a/gpt4all-bindings/cli/app.py
+++ b/gpt4all-bindings/cli/app.py
@ -113,10 +113,7 @@ def _old_loop(gpt4all_instance):
        full_response = gpt4all_instance.chat_completion(
            MESSAGES,
            # preferential kwargs for chat ux
-            logits_size=0,
-            tokens_size=0,
            n_past=0,
-            n_ctx=0,
            n_predict=200,
            top_k=40,
            top_p=0.9,
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@ -0,0 +1,75 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+
+## [Unreleased]
+
+### Added
+- Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
+- Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))
+- Add ability to modify or replace the history of an active chat session ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+
+### Changed
+- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
+- Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
+- Fix CalledProcessError on Intel Macs since v2.8.0 ([#3045](https://github.com/nomic-ai/gpt4all/pull/3045))
+- Use Jinja for chat templates instead of per-message QString.arg-style templates ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+
+## [2.8.2] - 2024-08-14
+
+### Fixed
+- Fixed incompatibility with Python 3.8 since v2.7.0 and Python <=3.11 since v2.8.1 ([#2871](https://github.com/nomic-ai/gpt4all/pull/2871))
+
+## [2.8.1] - 2024-08-13
+
+### Added
+- Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
+
+### Changed
+- Search for pip-installed CUDA 11 as well as CUDA 12 ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
+- Stop shipping CUBINs to reduce wheel size ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
+- Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+- Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+
+### Fixed
+- Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+- Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2849](https://github.com/nomic-ai/gpt4all/pull/2849))
+- Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
+- Fix a segfault on exit when using CPU mode on Linux with NVIDIA and EGL ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
+
+## [2.8.0] - 2024-08-05
+
+### Added
+- Support GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Vulkan support) ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
+- Enable Vulkan support for StarCoder2, XVERSE, Command R, and OLMo ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
+- Support DeepSeek-V2 architecture (no Vulkan support) ([#2702](https://github.com/nomic-ai/gpt4all/pull/2702))
+- Add Llama 3.1 8B Instruct to models3.json (by [@3Simplex](https://github.com/3Simplex) in [#2731](https://github.com/nomic-ai/gpt4all/pull/2731) and [#2732](https://github.com/nomic-ai/gpt4all/pull/2732))
+- Support Llama 3.1 RoPE scaling ([#2758](https://github.com/nomic-ai/gpt4all/pull/2758))
+- Add Qwen2-1.5B-Instruct to models3.json (by [@ThiloteE](https://github.com/ThiloteE) in [#2759](https://github.com/nomic-ai/gpt4all/pull/2759))
+- Detect use of a Python interpreter under Rosetta for a clearer error message ([#2793](https://github.com/nomic-ai/gpt4all/pull/2793))
+
+### Changed
+- Build against CUDA 11.8 instead of CUDA 12 for better compatibility with older drivers ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
+- Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
+
+### Removed
+- Remove unused internal llmodel\_has\_gpu\_device ([#2409](https://github.com/nomic-ai/gpt4all/pull/2409))
+- Remove support for GPT-J models ([#2676](https://github.com/nomic-ai/gpt4all/pull/2676), [#2693](https://github.com/nomic-ai/gpt4all/pull/2693))
+
+### Fixed
+- Fix debug mode crash on Windows and undefined behavior in LLamaModel::embedInternal ([#2467](https://github.com/nomic-ai/gpt4all/pull/2467))
+- Fix CUDA PTX errors with some GPT4All builds ([#2421](https://github.com/nomic-ai/gpt4all/pull/2421))
+- Fix mishandling of inputs greater than n\_ctx tokens after [#1970](https://github.com/nomic-ai/gpt4all/pull/1970) ([#2498](https://github.com/nomic-ai/gpt4all/pull/2498))
+- Fix crash when Kompute falls back to CPU ([#2640](https://github.com/nomic-ai/gpt4all/pull/2640))
+- Fix several Kompute resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
+- Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
+- Fix several backend issues ([#2778](https://github.com/nomic-ai/gpt4all/pull/2778))
+  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
+  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
+
+[Unreleased]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.2...HEAD
+[2.8.2]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.1...python-v2.8.2
+[2.8.1]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.0...python-v2.8.1
+[2.8.0]: https://github.com/nomic-ai/gpt4all/compare/python-v2.7.0...python-v2.8.0
--- a/gpt4all-bindings/python/docs/assets/attach_spreadsheet.png
+++ b/gpt4all-bindings/python/docs/assets/attach_spreadsheet.png
--- a/gpt4all-bindings/python/docs/assets/chat_window.png
+++ b/gpt4all-bindings/python/docs/assets/chat_window.png
--- a/gpt4all-bindings/python/docs/assets/disney_spreadsheet.png
+++ b/gpt4all-bindings/python/docs/assets/disney_spreadsheet.png
--- a/gpt4all-bindings/python/docs/assets/gpt4all_xlsx_attachment.mp4
+++ b/gpt4all-bindings/python/docs/assets/gpt4all_xlsx_attachment.mp4
--- a/gpt4all-bindings/python/docs/assets/spreadsheet_chat.png
+++ b/gpt4all-bindings/python/docs/assets/spreadsheet_chat.png
--- a/gpt4all-bindings/python/docs/gpt4all_api_server/home.md
+++ b/gpt4all-bindings/python/docs/gpt4all_api_server/home.md
@ -0,0 +1,86 @@
+# GPT4All API Server
+
+GPT4All provides a local API server that allows you to run LLMs over an HTTP API. 
+
+## Key Features
+
+- **Local Execution**: Run models on your own hardware for privacy and offline use.
+- **LocalDocs Integration**: Run the API with relevant text snippets provided to your LLM from a [LocalDocs collection](../gpt4all_desktop/localdocs.md).
+- **OpenAI API Compatibility**: Use existing OpenAI-compatible clients and tools with your local models.
+
+## Activating the API Server
+
+1. Open the GPT4All Chat Desktop Application.
+2. Go to `Settings` > `Application` and scroll down to `Advanced`.
+3. Check the box for the `"Enable Local API Server"` setting.
+4. The server listens on port 4891 by default. You can choose another port number in the `"API Server Port"` setting.
+
+## Connecting to the API Server
+
+The base URL used for the API server is `http://localhost:4891/v1` (or `http://localhost:<PORT_NUM>/v1` if you are using a different port number). 
+
+The server only accepts HTTP connections (not HTTPS) and only listens on localhost (127.0.0.1) (e.g. not to the IPv6 localhost address `::1`.)
+
+## Examples
+
+!!! note "Example GPT4All API calls"
+
+    === "cURL"
+
+        ```bash
+        curl -X POST http://localhost:4891/v1/chat/completions -d '{
+        "model": "Phi-3 Mini Instruct",
+        "messages": [{"role":"user","content":"Who is Lionel Messi?"}],
+        "max_tokens": 50,
+        "temperature": 0.28
+        }'
+        ```
+
+    === "PowerShell"
+
+        ```powershell
+        Invoke-WebRequest -URI http://localhost:4891/v1/chat/completions -Method POST -ContentType application/json -Body '{
+        "model": "Phi-3 Mini Instruct",
+        "messages": [{"role":"user","content":"Who is Lionel Messi?"}],
+        "max_tokens": 50,
+        "temperature": 0.28
+        }'
+        ```
+
+## API Endpoints
+
+| Method | Path | Description |
+|--------|------|-------------|
+| GET | `/v1/models` | List available models |
+| GET | `/v1/models/<name>` | Get details of a specific model |
+| POST | `/v1/completions` | Generate text completions |
+| POST | `/v1/chat/completions` | Generate chat completions |
+
+## LocalDocs Integration
+
+You can use LocalDocs with the API server:
+
+1. Open the Chats view in the GPT4All application.
+2. Scroll to the bottom of the chat history sidebar.
+3. Select the server chat (it has a different background color).
+4. Activate LocalDocs collections in the right sidebar.
+
+(Note: LocalDocs can currently only be activated through the GPT4All UI, not via the API itself).
+
+Now, your API calls to your local LLM will have relevant references from your LocalDocs collection retrieved and placed in the input message for the LLM to respond to.
+
+The references retrieved for your API call can be accessed in the API response object at 
+
+`response["choices"][0]["references"]`
+
+The data included in the `references` are:
+
+- `text`: the actual text content from the snippet that was extracted from the reference document
+
+- `author`: the author of the reference document (if available)
+
+- `date`: the date of creation of the reference document (if available)
+
+- `page`: the page number the snippet is from (only available for PDF documents for now)
+
+- `title`: the title of the reference document (if available)
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/chat_templates.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/chat_templates.md
@ -0,0 +1,206 @@
+## What are chat templates?
+Natively, large language models only know how to complete plain text and do not know the difference between their input and their output. In order to support a chat with a person, LLMs are designed to use a template to convert the conversation to plain text using a specific format.
+
+For a given model, it is important to use an appropriate chat template, as each model is designed to work best with a specific format. The chat templates included with the built-in models should be sufficient for most purposes.
+
+There are two reasons you would want to alter the chat template:
+
+- You are sideloading a model and there is no chat template available,
+- You would like to have greater control over the input to the LLM than a system message provides.
+
+
+## What is a system message?
+A system message is a message that controls the responses from the LLM in a way that affects the entire conversation. System messages can be short, such as "Speak like a pirate.", or they can be long and contain a lot of context for the LLM to keep in mind.
+
+Not all models are designed to use a system message, so they work with some models better than others.
+
+
+## How do I customize the chat template or system message?
+To customize the chat template or system message, go to Settings > Model. Make sure to select the correct model at the top. If you clone a model, you can use a different chat template or system message from the base model, enabling you to use different settings for each conversation.
+
+These settings take effect immediately. After changing them, you can click "Redo last response" in the chat view, and the response will take the new settings into account.
+
+
+## Do I need to write a chat template?
+You typically do not need to write your own chat template. The exception is models that are not in the official model list and do not come with a chat template built-in. These will show a "Clear" option above the chat template field in the Model Settings page instead of a "Reset" option. See the section on [finding] or [creating] a chat template.
+
+[finding]: #how-do-i-find-a-chat-template
+[creating]: #advanced-how-do-chat-templates-work
+
+
+## What changed in GPT4All v3.5?
+GPT4All v3.5 overhauled the chat template system. There are three crucial differences:
+
+- The chat template now formats an entire conversation instead of a single pair of messages,
+- The chat template now uses Jinja syntax instead of `%1` and `%2` placeholders,
+- And the system message should no longer contain control tokens or trailing whitespace.
+
+If you are using any chat templates or system messages that had been added or altered from the default before upgrading to GPT4All v3.5 or newer, these will no longer work. See below for how to solve common errors you may see after upgrading.
+
+
+## Error/Warning: System message is not plain text.
+This is easy to fix. Go to the model's settings and look at the system prompt. There are three things to look for:
+
+- Control tokens such as `<|im_start|>`, `<|start_header_id|>`, or `<|system|>`
+- A prefix such as `### System` or `SYSTEM:`
+- Trailing whitespace, such as a space character or blank line.
+
+If you see any of these things, remove them. For example, this legacy system prompt:
+```
+<|start_header_id|>system<|end_header_id|>
+You are a helpful assistant.<|eot_id|>
+```
+
+Should become this:
+```
+You are a helpful assistant.
+```
+
+If you do not see anything that needs to be changed, you can dismiss the error by making a minor modification to the message and then changing it back.
+
+If you see a warning, your system message does not appear to be plain text. If you believe this warning is incorrect, it can be safely ignored. If in doubt, ask on the [Discord].
+
+[Discord]: https://discord.gg/mGZE39AS3e
+
+
+## Error: Legacy system prompt needs to be updated in Settings.
+This is the same as [above][above-1], but appears on the chat page.
+
+[above-1]: #errorwarning-system-message-is-not-plain-text
+
+
+## Error/Warning: Chat template is not in Jinja format.
+This is the result of attempting to use an old-style template (possibly from a previous version) in GPT4All 3.5+.
+
+Go to the Model Settings page and select the affected model. If you see a "Reset" button, and you have not intentionally modified the prompt template, you can click "Reset". Otherwise, this is what you can do:
+
+1. Back up your chat template by copying it safely to a text file and saving it. In the next step, it will be removed from GPT4All.
+2. Click "Reset" or "Clear".
+3. If you clicked "Clear", the chat template is now gone. Follow the steps to [find][finding] or [create][creating] a basic chat template for your model.
+4. Customize the chat template to suit your needs. For help, read the section about [creating] a chat template.
+
+
+## Error: Legacy prompt template needs to be updated in Settings.
+This is the same as [above][above-2], but appears on the chat page.
+
+[above-2]: #errorwarning-chat-template-is-not-in-jinja-format
+
+
+## The chat template has a syntax error.
+If there is a syntax error while editing the chat template, the details will be displayed in an error message above the input box. This could be because the chat template is not actually in Jinja format (see [above][above-2]).
+
+Otherwise, you have either typed something correctly, or the model comes with a template that is incompatible with GPT4All. See [the below section][creating] on creating chat templates and make sure that everything is correct. When in doubt, ask on the [Discord].
+
+
+## Error: No chat template configured.
+This may appear for models that are not from the official model list and do not include a chat template. Older versions of GPT4All picked a poor default in this case. You will get much better results if you follow the steps to [find][finding] or [create][creating] a chat template for your model.
+
+
+## Error: The chat template cannot be blank.
+If the button above the chat template on the Model Settings page says "Clear", see [above][above-3]. If you see "Reset", click that button to restore a reasonable default. Also see the section on [syntax errors][chat-syntax-error].
+
+[above-3]: #error-no-chat-template-configured
+[chat-syntax-error]: #the-chat-template-has-a-syntax-error
+
+
+## How do I find a chat template?
+When in doubt, you can always ask the [Discord] community for help. Below are the instructions to find one on your own.
+
+The authoritative source for a model's chat template is the HuggingFace repo that the original (non-GGUF) model came from. First, you should find this page. If you just have a model file, you can try a google search for the model's name. If you know the page you downloaded the GGUF model from, its README usually links to the original non-GGUF model.
+
+Once you have located the original model, there are two methods you can use to extract its chat template. Pick whichever one you are most comfortable with.
+
+### Using the CLI (all models)
+1. Install `jq` using your preferred package manager - e.g. Chocolatey (Windows), Homebrew (macOS), or apt (Ubuntu).
+2. Download `tokenizer_config.json` from the model's "Files and versions" tab.
+3. Open a command prompt in the directory which you have downloaded the model file.
+4. Run `jq -r ".chat_template" tokenizer_config.json`. This shows the chat template in a human-readable form. You can copy this and paste it into the settings page.
+5. (Optional) You can save the output to a text file like this: `jq -r ".chat_template" tokenizer_config.json >chat_template.txt`
+
+If the output is "null", the model does not provide a chat template. See the [below instructions][creating] on creating a chat template.
+
+### Python (open models)
+1. Install `transformers` using your preferred python package manager, e.g. `pip install transformers`. Make sure it is at least version v4.43.0.
+2. Copy the ID of the HuggingFace model, using the clipboard icon next to the name. For example, if the URL is `https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B`, the ID is `NousResearch/Hermes-2-Pro-Llama-3-8B`.
+3. Open a python interpreter (`python`) and run the following commands. Change the model ID in the example to the one you copied.
+```
+>>> from transformers import AutoTokenizer
+>>> tokenizer = AutoTokenizer.from_pretrained('NousResearch/Hermes-2-Pro-Llama-3-8B')
+>>> print(tokenizer.get_chat_template())
+```
+You can copy the output and paste it into the settings page.
+4. (Optional) You can save the output to a text file like this:
+```
+>>> open('chat_template.txt', 'w').write(tokenizer.get_chat_template())
+```
+
+If you get a ValueError exception, this model does not provide a chat template. See the [below instructions][creating] on creating a chat template.
+
+
+### Python (gated models)
+Some models, such as Llama and Mistral, do not allow public access to their chat template. You must either use the CLI method above, or follow the following instructions to use Python:
+
+1. For these steps, you must have git and git-lfs installed.
+2. You must have a HuggingFace account and be logged in.
+3. You must already have access to the gated model. Otherwise, request access.
+4. You must have an SSH key configured for git access to HuggingFace.
+5. `git clone` the model's HuggingFace repo using the SSH clone URL. There is no need to download the entire model, which is very large. A good way to do this on Linux is:
+```console
+$ GIT_LFS_SKIP_SMUDGE=1 git clone hf.co:meta-llama/Llama-3.1-8B-Instruct.git
+$ cd Llama-3.1-8B-Instruct
+$ git lfs pull -I "tokenizer.*"
+```
+6. Follow the above instructions for open models, but replace the model ID with the path to the directory containing `tokenizer\_config.json`:
+```
+>>> tokenizer = AutoTokenizer.from_pretrained('.')
+```
+
+
+## Advanced: How do chat templates work?
+The chat template is applied to the entire conversation you see in the chat window. The template loops over the list of messages, each containing `role` and `content` fields. `role` is either `user`, `assistant`, or `system`.
+
+GPT4All also supports the special variables `bos_token`, `eos_token`, and `add_generation_prompt`. See the [HuggingFace docs] for what those do.
+
+[HuggingFace docs]: https://huggingface.co/docs/transformers/v4.46.3/en/chat_templating#special-variables
+
+
+## Advanced: How do I make a chat template?
+The best way to create a chat template is to start by using an existing one as a reference. Then, modify it to use the format documented for the given model. Its README page may explicitly give an example of its template. Or, it may mention the name of a well-known standard template, such as ChatML, Alpaca, Vicuna. GPT4All does not yet include presets for these templates, so they will have to be found in other models or taken from the community.
+
+For more information, see the very helpful [HuggingFace guide]. Some of this is not applicable, such as the information about tool calling and RAG - GPT4All implements those features differently.
+
+Some models use a prompt template that does not intuitively map to a multi-turn chat, because it is more intended for single instructions. The [FastChat] implementation of these templates is a useful reference for the correct way to extend them to multiple messages.
+
+[HuggingFace guide]: https://huggingface.co/docs/transformers/v4.46.3/en/chat_templating#advanced-template-writing-tips
+[FastChat]: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+
+
+# Advanced: What are GPT4All v1 templates?
+GPT4All supports its own template syntax, which is nonstandard but provides complete control over the way LocalDocs sources and file attachments are inserted into the conversation. These templates begin with `{# gpt4all v1 #}` and look similar to the example below.
+
+For standard templates, GPT4All combines the user message, sources, and attachments into the `content` field. For GPT4All v1 templates, this is not done, so they must be used directly in the template for those features to work correctly.
+
+```jinja
+{# gpt4all v1 #}
+{%- for message in messages %}
+    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
+    {%- if message['role'] == 'user' %}
+        {%- for source in message['sources'] %}
+            {%- if loop.first %}
+                {{- '### Context:\n' }}
+            {%- endif %}
+            {{- 'Collection: ' + source['collection'] + '\n'   +
+                'Path: '       + source['path']       + '\n'   +
+                'Excerpt: '    + source['text']       + '\n\n' }}
+        {%- endfor %}
+    {%- endif %}
+    {%- for attachment in message['prompt_attachments'] %}
+        {{- attachment['processed_content'] + '\n\n' }}
+    {%- endfor %}
+    {{- message['content'] | trim }}
+    {{- '<|eot_id|>' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+{%- endif %}
+```
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md
@ -0,0 +1,85 @@
+# Using GPT4All to Privately Chat with your Microsoft Excel Spreadsheets
+Local and Private AI Chat with your Microsoft Excel Spreadsheets
+
+Microsoft Excel allows you to create, manage, and analyze data in spreadsheet format. By attaching your spreadsheets directly to GPT4All, you can privately chat with the AI to query and explore the data, enabling you to summarize, generate reports, and glean insights from your files—all within your conversation.
+
+<div style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden;">
+  <iframe src="../../assets/gpt4all_xlsx_attachment.mp4" style="position: absolute; top: 0; left: 0; width: 100%; height: 100%; border:0;" allowfullscreen title="YouTube Video"></iframe>
+</div>
+
+
+## Attach Microsoft Excel to your GPT4All Conversation
+
+!!! note "Attach Microsoft Excel to your GPT4All Conversation"
+
+    1. **Install GPT4All and Open **:
+
+        - Go to [nomic.ai/gpt4all](https://nomic.ai/gpt4all) to install GPT4All for your operating system.
+
+        - Navigate to the Chats view within GPT4All.
+
+        <table>
+            <tr>
+               <td>
+                  <!-- Screenshot of Chat view -->
+                  <img width="1348" alt="Chat view" src="../../assets/chat_window.png">
+               </td>
+            </tr>
+         </table>
+
+    2. **Example Spreadsheet **:
+
+        <table>
+            <tr>
+               <td>
+                  <!-- Screenshot of Spreadsheet view -->
+                  <img width="1348" alt="Spreadsheet view" src="../../assets/disney_spreadsheet.png">
+               </td>
+            </tr>
+         </table>
+
+    3. **Attach to GPT4All conversration**
+        <table>
+            <tr>
+               <td>
+                  <!-- Screenshot of Attach view -->
+                  <img width="1348" alt="Attach view" src="../../assets/attach_spreadsheet.png">
+               </td>
+            </tr>
+         </table>
+
+    4. **Have GPT4All Summarize and Generate a Report**
+        <table>
+            <tr>
+               <td>
+                  <!-- Screenshot of Attach view -->
+                  <img width="1348" alt="Attach view" src="../../assets/spreadsheet_chat.png">
+               </td>
+            </tr>
+         </table>
+
+
+## How It Works
+
+GPT4All parses your attached excel spreadsheet into Markdown, a format understandable to LLMs, and adds the markdown text to the context for your LLM chat. You can view the code that converts `.xslx` to Markdown [here](https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/src/xlsxtomd.cpp) in the GPT4All github repo.
+
+For example, the above spreadsheet titled `disney_income_stmt.xlsx` would be formatted the following way:
+
+```markdown
+## disney_income_stmt
+
+|Walt Disney Co.|||||||
+|---|---|---|---|---|---|---|
+|Consolidated Income Statement|||||||
+|||||||||
+|US$ in millions|||||||
+|12 months ended:|2023-09-30 00:00:00|2022-10-01 00:00:00|2021-10-02 00:00:00|2020-10-03 00:00:00|2019-09-28 00:00:00|2018-09-29 00:00:00|
+|Services|79562|74200|61768|59265|60542|50869|
+...
+...
+...
+```
+
+## Limitations
+
+It is important to double-check the claims LLMs make about the spreadsheets you provide. LLMs can make mistakes about the data they are presented, particularly for the LLMs with smaller parameter counts (~8B) that fit within the memory of consumer hardware.
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/quickstart.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/quickstart.md
@ -4,6 +4,8 @@ The GPT4All Desktop Application allows you to download and run large language mo

 With GPT4All, you can chat with models, turn your local files into information sources for models [(LocalDocs)](localdocs.md), or browse models available online to download onto your device.

+[Official Video Tutorial](https://www.youtube.com/watch?v=gQcZDXRVJok)
+
 ## Quickstart

 !!! note "Quickstart"
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/settings.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/settings.md
@ -8,10 +8,11 @@
    | --- | --- | --- |
    | **Theme** | Color theme for the application. Options are `Light`, `Dark`, and `LegacyDark` | `Light` |
    | **Font Size** | Font size setting for text throughout the application. Options are Small, Medium, and Large | Small |
+    | **Language and Locale** | The language and locale of that language you wish to use | System Locale |
    | **Device** | Device that will run your models. Options are `Auto` (GPT4All chooses), `Metal` (Apple Silicon M1+), `CPU`, and `GPU` | `Auto` |
    | **Default Model** | Choose your preferred LLM to load by default on startup| Auto |
+    | **Suggestion Mode** | Generate suggested follow up questions at the end of responses | When chatting with LocalDocs | 
    | **Download Path** | Select a destination on your device to save downloaded models | Windows: `C:\Users\{username}\AppData\Local\nomic.ai\GPT4All`<br><br>Mac: `/Users/{username}/Library/Application Support/nomic.ai/GPT4All/`<br><br>Linux: `/home/{username}/.local/share/nomic.ai/GPT4All` |
-    
    | **Enable Datalake** | Opt-in to sharing interactions with GPT4All community (**anonymous** and **optional**) | Off |

 !!! note "Advanced Application Settings"
@ -19,7 +20,7 @@
    | Setting | Description | Default Value |
    | --- | --- | --- |
    | **CPU Threads** | Number of concurrently running CPU threads (more can speed up responses) | 4 |
-    | **Save Chat Context** | Save chat context to disk to pick up exactly where a model left off. | Off |
+    | **Enable System Tray** | The application will minimize to the system tray / taskbar when the window is closed | Off |
    | **Enable Local Server** | Allow any application on your device to use GPT4All via an OpenAI-compatible GPT4All API | Off |
    | **API Server Port** | Local HTTP port for the local API server | 4891 |

@ -30,8 +31,11 @@
    | Setting | Description | Default Value |
    | --- | --- | --- |
    | **Name** | Unique name of this model / character| set by model uploader |
-    | **System Prompt** | General instructions for the chats this model will be used for | set by model uploader |
-    | **Prompt Template** | Format of user <-> assistant interactions for the chats this model will be used for | set by model uploader |
+    | **Model File** | Filename (.gguf) of the model | set by model uploader |
+    | **System Message** | General instructions for the chats this model will be used for | set by model uploader |
+    | **Chat Template** | Format of user <-> assistant interactions for the chats this model will be used for | set by model uploader |
+    | **Chat Name Prompt** | Prompt used to automatically generate chat names | Describe the above conversation in seven words or less. |
+    | **Suggested FollowUp Prompt** | Prompt used to automatically generate follow up questions after a chat response | Suggest three very short factual follow-up questions that have not been answered yet or cannot be found inspired by the previous conversation and excerpts. |

 ### Clone

--- a/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
+++ b/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
@ -4,7 +4,7 @@

 It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings).

-Try downloading one of the officially supported models mentioned our [website](https://gpt4all.io/). If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).
+Try downloading one of the officially supported models listed on the main models page in the application. If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).

 ## Bad Responses 

@ -24,4 +24,4 @@ Including information in a prompt is not a guarantee that it will be used correc

 ### LocalDocs Issues

-Occasionally a model - particularly a smaller or overall weaker LLM - may not use the relevant text snippets from the files that were referenced via LocalDocs. If you are seeing this, it can help to use phrases like "in the docs" or "from the provided files" when prompting your model.
+Occasionally a model - particularly a smaller or overall weaker LLM - may not use the relevant text snippets from the files that were referenced via LocalDocs. If you are seeing this, it can help to use phrases like "in the docs" or "from the provided files" when prompting your model.
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -3,13 +3,13 @@ from __future__ import annotations
 import ctypes
 import os
 import platform
-import re
 import subprocess
 import sys
+import textwrap
 import threading
 from enum import Enum
 from queue import Queue
-from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Literal, NoReturn, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Iterator, Literal, NoReturn, TypeVar, overload

 if sys.version_info >= (3, 9):
    import importlib.resources as importlib_resources
@ -23,30 +23,75 @@ else:
    from typing import TypedDict

 if TYPE_CHECKING:
-    from typing_extensions import TypeAlias
+    from typing_extensions import ParamSpec, TypeAlias
+    T = TypeVar("T")
+    P = ParamSpec("P")

 EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')

+cuda_found: bool = False

-# Find CUDA libraries from the official packages
-cuda_found = False
-if platform.system() in ('Linux', 'Windows'):
+
+# TODO(jared): use operator.call after we drop python 3.10 support
+def _operator_call(obj: Callable[P, T], /, *args: P.args, **kwargs: P.kwargs) -> T:
+    return obj(*args, **kwargs)
+
+
+# Detect Rosetta 2
+@_operator_call
+def check_rosetta() -> None:
+    if platform.system() == "Darwin" and platform.processor() == "i386":
+        p = subprocess.run("sysctl -n sysctl.proc_translated".split(), capture_output=True, text=True)
+        if p.returncode == 0 and p.stdout.strip() == "1":
+            raise RuntimeError(textwrap.dedent("""\
+                Running GPT4All under Rosetta is not supported due to CPU feature requirements.
+                Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
+            """).strip())
+
+
+# Check for C++ runtime libraries
+if platform.system() == "Windows":
    try:
-        from nvidia import cuda_runtime, cublas
-    except ImportError:
-        pass  # CUDA is optional
-    else:
-        if platform.system() == 'Linux':
-            cudalib   = 'lib/libcudart.so.12'
-            cublaslib = 'lib/libcublas.so.12'
+        ctypes.CDLL("msvcp140.dll")
+        ctypes.CDLL("vcruntime140.dll")
+        ctypes.CDLL("vcruntime140_1.dll")
+    except OSError as e:
+        print(textwrap.dedent(f"""\
+            {e!r}
+            The Microsoft Visual C++ runtime libraries were not found. Please install them from
+            https://aka.ms/vs/17/release/vc_redist.x64.exe
+        """), file=sys.stderr)
+
+
+@_operator_call
+def find_cuda() -> None:
+    global cuda_found
+
+    def _load_cuda(rtver: str, blasver: str) -> None:
+        if platform.system() == "Linux":
+            cudalib   = f"lib/libcudart.so.{rtver}"
+            cublaslib = f"lib/libcublas.so.{blasver}"
        else:  # Windows
-            cudalib   = r'bin\cudart64_12.dll'
-            cublaslib = r'bin\cublas64_12.dll'
+            cudalib   = fr"bin\cudart64_{rtver.replace('.', '')}.dll"
+            cublaslib = fr"bin\cublas64_{blasver}.dll"

        # preload the CUDA libs so the backend can find them
        ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
        ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
-        cuda_found = True
+
+    # Find CUDA libraries from the official packages
+    if platform.system() in ("Linux", "Windows"):
+        try:
+            from nvidia import cuda_runtime, cublas
+        except ImportError:
+            pass  # CUDA is optional
+        else:
+            for rtver, blasver in [("12", "12"), ("11.0", "11")]:
+                try:
+                    _load_cuda(rtver, blasver)
+                    cuda_found = True
+                except OSError:  # dlopen() does not give specific error codes
+                    pass  # try the next one


 # TODO: provide a config file to make this more robust
@ -73,23 +118,18 @@ llmodel = load_llmodel_library()

 class LLModelPromptContext(ctypes.Structure):
    _fields_ = [
-        ("logits", ctypes.POINTER(ctypes.c_float)),
-        ("logits_size", ctypes.c_size_t),
-        ("tokens", ctypes.POINTER(ctypes.c_int32)),
-        ("tokens_size", ctypes.c_size_t),
-        ("n_past", ctypes.c_int32),
-        ("n_ctx", ctypes.c_int32),
-        ("n_predict", ctypes.c_int32),
-        ("top_k", ctypes.c_int32),
-        ("top_p", ctypes.c_float),
-        ("min_p", ctypes.c_float),
-        ("temp", ctypes.c_float),
-        ("n_batch", ctypes.c_int32),
+        ("n_predict",      ctypes.c_int32),
+        ("top_k",          ctypes.c_int32),
+        ("top_p",          ctypes.c_float),
+        ("min_p",          ctypes.c_float),
+        ("temp",           ctypes.c_float),
+        ("n_batch",        ctypes.c_int32),
        ("repeat_penalty", ctypes.c_float),
-        ("repeat_last_n", ctypes.c_int32),
-        ("context_erase", ctypes.c_float),
+        ("repeat_last_n",  ctypes.c_int32),
+        ("context_erase",  ctypes.c_float),
    ]

+
 class LLModelGPUDevice(ctypes.Structure):
    _fields_ = [
        ("backend", ctypes.c_char_p),
@ -100,6 +140,7 @@ class LLModelGPUDevice(ctypes.Structure):
        ("vendor", ctypes.c_char_p),
    ]

+
 # Define C function signatures using ctypes
 llmodel.llmodel_model_create.argtypes = [ctypes.c_char_p]
 llmodel.llmodel_model_create.restype = ctypes.c_void_p
@ -117,24 +158,21 @@ llmodel.llmodel_required_mem.restype = ctypes.c_size_t
 llmodel.llmodel_isModelLoaded.argtypes = [ctypes.c_void_p]
 llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool

-PromptCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32)
-ResponseCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
-RecalculateCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_bool)
-EmbCancelCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
+PromptCallback       = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int32), ctypes.c_size_t, ctypes.c_bool)
+ResponseCallback     = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
+EmbCancelCallback    = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
+SpecialTokenCallback = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_char_p)

 llmodel.llmodel_prompt.argtypes = [
    ctypes.c_void_p,
    ctypes.c_char_p,
-    ctypes.c_char_p,
    PromptCallback,
    ResponseCallback,
-    RecalculateCallback,
    ctypes.POINTER(LLModelPromptContext),
-    ctypes.c_bool,
-    ctypes.c_char_p,
+    ctypes.POINTER(ctypes.c_char_p),
 ]

-llmodel.llmodel_prompt.restype = None
+llmodel.llmodel_prompt.restype = ctypes.c_bool

 llmodel.llmodel_embed.argtypes = [
    ctypes.c_void_p,
@ -183,6 +221,12 @@ llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
 llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p]
 llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p

+llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char_p)]
+llmodel.llmodel_count_prompt_tokens.restype = ctypes.c_int32
+
+llmodel.llmodel_model_foreach_special_token.argtypes = [ctypes.c_void_p, SpecialTokenCallback]
+llmodel.llmodel_model_foreach_special_token.restype = None
+
 ResponseCallbackType = Callable[[int, str], bool]
 RawResponseCallbackType = Callable[[int, bytes], bool]
 EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
@ -227,7 +271,6 @@ class LLModel:
        self.model_path = model_path.encode()
        self.n_ctx = n_ctx
        self.ngl = ngl
-        self.context: LLModelPromptContext | None = None
        self.buffer = bytearray()
        self.buff_expecting_cont_bytes: int = 0

@ -247,6 +290,10 @@ class LLModel:

            raise RuntimeError(f"Unable to instantiate model: {errmsg}")
        self.model: ctypes.c_void_p | None = model
+        self.special_tokens_map: dict[str, str] = {}
+        llmodel.llmodel_model_foreach_special_token(
+            self.model, lambda n, t: self.special_tokens_map.__setitem__(n.decode(), t.decode()),
+        )

    def __del__(self, llmodel=llmodel):
        if hasattr(self, 'model'):
@ -273,6 +320,19 @@ class LLModel:
        dev = llmodel.llmodel_model_gpu_device_name(self.model)
        return None if dev is None else dev.decode()

+    def count_prompt_tokens(self, prompt: str) -> int:
+        if self.model is None:
+            self._raise_closed()
+        err = ctypes.c_char_p()
+        n_tok = llmodel.llmodel_count_prompt_tokens(self.model, prompt, ctypes.byref(err))
+        if n_tok < 0:
+            s = err.value
+            errmsg = 'null' if s is None else s.decode()
+            raise RuntimeError(f'Unable to count prompt tokens: {errmsg}')
+        return n_tok
+
+    llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
+
    @staticmethod
    def list_gpus(mem_required: int = 0) -> list[str]:
        """
@ -336,51 +396,6 @@ class LLModel:
            raise Exception("Model not loaded")
        return llmodel.llmodel_threadCount(self.model)

-    def _set_context(
-        self,
-        n_predict: int = 4096,
-        top_k: int = 40,
-        top_p: float = 0.9,
-        min_p: float = 0.0,
-        temp: float = 0.1,
-        n_batch: int = 8,
-        repeat_penalty: float = 1.2,
-        repeat_last_n: int = 10,
-        context_erase: float = 0.75,
-        reset_context: bool = False,
-    ):
-        if self.context is None:
-            context = LLModelPromptContext(
-                logits_size=0,
-                tokens_size=0,
-                n_past=0,
-                n_ctx=0,
-                n_predict=n_predict,
-                top_k=top_k,
-                top_p=top_p,
-                min_p=min_p,
-                temp=temp,
-                n_batch=n_batch,
-                repeat_penalty=repeat_penalty,
-                repeat_last_n=repeat_last_n,
-                context_erase=context_erase,
-            )
-            self.context = context
-        else:
-            context = self.context
-            if reset_context:
-                self.context.n_past = 0
-
-        self.context.n_predict = n_predict
-        self.context.top_k = top_k
-        self.context.top_p = top_p
-        self.context.min_p = min_p
-        self.context.temp = temp
-        self.context.n_batch = n_batch
-        self.context.repeat_penalty = repeat_penalty
-        self.context.repeat_last_n = repeat_last_n
-        self.context.context_erase = context_erase
-
    @overload
    def generate_embeddings(
        self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
@ -450,20 +465,18 @@ class LLModel:

    def prompt_model(
        self,
-        prompt: str,
-        prompt_template: str,
-        callback: ResponseCallbackType,
-        n_predict: int = 4096,
-        top_k: int = 40,
-        top_p: float = 0.9,
-        min_p: float = 0.0,
-        temp: float = 0.1,
-        n_batch: int = 8,
-        repeat_penalty: float = 1.2,
-        repeat_last_n: int = 10,
-        context_erase: float = 0.75,
-        reset_context: bool = False,
-        special: bool = False,
+        prompt          : str,
+        callback        : ResponseCallbackType,
+        n_predict       : int                  = 4096,
+        top_k           : int                  = 40,
+        top_p           : float                = 0.9,
+        min_p           : float                = 0.0,
+        temp            : float                = 0.1,
+        n_batch         : int                  = 8,
+        repeat_penalty  : float                = 1.2,
+        repeat_last_n   : int                  = 10,
+        context_erase   : float                = 0.75,
+        reset_context   : bool                 = False,
    ):
        """
        Generate response from model from a prompt.
@ -486,35 +499,38 @@ class LLModel:
        self.buffer.clear()
        self.buff_expecting_cont_bytes = 0

-        self._set_context(
-            n_predict=n_predict,
-            top_k=top_k,
-            top_p=top_p,
-            min_p=min_p,
-            temp=temp,
-            n_batch=n_batch,
-            repeat_penalty=repeat_penalty,
-            repeat_last_n=repeat_last_n,
-            context_erase=context_erase,
-            reset_context=reset_context,
+        context = LLModelPromptContext(
+            n_predict      = n_predict,
+            top_k          = top_k,
+            top_p          = top_p,
+            min_p          = min_p,
+            temp           = temp,
+            n_batch        = n_batch,
+            repeat_penalty = repeat_penalty,
+            repeat_last_n  = repeat_last_n,
+            context_erase  = context_erase,
        )

-        llmodel.llmodel_prompt(
+        error_msg: bytes | None = None
+        def error_callback(msg: bytes) -> None:
+            nonlocal error_msg
+            error_msg = msg
+
+        err = ctypes.c_char_p()
+        if not llmodel.llmodel_prompt(
            self.model,
            ctypes.c_char_p(prompt.encode()),
-            ctypes.c_char_p(prompt_template.encode()),
            PromptCallback(self._prompt_callback),
            ResponseCallback(self._callback_decoder(callback)),
-            RecalculateCallback(self._recalculate_callback),
-            self.context,
-            special,
-            ctypes.c_char_p(),
-        )
-
+            context,
+            ctypes.byref(err),
+        ):
+            s = err.value
+            raise RuntimeError(f"prompt error: {'null' if s is None else s.decode()}")

    def prompt_model_streaming(
-        self, prompt: str, prompt_template: str, callback: ResponseCallbackType = empty_response_callback, **kwargs
-    ) -> Iterable[str]:
+        self, prompt: str, callback: ResponseCallbackType = empty_response_callback, **kwargs: Any,
+    ) -> Iterator[str]:
        if self.model is None:
            self._raise_closed()

@ -533,15 +549,15 @@ class LLModel:

            return _generator_callback

-        def run_llmodel_prompt(prompt: str, prompt_template: str, callback: ResponseCallbackType, **kwargs):
-            self.prompt_model(prompt, prompt_template, callback, **kwargs)
+        def run_llmodel_prompt(prompt: str, callback: ResponseCallbackType, **kwargs):
+            self.prompt_model(prompt, callback, **kwargs)
            output_queue.put(Sentinel.TERMINATING_SYMBOL)

        # Kick off llmodel_prompt in separate thread so we can return generator
        # immediately
        thread = threading.Thread(
            target=run_llmodel_prompt,
-            args=(prompt, prompt_template, _generator_callback_wrapper(callback)),
+            args=(prompt, _generator_callback_wrapper(callback)),
            kwargs=kwargs,
        )
        thread.start()
@ -560,16 +576,16 @@ class LLModel:
            decoded = []

            for byte in response:
-                
+
                bits = "{:08b}".format(byte)
                (high_ones, _, _) = bits.partition('0')

-                if len(high_ones) == 1: 
+                if len(high_ones) == 1:
                    # continuation byte
                    self.buffer.append(byte)
                    self.buff_expecting_cont_bytes -= 1

-                else: 
+                else:
                    # beginning of a byte sequence
                    if len(self.buffer) > 0:
                        decoded.append(self.buffer.decode(errors='replace'))
@ -579,27 +595,22 @@ class LLModel:
                    self.buffer.append(byte)
                    self.buff_expecting_cont_bytes = max(0, len(high_ones) - 1)

-                if self.buff_expecting_cont_bytes <= 0: 
+                if self.buff_expecting_cont_bytes <= 0:
                    # received the whole sequence or an out of place continuation byte
                    decoded.append(self.buffer.decode(errors='replace'))

                    self.buffer.clear()
                    self.buff_expecting_cont_bytes = 0
-                    
+
            if len(decoded) == 0 and self.buff_expecting_cont_bytes > 0:
                # wait for more continuation bytes
                return True
-            
-            return callback(token_id, ''.join(decoded))     
+
+            return callback(token_id, ''.join(decoded))

        return _raw_callback

    # Empty prompt callback
    @staticmethod
-    def _prompt_callback(token_id: int) -> bool:
+    def _prompt_callback(token_ids: ctypes._Pointer[ctypes.c_int32], n_token_ids: int, cached: bool) -> bool:
        return True
-
-    # Empty recalculate callback
-    @staticmethod
-    def _recalculate_callback(is_recalculating: bool) -> bool:
-        return is_recalculating
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -4,38 +4,66 @@ Python only API for running all GPT4All models.
 from __future__ import annotations

 import hashlib
+import json
 import os
 import platform
 import re
 import sys
-import time
 import warnings
 from contextlib import contextmanager
+from datetime import datetime
 from pathlib import Path
 from types import TracebackType
-from typing import TYPE_CHECKING, Any, Iterable, Literal, Protocol, overload
+from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, NoReturn, Protocol, TypedDict, overload

+import jinja2
 import requests
+from jinja2.sandbox import ImmutableSandboxedEnvironment
 from requests.exceptions import ChunkedEncodingError
 from tqdm import tqdm
 from urllib3.exceptions import IncompleteRead, ProtocolError

 from ._pyllmodel import (CancellationError as CancellationError, EmbCancelCallbackType, EmbedResult as EmbedResult,
-                         LLModel, ResponseCallbackType, empty_response_callback)
+                         LLModel, ResponseCallbackType, _operator_call, empty_response_callback)

 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias

-if sys.platform == 'darwin':
+if sys.platform == "darwin":
    import fcntl

 # TODO: move to config
 DEFAULT_MODEL_DIRECTORY = Path.home() / ".cache" / "gpt4all"

-DEFAULT_PROMPT_TEMPLATE = "### Human:\n{0}\n\n### Assistant:\n"
+ConfigType: TypeAlias = "dict[str, Any]"

-ConfigType: TypeAlias = 'dict[str, Any]'
-MessageType: TypeAlias = 'dict[str, str]'
+# Environment setup adapted from HF transformers
+@_operator_call
+def _jinja_env() -> ImmutableSandboxedEnvironment:
+    def raise_exception(message: str) -> NoReturn:
+        raise jinja2.exceptions.TemplateError(message)
+
+    def tojson(obj: Any, indent: int | None = None) -> str:
+        return json.dumps(obj, ensure_ascii=False, indent=indent)
+
+    def strftime_now(fmt: str) -> str:
+        return datetime.now().strftime(fmt)
+
+    env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
+    env.filters["tojson"         ] = tojson
+    env.globals["raise_exception"] = raise_exception
+    env.globals["strftime_now"   ] = strftime_now
+    return env
+
+
+class MessageType(TypedDict):
+    role: str
+    content: str
+
+
+class ChatSession(NamedTuple):
+    template: jinja2.Template
+    history: list[MessageType]


 class Embed4All:
@ -55,7 +83,7 @@ class Embed4All:
            kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor.
        """
        if model_name is None:
-            model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
+            model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
        self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs)

    def __enter__(self) -> Self:
@ -146,18 +174,18 @@ class Embed4All:
            dimensionality = -1
        else:
            if dimensionality <= 0:
-                raise ValueError(f'Dimensionality must be None or a positive integer, got {dimensionality}')
+                raise ValueError(f"Dimensionality must be None or a positive integer, got {dimensionality}")
            if dimensionality < self.MIN_DIMENSIONALITY:
                warnings.warn(
-                    f'Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}.'
-                    ' Performance may be degraded.'
+                    f"Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}."
+                    " Performance may be degraded."
                )
        try:
            do_mean = {"mean": True, "truncate": False}[long_text_mode]
        except KeyError:
            raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
        result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas, cancel_cb)
-        return result if return_dict else result['embeddings']
+        return result if return_dict else result["embeddings"]


 class GPT4All:
@ -205,31 +233,30 @@ class GPT4All:
        """

        self.model_type = model_type
-        self._history: list[MessageType] | None = None
-        self._current_prompt_template: str = "{0}"
+        self._chat_session: ChatSession | None = None

        device_init = None
-        if sys.platform == 'darwin':
+        if sys.platform == "darwin":
            if device is None:
-                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
-            elif device == 'cpu':
-                backend = 'cpu'
+                backend = "auto"  # "auto" is effectively "metal" due to currently non-functional fallback
+            elif device == "cpu":
+                backend = "cpu"
            else:
-                if platform.machine() != 'arm64' or device != 'gpu':
-                    raise ValueError(f'Unknown device for this platform: {device}')
-                backend = 'metal'
+                if platform.machine() != "arm64" or device != "gpu":
+                    raise ValueError(f"Unknown device for this platform: {device}")
+                backend = "metal"
        else:
-            backend = 'kompute'
-            if device is None or device == 'cpu':
+            backend = "kompute"
+            if device is None or device == "cpu":
                pass  # use kompute with no device
-            elif device in ('cuda', 'kompute'):
+            elif device in ("cuda", "kompute"):
                backend = device
-                device_init = 'gpu'
-            elif device.startswith('cuda:'):
-                backend = 'cuda'
-                device_init = device.removeprefix('cuda:')
+                device_init = "gpu"
+            elif device.startswith("cuda:"):
+                backend = "cuda"
+                device_init = _remove_prefix(device, "cuda:")
            else:
-                device_init = device.removeprefix('kompute:')
+                device_init = _remove_prefix(device, "kompute:")

        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
@ -265,7 +292,13 @@ class GPT4All:

    @property
    def current_chat_session(self) -> list[MessageType] | None:
-        return None if self._history is None else list(self._history)
+        return None if self._chat_session is None else self._chat_session.history
+
+    @current_chat_session.setter
+    def current_chat_session(self, history: list[MessageType]) -> None:
+        if self._chat_session is None:
+            raise ValueError("current_chat_session may only be set when there is an active chat session")
+        self._chat_session.history[:] = history

    @staticmethod
    def list_models() -> list[ConfigType]:
@ -277,7 +310,7 @@ class GPT4All:
        """
        resp = requests.get("https://gpt4all.io/models/models3.json")
        if resp.status_code != 200:
-            raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
+            raise ValueError(f"Request failed: HTTP {resp.status_code} {resp.reason}")
        return resp.json()

    @classmethod
@ -307,15 +340,9 @@ class GPT4All:
        # get the config for the model
        config: ConfigType = {}
        if allow_download:
-            available_models = cls.list_models()
-
-            for m in available_models:
-                if model_filename == m["filename"]:
-                    tmpl = m.get("promptTemplate", DEFAULT_PROMPT_TEMPLATE)
-                    # change to Python-style formatting
-                    m["promptTemplate"] = tmpl.replace("%1", "{0}", 1).replace("%2", "{1}", 1)
-                    config.update(m)
-                    break
+            models = cls.list_models()
+            if (model := next((m for m in models if m["filename"] == model_filename), None)) is not None:
+                config.update(model)

        # Validate download directory
        if model_path is None:
@ -357,7 +384,7 @@ class GPT4All:
        expected_md5: str | None = None,
    ) -> str | os.PathLike[str]:
        """
-        Download model from https://gpt4all.io.
+        Download model from gpt4all.io.

        Args:
            model_filename: Filename of model (with .gguf extension).
@ -379,13 +406,13 @@ class GPT4All:
            headers = {}
            if offset:
                print(f"\nDownload interrupted, resuming from byte position {offset}", file=sys.stderr)
-                headers['Range'] = f'bytes={offset}-'  # resume incomplete response
+                headers["Range"] = f"bytes={offset}-"  # resume incomplete response
                headers["Accept-Encoding"] = "identity"  # Content-Encoding changes meaning of ranges
            response = requests.get(url, stream=True, headers=headers)
            if response.status_code not in (200, 206):
-                raise ValueError(f'Request failed: HTTP {response.status_code} {response.reason}')
-            if offset and (response.status_code != 206 or str(offset) not in response.headers.get('Content-Range', '')):
-                raise ValueError('Connection was interrupted and server does not support range requests')
+                raise ValueError(f"Request failed: HTTP {response.status_code} {response.reason}")
+            if offset and (response.status_code != 206 or str(offset) not in response.headers.get("Content-Range", "")):
+                raise ValueError("Connection was interrupted and server does not support range requests")
            if (enc := response.headers.get("Content-Encoding")) is not None:
                raise ValueError(f"Expected identity Content-Encoding, got {enc}")
            return response
@ -484,19 +511,19 @@ class GPT4All:

    def generate(
        self,
-        prompt: str,
+        prompt         : str,
        *,
-        max_tokens: int = 200,
-        temp: float = 0.7,
-        top_k: int = 40,
-        top_p: float = 0.4,
-        min_p: float = 0.0,
-        repeat_penalty: float = 1.18,
-        repeat_last_n: int = 64,
-        n_batch: int = 8,
-        n_predict: int | None = None,
-        streaming: bool = False,
-        callback: ResponseCallbackType = empty_response_callback,
+        max_tokens     : int                  = 200,
+        temp           : float                = 0.7,
+        top_k          : int                  = 40,
+        top_p          : float                = 0.4,
+        min_p          : float                = 0.0,
+        repeat_penalty : float                = 1.18,
+        repeat_last_n  : int                  = 64,
+        n_batch        : int                  = 8,
+        n_predict      : int | None           = None,
+        streaming      : bool                 = False,
+        callback       : ResponseCallbackType = empty_response_callback,
    ) -> Any:
        """
        Generate outputs from any GPT4All model.
@ -521,122 +548,94 @@ class GPT4All:

        # Preparing the model request
        generate_kwargs: dict[str, Any] = dict(
-            temp=temp,
-            top_k=top_k,
-            top_p=top_p,
-            min_p=min_p,
-            repeat_penalty=repeat_penalty,
-            repeat_last_n=repeat_last_n,
-            n_batch=n_batch,
-            n_predict=n_predict if n_predict is not None else max_tokens,
+            temp           = temp,
+            top_k          = top_k,
+            top_p          = top_p,
+            min_p          = min_p,
+            repeat_penalty = repeat_penalty,
+            repeat_last_n  = repeat_last_n,
+            n_batch        = n_batch,
+            n_predict      = n_predict if n_predict is not None else max_tokens,
        )

-        if self._history is not None:
-            # check if there is only one message, i.e. system prompt:
-            reset = len(self._history) == 1
-            self._history.append({"role": "user", "content": prompt})
-
-            fct_func = self._format_chat_prompt_template.__func__  # type: ignore[attr-defined]
-            if fct_func is GPT4All._format_chat_prompt_template:
-                if reset:
-                    # ingest system prompt
-                    # use "%1%2" and not "%1" to avoid implicit whitespace
-                    self.model.prompt_model(self._history[0]["content"], "%1%2",
-                                            empty_response_callback,
-                                            n_batch=n_batch, n_predict=0, reset_context=True, special=True)
-                prompt_template = self._current_prompt_template.format("%1", "%2")
-            else:
-                warnings.warn(
-                    "_format_chat_prompt_template is deprecated. Please use a chat session with a prompt template.",
-                    DeprecationWarning,
-                )
-                # special tokens won't be processed
-                prompt = self._format_chat_prompt_template(
-                    self._history[-1:],
-                    self._history[0]["content"] if reset else "",
-                )
-                prompt_template = "%1"
-                generate_kwargs["reset_context"] = reset
-        else:
-            prompt_template = "%1"
-            generate_kwargs["reset_context"] = True
-
        # Prepare the callback, process the model response
-        output_collector: list[MessageType]
-        output_collector = [
-            {"content": ""}
-        ]  # placeholder for the self._history if chat session is not activated
+        full_response = ""

-        if self._history is not None:
-            self._history.append({"role": "assistant", "content": ""})
-            output_collector = self._history
+        def _callback_wrapper(token_id: int, response: str) -> bool:
+            nonlocal full_response
+            full_response += response
+            return callback(token_id, response)

-        def _callback_wrapper(
-            callback: ResponseCallbackType,
-            output_collector: list[MessageType],
-        ) -> ResponseCallbackType:
-            def _callback(token_id: int, response: str) -> bool:
-                nonlocal callback, output_collector
+        last_msg_rendered = prompt
+        if self._chat_session is not None:
+            session = self._chat_session
+            def render(messages: list[MessageType]) -> str:
+                return session.template.render(
+                    messages=messages,
+                    add_generation_prompt=True,
+                    **self.model.special_tokens_map,
+                )
+            session.history.append(MessageType(role="user", content=prompt))
+            prompt = render(session.history)
+            if len(session.history) > 1:
+                last_msg_rendered = render(session.history[-1:])

-                output_collector[-1]["content"] += response
-
-                return callback(token_id, response)
-
-            return _callback
+        # Check request length
+        last_msg_len = self.model.count_prompt_tokens(last_msg_rendered)
+        if last_msg_len > (limit := self.model.n_ctx - 4):
+            raise ValueError(f"Your message was too long and could not be processed ({last_msg_len} > {limit}).")

        # Send the request to the model
        if streaming:
-            return self.model.prompt_model_streaming(
-                prompt,
-                prompt_template,
-                _callback_wrapper(callback, output_collector),
-                **generate_kwargs,
-            )
+            def stream() -> Iterator[str]:
+                yield from self.model.prompt_model_streaming(prompt, _callback_wrapper, **generate_kwargs)
+                if self._chat_session is not None:
+                    self._chat_session.history.append(MessageType(role="assistant", content=full_response))
+            return stream()

-        self.model.prompt_model(
-            prompt,
-            prompt_template,
-            _callback_wrapper(callback, output_collector),
-            **generate_kwargs,
-        )
-
-        return output_collector[-1]["content"]
+        self.model.prompt_model(prompt, _callback_wrapper, **generate_kwargs)
+        if self._chat_session is not None:
+            self._chat_session.history.append(MessageType(role="assistant", content=full_response))
+        return full_response

    @contextmanager
    def chat_session(
        self,
-        system_prompt: str | None = None,
-        prompt_template: str | None = None,
+        system_message: str | Literal[False] | None = None,
+        chat_template: str | None = None,
    ):
        """
        Context manager to hold an inference optimized chat session with a GPT4All model.

        Args:
-            system_prompt: An initial instruction for the model.
-            prompt_template: Template for the prompts with {0} being replaced by the user message.
+            system_message: An initial instruction for the model, None to use the model default, or False to disable. Defaults to None.
+            chat_template: Jinja template for the conversation, or None to use the model default. Defaults to None.
        """

-        if system_prompt is None:
-            system_prompt = self.config.get("systemPrompt", "")
+        if system_message is None:
+            system_message = self.config.get("systemMessage", False)

-        if prompt_template is None:
-            if (tmpl := self.config.get("promptTemplate")) is None:
-                warnings.warn("Use of a sideloaded model or allow_download=False without specifying a prompt template "
-                              "is deprecated. Defaulting to Alpaca.", DeprecationWarning)
-                tmpl = DEFAULT_PROMPT_TEMPLATE
-            prompt_template = tmpl
+        if chat_template is None:
+            if "name" not in self.config:
+                raise ValueError("For sideloaded models or with allow_download=False, you must specify a chat template.")
+            if "chatTemplate" not in self.config:
+                raise NotImplementedError("This model appears to have a built-in chat template, but loading it is not "
+                                          "currently implemented. Please pass a template to chat_session() directly.")
+            if (tmpl := self.config["chatTemplate"]) is None:
+                raise ValueError(f"The model {self.config['name']!r} does not support chat.")
+            chat_template = tmpl

-        if re.search(r"%1(?![0-9])", prompt_template):
-            raise ValueError("Prompt template containing a literal '%1' is not supported. For a prompt "
-                             "placeholder, please use '{0}' instead.")
-
-        self._history = [{"role": "system", "content": system_prompt}]
-        self._current_prompt_template = prompt_template
+        history = []
+        if system_message is not False:
+            history.append(MessageType(role="system", content=system_message))
+        self._chat_session = ChatSession(
+            template=_jinja_env.from_string(chat_template),
+            history=history,
+        )
        try:
            yield self
        finally:
-            self._history = None
-            self._current_prompt_template = "{0}"
+            self._chat_session = None

    @staticmethod
    def list_gpus() -> list[str]:
@ -648,43 +647,6 @@ class GPT4All:
        """
        return LLModel.list_gpus()

-    def _format_chat_prompt_template(
-        self,
-        messages: list[MessageType],
-        default_prompt_header: str = "",
-        default_prompt_footer: str = "",
-    ) -> str:
-        """
-        Helper method for building a prompt from list of messages using the self._current_prompt_template as a template for each message.
-
-        Warning:
-            This function was deprecated in version 2.3.0, and will be removed in a future release.
-
-        Args:
-            messages:  List of dictionaries. Each dictionary should have a "role" key
-                with value of "system", "assistant", or "user" and a "content" key with a
-                string value. Messages are organized such that "system" messages are at top of prompt,
-                and "user" and "assistant" messages are displayed in order. Assistant messages get formatted as
-                "Response: {content}".
-
-        Returns:
-            Formatted prompt.
-        """
-
-        full_prompt = default_prompt_header + "\n\n" if default_prompt_header != "" else ""
-
-        for message in messages:
-            if message["role"] == "user":
-                user_message = self._current_prompt_template.format(message["content"])
-                full_prompt += user_message
-            if message["role"] == "assistant":
-                assistant_message = message["content"] + "\n"
-                full_prompt += assistant_message
-
-        full_prompt += "\n\n" + default_prompt_footer if default_prompt_footer != "" else ""
-
-        return full_prompt
-

 def append_extension_if_missing(model_name):
    if not model_name.endswith((".bin", ".gguf")):
@ -697,7 +659,7 @@ class _HasFileno(Protocol):


 def _fsync(fd: int | _HasFileno) -> None:
-    if sys.platform == 'darwin':
+    if sys.platform == "darwin":
        # Apple's fsync does not flush the drive write cache
        try:
            fcntl.fcntl(fd, fcntl.F_FULLFSYNC)
@ -706,3 +668,7 @@ def _fsync(fd: int | _HasFileno) -> None:
        else:
            return
    os.fsync(fd)
+
+
+def _remove_prefix(s: str, prefix: str) -> str:
+    return s[len(prefix):] if s.startswith(prefix) else s
--- a/gpt4all-bindings/python/mkdocs.yml
+++ b/gpt4all-bindings/python/mkdocs.yml
@ -14,10 +14,14 @@ nav:
    - 'Models' : 'gpt4all_desktop/models.md'
    - 'LocalDocs' : 'gpt4all_desktop/localdocs.md'
    - 'Settings' : 'gpt4all_desktop/settings.md'
+    - 'Chat Templates' : 'gpt4all_desktop/chat_templates.md'
    - 'Cookbook':
+      - 'Local AI Chat with Microsoft Excel': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md'
      - 'Local AI Chat with your Google Drive': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-google-drive.md'
      - 'Local AI Chat with your Obsidian Vault': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-Obsidian.md'
      - 'Local AI Chat with your OneDrive': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-One-Drive.md'
+    - 'API Server':
+      - 'gpt4all_api_server/home.md'
    - 'Python SDK':
      - 'gpt4all_python/home.md'
      - 'Monitoring': 'gpt4all_python/monitoring.md'
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -68,16 +68,17 @@ def get_long_description():

 setup(
    name=package_name,
-    version="3.0.0",
+    version="2.8.3.dev0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
    author="Nomic and the Open Source Community",
    author_email="support@nomic.ai",
-    url="https://gpt4all.io/",
+    url="https://www.nomic.ai/gpt4all",
    project_urls={
        "Documentation": "https://docs.gpt4all.io/gpt4all_python.html",
        "Source code": "https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python",
+        "Changelog": "https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/python/CHANGELOG.md",
    },
    classifiers = [
        "Programming Language :: Python :: 3",
@ -87,15 +88,16 @@ setup(
    python_requires='>=3.8',
    packages=find_packages(),
    install_requires=[
+        'importlib_resources; python_version < "3.9"',
+        'jinja2~=3.1',
        'requests',
        'tqdm',
-        'importlib_resources; python_version < "3.9"',
        'typing-extensions>=4.3.0; python_version >= "3.9" and python_version < "3.11"',
    ],
    extras_require={
        'cuda': [
-            'nvidia-cuda-runtime-cu12',
-            'nvidia-cublas-cu12',
+            'nvidia-cuda-runtime-cu11',
+            'nvidia-cublas-cu11',
        ],
        'all': [
            'gpt4all[cuda]; platform_system == "Windows" or platform_system == "Linux"',
--- a/gpt4all-chat/.flake8
+++ b/gpt4all-chat/.flake8
@ -0,0 +1,5 @@
+# vim: set syntax=dosini:
+[flake8]
+exclude = .*,__pycache__
+max-line-length = 120
+extend-ignore = B001,C408,D,DAR,E221,E303,E722,E741,E800,N801,N806,P101,S101,S324,S404,S406,S410,S603,WPS100,WPS110,WPS111,WPS113,WPS114,WPS115,WPS120,WPS2,WPS300,WPS301,WPS304,WPS305,WPS306,WPS309,WPS316,WPS317,WPS318,WPS319,WPS322,WPS323,WPS326,WPS329,WPS330,WPS332,WPS336,WPS337,WPS347,WPS360,WPS361,WPS407,WPS414,WPS420,WPS421,WPS429,WPS430,WPS431,WPS432,WPS433,WPS437,WPS440,WPS440,WPS441,WPS442,WPS457,WPS458,WPS460,WPS462,WPS463,WPS473,WPS501,WPS504,WPS505,WPS508,WPS509,WPS510,WPS515,WPS516,WPS519,WPS520,WPS529,WPS531,WPS602,WPS604,WPS605,WPS608,WPS609,WPS613,WPS615
--- a/gpt4all-chat/CHANGELOG.md
+++ b/gpt4all-chat/CHANGELOG.md
@ -0,0 +1,335 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+
+## [3.10.0] - 2025-02-24
+
+### Added
+- Whitelist Granite (non-MoE) model architecture (by [@ThiloteE](https://github.com/ThiloteE) in [#3487](https://github.com/nomic-ai/gpt4all/pull/3487))
+- Add support for CUDA compute 5.0 GPUs such as the GTX 750 ([#3499](https://github.com/nomic-ai/gpt4all/pull/3499))
+- Add a Remote Providers tab to the Add Model page ([#3506](https://github.com/nomic-ai/gpt4all/pull/3506))
+
+### Changed
+- Substitute prettier default templates for OLMoE 7B 0924/0125 and Granite 3.1 3B/8B (by [@ThiloteE](https://github.com/ThiloteE) in [#3471](https://github.com/nomic-ai/gpt4all/pull/3471))
+- Build with LLVM Clang 19 on macOS and Ubuntu ([#3500](https://github.com/nomic-ai/gpt4all/pull/3500))
+
+### Fixed
+- Fix several potential crashes ([#3465](https://github.com/nomic-ai/gpt4all/pull/3465))
+- Fix visual spacing issues with deepseek models ([#3470](https://github.com/nomic-ai/gpt4all/pull/3470))
+- Add missing strings to Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#3496](https://github.com/nomic-ai/gpt4all/pull/3496))
+- Update Simplified Chinese translation (by [@Junior2Ran](https://github.com/Junior2Ran) in [#3467](https://github.com/nomic-ai/pull/3467))
+
+## [3.9.0] - 2025-02-04
+
+### Added
+- Whitelist OLMoE and Granite MoE model architectures (no Vulkan) (by [@ThiloteE](https://github.com/ThiloteE) in [#3449](https://github.com/nomic-ai/gpt4all/pull/3449))
+
+### Fixed
+- Fix "index N is not a prompt" when using LocalDocs with reasoning ([#3451](https://github.com/nomic-ai/gpt4all/pull/3451))
+- Work around rendering artifacts on Snapdragon SoCs with Windows ([#3450](https://github.com/nomic-ai/gpt4all/pull/3450))
+- Prevent DeepSeek-R1 reasoning from appearing in chat names and follow-up questions ([#3458](https://github.com/nomic-ai/gpt4all/pull/3458))
+- Fix LocalDocs crash on Windows ARM when reading PDFs ([#3460](https://github.com/nomic-ai/gpt4all/pull/3460))
+- Fix UI freeze when chat template is `{#` ([#3446](https://github.com/nomic-ai/gpt4all/pull/3446))
+
+## [3.8.0] - 2025-01-30
+
+### Added
+- Support DeepSeek-R1 Qwen models ([#3431](https://github.com/nomic-ai/gpt4all/pull/3431))
+- Support for think tags in the GUI ([#3440](https://github.com/nomic-ai/gpt4all/pull/3440))
+- Support specifying SHA256 hash in models3.json instead of MD5 ([#3437](https://github.com/nomic-ai/gpt4all/pull/3437))
+
+### Changed
+- Use minja instead of Jinja2Cpp for significantly improved template compatibility ([#3433](https://github.com/nomic-ai/gpt4all/pull/3433))
+
+### Fixed
+- Fix regression while using localdocs with server API ([#3410](https://github.com/nomic-ai/gpt4all/pull/3410))
+- Don't show system messages in server chat view ([#3411](https://github.com/nomic-ai/gpt4all/pull/3411))
+- Fix `codesign --verify` failure on macOS ([#3413](https://github.com/nomic-ai/gpt4all/pull/3413))
+- Code Interpreter: Fix console.log not accepting a single string after v3.7.0 ([#3426](https://github.com/nomic-ai/gpt4all/pull/3426))
+- Fix Phi 3.1 Mini 128K Instruct template (by [@ThiloteE](https://github.com/ThiloteE) in [#3412](https://github.com/nomic-ai/gpt4all/pull/3412))
+- Don't block the gui thread for reasoning ([#3435](https://github.com/nomic-ai/gpt4all/pull/3435))
+- Fix corruption of unicode in output of reasoning models ([#3443](https://github.com/nomic-ai/gpt4all/pull/3443))
+
+## [3.7.0] - 2025-01-21
+
+### Added
+- Add support for the Windows ARM64 target platform (CPU-only) ([#3385](https://github.com/nomic-ai/gpt4all/pull/3385))
+
+### Changed
+- Update from Qt 6.5.1 to 6.8.1 ([#3386](https://github.com/nomic-ai/gpt4all/pull/3386))
+
+### Fixed
+- Fix the timeout error in code interpreter ([#3369](https://github.com/nomic-ai/gpt4all/pull/3369))
+- Fix code interpreter console.log not accepting multiple arguments ([#3371](https://github.com/nomic-ai/gpt4all/pull/3371))
+- Remove 'X is defined' checks from templates for better compatibility ([#3372](https://github.com/nomic-ai/gpt4all/pull/3372))
+- Jinja2Cpp: Add 'if' requirement for 'else' parsing to fix crash ([#3373](https://github.com/nomic-ai/gpt4all/pull/3373))
+- Save chats on quit, even if the window isn't closed first ([#3387](https://github.com/nomic-ai/gpt4all/pull/3387))
+- Add chat template replacements for five new models and fix EM German Mistral ([#3393](https://github.com/nomic-ai/gpt4all/pull/3393))
+- Fix crash when entering `{{ a["foo"(` as chat template ([#3394](https://github.com/nomic-ai/gpt4all/pull/3394))
+- Sign the maintenance tool on macOS to prevent crash on Sequoia ([#3391](https://github.com/nomic-ai/gpt4all/pull/3391))
+- Jinja2Cpp: Fix operator precedence in 'not X is defined' ([#3402](https://github.com/nomic-ai/gpt4all/pull/3402))
+
+## [3.6.1] - 2024-12-20
+
+### Fixed
+- Fix the stop generation button no longer working in v3.6.0 ([#3336](https://github.com/nomic-ai/gpt4all/pull/3336))
+- Fix the copy entire conversation button no longer working in v3.6.0 ([#3336](https://github.com/nomic-ai/gpt4all/pull/3336))
+
+## [3.6.0] - 2024-12-19
+
+### Added
+- Automatically substitute chat templates that are not compatible with Jinja2Cpp in GGUFs ([#3327](https://github.com/nomic-ai/gpt4all/pull/3327))
+- Built-in javascript code interpreter tool plus model ([#3173](https://github.com/nomic-ai/gpt4all/pull/3173))
+
+### Fixed
+- Fix remote model template to allow for XML in messages ([#3318](https://github.com/nomic-ai/gpt4all/pull/3318))
+- Fix Jinja2Cpp bug that broke system message detection in chat templates ([#3325](https://github.com/nomic-ai/gpt4all/pull/3325))
+- Fix LocalDocs sources displaying in unconsolidated form after v3.5.0 ([#3328](https://github.com/nomic-ai/gpt4all/pull/3328))
+
+## [3.5.3] - 2024-12-16
+
+### Fixed
+- Fix LocalDocs not using information from sources in v3.5.2 ([#3302](https://github.com/nomic-ai/gpt4all/pull/3302))
+
+## [3.5.2] - 2024-12-13
+
+### Added
+- Create separate download pages for built-in and HuggingFace models ([#3269](https://github.com/nomic-ai/gpt4all/pull/3269))
+
+### Fixed
+- Fix API server ignoring assistant messages in history after v3.5.0 ([#3256](https://github.com/nomic-ai/gpt4all/pull/3256))
+- Fix API server replying with incorrect token counts and stop reason after v3.5.0 ([#3256](https://github.com/nomic-ai/gpt4all/pull/3256))
+- Fix API server remembering previous, unrelated conversations after v3.5.0 ([#3256](https://github.com/nomic-ai/gpt4all/pull/3256))
+- Fix mishandling of default chat template and system message of cloned models in v3.5.0 ([#3262](https://github.com/nomic-ai/gpt4all/pull/3262))
+- Fix untranslated text on the startup dialog ([#3293](https://github.com/nomic-ai/gpt4all/pull/3293))
+
+## [3.5.1] - 2024-12-10
+
+### Fixed
+- Fix an incorrect value for currentResponse ([#3245](https://github.com/nomic-ai/gpt4all/pull/3245))
+- Fix the default model button so it works again after 3.5.0 ([#3246](https://github.com/nomic-ai/gpt4all/pull/3246))
+- Fix chat templates for Nous Hermes 2 Mistral, Mistral OpenOrca, Qwen 2, and remote models ([#3250](https://github.com/nomic-ai/gpt4all/pull/3250))
+- Fix chat templates for Llama 3.2 models ([#3251](https://github.com/nomic-ai/gpt4all/pull/3251))
+
+## [3.5.0] - 2024-12-09
+
+### Changed
+- Update Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#3236](https://github.com/nomic-ai/gpt4all/pull/3236))
+- Update Romanian translation (by [@SINAPSA-IC](https://github.com/SINAPSA-IC) in [#3232](https://github.com/nomic-ai/gpt4all/pull/3232))
+
+### Fixed
+- Fix a few more problems with the Jinja changes ([#3239](https://github.com/nomic-ai/gpt4all/pull/3239))
+
+## [3.5.0-rc2] - 2024-12-06
+
+### Changed
+- Fade messages out with an animation when they are removed from the chat view ([#3227](https://github.com/nomic-ai/gpt4all/pull/3227))
+- Tweak wording of edit/redo confirmation dialogs ([#3228](https://github.com/nomic-ai/gpt4all/pull/3228))
+- Make edit/redo buttons disabled instead of invisible when they are temporarily unavailable ([#3228](https://github.com/nomic-ai/gpt4all/pull/3228))
+
+## [3.5.0-rc1] - 2024-12-04
+
+### Added
+- Add ability to attach text, markdown, and rst files to chat ([#3135](https://github.com/nomic-ai/gpt4all/pull/3135))
+- Add feature to minimize to system tray (by [@bgallois](https://github.com/bgallois) in [#3109](https://github.com/nomic-ai/gpt4all/pull/3109))
+- Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))
+- Add ability to edit prompts and regenerate any response ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+
+### Changed
+- Implement Qt 6.8 compatibility ([#3121](https://github.com/nomic-ai/gpt4all/pull/3121))
+- Use Jinja for chat templates instead of per-message QString.arg-style templates ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+- API server: Use system message(s) from client instead of settings ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+- API server: Accept messages in any order supported by the model instead of requiring user/assistant pairs ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+- Remote models: Pass system message with "system" role instead of joining with user message ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+
+### Removed
+- Remove option to save binary model state to disk ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
+
+### Fixed
+- Fix bug in GUI when localdocs encounters binary data ([#3137](https://github.com/nomic-ai/gpt4all/pull/3137))
+- Fix LocalDocs bugs that prevented some docx files from fully chunking ([#3140](https://github.com/nomic-ai/gpt4all/pull/3140))
+- Fix missing softmax that was causing crashes and effectively infinite temperature since 3.4.0 ([#3202](https://github.com/nomic-ai/gpt4all/pull/3202))
+
+## [3.4.2] - 2024-10-16
+
+### Fixed
+- Limit bm25 retrieval to only specified collections ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
+- Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
+- Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
+- Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
+- Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094))
+
+## [3.4.1] - 2024-10-11
+
+### Fixed
+- Improve the Italian translation ([#3048](https://github.com/nomic-ai/gpt4all/pull/3048))
+- Fix models.json cache location ([#3052](https://github.com/nomic-ai/gpt4all/pull/3052))
+- Fix LocalDocs regressions caused by docx change ([#3079](https://github.com/nomic-ai/gpt4all/pull/3079))
+- Fix Go code being highlighted as Java ([#3080](https://github.com/nomic-ai/gpt4all/pull/3080))
+
+## [3.4.0] - 2024-10-08
+
+### Added
+- Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
+- LocalDocs support for .docx files ([#2986](https://github.com/nomic-ai/gpt4all/pull/2986))
+- Add support for attaching Excel spreadsheet to chat ([#3007](https://github.com/nomic-ai/gpt4all/pull/3007), [#3028](https://github.com/nomic-ai/gpt4all/pull/3028))
+
+### Changed
+- Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
+- Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
+- Simplify chatmodel to get rid of unnecessary field and bump chat version ([#3016](https://github.com/nomic-ai/gpt4all/pull/3016))
+- Allow ChatLLM to have direct access to ChatModel for restoring state from text ([#3018](https://github.com/nomic-ai/gpt4all/pull/3018))
+- Improvements to XLSX conversion and UI fix ([#3022](https://github.com/nomic-ai/gpt4all/pull/3022))
+
+### Fixed
+- Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))
+- Fix the local server rejecting min\_p/top\_p less than 1 ([#2996](https://github.com/nomic-ai/gpt4all/pull/2996))
+- Fix "regenerate" always forgetting the most recent message ([#3011](https://github.com/nomic-ai/gpt4all/pull/3011))
+- Fix loaded chats forgetting context when there is a system prompt ([#3015](https://github.com/nomic-ai/gpt4all/pull/3015))
+- Make it possible to downgrade and keep some chats, and avoid crash for some model types ([#3030](https://github.com/nomic-ai/gpt4all/pull/3030))
+- Fix scroll positition being reset in model view, and attempt a better fix for the clone issue ([#3042](https://github.com/nomic-ai/gpt4all/pull/3042))
+
+## [3.3.1] - 2024-09-27 ([v3.3.y](https://github.com/nomic-ai/gpt4all/tree/v3.3.y))
+
+### Fixed
+- Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))
+- Fix the local server rejecting min\_p/top\_p less than 1 ([#2996](https://github.com/nomic-ai/gpt4all/pull/2996))
+
+## [3.3.0] - 2024-09-20
+
+### Added
+- Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
+- Use configured system prompt in server mode and ignore system messages ([#2921](https://github.com/nomic-ai/gpt4all/pull/2921), [#2924](https://github.com/nomic-ai/gpt4all/pull/2924))
+- Add more system information to anonymous usage stats ([#2939](https://github.com/nomic-ai/gpt4all/pull/2939))
+- Check for unsupported Ubuntu and macOS versions at install time ([#2940](https://github.com/nomic-ai/gpt4all/pull/2940))
+
+### Changed
+- The offline update button now directs users to the offline installer releases page. (by [@3Simplex](https://github.com/3Simplex) in [#2888](https://github.com/nomic-ai/gpt4all/pull/2888))
+- Change the website link on the home page to point to the new URL ([#2915](https://github.com/nomic-ai/gpt4all/pull/2915))
+- Smaller default window size, dynamic minimum size, and scaling tweaks ([#2904](https://github.com/nomic-ai/gpt4all/pull/2904))
+- Only allow a single instance of program to be run at a time ([#2923](https://github.com/nomic-ai/gpt4all/pull/2923]))
+
+### Fixed
+- Bring back "Auto" option for Embeddings Device as "Application default," which went missing in v3.1.0 ([#2873](https://github.com/nomic-ai/gpt4all/pull/2873))
+- Correct a few strings in the Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#2872](https://github.com/nomic-ai/gpt4all/pull/2872) and [#2909](https://github.com/nomic-ai/gpt4all/pull/2909))
+- Correct typos in Traditional Chinese translation (by [@supersonictw](https://github.com/supersonictw) in [#2852](https://github.com/nomic-ai/gpt4all/pull/2852))
+- Set the window icon on Linux ([#2880](https://github.com/nomic-ai/gpt4all/pull/2880))
+- Corrections to the Romanian translation (by [@SINAPSA-IC](https://github.com/SINAPSA-IC) in [#2890](https://github.com/nomic-ai/gpt4all/pull/2890))
+- Fix singular/plural forms of LocalDocs "x Sources" (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2885](https://github.com/nomic-ai/gpt4all/pull/2885))
+- Fix a typo in Model Settings (by [@3Simplex](https://github.com/3Simplex) in [#2916](https://github.com/nomic-ai/gpt4all/pull/2916))
+- Fix the antenna icon tooltip when using the local server ([#2922](https://github.com/nomic-ai/gpt4all/pull/2922))
+- Fix a few issues with locating files and handling errors when loading remote models on startup ([#2875](https://github.com/nomic-ai/gpt4all/pull/2875))
+- Significantly improve API server request parsing and response correctness ([#2929](https://github.com/nomic-ai/gpt4all/pull/2929))
+- Remove unnecessary dependency on Qt WaylandCompositor module ([#2949](https://github.com/nomic-ai/gpt4all/pull/2949))
+- Update translations ([#2970](https://github.com/nomic-ai/gpt4all/pull/2970))
+- Fix macOS installer and remove extra installed copy of Nomic Embed ([#2973](https://github.com/nomic-ai/gpt4all/pull/2973))
+
+## [3.2.1] - 2024-08-13
+
+### Fixed
+- Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
+- Fix a potential crash on exit when using only CPU on Linux with NVIDIA (does not affect X11) ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
+- Fix default CUDA architecture list after [#2802](https://github.com/nomic-ai/gpt4all/pull/2802) ([#2855](https://github.com/nomic-ai/gpt4all/pull/2855))
+
+## [3.2.0] - 2024-08-12
+
+### Added
+- Add Qwen2-1.5B-Instruct to models3.json (by [@ThiloteE](https://github.com/ThiloteE) in [#2759](https://github.com/nomic-ai/gpt4all/pull/2759))
+- Enable translation feature for seven languages: English, Spanish, Italian, Portuguese, Chinese Simplified, Chinese Traditional, Romanian ([#2830](https://github.com/nomic-ai/gpt4all/pull/2830))
+
+### Changed
+- Add missing entries to Italian transltation (by [@Harvester62](https://github.com/Harvester62) in [#2783](https://github.com/nomic-ai/gpt4all/pull/2783))
+- Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+- Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+
+### Fixed
+- Case-insensitive LocalDocs source icon detection (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2761](https://github.com/nomic-ai/gpt4all/pull/2761))
+- Fix comparison of pre- and post-release versions for update check and models3.json ([#2762](https://github.com/nomic-ai/gpt4all/pull/2762), [#2772](https://github.com/nomic-ai/gpt4all/pull/2772))
+- Fix several backend issues ([#2778](https://github.com/nomic-ai/gpt4all/pull/2778))
+  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
+  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
+- Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+- Disallow context shift for chat name and follow-up generation to prevent bugs ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
+- Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2846](https://github.com/nomic-ai/gpt4all/pull/2846))
+
+## [3.1.1] - 2024-07-27
+
+### Added
+- Add Llama 3.1 8B Instruct to models3.json (by [@3Simplex](https://github.com/3Simplex) in [#2731](https://github.com/nomic-ai/gpt4all/pull/2731) and [#2732](https://github.com/nomic-ai/gpt4all/pull/2732))
+- Portuguese (BR) translation (by [thiagojramos](https://github.com/thiagojramos) in [#2733](https://github.com/nomic-ai/gpt4all/pull/2733))
+- Support adding arbitrary OpenAI-compatible models by URL (by [@supersonictw](https://github.com/supersonictw) in [#2683](https://github.com/nomic-ai/gpt4all/pull/2683))
+- Support Llama 3.1 RoPE scaling ([#2758](https://github.com/nomic-ai/gpt4all/pull/2758))
+
+### Changed
+- Add missing entries to Chinese (Simplified) translation (by [wuodoo](https://github.com/wuodoo) in [#2716](https://github.com/nomic-ai/gpt4all/pull/2716) and [#2749](https://github.com/nomic-ai/gpt4all/pull/2749))
+- Update translation files and add missing paths to CMakeLists.txt ([#2735](https://github.com/nomic-ai/gpt4all/2735))
+
+## [3.1.0] - 2024-07-24
+
+### Added
+- Generate suggested follow-up questions ([#2634](https://github.com/nomic-ai/gpt4all/pull/2634), [#2723](https://github.com/nomic-ai/gpt4all/pull/2723))
+  - Also add options for the chat name and follow-up question prompt templates
+- Scaffolding for translations ([#2612](https://github.com/nomic-ai/gpt4all/pull/2612))
+- Spanish (MX) translation (by [@jstayco](https://github.com/jstayco) in [#2654](https://github.com/nomic-ai/gpt4all/pull/2654))
+- Chinese (Simplified) translation by mikage ([#2657](https://github.com/nomic-ai/gpt4all/pull/2657))
+- Dynamic changes of language and locale at runtime ([#2659](https://github.com/nomic-ai/gpt4all/pull/2659), [#2677](https://github.com/nomic-ai/gpt4all/pull/2677))
+- Romanian translation by [@SINAPSA\_IC](https://github.com/SINAPSA_IC) ([#2662](https://github.com/nomic-ai/gpt4all/pull/2662))
+- Chinese (Traditional) translation (by [@supersonictw](https://github.com/supersonictw) in [#2661](https://github.com/nomic-ai/gpt4all/pull/2661))
+- Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#2700](https://github.com/nomic-ai/gpt4all/pull/2700))
+
+### Changed
+- Customize combo boxes and context menus to fit the new style ([#2535](https://github.com/nomic-ai/gpt4all/pull/2535))
+- Improve view bar scaling and Model Settings layout ([#2520](https://github.com/nomic-ai/gpt4all/pull/2520)
+- Make the logo spin while the model is generating ([#2557](https://github.com/nomic-ai/gpt4all/pull/2557))
+- Server: Reply to wrong GET/POST method with HTTP 405 instead of 404 (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2615](https://github.com/nomic-ai/gpt4all/pull/2615))
+- Update theme for menus (by [@3Simplex](https://github.com/3Simplex) in [#2578](https://github.com/nomic-ai/gpt4all/pull/2578))
+- Move the "stop" button to the message box ([#2561](https://github.com/nomic-ai/gpt4all/pull/2561))
+- Build with CUDA 11.8 for better compatibility ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
+- Make links in latest news section clickable ([#2643](https://github.com/nomic-ai/gpt4all/pull/2643))
+- Support translation of settings choices ([#2667](https://github.com/nomic-ai/gpt4all/pull/2667), [#2690](https://github.com/nomic-ai/gpt4all/pull/2690))
+- Improve LocalDocs view's error message (by @cosmic-snow in [#2679](https://github.com/nomic-ai/gpt4all/pull/2679))
+- Ignore case of LocalDocs file extensions ([#2642](https://github.com/nomic-ai/gpt4all/pull/2642), [#2684](https://github.com/nomic-ai/gpt4all/pull/2684))
+- Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694), [#2702](https://github.com/nomic-ai/gpt4all/pull/2702))
+  - Add support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Vulkan support)
+  - Add support for DeepSeek-V2 architecture (no Vulkan support)
+  - Enable Vulkan support for StarCoder2, XVERSE, Command R, and OLMo
+- Show scrollbar in chat collections list as needed (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2691](https://github.com/nomic-ai/gpt4all/pull/2691))
+
+### Removed
+- Remove support for GPT-J models ([#2676](https://github.com/nomic-ai/gpt4all/pull/2676), [#2693](https://github.com/nomic-ai/gpt4all/pull/2693))
+
+### Fixed
+- Fix placement of thumbs-down and datalake opt-in dialogs ([#2540](https://github.com/nomic-ai/gpt4all/pull/2540))
+- Select the correct folder with the Linux fallback folder dialog ([#2541](https://github.com/nomic-ai/gpt4all/pull/2541))
+- Fix clone button sometimes producing blank model info ([#2545](https://github.com/nomic-ai/gpt4all/pull/2545))
+- Fix jerky chat view scrolling ([#2555](https://github.com/nomic-ai/gpt4all/pull/2555))
+- Fix "reload" showing for chats with missing models ([#2520](https://github.com/nomic-ai/gpt4all/pull/2520)
+- Fix property binding loop warning ([#2601](https://github.com/nomic-ai/gpt4all/pull/2601))
+- Fix UI hang with certain chat view content ([#2543](https://github.com/nomic-ai/gpt4all/pull/2543))
+- Fix crash when Kompute falls back to CPU ([#2640](https://github.com/nomic-ai/gpt4all/pull/2640))
+- Fix several Vulkan resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
+- Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
+
+[3.10.0]: https://github.com/nomic-ai/gpt4all/compare/v3.9.0...v3.10.0
+[3.9.0]: https://github.com/nomic-ai/gpt4all/compare/v3.8.0...v3.9.0
+[3.8.0]: https://github.com/nomic-ai/gpt4all/compare/v3.7.0...v3.8.0
+[3.7.0]: https://github.com/nomic-ai/gpt4all/compare/v3.6.1...v3.7.0
+[3.6.1]: https://github.com/nomic-ai/gpt4all/compare/v3.6.0...v3.6.1
+[3.6.0]: https://github.com/nomic-ai/gpt4all/compare/v3.5.3...v3.6.0
+[3.5.3]: https://github.com/nomic-ai/gpt4all/compare/v3.5.2...v3.5.3
+[3.5.2]: https://github.com/nomic-ai/gpt4all/compare/v3.5.1...v3.5.2
+[3.5.1]: https://github.com/nomic-ai/gpt4all/compare/v3.5.0...v3.5.1
+[3.5.0]: https://github.com/nomic-ai/gpt4all/compare/v3.5.0-rc2...v3.5.0
+[3.5.0-rc2]: https://github.com/nomic-ai/gpt4all/compare/v3.5.0-rc1...v3.5.0-rc2
+[3.5.0-rc1]: https://github.com/nomic-ai/gpt4all/compare/v3.4.2...v3.5.0-rc1
+[3.4.2]: https://github.com/nomic-ai/gpt4all/compare/v3.4.1...v3.4.2
+[3.4.1]: https://github.com/nomic-ai/gpt4all/compare/v3.4.0...v3.4.1
+[3.4.0]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.4.0
+[3.3.1]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.3.1
+[3.3.0]: https://github.com/nomic-ai/gpt4all/compare/v3.2.1...v3.3.0
+[3.2.1]: https://github.com/nomic-ai/gpt4all/compare/v3.2.0...v3.2.1
+[3.2.0]: https://github.com/nomic-ai/gpt4all/compare/v3.1.1...v3.2.0
+[3.1.1]: https://github.com/nomic-ai/gpt4all/compare/v3.1.0...v3.1.1
+[3.1.0]: https://github.com/nomic-ai/gpt4all/compare/v3.0.0...v3.1.0
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -1,8 +1,18 @@
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.25)  # for try_compile SOURCE_FROM_VAR

-set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-set(CMAKE_CXX_STANDARD 20)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)
+include(../common/common.cmake)
+
+set(APP_VERSION_MAJOR 3)
+set(APP_VERSION_MINOR 10)
+set(APP_VERSION_PATCH 1)
+set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
+set(APP_VERSION "${APP_VERSION_BASE}-dev0")
+
+project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C)
+
+if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
+  set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "..." FORCE)
+endif()

 if(APPLE)
  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" OFF)
@ -16,38 +26,88 @@ if(APPLE)
  endif()
 endif()

-set(APP_VERSION_MAJOR 3)
-set(APP_VERSION_MINOR 1)
-set(APP_VERSION_PATCH 1)
-set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
-set(APP_VERSION "${APP_VERSION_BASE}")
+find_package(Python3 3.12 QUIET COMPONENTS Interpreter)
+
+option(GPT4ALL_TEST "Build the tests" ${Python3_FOUND})
+option(GPT4ALL_LOCALHOST "Build installer for localhost repo" OFF)
+option(GPT4ALL_OFFLINE_INSTALLER "Build an offline installer" OFF)
+option(GPT4ALL_SIGN_INSTALL "Sign installed binaries and installers (requires signing identities)" OFF)
+option(GPT4ALL_GEN_CPACK_CONFIG "Generate the CPack config.xml in the package step and nothing else." OFF)
+set(GPT4ALL_USE_QTPDF "AUTO" CACHE STRING "Whether to Use QtPDF for LocalDocs. If OFF or not available on this platform, PDFium is used.")
+set_property(CACHE GPT4ALL_USE_QTPDF PROPERTY STRINGS AUTO ON OFF)
+set(GPT4ALL_FORCE_D3D12 "AUTO" CACHE STRING "Whether to use Direct3D 12 as the Qt scene graph backend. Defaults to ON on Windows ARM.")
+set_property(CACHE GPT4ALL_FORCE_D3D12 PROPERTY STRINGS AUTO ON OFF)
+
+include(cmake/cpack_config.cmake)
+
+if (GPT4ALL_GEN_CPACK_CONFIG)
+    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cpack-steal-config.cmake.in"
+                   "${CMAKE_BINARY_DIR}/cmake/cpack-steal-config.cmake" @ONLY)
+    set(CPACK_POST_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/cpack-steal-config.cmake)
+    include(CPack)
+    include(CPackIFW)
+    return()
+endif()
+
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+if (MSVC)
+    # Enable accurate __cplusplus macro
+    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
+endif()
+
+
+# conftests
+function(check_cpp_feature FEATURE_NAME MIN_VALUE)
+    message(CHECK_START "Checking for ${FEATURE_NAME} >= ${MIN_VALUE}")
+    string(CONCAT SRC
+        "#include <version>\n"
+        "#if !defined(${FEATURE_NAME}) || ${FEATURE_NAME} < ${MIN_VALUE}\n"
+        "#   error \"${FEATURE_NAME} is not defined or less than ${MIN_VALUE}\"\n"
+        "#endif\n"
+        "int main() { return 0; }\n"
+    )
+    try_compile(HAS_FEATURE SOURCE_FROM_VAR "test_${FEATURE_NAME}.cpp" SRC)
+    if (NOT HAS_FEATURE)
+        message(CHECK_FAIL "fail")
+        message(FATAL_ERROR
+            "The C++ compiler\n  \"${CMAKE_CXX_COMPILER}\"\n"
+            "is too old to support ${FEATURE_NAME} >= ${MIN_VALUE}.\n"
+            "Please specify a newer compiler via -DCMAKE_C_COMPILER/-DCMAKE_CXX_COMPILER."
+        )
+    endif()
+  message(CHECK_PASS "pass")
+endfunction()
+
+# check for monadic operations in std::optional (e.g. transform)
+check_cpp_feature("__cpp_lib_optional" "202110L")
+

 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules")

 # Include the binary directory for the generated header file
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")

-project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C)
-
 set(CMAKE_AUTOMOC ON)
 set(CMAKE_AUTORCC ON)
-set(CMAKE_CXX_STANDARD_REQUIRED ON)

-option(GPT4ALL_TRANSLATIONS OFF "Build with translations")
-option(GPT4ALL_LOCALHOST OFF "Build installer for localhost repo")
-option(GPT4ALL_OFFLINE_INSTALLER "Build an offline installer" OFF)
-option(GPT4ALL_SIGN_INSTALL "Sign installed binaries and installers (requires signing identities)" OFF)
+set(CMAKE_FIND_PACKAGE_TARGETS_GLOBAL ON)
+set(GPT4ALL_QT_COMPONENTS Core HttpServer LinguistTools Quick QuickDialogs2 Sql Svg)
+set(GPT4ALL_USING_QTPDF OFF)
+if (CMAKE_SYSTEM_NAME MATCHES Windows AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+    # QtPDF is not available.
+    if (GPT4ALL_USE_QTPDF STREQUAL "ON")
+        message(FATAL_ERROR "QtPDF is not available on Windows ARM64.")
+    endif()
+elseif (GPT4ALL_USE_QTPDF MATCHES "^(ON|AUTO)$")
+    set(GPT4ALL_USING_QTPDF ON)
+    list(APPEND GPT4ALL_QT_COMPONENTS Pdf)
+endif()
+find_package(Qt6 6.8 COMPONENTS ${GPT4ALL_QT_COMPONENTS} REQUIRED)

-# Generate a header file with the version number
-configure_file(
-  "${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.h.in"
-  "${CMAKE_CURRENT_BINARY_DIR}/config.h"
-)
-
-if(LINUX)
-  find_package(Qt6 6.4 COMPONENTS Core Quick WaylandCompositor QuickDialogs2 Svg HttpServer Sql Pdf LinguistTools REQUIRED)
-else()
-  find_package(Qt6 6.4 COMPONENTS Core Quick QuickDialogs2 Svg HttpServer Sql Pdf LinguistTools REQUIRED)
+if (QT_KNOWN_POLICY_QTP0004)
+    qt_policy(SET QTP0004 NEW)  # generate extra qmldir files on Qt 6.8+
 endif()

 # Get the Qt6Core target properties
@ -64,15 +124,62 @@ get_filename_component(Qt6_ROOT_DIR "${Qt6_ROOT_DIR}/.." ABSOLUTE)
 message(STATUS "qmake binary: ${QMAKE_EXECUTABLE}")
 message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")

-set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)

+set(GPT4ALL_CONFIG_FORCE_D3D12 -1)
+if (NOT CMAKE_SYSTEM_NAME MATCHES Windows OR Qt6_VERSION VERSION_LESS "6.6")
+    # Direct3D 12 is not available.
+    if (GPT4ALL_FORCE_D3D12 STREQUAL "ON")
+        message(FATAL_ERROR "Cannot use Direct3D 12 on this platform.")
+    endif()
+elseif (GPT4ALL_FORCE_D3D12 MATCHES "^(ON|AUTO)$")
+    if (GPT4ALL_FORCE_D3D12 STREQUAL "ON" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+        set(GPT4ALL_CONFIG_FORCE_D3D12 1)
+    endif()
+endif()
+
+# Generate a header file for configuration
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/src/config.h.in"
+    "${CMAKE_CURRENT_BINARY_DIR}/config.h"
+)
+
+add_subdirectory(deps)
 add_subdirectory(../gpt4all-backend llmodel)

+if (GPT4ALL_TEST)
+    enable_testing()
+
+    # Llama-3.2-1B model
+    set(TEST_MODEL "Llama-3.2-1B-Instruct-Q4_0.gguf")
+    set(TEST_MODEL_MD5 "48ff0243978606fdba19d899b77802fc")
+    set(TEST_MODEL_PATH "${CMAKE_BINARY_DIR}/resources/${TEST_MODEL}")
+    set(TEST_MODEL_URL "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/${TEST_MODEL}")
+
+    # Create a custom command to download the file if it does not exist or if the checksum does not match
+    add_custom_command(
+        OUTPUT "${TEST_MODEL_PATH}"
+        COMMAND ${CMAKE_COMMAND} -E echo "Downloading test model from ${TEST_MODEL_URL} ..."
+        COMMAND ${CMAKE_COMMAND} -DURL="${TEST_MODEL_URL}" -DOUTPUT_PATH="${TEST_MODEL_PATH}" -DEXPECTED_MD5="${TEST_MODEL_MD5}" -P "${CMAKE_SOURCE_DIR}/cmake/download_model.cmake"
+        DEPENDS "${CMAKE_SOURCE_DIR}/cmake/download_model.cmake"
+    )
+
+    # Define a custom target that depends on the downloaded model
+    add_custom_target(download_test_model
+        DEPENDS "${TEST_MODEL_PATH}"
+    )
+
+    add_subdirectory(tests)
+
+    # The 'check' target makes sure the tests and their dependencies are up-to-date before running them
+    add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure DEPENDS download_test_model chat gpt4all_tests)
+endif()
+
 set(CHAT_EXE_RESOURCES)

 # Metal shader library
 if (APPLE)
-    list(APPEND CHAT_EXE_RESOURCES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
+    list(APPEND CHAT_EXE_RESOURCES "${GGML_METALLIB}")
 endif()

 # App icon
@ -86,8 +193,6 @@ elseif (APPLE)

    # And the following tells CMake where to find and install the file itself.
    set(APP_ICON_RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
-    set_source_files_properties(${APP_ICON_RESOURCE} PROPERTIES
-        MACOSX_PACKAGE_LOCATION "Resources")
    list(APPEND CHAT_EXE_RESOURCES "${APP_ICON_RESOURCE}")
 endif()

@ -107,26 +212,49 @@ if (APPLE)
    list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
 endif()

+if (DEFINED GGML_METALLIB)
+    set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
+endif()
+if (APPLE)
+    set_source_files_properties(${CHAT_EXE_RESOURCES} PROPERTIES MACOSX_PACKAGE_LOCATION Resources)
+endif()
+
+set(MACOS_SOURCES)
+if (APPLE)
+    find_library(COCOA_LIBRARY Cocoa)
+    list(APPEND MACOS_SOURCES src/macosdock.mm src/macosdock.h)
+endif()
+
 qt_add_executable(chat
-    main.cpp
-    chat.h chat.cpp
-    chatllm.h chatllm.cpp
-    chatmodel.h chatlistmodel.h chatlistmodel.cpp
-    chatapi.h chatapi.cpp
-    chatviewtextprocessor.h chatviewtextprocessor.cpp
-    database.h database.cpp
-    download.h download.cpp
-    embllm.cpp embllm.h
-    localdocs.h localdocs.cpp localdocsmodel.h localdocsmodel.cpp
-    llm.h llm.cpp
-    modellist.h modellist.cpp
-    mysettings.h mysettings.cpp
-    network.h network.cpp
-    server.h server.cpp
-    logger.h logger.cpp
-    ${APP_ICON_RESOURCE}
+    src/main.cpp
+    src/chat.cpp                  src/chat.h
+    src/chatapi.cpp               src/chatapi.h
+    src/chatlistmodel.cpp         src/chatlistmodel.h
+    src/chatllm.cpp               src/chatllm.h
+    src/chatmodel.h               src/chatmodel.cpp
+    src/chatviewtextprocessor.cpp src/chatviewtextprocessor.h
+    src/codeinterpreter.cpp       src/codeinterpreter.h
+    src/database.cpp              src/database.h
+    src/download.cpp              src/download.h
+    src/embllm.cpp                src/embllm.h
+    src/jinja_helpers.cpp         src/jinja_helpers.h
+    src/jinja_replacements.cpp    src/jinja_replacements.h
+    src/llm.cpp                   src/llm.h
+    src/localdocs.cpp             src/localdocs.h
+    src/localdocsmodel.cpp        src/localdocsmodel.h
+    src/logger.cpp                src/logger.h
+    src/modellist.cpp             src/modellist.h
+    src/mysettings.cpp            src/mysettings.h
+    src/network.cpp               src/network.h
+    src/server.cpp                src/server.h
+    src/tool.cpp                  src/tool.h
+    src/toolcallparser.cpp        src/toolcallparser.h
+    src/toolmodel.cpp             src/toolmodel.h
+    src/xlsxtomd.cpp              src/xlsxtomd.h
    ${CHAT_EXE_RESOURCES}
+    ${MACOS_SOURCES}
 )
+gpt4all_add_warning_options(chat)

 qt_add_qml_module(chat
    URI gpt4all
@ -136,8 +264,15 @@ qt_add_qml_module(chat
      main.qml
      qml/AddCollectionView.qml
      qml/AddModelView.qml
+      qml/AddGPT4AllModelView.qml
+      qml/AddHFModelView.qml
+      qml/AddRemoteModelView.qml
      qml/ApplicationSettings.qml
      qml/ChatDrawer.qml
+      qml/ChatCollapsibleItem.qml
+      qml/ChatItemView.qml
+      qml/ChatMessageButton.qml
+      qml/ChatTextItem.qml
      qml/ChatView.qml
      qml/CollectionsDrawer.qml
      qml/HomeView.qml
@ -150,17 +285,21 @@ qt_add_qml_module(chat
      qml/PopupDialog.qml
      qml/SettingsView.qml
      qml/StartupDialog.qml
-      qml/SwitchModelDialog.qml
+      qml/ConfirmationDialog.qml
      qml/Theme.qml
      qml/ThumbsDownDialog.qml
      qml/Toast.qml
      qml/ToastManager.qml
      qml/MyBusyIndicator.qml
      qml/MyButton.qml
+      qml/MyTabButton.qml
      qml/MyCheckBox.qml
      qml/MyComboBox.qml
      qml/MyDialog.qml
      qml/MyDirectoryField.qml
+      qml/MyFileDialog.qml
+      qml/MyFileIcon.qml
+      qml/MyFolderDialog.qml
      qml/MyFancyLink.qml
      qml/MyMenu.qml
      qml/MyMenuItem.qml
@ -176,6 +315,7 @@ qt_add_qml_module(chat
      qml/MyTextField.qml
      qml/MyToolButton.qml
      qml/MyWelcomeButton.qml
+      qml/RemoteModelCard.qml
    RESOURCES
      icons/antenna_1.svg
      icons/antenna_2.svg
@ -193,9 +333,12 @@ qt_add_qml_module(chat
      icons/edit.svg
      icons/eject.svg
      icons/email.svg
+      icons/file-doc.svg
+      icons/file-docx.svg
      icons/file-md.svg
      icons/file-pdf.svg
      icons/file-txt.svg
+      icons/file-xls.svg
      icons/file.svg
      icons/github.svg
      icons/globe.svg
@ -203,6 +346,7 @@ qt_add_qml_module(chat
      icons/gpt4all-48.png
      icons/gpt4all.svg
      icons/gpt4all_transparent.svg
+      icons/groq.svg
      icons/home.svg
      icons/image.svg
      icons/info.svg
@ -210,10 +354,14 @@ qt_add_qml_module(chat
      icons/left_panel_open.svg
      icons/local-docs.svg
      icons/models.svg
+      icons/mistral.svg
      icons/network.svg
      icons/nomic_logo.svg
      icons/notes.svg
+      icons/paperclip.svg
      icons/plus.svg
+      icons/plus_circle.svg
+      icons/openai.svg
      icons/recycle.svg
      icons/regenerate.svg
      icons/search.svg
@ -226,21 +374,20 @@ qt_add_qml_module(chat
      icons/trash.svg
      icons/twitter.svg
      icons/up_down.svg
+      icons/webpage.svg
      icons/you.svg
 )

-if (GPT4ALL_TRANSLATIONS)
-    qt_add_translations(chat
-        TS_FILES
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_en.ts
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_es_MX.ts
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_CN.ts
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_TW.ts
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_ro_RO.ts
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_it_IT.ts
-        ${CMAKE_SOURCE_DIR}/translations/gpt4all_pt_BR.ts
-    )
-endif()
+qt_add_translations(chat
+    TS_FILES
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_en_US.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_es_MX.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_CN.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_TW.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_ro_RO.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_it_IT.ts
+    ${CMAKE_SOURCE_DIR}/translations/gpt4all_pt_BR.ts
+)

 set_target_properties(chat PROPERTIES
    WIN32_EXECUTABLE TRUE
@ -259,19 +406,20 @@ if (APPLE)
        MACOSX_BUNDLE_GUI_IDENTIFIER gpt4all
        MACOSX_BUNDLE_BUNDLE_VERSION ${PROJECT_VERSION}
        MACOSX_BUNDLE_SHORT_VERSION_STRING ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}
-        RESOURCE "${CHAT_EXE_RESOURCES}"
        OUTPUT_NAME gpt4all
    )
    add_dependencies(chat ggml-metal)
+endif()

-    if(NOT MAC_SIGNING_IDENTITY)
-        if(NOT DEFINED ENV{MAC_SIGNING_CERT_NAME} AND GPT4ALL_SIGN_INSTALL)
+if (APPLE AND GPT4ALL_SIGN_INSTALL)
+    if (NOT MAC_SIGNING_IDENTITY)
+        if (NOT DEFINED ENV{MAC_SIGNING_CERT_NAME})
            REPORT_MISSING_SIGNING_CONTEXT()
        endif()
        set(MAC_SIGNING_IDENTITY $ENV{MAC_SIGNING_CERT_NAME})
    endif()
-    if(NOT MAC_SIGNING_TID)
-        if(NOT DEFINED ENV{MAC_NOTARIZATION_TID} AND GPT4ALL_SIGN_INSTALL)
+    if (NOT MAC_SIGNING_TID)
+        if (NOT DEFINED ENV{MAC_NOTARIZATION_TID})
            REPORT_MISSING_SIGNING_CONTEXT()
        endif()
        set(MAC_SIGNING_TID $ENV{MAC_NOTARIZATION_TID})
@ -290,37 +438,47 @@ endif()
 target_compile_definitions(chat
    PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)

+target_include_directories(chat PRIVATE src)
+
 # usearch uses the identifier 'slots' which conflicts with Qt's 'slots' keyword
 target_compile_definitions(chat PRIVATE QT_NO_SIGNALS_SLOTS_KEYWORDS)

-target_include_directories(chat PRIVATE usearch/include
-                                        usearch/fp16/include)
+target_include_directories(chat PRIVATE deps/usearch/include
+                                        deps/usearch/fp16/include)

-if(LINUX)
-  target_link_libraries(chat
-      PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf Qt6::WaylandCompositor)
+target_link_libraries(chat
+    PRIVATE Qt6::Core Qt6::HttpServer Qt6::Quick Qt6::Sql Qt6::Svg)
+if (GPT4ALL_USING_QTPDF)
+    target_compile_definitions(chat PRIVATE GPT4ALL_USE_QTPDF)
+    target_link_libraries(chat PRIVATE Qt6::Pdf)
 else()
-  target_link_libraries(chat
-    PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf)
+    # Link PDFium
+    target_link_libraries(chat PRIVATE pdfium)
 endif()
 target_link_libraries(chat
-    PRIVATE llmodel)
+    PRIVATE llmodel SingleApplication fmt::fmt duckx::duckx QXlsx)
+target_include_directories(chat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/json/include)
+target_include_directories(chat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/json/include/nlohmann)
+target_include_directories(chat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/minja/include)

+if (APPLE)
+    target_link_libraries(chat PRIVATE ${COCOA_LIBRARY})
+endif()

 # -- install --

-set(COMPONENT_NAME_MAIN ${PROJECT_NAME})
-
-if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-  set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "..." FORCE)
+if (APPLE)
+    set(GPT4ALL_LIB_DEST bin/gpt4all.app/Contents/Frameworks)
+else()
+    set(GPT4ALL_LIB_DEST lib)
 endif()

 install(TARGETS chat DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN})

 install(
    TARGETS llmodel
-    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
-    RUNTIME DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+    LIBRARY DESTINATION ${GPT4ALL_LIB_DEST} COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    RUNTIME DESTINATION bin                 COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
 )

 # We should probably iterate through the list of the cmake for backend, but these need to be installed
@ -343,8 +501,8 @@ endif()

 install(
    TARGETS ${MODEL_IMPL_TARGETS}
-    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
-    RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+    LIBRARY DESTINATION ${GPT4ALL_LIB_DEST} COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    RUNTIME DESTINATION lib                 COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
 )

 if(APPLE AND GPT4ALL_SIGN_INSTALL)
@ -373,7 +531,7 @@ if (LLMODEL_CUDA)
        TARGETS llamamodel-mainline-cuda
                llamamodel-mainline-cuda-avxonly
        RUNTIME_DEPENDENCY_SET llama-cuda-deps
-        LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+        LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so
        RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
    )
    if (WIN32)
@ -387,65 +545,38 @@ if (LLMODEL_CUDA)
    endif()
 endif()

+if (NOT GPT4ALL_USING_QTPDF)
+    # Install PDFium
+    if (WIN32)
+        install(FILES ${PDFium_LIBRARY} DESTINATION bin                 COMPONENT ${COMPONENT_NAME_MAIN})  # .dll
+    else()
+        install(FILES ${PDFium_LIBRARY} DESTINATION ${GPT4ALL_LIB_DEST} COMPONENT ${COMPONENT_NAME_MAIN})  # .so/.dylib
+    endif()
+endif()
+
 if (NOT APPLE)
-    install(FILES "${CMAKE_BINARY_DIR}/resources/${LOCAL_EMBEDDING_MODEL}"
+    install(FILES "${LOCAL_EMBEDDING_MODEL_PATH}"
            DESTINATION resources
            COMPONENT ${COMPONENT_NAME_MAIN})
 endif()

-set(CPACK_GENERATOR "IFW")
-set(CPACK_VERBATIM_VARIABLES YES)
-set(CPACK_IFW_VERBOSE ON)
-
-if(${CMAKE_SYSTEM_NAME} MATCHES Linux)
+if (CMAKE_SYSTEM_NAME MATCHES Linux)
    find_program(LINUXDEPLOYQT linuxdeployqt HINTS "$ENV{HOME}/dev/linuxdeployqt/build/tools/linuxdeployqt" "$ENV{HOME}/project/linuxdeployqt/bin")
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/deploy-qt-linux.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-linux.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-linux.cmake)
-    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
-    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-linux")
-    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@/${COMPONENT_NAME_MAIN}")
-elseif(${CMAKE_SYSTEM_NAME} MATCHES Windows)
-    find_program(WINDEPLOYQT windeployqt HINTS ${_qt_bin_dir})
+elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+    find_program(WINDEPLOYQT windeployqt)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/deploy-qt-windows.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-windows.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-windows.cmake)
-    set(CPACK_IFW_ROOT "C:/Qt/Tools/QtInstallerFramework/4.6")
-    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.ico")
-    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64")
-    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@\\${COMPONENT_NAME_MAIN}")
-elseif(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
-    find_program(MACDEPLOYQT macdeployqt HINTS ${_qt_bin_dir})
+elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
+    find_program(MACDEPLOYQT macdeployqt)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/deploy-qt-mac.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-mac.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-mac.cmake)
-    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
-    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
-    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-darwin")
-    set(CPACK_IFW_TARGET_DIRECTORY "@ApplicationsDir@/${COMPONENT_NAME_MAIN}")
-    set(CPACK_BUNDLE_NAME ${COMPONENT_NAME_MAIN})
-    set(CPACK_BUNDLE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
 endif()

-set(CPACK_PACKAGE_INSTALL_DIRECTORY ${COMPONENT_NAME_MAIN})
-set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
-set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
-SET(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
-set(CPACK_PACKAGE_HOMEPAGE_URL "https://gpt4all.io")
-set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
-set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
-set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_SOURCE_DIR}/README.md)
-set(CPACK_PACKAGE_EXECUTABLES "GPT4All")
-set(CPACK_CREATE_DESKTOP_LINKS "GPT4All")
-set(CPACK_IFW_PACKAGE_NAME "GPT4All")
-set(CPACK_IFW_PACKAGE_TITLE "GPT4All Installer")
-set(CPACK_IFW_PACKAGE_PUBLISHER "Nomic, Inc.")
-set(CPACK_IFW_PRODUCT_URL "https://gpt4all.io")
-set(CPACK_IFW_PACKAGE_WIZARD_STYLE "Aero")
-set(CPACK_IFW_PACKAGE_LOGO "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
-set(CPACK_IFW_PACKAGE_WINDOW_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png")
-set(CPACK_IFW_PACKAGE_WIZARD_SHOW_PAGE_LIST OFF)
-
 include(InstallRequiredSystemLibraries)
 include(CPack)
 include(CPackIFW)
@ -457,20 +588,35 @@ endif()
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} ESSENTIAL FORCED_INSTALLATION)
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} VERSION ${APP_VERSION})
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} LICENSES "MIT LICENSE" ${CPACK_RESOURCE_FILE_LICENSE})
-cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installerscript.qs")
+cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_gpt4all_component.qs")
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} REPLACES "gpt4all-chat") #Was used in very earliest prototypes

+if (APPLE AND GPT4ALL_SIGN_INSTALL)
+    if (GPT4ALL_OFFLINE_INSTALLER)
+        cpack_add_component(maintenancetool HIDDEN)
+    else()
+        cpack_add_component(maintenancetool HIDDEN DOWNLOADED)
+    endif()
+    cpack_ifw_configure_component(maintenancetool ESSENTIAL FORCED_INSTALLATION)
+    cpack_ifw_configure_component(maintenancetool VERSION ${APP_VERSION})
+    cpack_ifw_configure_component(maintenancetool SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_maintenancetool_component.qs")
+endif()
+
 if (GPT4ALL_LOCALHOST)
    cpack_ifw_add_repository("GPT4AllRepository" URL "http://localhost/repository")
-elseif(GPT4ALL_OFFLINE_INSTALLER)
-  add_compile_definitions(GPT4ALL_OFFLINE_INSTALLER)
+elseif (GPT4ALL_OFFLINE_INSTALLER)
+    add_compile_definitions(GPT4ALL_OFFLINE_INSTALLER)
 else()
-  if(${CMAKE_SYSTEM_NAME} MATCHES Linux)
-      cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/linux/repository")
-  elseif(${CMAKE_SYSTEM_NAME} MATCHES Windows)
-      #To sign the target on windows have to create a batch script add use it as a custom target and then use CPACK_IFW_EXTRA_TARGETS to set this extra target
-      cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/windows/repository")
-  elseif(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
-      cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/mac/repository")
-  endif()
+    if (CMAKE_SYSTEM_NAME MATCHES Linux)
+        cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/linux/repository")
+    elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+        # To sign the target on windows have to create a batch script add use it as a custom target and then use CPACK_IFW_EXTRA_TARGETS to set this extra target
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$")
+            cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/windows/repository")
+        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+            cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/windows_arm/repository")
+        endif()
+    elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
+        cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/mac/repository")
+    endif()
 endif()
--- a/gpt4all-chat/README.md
+++ b/gpt4all-chat/README.md
@ -1,45 +0,0 @@
-# gpt4all-chat
-
-Cross platform Qt based GUI for GPT4All versions with GPT-J as the base
-model. NOTE: The model seen in the screenshot is actually a preview of a
-new training run for GPT4All based on GPT-J. The GPT4All project is busy
-at work getting ready to release this model including installers for all
-three major OS's. In the meantime, you can try this UI out with the original
-GPT-J model by following build instructions below.
-
-![image](https://user-images.githubusercontent.com/50458173/231464085-da9edff6-a593-410e-8f38-7513f75c8aab.png)
-
-## Install
-
-One click installers for macOS, Linux, and Windows at https://gpt4all.io
-
-## Features
-
-* Cross-platform (Linux, Windows, MacOSX)
-* The UI is made to look and feel like you've come to expect from a chatty gpt
-* Check for updates so you can always stay fresh with latest models
-* Easy to install with precompiled binaries available for all three major desktop platforms
-* Multi-modal - Ability to load more than one model and switch between them
-* Multi-chat - a list of current and past chats and the ability to save/delete/export and switch between
-* Supports models that are supported by llama.cpp
-* Model downloader in GUI featuring many popular open source models
-* Settings dialog to change temp, top_p, min_p, top_k, threads, etc
-* Copy your conversation to clipboard
-* RAG via LocalDocs feature
-* Check for updates to get the very latest GUI
-
-## Building and running
-
-* Follow the visual instructions on the [build_and_run](build_and_run.md) page
-
-## Getting the latest
-
-If you've already checked out the source code and/or built the program make sure when you do a git fetch to get the latest changes and that you also do `git submodule update --init --recursive` to update the submodules. (If you ever run into trouble, deinitializing via `git submodule deinit -f .` and then initializing again via `git submodule update --init --recursive` fixes most issues)
-
-## Contributing
-
-* Pull requests welcome. See the feature wish list for ideas :)
-
-
-## License
-The source code of this chat interface is currently under a MIT license.
--- a/gpt4all-chat/build_and_run.md
+++ b/gpt4all-chat/build_and_run.md
@ -1,109 +1,106 @@
-# Building gpt4all-chat from source
-
-Depending upon your operating system, there are many ways that Qt is distributed. 
-Here is the recommended method for getting the Qt dependency installed to setup and build 
-gpt4all-chat from source.
-
-## Prerequisites
-
-You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
-
-On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
-
-## Note for Linux users
-
-Linux users may install Qt via their distro's official packages instead of using the Qt installer. You need at least Qt 6.5, with support for QPdf and the Qt HTTP Server. It should be straightforward to build with just cmake and make, but you may continue to follow these instructions to build with Qt Creator.
-
-On Arch Linux, this looks like:
-```
-sudo pacman -S --needed base-devel qt6-base qt6-declarative qt6-wayland qt6-svg qt6-httpserver qt6-webengine qt6-5compat qt6-shadertools qtcreator cmake ninja
-```
-
-On Ubuntu 23.04, this looks like:
-```
-sudo apt install build-essential qt6-base-dev qt6-declarative-dev qt6-wayland-dev qt6-svg-dev qt6-httpserver-dev qt6-webengine-dev libqt6core5compat6 qml6-module-qt5compat-graphicaleffects libqt6shadertools6 qtcreator cmake ninja-build
-```
-
-On Fedora 39, this looks like:
-```
-sudo dnf install make gcc gcc-c++ qt6-qtbase-devel qt6-qtdeclarative-devel qt6-qtwayland-devel qt6-qtsvg-devel qt6-qthttpserver-devel qt6-qtwebengine-devel qt6-qt5compat qt5-qtgraphicaleffects qt6-qtshadertools qt-creator cmake ninja-build
-```
-
-## Download Qt
-
- Go to https://login.qt.io/register to create a free Qt account.
- Download the Qt Online Installer for your OS from here: https://www.qt.io/download-qt-installer-oss
- Sign into the installer.
- Agree to the terms of the (L)GPL 3 license.
- Select whether you would like to send anonymous usage statistics to Qt.
- On the Installation Folder page, leave the default installation path, and select "Custom Installation".
-
-## Customize the installation
-
-![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/c6e999e5-cc8a-4dfc-8065-b59139e8c7ae)
-
-Under "Qt", find the latest Qt 6.x release.
-
-Under this release (e.g. Qt 6.5.0), select the target platform:
- On macOS, it is just called "macOS".
- On Windows, it is called "MSVC 2019 64-bit" (for 64-bit x86 CPUs). MinGW has not been tested.
-
-Under this release, select the following additional components:
- Qt Quick 3D
- Qt Wayland Compositor (for Linux only)
- Qt 5 Compatibility Module
- Qt Shader Tools
- Additional Libraries:
-  - Qt HTTP Server
-  - Qt PDF
- Qt Debug information Files
-
-Under Developer and Designer Tools, select the following components:
- Qt Creator
- Qt Creator CDB Debugger Support (for Windows only)
- Debugging Tools for Windows (for Windows only)
- CMake
- Ninja
-
-Agree to the license and complete the installation.
-
-## Download the source code
-
-You must use git to download the source code for gpt4all:
-```
-git clone --recurse-submodules https://github.com/nomic-ai/gpt4all
-```
-
-Note the use of --recurse-submodules, which makes sure the necessary dependencies are downloaded inside the repo. This is why you cannot simply download a zip archive.
-
-Windows users: To install git for Windows, see https://git-scm.com/downloads. Once it is installed, you should be able to shift-right click in any folder, "Open PowerShell window here" (or similar, depending on the version of Windows), and run the above command.
-
-## Open gpt4all-chat in Qt Creator
-
-Open Qt Creator. Navigate to File > Open File or Project, find the "gpt4all-chat" folder inside the freshly cloned repository, and select CMakeLists.txt.
-
-![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/3d3e2743-2a1d-43d6-9e55-62f7f4306de7)
-
-## Configure project
-
-You can now expand the "Details" section next to the build kit. It is best to uncheck all but one build configuration, e.g. "Release", which will produce optimized binaries that are not useful for debugging.
-
-Click "Configure Project", and wait for it to complete.
-
-![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/44d5aafb-a95d-434b-ba2a-a3138c0e49a0)
-
-## Build project
-
-Now that the project has been configured, click the hammer button on the left sidebar to build the project.
-
-![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/43cd7b42-32f0-4efa-9612-d51f85637103)
-
-## Run project
-
-Click the play button on the left sidebar to run the Chat UI.
-
-![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/611ea795-bdcd-4feb-a466-eb1c2e936e7e)
-
-## Updating the downloaded source code
-
-You do not need to make a fresh clone of the source code every time. To update it, you may open a terminal/command prompt in the repository, run `git pull`, and then `git submodule update --init --recursive`.
+# Building gpt4all-chat from source
+
+Depending upon your operating system, there are many ways that Qt is distributed.
+Here is the recommended method for getting the Qt dependency installed to setup and build
+gpt4all-chat from source.
+
+## Prerequisites
+
+You will need a compiler. On Windows, you should install Visual Studio with the C++ Development components. On macOS, you will need the full version of Xcode&mdash;Xcode Command Line Tools lacks certain required tools. On Linux, you will need a GCC or Clang toolchain with C++ support.
+
+On Windows and Linux, building GPT4All with full GPU support requires the [Vulkan SDK](https://vulkan.lunarg.com/sdk/home) and the latest [CUDA Toolkit](https://developer.nvidia.com/cuda-downloads).
+
+## Note for Linux users
+
+Linux users may install Qt via their distro's official packages instead of using the Qt installer. You need at least Qt 6.5, with support for QPdf and the Qt HTTP Server. You may build from the CLI using CMake and Ninja, or with Qt Creator as described later in this document.
+
+On Arch Linux, this looks like:
+```
+sudo pacman -S --needed cmake gcc ninja qt6-5compat qt6-base qt6-declarative qt6-httpserver qt6-svg qtcreator
+```
+
+On Ubuntu 23.04, this looks like:
+```
+sudo apt install cmake g++ libgl-dev libqt6core5compat6 ninja-build qml6-module-qt5compat-graphicaleffects qt6-base-private-dev qt6-declarative-dev qt6-httpserver-dev qt6-svg-dev qtcreator
+```
+
+On Fedora 39, this looks like:
+```
+sudo dnf install cmake gcc-c++ ninja-build qt-creator qt5-qtgraphicaleffects qt6-qt5compat qt6-qtbase-private-devel qt6-qtdeclarative-devel qt6-qthttpserver-devel qt6-qtsvg-devel
+```
+
+## Download Qt
+
+- Go to https://login.qt.io/register to create a free Qt account.
+- Download the Qt Online Installer for your OS from here: https://www.qt.io/download-qt-installer-oss
+- Sign into the installer.
+- Agree to the terms of the (L)GPL 3 license.
+- Select whether you would like to send anonymous usage statistics to Qt.
+- On the Installation Folder page, leave the default installation path, and select "Custom Installation".
+
+## Customize the installation
+
+![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/c6e999e5-cc8a-4dfc-8065-b59139e8c7ae)
+
+Under "Qt", find the latest Qt 6.x release.
+
+Under this release (e.g. Qt 6.5.0), select the target platform:
+- On macOS, it is just called "macOS".
+- On Windows, it is called "MSVC 2019 64-bit" (for 64-bit x86 CPUs). MinGW has not been tested.
+
+Under this release, select the following additional components:
+- Qt 5 Compatibility Module
+- Additional Libraries:
+  - Qt HTTP Server
+  - Qt PDF
+- Qt Debug information Files
+
+Under Developer and Designer Tools, select the following components:
+- Qt Creator
+- Qt Creator CDB Debugger Support (for Windows only)
+- Debugging Tools for Windows (for Windows only)
+- CMake
+- Ninja
+
+Agree to the license and complete the installation.
+
+## Download the source code
+
+You must use git to download the source code for gpt4all:
+```
+git clone --recurse-submodules https://github.com/nomic-ai/gpt4all
+```
+
+Note the use of --recurse-submodules, which makes sure the necessary dependencies are downloaded inside the repo. This is why you cannot simply download a zip archive.
+
+Windows users: To install git for Windows, see https://git-scm.com/downloads. Once it is installed, you should be able to shift-right click in any folder, "Open PowerShell window here" (or similar, depending on the version of Windows), and run the above command.
+
+## Open gpt4all-chat in Qt Creator
+
+Open Qt Creator. Navigate to File > Open File or Project, find the "gpt4all-chat" folder inside the freshly cloned repository, and select CMakeLists.txt.
+
+![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/3d3e2743-2a1d-43d6-9e55-62f7f4306de7)
+
+## Configure project
+
+You can now expand the "Details" section next to the build kit. It is best to uncheck all but one build configuration, e.g. "Release", which will produce optimized binaries that are not useful for debugging.
+
+Click "Configure Project", and wait for it to complete.
+
+![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/44d5aafb-a95d-434b-ba2a-a3138c0e49a0)
+
+## Build project
+
+Now that the project has been configured, click the hammer button on the left sidebar to build the project.
+
+![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/43cd7b42-32f0-4efa-9612-d51f85637103)
+
+## Run project
+
+Click the play button on the left sidebar to run the Chat UI.
+
+![image](https://github.com/nomic-ai/gpt4all-chat/assets/10168/611ea795-bdcd-4feb-a466-eb1c2e936e7e)
+
+## Updating the downloaded source code
+
+You do not need to make a fresh clone of the source code every time. To update it, you may open a terminal/command prompt in the repository, run `git pull`, and then `git submodule update --init --recursive`.
--- a/gpt4all-chat/chatapi.h
+++ b/gpt4all-chat/chatapi.h
@ -1,144 +0,0 @@
-#ifndef CHATAPI_H
-#define CHATAPI_H
-
-#include "../gpt4all-backend/llmodel.h"
-
-#include <QByteArray>
-#include <QNetworkReply>
-#include <QObject>
-#include <QString>
-#include <QStringList>
-#include <QList>
-
-#include <cstddef>
-#include <cstdint>
-#include <stdexcept>
-#include <functional>
-#include <string>
-#include <vector>
-
-class QNetworkAccessManager;
-
-class ChatAPI;
-class ChatAPIWorker : public QObject {
-    Q_OBJECT
-public:
-    ChatAPIWorker(ChatAPI *chatAPI)
-        : QObject(nullptr)
-        , m_ctx(nullptr)
-        , m_networkManager(nullptr)
-        , m_chat(chatAPI) {}
-    virtual ~ChatAPIWorker() {}
-
-    QString currentResponse() const { return m_currentResponse; }
-
-    void request(const QString &apiKey,
-                 LLModel::PromptContext *promptCtx,
-                 const QByteArray &array);
-
-Q_SIGNALS:
-    void finished();
-
-private Q_SLOTS:
-    void handleFinished();
-    void handleReadyRead();
-    void handleErrorOccurred(QNetworkReply::NetworkError code);
-
-private:
-    ChatAPI *m_chat;
-    LLModel::PromptContext *m_ctx;
-    QNetworkAccessManager *m_networkManager;
-    QString m_currentResponse;
-};
-
-class ChatAPI : public QObject, public LLModel {
-    Q_OBJECT
-public:
-    ChatAPI();
-    virtual ~ChatAPI();
-
-    bool supportsEmbedding() const override { return false; }
-    bool supportsCompletion() const override { return true; }
-    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
-    bool isModelLoaded() const override;
-    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
-    size_t stateSize() const override;
-    size_t saveState(uint8_t *dest) const override;
-    size_t restoreState(const uint8_t *src) override;
-    void prompt(const std::string &prompt,
-                const std::string &promptTemplate,
-                std::function<bool(int32_t)> promptCallback,
-                std::function<bool(int32_t, const std::string&)> responseCallback,
-                std::function<bool(bool)> recalculateCallback,
-                PromptContext &ctx,
-                bool special,
-                std::string *fakeReply) override;
-
-    void setThreadCount(int32_t n_threads) override;
-    int32_t threadCount() const override;
-
-    void setModelName(const QString &modelName) { m_modelName = modelName; }
-    void setAPIKey(const QString &apiKey) { m_apiKey = apiKey; }
-    void setRequestURL(const QString &requestURL) { m_requestURL = requestURL; }
-    QString url() const { return m_requestURL; }
-
-    QList<QString> context() const { return m_context; }
-    void setContext(const QList<QString> &context) { m_context = context; }
-
-    bool callResponse(int32_t token, const std::string &string);
-
-Q_SIGNALS:
-    void request(const QString &apiKey,
-                 LLModel::PromptContext *ctx,
-                 const QByteArray &array);
-
-protected:
-    // We have to implement these as they are pure virtual in base class, but we don't actually use
-    // them as they are only called from the default implementation of 'prompt' which we override and
-    // completely replace
-
-    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override {
-        (void)ctx;
-        (void)str;
-        (void)special;
-        throw std::logic_error("not implemented");
-    }
-
-    std::string tokenToString(Token id) const override {
-        (void)id;
-        throw std::logic_error("not implemented");
-    }
-
-    Token sampleToken(PromptContext &ctx) const override {
-        (void)ctx;
-        throw std::logic_error("not implemented");
-    }
-
-    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override {
-        (void)ctx;
-        (void)tokens;
-        throw std::logic_error("not implemented");
-    }
-
-    int32_t contextLength() const override {
-        throw std::logic_error("not implemented");
-    }
-
-    const std::vector<Token> &endTokens() const override {
-        throw std::logic_error("not implemented");
-    }
-
-    bool shouldAddBOS() const override {
-        throw std::logic_error("not implemented");
-    }
-
-private:
-    std::function<bool(int32_t, const std::string&)> m_responseCallback;
-    QString m_modelName;
-    QString m_apiKey;
-    QString m_requestURL;
-    QList<QString> m_context;
-    QStringList m_queuedPrompts;
-};
-
-#endif // CHATAPI_H
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
--- a/gpt4all-chat/chatmodel.h
+++ b/gpt4all-chat/chatmodel.h
@ -1,474 +0,0 @@
-#ifndef CHATMODEL_H
-#define CHATMODEL_H
-
-#include "database.h"
-
-#include <QAbstractListModel>
-#include <QByteArray>
-#include <QDataStream>
-#include <QHash>
-#include <QList>
-#include <QObject>
-#include <QPair>
-#include <QString>
-#include <QVariant>
-#include <QVector>
-#include <Qt>
-#include <QtGlobal>
-
-struct ChatItem
-{
-    Q_GADGET
-    Q_PROPERTY(int id MEMBER id)
-    Q_PROPERTY(QString name MEMBER name)
-    Q_PROPERTY(QString value MEMBER value)
-    Q_PROPERTY(QString prompt MEMBER prompt)
-    Q_PROPERTY(QString newResponse MEMBER newResponse)
-    Q_PROPERTY(bool currentResponse MEMBER currentResponse)
-    Q_PROPERTY(bool stopped MEMBER stopped)
-    Q_PROPERTY(bool thumbsUpState MEMBER thumbsUpState)
-    Q_PROPERTY(bool thumbsDownState MEMBER thumbsDownState)
-    Q_PROPERTY(QList<ResultInfo> sources MEMBER sources)
-    Q_PROPERTY(QList<ResultInfo> consolidatedSources MEMBER consolidatedSources)
-
-public:
-    // TODO: Maybe we should include the model name here as well as timestamp?
-    int id = 0;
-    QString name;
-    QString value;
-    QString prompt;
-    QString newResponse;
-    QList<ResultInfo> sources;
-    QList<ResultInfo> consolidatedSources;
-    bool currentResponse = false;
-    bool stopped = false;
-    bool thumbsUpState = false;
-    bool thumbsDownState = false;
-};
-Q_DECLARE_METATYPE(ChatItem)
-
-class ChatModel : public QAbstractListModel
-{
-    Q_OBJECT
-    Q_PROPERTY(int count READ count NOTIFY countChanged)
-
-public:
-    explicit ChatModel(QObject *parent = nullptr) : QAbstractListModel(parent) {}
-
-    enum Roles {
-        IdRole = Qt::UserRole + 1,
-        NameRole,
-        ValueRole,
-        PromptRole,
-        NewResponseRole,
-        CurrentResponseRole,
-        StoppedRole,
-        ThumbsUpStateRole,
-        ThumbsDownStateRole,
-        SourcesRole,
-        ConsolidatedSourcesRole
-    };
-
-    int rowCount(const QModelIndex &parent = QModelIndex()) const override
-    {
-        Q_UNUSED(parent)
-        return m_chatItems.size();
-    }
-
-    QVariant data(const QModelIndex &index, int role = Qt::DisplayRole) const override
-    {
-        if (!index.isValid() || index.row() < 0 || index.row() >= m_chatItems.size())
-            return QVariant();
-
-        const ChatItem &item = m_chatItems.at(index.row());
-        switch (role) {
-            case IdRole:
-                return item.id;
-            case NameRole:
-                return item.name;
-            case ValueRole:
-                return item.value;
-            case PromptRole:
-                return item.prompt;
-            case NewResponseRole:
-                return item.newResponse;
-            case CurrentResponseRole:
-                return item.currentResponse;
-            case StoppedRole:
-                return item.stopped;
-            case ThumbsUpStateRole:
-                return item.thumbsUpState;
-            case ThumbsDownStateRole:
-                return item.thumbsDownState;
-            case SourcesRole:
-                return QVariant::fromValue(item.sources);
-            case ConsolidatedSourcesRole:
-                return QVariant::fromValue(item.consolidatedSources);
-        }
-
-        return QVariant();
-    }
-
-    QHash<int, QByteArray> roleNames() const override
-    {
-        QHash<int, QByteArray> roles;
-        roles[IdRole] = "id";
-        roles[NameRole] = "name";
-        roles[ValueRole] = "value";
-        roles[PromptRole] = "prompt";
-        roles[NewResponseRole] = "newResponse";
-        roles[CurrentResponseRole] = "currentResponse";
-        roles[StoppedRole] = "stopped";
-        roles[ThumbsUpStateRole] = "thumbsUpState";
-        roles[ThumbsDownStateRole] = "thumbsDownState";
-        roles[SourcesRole] = "sources";
-        roles[ConsolidatedSourcesRole] = "consolidatedSources";
-        return roles;
-    }
-
-    void appendPrompt(const QString &name, const QString &value)
-    {
-        ChatItem item;
-        item.name = name;
-        item.value = value;
-        beginInsertRows(QModelIndex(), m_chatItems.size(), m_chatItems.size());
-        m_chatItems.append(item);
-        endInsertRows();
-        emit countChanged();
-    }
-
-    void appendResponse(const QString &name, const QString &prompt)
-    {
-        ChatItem item;
-        item.id = m_chatItems.count(); // This is only relevant for responses
-        item.name = name;
-        item.prompt = prompt;
-        item.currentResponse = true;
-        beginInsertRows(QModelIndex(), m_chatItems.size(), m_chatItems.size());
-        m_chatItems.append(item);
-        endInsertRows();
-        emit countChanged();
-    }
-
-    Q_INVOKABLE void clear()
-    {
-        if (m_chatItems.isEmpty()) return;
-
-        beginResetModel();
-        m_chatItems.clear();
-        endResetModel();
-        emit countChanged();
-    }
-
-    Q_INVOKABLE ChatItem get(int index)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return ChatItem();
-        return m_chatItems.at(index);
-    }
-
-    Q_INVOKABLE void updateCurrentResponse(int index, bool b)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        if (item.currentResponse != b) {
-            item.currentResponse = b;
-            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {CurrentResponseRole});
-        }
-    }
-
-    Q_INVOKABLE void updateStopped(int index, bool b)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        if (item.stopped != b) {
-            item.stopped = b;
-            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {StoppedRole});
-        }
-    }
-
-    Q_INVOKABLE void updateValue(int index, const QString &value)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        if (item.value != value) {
-            item.value = value;
-            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ValueRole});
-            emit valueChanged(index, value);
-        }
-    }
-
-    QList<ResultInfo> consolidateSources(const QList<ResultInfo> &sources) {
-        QMap<QString, ResultInfo> groupedData;
-        for (const ResultInfo &info : sources) {
-            if (groupedData.contains(info.file)) {
-                groupedData[info.file].text += "\n---\n" + info.text;
-            } else {
-                groupedData[info.file] = info;
-            }
-        }
-        QList<ResultInfo> consolidatedSources = groupedData.values();
-        return consolidatedSources;
-    }
-
-    Q_INVOKABLE void updateSources(int index, const QList<ResultInfo> &sources)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        item.sources = sources;
-        item.consolidatedSources = consolidateSources(sources);
-        emit dataChanged(createIndex(index, 0), createIndex(index, 0), {SourcesRole});
-        emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ConsolidatedSourcesRole});
-    }
-
-    Q_INVOKABLE void updateThumbsUpState(int index, bool b)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        if (item.thumbsUpState != b) {
-            item.thumbsUpState = b;
-            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ThumbsUpStateRole});
-        }
-    }
-
-    Q_INVOKABLE void updateThumbsDownState(int index, bool b)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        if (item.thumbsDownState != b) {
-            item.thumbsDownState = b;
-            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ThumbsDownStateRole});
-        }
-    }
-
-    Q_INVOKABLE void updateNewResponse(int index, const QString &newResponse)
-    {
-        if (index < 0 || index >= m_chatItems.size()) return;
-
-        ChatItem &item = m_chatItems[index];
-        if (item.newResponse != newResponse) {
-            item.newResponse = newResponse;
-            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {NewResponseRole});
-        }
-    }
-
-    int count() const { return m_chatItems.size(); }
-
-    bool serialize(QDataStream &stream, int version) const
-    {
-        stream << count();
-        for (const auto &c : m_chatItems) {
-            stream << c.id;
-            stream << c.name;
-            stream << c.value;
-            stream << c.prompt;
-            stream << c.newResponse;
-            stream << c.currentResponse;
-            stream << c.stopped;
-            stream << c.thumbsUpState;
-            stream << c.thumbsDownState;
-            if (version > 7) {
-                stream << c.sources.size();
-                for (const ResultInfo &info : c.sources) {
-                    Q_ASSERT(!info.file.isEmpty());
-                    stream << info.collection;
-                    stream << info.path;
-                    stream << info.file;
-                    stream << info.title;
-                    stream << info.author;
-                    stream << info.date;
-                    stream << info.text;
-                    stream << info.page;
-                    stream << info.from;
-                    stream << info.to;
-                }
-            } else if (version > 2) {
-                QList<QString> references;
-                QList<QString> referencesContext;
-                int validReferenceNumber = 1;
-                for (const ResultInfo &info : c.sources) {
-                    if (info.file.isEmpty())
-                        continue;
-
-                    QString reference;
-                    {
-                        QTextStream stream(&reference);
-                        stream << (validReferenceNumber++) << ". ";
-                        if (!info.title.isEmpty())
-                            stream << "\"" << info.title << "\". ";
-                        if (!info.author.isEmpty())
-                            stream << "By " << info.author << ". ";
-                        if (!info.date.isEmpty())
-                            stream << "Date: " << info.date << ". ";
-                        stream << "In " << info.file << ". ";
-                        if (info.page != -1)
-                            stream << "Page " << info.page << ". ";
-                        if (info.from != -1) {
-                            stream << "Lines " << info.from;
-                            if (info.to != -1)
-                                stream << "-" << info.to;
-                            stream << ". ";
-                        }
-                        stream << "[Context](context://" << validReferenceNumber - 1 << ")";
-                    }
-                    references.append(reference);
-                    referencesContext.append(info.text);
-                }
-
-                stream << references.join("\n");
-                stream << referencesContext;
-            }
-        }
-        return stream.status() == QDataStream::Ok;
-    }
-
-    bool deserialize(QDataStream &stream, int version)
-    {
-        int size;
-        stream >> size;
-        for (int i = 0; i < size; ++i) {
-            ChatItem c;
-            stream >> c.id;
-            stream >> c.name;
-            stream >> c.value;
-            stream >> c.prompt;
-            stream >> c.newResponse;
-            stream >> c.currentResponse;
-            stream >> c.stopped;
-            stream >> c.thumbsUpState;
-            stream >> c.thumbsDownState;
-            if (version > 7) {
-                qsizetype count;
-                stream >> count;
-                QList<ResultInfo> sources;
-                for (int i = 0; i < count; ++i) {
-                    ResultInfo info;
-                    stream >> info.collection;
-                    stream >> info.path;
-                    stream >> info.file;
-                    stream >> info.title;
-                    stream >> info.author;
-                    stream >> info.date;
-                    stream >> info.text;
-                    stream >> info.page;
-                    stream >> info.from;
-                    stream >> info.to;
-                    sources.append(info);
-                }
-                c.sources = sources;
-                c.consolidatedSources = consolidateSources(sources);
-            }else if (version > 2) {
-                QString references;
-                QList<QString> referencesContext;
-                stream >> references;
-                stream >> referencesContext;
-
-                if (!references.isEmpty()) {
-                    QList<ResultInfo> sources;
-                    QList<QString> referenceList = references.split("\n");
-
-                    // Ignore empty lines and those that begin with "---" which is no longer used
-                    for (auto it = referenceList.begin(); it != referenceList.end();) {
-                        if (it->trimmed().isEmpty() || it->trimmed().startsWith("---"))
-                            it = referenceList.erase(it);
-                        else
-                            ++it;
-                    }
-
-                    Q_ASSERT(referenceList.size() == referencesContext.size());
-                    for (int j = 0; j < referenceList.size(); ++j) {
-                        QString reference = referenceList[j];
-                        QString context = referencesContext[j];
-                        ResultInfo info;
-                        QTextStream refStream(&reference);
-                        QString dummy;
-                        int validReferenceNumber;
-                        refStream >> validReferenceNumber >> dummy;
-                        // Extract title (between quotes)
-                        if (reference.contains("\"")) {
-                            int startIndex = reference.indexOf('"') + 1;
-                            int endIndex = reference.indexOf('"', startIndex);
-                            info.title = reference.mid(startIndex, endIndex - startIndex);
-                        }
-
-                        // Extract author (after "By " and before the next period)
-                        if (reference.contains("By ")) {
-                            int startIndex = reference.indexOf("By ") + 3;
-                            int endIndex = reference.indexOf('.', startIndex);
-                            info.author = reference.mid(startIndex, endIndex - startIndex).trimmed();
-                        }
-
-                        // Extract date (after "Date: " and before the next period)
-                        if (reference.contains("Date: ")) {
-                            int startIndex = reference.indexOf("Date: ") + 6;
-                            int endIndex = reference.indexOf('.', startIndex);
-                            info.date = reference.mid(startIndex, endIndex - startIndex).trimmed();
-                        }
-
-                        // Extract file name (after "In " and before the "[Context]")
-                        if (reference.contains("In ") && reference.contains(". [Context]")) {
-                            int startIndex = reference.indexOf("In ") + 3;
-                            int endIndex = reference.indexOf(". [Context]", startIndex);
-                            info.file = reference.mid(startIndex, endIndex - startIndex).trimmed();
-                        }
-
-                        // Extract page number (after "Page " and before the next space)
-                        if (reference.contains("Page ")) {
-                            int startIndex = reference.indexOf("Page ") + 5;
-                            int endIndex = reference.indexOf(' ', startIndex);
-                            if (endIndex == -1) endIndex = reference.length();
-                            info.page = reference.mid(startIndex, endIndex - startIndex).toInt();
-                        }
-
-                        // Extract lines (after "Lines " and before the next space or hyphen)
-                        if (reference.contains("Lines ")) {
-                            int startIndex = reference.indexOf("Lines ") + 6;
-                            int endIndex = reference.indexOf(' ', startIndex);
-                            if (endIndex == -1) endIndex = reference.length();
-                            int hyphenIndex = reference.indexOf('-', startIndex);
-                            if (hyphenIndex != -1 && hyphenIndex < endIndex) {
-                                info.from = reference.mid(startIndex, hyphenIndex - startIndex).toInt();
-                                info.to = reference.mid(hyphenIndex + 1, endIndex - hyphenIndex - 1).toInt();
-                            } else {
-                                info.from = reference.mid(startIndex, endIndex - startIndex).toInt();
-                            }
-                        }
-                        info.text = context;
-                        sources.append(info);
-                    }
-
-                    c.sources = sources;
-                    c.consolidatedSources = consolidateSources(sources);
-                }
-            }
-            beginInsertRows(QModelIndex(), m_chatItems.size(), m_chatItems.size());
-            m_chatItems.append(c);
-            endInsertRows();
-        }
-        emit countChanged();
-        return stream.status() == QDataStream::Ok;
-    }
-
-    QVector<QPair<QString, QString>> text() const
-    {
-        QVector<QPair<QString, QString>> result;
-        for (const auto &c : m_chatItems)
-            result << qMakePair(c.name, c.value);
-        return result;
-    }
-
-Q_SIGNALS:
-    void countChanged();
-    void valueChanged(int index, const QString &value);
-
-private:
-
-    QList<ChatItem> m_chatItems;
-};
-
-#endif // CHATMODEL_H
--- a/gpt4all-chat/cmake/Modules/SignWindowsBinaries.cmake
+++ b/gpt4all-chat/cmake/Modules/SignWindowsBinaries.cmake
@ -3,7 +3,7 @@ function(sign_target_windows tgt)
        add_custom_command(TARGET ${tgt}
            POST_BUILD
            COMMAND AzureSignTool.exe sign
-                -du "https://gpt4all.io/index.html"
+                -du "https://www.nomic.ai/gpt4all"
                -kvu https://gpt4all.vault.azure.net
                -kvi "$Env{AZSignGUID}"
                -kvs "$Env{AZSignPWD}"
@ -14,4 +14,4 @@ function(sign_target_windows tgt)
                $<TARGET_FILE:${tgt}>
        )
    endif()
-endfunction()
+endfunction()
--- a/gpt4all-chat/cmake/config.h.in
+++ b/gpt4all-chat/cmake/config.h.in
@ -1,6 +0,0 @@
-#ifndef CONFIG_H
-#define CONFIG_H
-
-#define APP_VERSION "@APP_VERSION@"
-
-#endif // CONFIG_H
--- a/gpt4all-chat/cmake/cpack-steal-config.cmake.in
+++ b/gpt4all-chat/cmake/cpack-steal-config.cmake.in
@ -0,0 +1,2 @@
+set(OUTPUT_DIR "@CMAKE_BINARY_DIR@")
+file(COPY ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/config DESTINATION ${OUTPUT_DIR}/cpack-config)
--- a/gpt4all-chat/cmake/cpack_config.cmake
+++ b/gpt4all-chat/cmake/cpack_config.cmake
@ -0,0 +1,50 @@
+set(COMPONENT_NAME_MAIN "gpt4all")
+
+set(CPACK_GENERATOR "IFW")
+set(CPACK_VERBATIM_VARIABLES YES)
+set(CPACK_IFW_VERBOSE ON)
+
+if (CMAKE_SYSTEM_NAME MATCHES Linux)
+    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
+    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-linux")
+    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@/${COMPONENT_NAME_MAIN}")
+elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+    set(CPACK_IFW_ROOT "C:/Qt/Tools/QtInstallerFramework/4.6")
+    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.ico")
+    if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$")
+        set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64")
+    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+        set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64-arm")
+    else()
+        message(FATAL_ERROR "Unrecognized processor: ${CMAKE_SYSTEM_PROCESSOR}")
+    endif()
+    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@\\${COMPONENT_NAME_MAIN}")
+elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
+    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
+    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
+    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-darwin")
+    set(CPACK_IFW_TARGET_DIRECTORY "@ApplicationsDir@/${COMPONENT_NAME_MAIN}")
+endif()
+
+set(CPACK_COMPONENTS_ALL ${COMPONENT_NAME_MAIN})  # exclude development components
+if (APPLE AND GPT4ALL_SIGN_INSTALL)
+    list(APPEND CPACK_COMPONENTS_ALL maintenancetool)
+endif()
+set(CPACK_PACKAGE_INSTALL_DIRECTORY ${COMPONENT_NAME_MAIN})
+set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
+set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
+set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
+set(CPACK_PACKAGE_HOMEPAGE_URL "https://www.nomic.ai/gpt4all")
+set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
+set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
+set(CPACK_PACKAGE_EXECUTABLES "GPT4All")
+set(CPACK_CREATE_DESKTOP_LINKS "GPT4All")
+set(CPACK_IFW_PACKAGE_NAME "GPT4All")
+set(CPACK_IFW_PACKAGE_TITLE "GPT4All Installer")
+set(CPACK_IFW_PACKAGE_PUBLISHER "Nomic, Inc.")
+set(CPACK_IFW_PRODUCT_URL "https://www.nomic.ai/gpt4all")
+set(CPACK_IFW_PACKAGE_WIZARD_STYLE "Aero")
+set(CPACK_IFW_PACKAGE_LOGO "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
+set(CPACK_IFW_PACKAGE_WINDOW_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png")
+set(CPACK_IFW_PACKAGE_WIZARD_SHOW_PAGE_LIST OFF)
+set(CPACK_IFW_PACKAGE_CONTROL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_control.qs")
--- a/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
@ -1,17 +1,26 @@
 set(MACDEPLOYQT "@MACDEPLOYQT@")
 set(COMPONENT_NAME_MAIN "@COMPONENT_NAME_MAIN@")
 set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
+set(GPT4ALL_SIGN_INSTALL "@GPT4ALL_SIGN_INSTALL@")
 set(GPT4ALL_SIGNING_ID "@MAC_SIGNING_IDENTITY@")
-execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2 -sign-for-notarization=${GPT4ALL_SIGNING_ID})
-file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
-file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
-file(COPY ${MYLLAMALIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
-file(COPY ${MYLLMODELLIBS}
-     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
+set(CPACK_CONFIG_DIR "@CMAKE_BINARY_DIR@")
+if (GPT4ALL_SIGN_INSTALL)
+    set(MAC_NOTARIZE -sign-for-notarization=${GPT4ALL_SIGNING_ID})
+endif()
+execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2 ${MAC_NOTARIZE})
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
+
+if (GPT4ALL_SIGN_INSTALL)
+    # Create signed MaintenanceTool
+    set(MT_DATA_DIR ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/maintenancetool/data)
+    file(MAKE_DIRECTORY ${MT_DATA_DIR})
+    execute_process(
+        COMMAND binarycreator --config ${CPACK_CONFIG_DIR}/cpack-config/config/config.xml --create-maintenancetool --sign ${GPT4ALL_SIGNING_ID}
+        WORKING_DIRECTORY ${MT_DATA_DIR}
+    )
+endif()
--- a/gpt4all-chat/cmake/download_model.cmake
+++ b/gpt4all-chat/cmake/download_model.cmake
@ -0,0 +1,12 @@
+if(NOT DEFINED URL OR NOT DEFINED OUTPUT_PATH OR NOT DEFINED EXPECTED_MD5)
+    message(FATAL_ERROR "Usage: cmake -DURL=<url> -DOUTPUT_PATH=<path> -DEXPECTED_MD5=<md5> -P download_model.cmake")
+endif()
+
+message(STATUS "Downloading model from ${URL} to ${OUTPUT_PATH} ...")
+
+file(DOWNLOAD "${URL}" "${OUTPUT_PATH}" EXPECTED_MD5 "${EXPECTED_MD5}" STATUS status)
+
+list(GET status 0 status_code)
+if(NOT status_code EQUAL 0)
+    message(FATAL_ERROR "Failed to download model: ${status}")
+endif()
--- a/gpt4all-chat/cmake/installer_control.qs
+++ b/gpt4all-chat/cmake/installer_control.qs
@ -0,0 +1,44 @@
+var finishedText = null;
+
+function cancelInstaller(message) {
+    installer.setDefaultPageVisible(QInstaller.Introduction,         false);
+    installer.setDefaultPageVisible(QInstaller.TargetDirectory,      false);
+    installer.setDefaultPageVisible(QInstaller.ComponentSelection,   false);
+    installer.setDefaultPageVisible(QInstaller.ReadyForInstallation, false);
+    installer.setDefaultPageVisible(QInstaller.StartMenuSelection,   false);
+    installer.setDefaultPageVisible(QInstaller.PerformInstallation,  false);
+    installer.setDefaultPageVisible(QInstaller.LicenseCheck,         false);
+    finishedText = message;
+    installer.setCanceled();
+}
+
+function vercmp(a, b) {
+    return a.localeCompare(b, undefined, { numeric: true, sensitivity: "base" });
+}
+
+function Controller() {
+}
+
+Controller.prototype.TargetDirectoryPageCallback = function() {
+    var failedReq = null;
+    if (systemInfo.productType === "ubuntu" && vercmp(systemInfo.productVersion, "22.04") < 0) {
+        failedReq = "Ubuntu 22.04 LTS";
+    } else if (systemInfo.productType === "macos" && vercmp(systemInfo.productVersion, "12.6") < 0) {
+        failedReq = "macOS Monterey 12.6";
+    }
+
+    if (failedReq !== null) {
+        cancelInstaller(
+            "Installation cannot continue because GPT4All does not support your operating system: " +
+            `${systemInfo.prettyProductName}<br/><br/>` +
+            `GPT4All requires ${failedReq} or newer.`
+        );
+    }
+}
+
+Controller.prototype.FinishedPageCallback = function() {
+    const widget = gui.currentPageWidget();
+    if (widget != null && finishedText != null) {
+        widget.MessageLabel.setText(finishedText);
+    }
+}
--- a/gpt4all-chat/cmake/installer_gpt4all_component.qs
+++ b/gpt4all-chat/cmake/installer_gpt4all_component.qs
@ -6,8 +6,7 @@ Component.prototype.beginInstallation = function() {
    targetDirectory = installer.value("TargetDir");
 };

-Component.prototype.createOperations = function()
-{
+Component.prototype.createOperations = function() {
    try {
        // call the base create operations function
        component.createOperations();
@ -30,7 +29,7 @@ Component.prototype.createOperations = function()
                "workingDirectory=" + targetDirectory + "/bin",
                "iconPath=" + targetDirectory + "/gpt4all.ico",
                "iconId=0", "description=Open GPT4All");
-        } else if (systemInfo.productType === "macos" || systemInfo.productType === "osx") {
+        } else if (systemInfo.productType === "macos") {
            var gpt4allAppPath = targetDirectory + "/bin/gpt4all.app";
            var symlinkPath = targetDirectory + "/../GPT4All.app";
            // Remove the symlink if it already exists
@ -56,7 +55,7 @@ Component.prototype.createOperationsForArchive = function(archive)
 {
    component.createOperationsForArchive(archive);

-    if (systemInfo.productType === "macos" || systemInfo.productType === "osx") {
+    if (systemInfo.productType === "macos") {
        var uninstallTargetDirectory = installer.value("TargetDir");
        var symlinkPath = uninstallTargetDirectory + "/../GPT4All.app";

--- a/gpt4all-chat/cmake/installer_maintenancetool_component.qs
+++ b/gpt4all-chat/cmake/installer_maintenancetool_component.qs
@ -0,0 +1,19 @@
+function Component()
+{
+    component.ifwVersion = installer.value("FrameworkVersion");
+    installer.installationStarted.connect(this, Component.prototype.onInstallationStarted);
+}
+
+Component.prototype.onInstallationStarted = function()
+{
+    if (component.updateRequested() || component.installationRequested()) {
+        if (installer.value("os") == "win") {
+            component.installerbaseBinaryPath = "@TargetDir@/installerbase.exe";
+        } else if (installer.value("os") == "x11") {
+            component.installerbaseBinaryPath = "@TargetDir@/installerbase";
+        } else if (installer.value("os") == "mac") {
+            component.installerbaseBinaryPath = "@TargetDir@/MaintenanceTool.app";
+        }
+        installer.setInstallerBaseBinary(component.installerbaseBinaryPath);
+    }
+}
--- a/gpt4all-chat/deps/CMakeLists.txt
+++ b/gpt4all-chat/deps/CMakeLists.txt
@ -0,0 +1,51 @@
+include(FetchContent)
+
+
+set(BUILD_SHARED_LIBS OFF)
+
+set(FMT_INSTALL OFF)
+add_subdirectory(fmt)
+
+set(QAPPLICATION_CLASS QApplication)
+add_subdirectory(SingleApplication)
+
+set(DUCKX_INSTALL OFF)
+add_subdirectory(DuckX)
+
+set(QT_VERSION_MAJOR 6)
+add_subdirectory(QXlsx/QXlsx)
+
+if (NOT GPT4ALL_USING_QTPDF)
+    # If we do not use QtPDF, we need to get PDFium.
+    set(GPT4ALL_PDFIUM_TAG "chromium/6996")
+    if (CMAKE_SYSTEM_NAME MATCHES Linux)
+        FetchContent_Declare(
+            pdfium
+            URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-linux-x64.tgz"
+            URL_HASH "SHA256=68b381b87efed539f2e33ae1e280304c9a42643a878cc296c1d66a93b0cb4335"
+        )
+    elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+        if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$")
+            FetchContent_Declare(
+                pdfium
+                URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-win-x64.tgz"
+                URL_HASH "SHA256=83e714c302ceacccf403826d5cb57ea39b77f393d83b8d5781283012774a9378"
+            )
+        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+            FetchContent_Declare(
+                pdfium
+                URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-win-arm64.tgz"
+                URL_HASH "SHA256=78e77e871453a4915cbf66fb381b951c9932f88a747c6b2b33c9f27ec2371445"
+            )
+        endif()
+    elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
+        FetchContent_Declare(
+            pdfium
+            URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-mac-univ.tgz"
+            URL_HASH "SHA256=e7577f3242ff9c1df50025f9615673a43601a201bc51ee4792975f98920793a2"
+        )
+    endif()
+
+    FetchContent_MakeAvailable(pdfium)
+    find_package(PDFium REQUIRED PATHS "${pdfium_SOURCE_DIR}" NO_DEFAULT_PATH)
+endif()
--- a/gpt4all-chat/deps/DuckX
+++ b/gpt4all-chat/deps/DuckX
@ -0,0 +1 @@
+Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc
--- a/gpt4all-chat/deps/QXlsx
+++ b/gpt4all-chat/deps/QXlsx
@ -0,0 +1 @@
+Subproject commit 29e81b369128525749dcb6516195b6b062eda955
--- a/gpt4all-chat/deps/SingleApplication
+++ b/gpt4all-chat/deps/SingleApplication
@ -0,0 +1 @@
+Subproject commit 21bdef01eddcbd78044eea1d50b9dee08d218ff2
--- a/gpt4all-chat/deps/fmt
+++ b/gpt4all-chat/deps/fmt
@ -0,0 +1 @@
+Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592
--- a/gpt4all-chat/deps/json
+++ b/gpt4all-chat/deps/json
@ -0,0 +1 @@
+Subproject commit 606b6347edf0758c531abb6c36743e09a4c48a84
--- a/gpt4all-chat/deps/minja
+++ b/gpt4all-chat/deps/minja
@ -0,0 +1 @@
+Subproject commit e97bb2442cd6ab3d5bb5f5a3e8a1f7d6081d613b
--- a/gpt4all-chat/deps/usearch
+++ b/gpt4all-chat/deps/usearch
@ -0,0 +1 @@
+Subproject commit 9e59f1036657303b29eaf709945f339e403e5f2f
--- a/gpt4all-chat/dev-requirements.txt
+++ b/gpt4all-chat/dev-requirements.txt
@ -0,0 +1,11 @@
+-r test-requirements.txt
+
+# dev tools
+flake8~=7.1
+mypy~=1.12
+pytype>=2024.10.11
+wemake-python-styleguide~=0.19.2
+
+# type stubs and other optional modules
+types-requests~=2.32
+urllib3[socks]
--- a/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
+++ b/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
@ -32,7 +32,7 @@
            <image>https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/flatpak-manifest/screenshots/model.png</image>
        </screenshot>
    </screenshots>
-    <url type="homepage">https://gpt4all.io</url>
+    <url type="homepage">https://www.nomic.ai/gpt4all</url>
    <url type="bugtracker">https://github.com/nomic-ai/gpt4all/issues</url>
    <url type="vcs-browser">https://github.com/nomic-ai/gpt4all</url>
    <releases>
@ -46,4 +46,4 @@
        <content_attribute id="language-humor">moderate</content_attribute>
        <content_attribute id="language-discrimination">mild</content_attribute>
    </content_rating>
-</component>
+</component>
--- a/gpt4all-chat/icons/edit.svg
+++ b/gpt4all-chat/icons/edit.svg
@ -1,3 +1 @@
-<svg width="32" height="32" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg">
-<path d="M28.4138 9.17125L22.8288 3.585C22.643 3.39924 22.4225 3.25188 22.1799 3.15134C21.9372 3.0508 21.6771 2.99905 21.4144 2.99905C21.1517 2.99905 20.8916 3.0508 20.6489 3.15134C20.4062 3.25188 20.1857 3.39924 20 3.585L4.58626 19C4.39973 19.185 4.25185 19.4053 4.15121 19.648C4.05057 19.8907 3.99917 20.151 4.00001 20.4138V26C4.00001 26.5304 4.21072 27.0391 4.5858 27.4142C4.96087 27.7893 5.46958 28 6.00001 28H11.5863C11.849 28.0008 12.1093 27.9494 12.352 27.8488C12.5947 27.7482 12.815 27.6003 13 27.4138L28.4138 12C28.5995 11.8143 28.7469 11.5938 28.8474 11.3511C28.948 11.1084 28.9997 10.8483 28.9997 10.5856C28.9997 10.3229 28.948 10.0628 28.8474 9.82015C28.7469 9.57747 28.5995 9.35698 28.4138 9.17125ZM6.41376 20L17 9.41375L19.0863 11.5L8.50001 22.085L6.41376 20ZM6.00001 22.4138L9.58626 26H6.00001V22.4138ZM12 25.5863L9.91376 23.5L20.5 12.9138L22.5863 15L12 25.5863ZM24 13.5863L18.4138 8L21.4138 5L27 10.585L24 13.5863Z" fill="black"/>
-</svg>
+<svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="#000000" viewBox="0 0 256 256"><path d="M227.31,73.37,182.63,28.68a16,16,0,0,0-22.63,0L36.69,152A15.86,15.86,0,0,0,32,163.31V208a16,16,0,0,0,16,16H92.69A15.86,15.86,0,0,0,104,219.31L227.31,96a16,16,0,0,0,0-22.63ZM92.69,208H48V163.31l88-88L180.69,120ZM192,108.68,147.31,64l24-24L216,84.68Z"></path></svg>
--- a/gpt4all-chat/icons/file-doc.svg
+++ b/gpt4all-chat/icons/file-doc.svg
@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><path d="M36,152v56H52a28,28,0,0,0,0-56Z" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M216,200.87A22.12,22.12,0,0,1,200,208c-13.26,0-24-12.54-24-28s10.74-28,24-28a22.12,22.12,0,0,1,16,7.13" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M48,112V40a8,8,0,0,1,8-8h96l56,56v24" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="152 32 152 88 208 88" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><ellipse cx="128" cy="180" rx="24" ry="28" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>
--- a/gpt4all-chat/icons/file-docx.svg
+++ b/gpt4all-chat/icons/file-docx.svg
@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><line x1="152" y1="96" x2="208" y2="96" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><line x1="152" y1="160" x2="208" y2="160" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M64,72V40a8,8,0,0,1,8-8H200a8,8,0,0,1,8,8V216a8,8,0,0,1-8,8H72a8,8,0,0,1-8-8V184" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="64 104 76 152 92 120 108 152 120 104" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><rect x="32" y="72" width="120" height="112" rx="8" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>
--- a/gpt4all-chat/icons/file-xls.svg
+++ b/gpt4all-chat/icons/file-xls.svg
@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><polyline points="148 208 120 208 120 152" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M48,112V40a8,8,0,0,1,8-8h96l56,56v24" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="152 32 152 88 208 88" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><line x1="48" y1="152" x2="88" y2="208" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><line x1="88" y1="152" x2="48" y2="208" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M203.9,153.6s-29.43-7.78-31.8,11,38.43,10.12,35.78,30.72c-2.47,19.16-31.78,11-31.78,11" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>
--- a/gpt4all-chat/icons/groq.svg
+++ b/gpt4all-chat/icons/groq.svg
@ -0,0 +1,3 @@
+<?xml version="1.0" encoding="utf-8" ?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 26.3 26.3"><defs><style>.cls-1{fill:#f05237;}.cls-2{fill:#fff;}</style></defs><g id="Layer_2" data-name="Layer 2"><g id="Content"><circle class="cls-1" cx="13.15" cy="13.15" r="13.15"/><path class="cls-2" d="M13.17,6.88a4.43,4.43,0,0,0,0,8.85h1.45V14.07H13.17a2.77,2.77,0,1,1,2.77-2.76v4.07a2.74,2.74,0,0,1-4.67,2L10.1,18.51a4.37,4.37,0,0,0,3.07,1.29h.06a4.42,4.42,0,0,0,4.36-4.4V11.2a4.43,4.43,0,0,0-4.42-4.32"/></g></g></svg>
--- a/gpt4all-chat/icons/mistral.svg
+++ b/gpt4all-chat/icons/mistral.svg
@ -0,0 +1 @@
+<svg viewBox="0 0 512 512" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="2"><path d="M189.08 303.228H94.587l.044-94.446h94.497l-.048 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.528 397.674h-94.493l.044-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.575 303.228H189.08l.046-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M378.07 303.228h-94.495l.044-94.446h94.498l-.047 94.446zM189.128 208.779H94.633l.044-94.448h94.498l-.047 94.448zM378.115 208.779h-94.494l.045-94.448h94.496l-.047 94.448zM94.587 303.227H.093l.044-96.017h94.496l-.046 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.633 208.779H.138l.046-94.448H94.68l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.68 115.902H.185L.23 19.885h94.498l-.047 96.017zM472.657 114.331h-94.495l.044-94.446h94.497l-.046 94.446zM94.54 399.244H.046l.044-97.588h94.497l-.047 97.588z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.495 492.123H0l.044-94.446H94.54l-.045 94.446zM472.563 303.228H378.07l.044-94.446h94.496l-.047 94.446zM472.61 208.779h-94.495l.044-94.448h94.498l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.517 397.674h-94.494l.044-94.446h94.497l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.47 492.121h-94.493l.044-96.017h94.496l-.047 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M228.375 303.22h-96.061l.046-94.446h96.067l-.052 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M322.827 397.666h-94.495l.044-96.018h94.498l-.047 96.018z" fill="#ff4900" fill-rule="nonzero"/><path d="M324.444 303.22h-97.636l.046-94.446h97.638l-.048 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M418.938 303.22h-96.064l.045-94.446h96.066l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M228.423 208.77H132.36l.045-94.445h96.066l-.05 94.446zM418.985 208.77H322.92l.044-94.445h96.069l-.048 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.883 304.79H39.392l.044-96.017h94.496l-.049 96.017z" fill="#ff7000" fill-rule="nonzero"/><path d="M133.929 208.77H39.437l.044-95.445h94.496l-.048 95.445z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.976 114.325H39.484l.044-94.448h94.497l-.05 94.448zM511.954 115.325h-94.493l.044-95.448h94.497l-.048 95.448z" fill="#ffce00" fill-rule="nonzero"/><path d="M133.836 399.667H39.345l.044-96.447h94.496l-.049 96.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M133.79 492.117H39.3l.044-94.448h94.496l-.049 94.448z" fill="#ff0107" fill-rule="nonzero"/><path d="M511.862 303.22h-94.495l.046-94.446h94.496l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M511.907 208.77h-94.493l.044-94.445h94.496l-.047 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M511.815 398.666h-94.493l.044-95.447h94.496l-.047 95.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M511.77 492.117h-94.496l.046-94.448h94.496l-.047 94.448z" fill="#ff0107" fill-rule="nonzero"/></svg>
--- a/gpt4all-chat/icons/openai.svg
+++ b/gpt4all-chat/icons/openai.svg
@ -0,0 +1,2 @@
+<?xml version="1.0" encoding="utf-8"?><!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
+<svg fill="#000000" width="800px" height="800px" viewBox="0 0 24 24" role="img" xmlns="http://www.w3.org/2000/svg"><title>OpenAI icon</title><path d="M22.2819 9.8211a5.9847 5.9847 0 0 0-.5157-4.9108 6.0462 6.0462 0 0 0-6.5098-2.9A6.0651 6.0651 0 0 0 4.9807 4.1818a5.9847 5.9847 0 0 0-3.9977 2.9 6.0462 6.0462 0 0 0 .7427 7.0966 5.98 5.98 0 0 0 .511 4.9107 6.051 6.051 0 0 0 6.5146 2.9001A5.9847 5.9847 0 0 0 13.2599 24a6.0557 6.0557 0 0 0 5.7718-4.2058 5.9894 5.9894 0 0 0 3.9977-2.9001 6.0557 6.0557 0 0 0-.7475-7.0729zm-9.022 12.6081a4.4755 4.4755 0 0 1-2.8764-1.0408l.1419-.0804 4.7783-2.7582a.7948.7948 0 0 0 .3927-.6813v-6.7369l2.02 1.1686a.071.071 0 0 1 .038.052v5.5826a4.504 4.504 0 0 1-4.4945 4.4944zm-9.6607-4.1254a4.4708 4.4708 0 0 1-.5346-3.0137l.142.0852 4.783 2.7582a.7712.7712 0 0 0 .7806 0l5.8428-3.3685v2.3324a.0804.0804 0 0 1-.0332.0615L9.74 19.9502a4.4992 4.4992 0 0 1-6.1408-1.6464zM2.3408 7.8956a4.485 4.485 0 0 1 2.3655-1.9728V11.6a.7664.7664 0 0 0 .3879.6765l5.8144 3.3543-2.0201 1.1685a.0757.0757 0 0 1-.071 0l-4.8303-2.7865A4.504 4.504 0 0 1 2.3408 7.872zm16.5963 3.8558L13.1038 8.364 15.1192 7.2a.0757.0757 0 0 1 .071 0l4.8303 2.7913a4.4944 4.4944 0 0 1-.6765 8.1042v-5.6772a.79.79 0 0 0-.407-.667zm2.0107-3.0231l-.142-.0852-4.7735-2.7818a.7759.7759 0 0 0-.7854 0L9.409 9.2297V6.8974a.0662.0662 0 0 1 .0284-.0615l4.8303-2.7866a4.4992 4.4992 0 0 1 6.6802 4.66zM8.3065 12.863l-2.02-1.1638a.0804.0804 0 0 1-.038-.0567V6.0742a4.4992 4.4992 0 0 1 7.3757-3.4537l-.142.0805L8.704 5.459a.7948.7948 0 0 0-.3927.6813zm1.0976-2.3654l2.602-1.4998 2.6069 1.4998v2.9994l-2.5974 1.4997-2.6067-1.4997Z"/></svg>
--- a/gpt4all-chat/icons/paperclip.svg
+++ b/gpt4all-chat/icons/paperclip.svg
@ -0,0 +1,45 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<svg
+   viewBox="0 0 256 256"
+   version="1.1"
+   id="svg6"
+   sodipodi:docname="paperclip-horizontal.svg"
+   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
+   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
+   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
+   xmlns="http://www.w3.org/2000/svg"
+   xmlns:svg="http://www.w3.org/2000/svg">
+  <defs
+     id="defs10" />
+  <sodipodi:namedview
+     id="namedview8"
+     pagecolor="#ffffff"
+     bordercolor="#666666"
+     borderopacity="1.0"
+     inkscape:pageshadow="2"
+     inkscape:pageopacity="0.0"
+     inkscape:pagecheckerboard="0"
+     showgrid="false"
+     inkscape:zoom="4.421875"
+     inkscape:cx="127.88693"
+     inkscape:cy="127.88693"
+     inkscape:window-width="2560"
+     inkscape:window-height="1495"
+     inkscape:window-x="0"
+     inkscape:window-y="0"
+     inkscape:window-maximized="1"
+     inkscape:current-layer="svg6" />
+  <rect
+     width="256"
+     height="256"
+     fill="none"
+     id="rect2" />
+  <path
+     d="m 144,80 v 112 a -16,16 0 0 1 -32,0 V 48 a -32,32 0 0 1 64,0 v 144 a -48,48 0 0 1 -96,0 V 80"
+     fill="none"
+     stroke="currentColor"
+     stroke-linecap="round"
+     stroke-linejoin="round"
+     stroke-width="16"
+     id="path4" />
+</svg>
--- a/gpt4all-chat/icons/plus_circle.svg
+++ b/gpt4all-chat/icons/plus_circle.svg
@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="#000000" viewBox="0 0 256 256"><path d="M128,24A104,104,0,1,0,232,128,104.11,104.11,0,0,0,128,24Zm0,192a88,88,0,1,1,88-88A88.1,88.1,0,0,1,128,216Zm48-88a8,8,0,0,1-8,8H136v32a8,8,0,0,1-16,0V136H88a8,8,0,0,1,0-16h32V88a8,8,0,0,1,16,0v32h32A8,8,0,0,1,176,128Z"></path></svg>
--- a/gpt4all-chat/icons/webpage.svg
+++ b/gpt4all-chat/icons/webpage.svg
@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="#000000" viewBox="0 0 256 256"><path d="M216,40H40A16,16,0,0,0,24,56V200a16,16,0,0,0,16,16H216a16,16,0,0,0,16-16V56A16,16,0,0,0,216,40Zm0,16V88H40V56Zm0,144H40V104H216v96Z"></path></svg>
--- a/gpt4all-chat/main.cpp
+++ b/gpt4all-chat/main.cpp
@ -1,95 +0,0 @@
-#include "chatlistmodel.h"
-#include "config.h"
-#include "download.h"
-#include "llm.h"
-#include "localdocs.h"
-#include "logger.h"
-#include "modellist.h"
-#include "mysettings.h"
-#include "network.h"
-
-#include "../gpt4all-backend/llmodel.h"
-
-#include <QCoreApplication>
-#include <QGuiApplication>
-#include <QObject>
-#include <QQmlApplicationEngine>
-#include <QQmlEngine>
-#include <QSettings>
-#include <QString>
-#include <QTranslator>
-#include <QUrl>
-#include <Qt>
-
-int main(int argc, char *argv[])
-{
-    QCoreApplication::setOrganizationName("nomic.ai");
-    QCoreApplication::setOrganizationDomain("gpt4all.io");
-    QCoreApplication::setApplicationName("GPT4All");
-    QCoreApplication::setApplicationVersion(APP_VERSION);
-    QSettings::setDefaultFormat(QSettings::IniFormat);
-
-    Logger::globalInstance();
-
-    QGuiApplication app(argc, argv);
-
-    // set search path before constructing the MySettings instance, which relies on this
-    QString llmodelSearchPaths = QCoreApplication::applicationDirPath();
-    const QString libDir = QCoreApplication::applicationDirPath() + "/../lib/";
-    if (LLM::directoryExists(libDir))
-        llmodelSearchPaths += ";" + libDir;
-#if defined(Q_OS_MAC)
-    const QString binDir = QCoreApplication::applicationDirPath() + "/../../../";
-    if (LLM::directoryExists(binDir))
-        llmodelSearchPaths += ";" + binDir;
-    const QString frameworksDir = QCoreApplication::applicationDirPath() + "/../Frameworks/";
-    if (LLM::directoryExists(frameworksDir))
-        llmodelSearchPaths += ";" + frameworksDir;
-#endif
-    LLModel::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
-
-    // Set the local and language translation before the qml engine has even been started. This will
-    // use the default system locale unless the user has explicitly set it to use a different one.
-    MySettings::globalInstance()->setLanguageAndLocale();
-
-    QQmlApplicationEngine engine;
-
-    // Add a connection here from MySettings::languageAndLocaleChanged signal to a lambda slot where I can call
-    // engine.uiLanguage property
-    QObject::connect(MySettings::globalInstance(), &MySettings::languageAndLocaleChanged, [&engine]() {
-        engine.setUiLanguage(MySettings::globalInstance()->languageAndLocale());
-    });
-
-    qmlRegisterSingletonInstance("mysettings", 1, 0, "MySettings", MySettings::globalInstance());
-    qmlRegisterSingletonInstance("modellist", 1, 0, "ModelList", ModelList::globalInstance());
-    qmlRegisterSingletonInstance("chatlistmodel", 1, 0, "ChatListModel", ChatListModel::globalInstance());
-    qmlRegisterSingletonInstance("llm", 1, 0, "LLM", LLM::globalInstance());
-    qmlRegisterSingletonInstance("download", 1, 0, "Download", Download::globalInstance());
-    qmlRegisterSingletonInstance("network", 1, 0, "Network", Network::globalInstance());
-    qmlRegisterSingletonInstance("localdocs", 1, 0, "LocalDocs", LocalDocs::globalInstance());
-    qmlRegisterUncreatableMetaObject(MySettingsEnums::staticMetaObject, "mysettingsenums", 1, 0, "MySettingsEnums", "Error: only enums");
-
-    const QUrl url(u"qrc:/gpt4all/main.qml"_qs);
-
-    QObject::connect(&engine, &QQmlApplicationEngine::objectCreated,
-        &app, [url](QObject *obj, const QUrl &objUrl) {
-            if (!obj && url == objUrl)
-                QCoreApplication::exit(-1);
-        }, Qt::QueuedConnection);
-    engine.load(url);
-
-#if 0
-    QDirIterator it("qrc:", QDirIterator::Subdirectories);
-    while (it.hasNext()) {
-        qDebug() << it.next();
-    }
-#endif
-
-    int res = app.exec();
-
-    // Make sure ChatLLM threads are joined before global destructors run.
-    // Otherwise, we can get a heap-use-after-free inside of llama.cpp.
-    ChatListModel::globalInstance()->destroyChats();
-
-    return res;
-}
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@ -12,16 +12,54 @@ import network
 import gpt4all
 import localdocs
 import mysettings
+import Qt.labs.platform

 Window {
    id: window
-    width: 1920
-    height: 1080
-    minimumWidth: 1280
-    minimumHeight: 720
+    width: 1440
+    height: 810
+    minimumWidth: 658 + 470 * theme.fontScale
+    minimumHeight: 384 + 160 * theme.fontScale
    visible: true
    title: qsTr("GPT4All v%1").arg(Qt.application.version)

+    SystemTrayIcon {
+        id: systemTrayIcon
+        property bool shouldClose: false
+        visible: MySettings.systemTray && !shouldClose
+        icon.source: "qrc:/gpt4all/icons/gpt4all.svg"
+
+        function restore() {
+            LLM.showDockIcon();
+            window.show();
+            window.raise();
+            window.requestActivate();
+        }
+        onActivated: function(reason) {
+            if (reason === SystemTrayIcon.Context && Qt.platform.os !== "osx")
+                menu.open();
+            else if (reason === SystemTrayIcon.Trigger)
+                restore();
+        }
+
+        menu: Menu {
+            MenuItem {
+                text: qsTr("Restore")
+                onTriggered: systemTrayIcon.restore()
+            }
+            MenuItem {
+                text: qsTr("Quit")
+                onTriggered: {
+                    systemTrayIcon.restore();
+                    systemTrayIcon.shouldClose = true;
+                    window.shouldClose = true;
+                    savingPopup.open();
+                    ChatListModel.saveChatsForQuit();
+                }
+            }
+        }
+    }
+
    Settings {
        property alias x: window.x
        property alias y: window.y
@ -156,7 +194,7 @@ Window {
        font.pixelSize: theme.fontSizeLarge
    }

-    property bool hasSaved: false
+    property bool shouldClose: false

    PopupDialog {
        id: savingPopup
@ -180,20 +218,29 @@ Window {
    }

    onClosing: function(close) {
-        if (window.hasSaved)
+        if (systemTrayIcon.visible) {
+            LLM.hideDockIcon();
+            window.visible = false;
+            ChatListModel.saveChats();
+            close.accepted = false;
+            return;
+        }
+
+        if (window.shouldClose)
            return;

+        window.shouldClose = true;
        savingPopup.open();
-        ChatListModel.saveChats();
-        close.accepted = false
+        ChatListModel.saveChatsForQuit();
+        close.accepted = false;
    }

    Connections {
        target: ChatListModel
        function onSaveChatsFinished() {
-            window.hasSaved = true;
            savingPopup.close();
-            window.close()
+            if (window.shouldClose)
+                window.close()
        }
    }

@ -422,7 +469,7 @@ Window {
                            return qsTr("The datalake is enabled")
                        else if (currentChat.modelInfo.isOnline)
                            return qsTr("Using a network model")
-                        else if (currentChat.modelInfo.isOnline)
+                        else if (currentChat.isServer)
                            return qsTr("Server mode is enabled")
                        return ""
                    }
@ -627,9 +674,6 @@ Window {

            function show() {
                stackLayout.currentIndex = 2;
-                // FIXME This expanded code should be removed and we should be changing the names of
-                // the classes here in ModelList for the proxy/filter models
-                ModelList.downloadableModels.expanded = true
            }

            function isShown() {
--- a/gpt4all-chat/metadata/latestnews.md
+++ b/gpt4all-chat/metadata/latestnews.md
@ -1,6 +1,15 @@
 ## Latest News

-* **New Model Support**: LLaMa 3.1 8b, Gemma, Mixtral, GPT-NeoX, Gemma 2, OpenELM, ChatGLM, Jais architectures, StarCoder2, XVERSE, Command R, and OLMo (all with Vulkan support)
-* **Suggested Follow Up Questions**: Get follow up questions on your LocalDocs or chats automatically suggested
+GPT4All v3.10.0 was released on February 24th. Changes include:

-Roadmap: we're planning support for tools in GPT4All that models like LLaMa 3.1 can use. Share suggestions on Discord!
+* **Remote Models:**
+  * The Add Model page now has a dedicated tab for remote model providers.
+  * Groq, OpenAI, and Mistral remote models are now easier to configure.
+* **CUDA Compatibility:** GPUs with CUDA compute capability 5.0 such as the GTX 750 are now supported by the CUDA backend.
+* **New Model:** The non-MoE Granite model is now supported.
+* **Translation Updates:**
+  * The Italian translation has been updated.
+  * The Simplified Chinese translation has been significantly improved.
+* **Better Chat Templates:** The default chat templates for OLMoE 7B 0924/0125 and Granite 3.1 3B/8B have been improved.
+* **Whitespace Fixes:** DeepSeek-R1-based models now have better whitespace behavior in their output.
+* **Crash Fixes:** Several issues that could potentially cause GPT4All to crash have been fixed.
--- a/gpt4all-chat/metadata/models3.json
+++ b/gpt4all-chat/metadata/models3.json
@ -1,22 +1,22 @@
 [
  {
    "order": "a",
-    "md5sum": "3a265fbb343693d283f8a4ec5e7f1529",
-    "name": "Llama 3.1 8B Instruct",
-    "filename": "Meta-Llama-3.1-8B-Instruct.Q4_0.gguf",
-    "filesize": "4661211808",
-    "requires": "3.1.0",
+    "md5sum": "a54c08a7b90e4029a8c2ab5b5dc936aa",
+    "name": "Reasoner v1",
+    "filename": "qwen2.5-coder-7b-instruct-q4_0.gguf",
+    "filesize": "4431390720",
+    "requires": "3.6.0",
    "ramrequired": "8",
    "parameters": "8 billion",
    "quant": "q4_0",
-    "type": "LLaMA3",
-    "description": "<ul><li>Fast responses</li><li>Chat based model</li><li>Accepts agentic system prompts in Llama 3.1 format</li><li>Trained by Meta</li><li>License: <a href=\"https://llama.meta.com/llama3_1/license/\">Meta Llama 3.1 Community License</a></li></ul>",
-    "url": "https://huggingface.co/3Simplex/Meta-Llama-3.1-8B-Instruct-gguf/resolve/main/Meta-Llama-3.1-8B-Instruct-Q4_0.gguf",
-    "promptTemplate": "<|start_header_id|>user<|end_header_id|>\n\n%1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n%2",
-    "systemPrompt": "<|start_header_id|>system<|end_header_id|>\nCutting Knowledge Date: December 2023\n\nYou are a helpful assistant.<|eot_id|>"
+    "type": "qwen2",
+    "description": "<ul><li>Based on <a href=\"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct\">Qwen2.5-Coder 7B</a></li><li>Uses built-in javascript code interpreter</li><li>Use for complex reasoning tasks that can be aided by computation analysis</li><li>License: <a href=\"https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct/blob/main/LICENSE\">Apache License Version 2.0</a></li><li>#reasoning</li></ul>",
+    "url": "https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct-GGUF/resolve/main/qwen2.5-coder-7b-instruct-q4_0.gguf",
+    "chatTemplate": "{{- '<|im_start|>system\\n' }}\n{% if toolList|length > 0 %}You have access to the following functions:\n{% for tool in toolList %}\nUse the function '{{tool.function}}' to: '{{tool.description}}'\n{% if tool.parameters|length > 0 %}\nparameters:\n{% for info in tool.parameters %}\n  {{info.name}}:\n    type: {{info.type}}\n    description: {{info.description}}\n    required: {{info.required}}\n{% endfor %}\n{% endif %}\n# Tool Instructions\nIf you CHOOSE to call this function ONLY reply with the following format:\n'{{tool.symbolicFormat}}'\nHere is an example. If the user says, '{{tool.examplePrompt}}', then you reply\n'{{tool.exampleCall}}'\nAfter the result you might reply with, '{{tool.exampleReply}}'\n{% endfor %}\nYou MUST include both the start and end tags when you use a function.\n\nYou are a helpful AI assistant who uses the functions to break down, analyze, perform, and verify complex reasoning tasks. You SHOULD try to verify your answers using the functions where possible.\n{% endif %}\n{{- '<|im_end|>\\n' }}\n{% for message in messages %}\n{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{% endfor %}\n{% if add_generation_prompt %}\n{{ '<|im_start|>assistant\\n' }}\n{% endif %}\n",
+    "systemPrompt": ""
  },
  {
-    "order": "b",
+    "order": "aa",
    "md5sum": "c87ad09e1e4c8f9c35a5fcef52b6f1c9",
    "name": "Llama 3 8B Instruct",
    "filename": "Meta-Llama-3-8B-Instruct.Q4_0.gguf",
@ -29,10 +29,105 @@
    "description": "<ul><li>Fast responses</li><li>Chat based model</li><li>Accepts system prompts in Llama 3 format</li><li>Trained by Meta</li><li>License: <a href=\"https://llama.meta.com/llama3/license/\">Meta Llama 3 Community License</a></li></ul>",
    "url": "https://gpt4all.io/models/gguf/Meta-Llama-3-8B-Instruct.Q4_0.gguf",
    "promptTemplate": "<|start_header_id|>user<|end_header_id|>\n\n%1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n%2<|eot_id|>",
-    "systemPrompt": ""
+    "systemPrompt": "",
+    "chatTemplate": "{%- set loop_messages = messages %}\n{%- for message in loop_messages %}\n    {%- set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' %}\n    {{- content }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}"
+  },
+  {
+    "order": "aa1",
+    "sha256sum": "5cd4ee65211770f1d99b4f6f4951780b9ef40e29314bd6542bb5bd0ad0bc29d1",
+    "name": "DeepSeek-R1-Distill-Qwen-7B",
+    "filename": "DeepSeek-R1-Distill-Qwen-7B-Q4_0.gguf",
+    "filesize": "4444121056",
+    "requires": "3.8.0",
+    "ramrequired": "8",
+    "parameters": "7 billion",
+    "quant": "q4_0",
+    "type": "deepseek",
+    "description": "<p>The official Qwen2.5-Math-7B distillation of DeepSeek-R1.</p><ul><li>License: <a href=\"https://opensource.org/license/mit\">MIT</a></li><li>No restrictions on commercial use</li><li>#reasoning</li></ul>",
+    "url": "https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-7B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-7B-Q4_0.gguf",
+    "chatTemplate": "{%- if not add_generation_prompt is defined %}\n    {%- set add_generation_prompt = false %}\n{%- endif %}\n{%- if messages[0]['role'] == 'system' %}\n    {{- messages[0]['content'] }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '<｜User｜>' + message['content'] }}\n    {%- endif %}\n    {%- if message['role'] == 'assistant' %}\n        {%- set content = message['content'] | regex_replace('^[\\\\s\\\\S]*</think>', '') %}\n        {{- '<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>' }}\n    {%- endif %}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n    {{- '<｜Assistant｜>' }}\n{%- endif %}"
+  },
+  {
+    "order": "aa2",
+    "sha256sum": "906b3382f2680f4ce845459b4a122e904002b075238080307586bcffcde49eef",
+    "name": "DeepSeek-R1-Distill-Qwen-14B",
+    "filename": "DeepSeek-R1-Distill-Qwen-14B-Q4_0.gguf",
+    "filesize": "8544267680",
+    "requires": "3.8.0",
+    "ramrequired": "16",
+    "parameters": "14 billion",
+    "quant": "q4_0",
+    "type": "deepseek",
+    "description": "<p>The official Qwen2.5-14B distillation of DeepSeek-R1.</p><ul><li>License: <a href=\"https://opensource.org/license/mit\">MIT</a></li><li>No restrictions on commercial use</li><li>#reasoning</li></ul>",
+    "url": "https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-14B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-14B-Q4_0.gguf",
+    "chatTemplate": "{%- if not add_generation_prompt is defined %}\n    {%- set add_generation_prompt = false %}\n{%- endif %}\n{%- if messages[0]['role'] == 'system' %}\n    {{- messages[0]['content'] }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '<｜User｜>' + message['content'] }}\n    {%- endif %}\n    {%- if message['role'] == 'assistant' %}\n        {%- set content = message['content'] | regex_replace('^[\\\\s\\\\S]*</think>', '') %}\n        {{- '<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>' }}\n    {%- endif %}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n    {{- '<｜Assistant｜>' }}\n{%- endif %}"
+  },
+  {
+    "order": "aa3",
+    "sha256sum": "0eb93e436ac8beec18aceb958c120d282cb2cf5451b23185e7be268fe9d375cc",
+    "name": "DeepSeek-R1-Distill-Llama-8B",
+    "filename": "DeepSeek-R1-Distill-Llama-8B-Q4_0.gguf",
+    "filesize": "4675894112",
+    "requires": "3.8.0",
+    "ramrequired": "8",
+    "parameters": "8 billion",
+    "quant": "q4_0",
+    "type": "deepseek",
+    "description": "<p>The official Llama-3.1-8B distillation of DeepSeek-R1.</p><ul><li>License: <a href=\"https://opensource.org/license/mit\">MIT</a></li><li>No restrictions on commercial use</li><li>#reasoning</li></ul>",
+    "url": "https://huggingface.co/bartowski/DeepSeek-R1-Distill-Llama-8B-GGUF/resolve/main/DeepSeek-R1-Distill-Llama-8B-Q4_0.gguf",
+    "chatTemplate": "{%- if not add_generation_prompt is defined %}\n    {%- set add_generation_prompt = false %}\n{%- endif %}\n{%- if messages[0]['role'] == 'system' %}\n    {{- messages[0]['content'] }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '<｜User｜>' + message['content'] }}\n    {%- endif %}\n    {%- if message['role'] == 'assistant' %}\n        {%- set content = message['content'] | regex_replace('^[\\\\s\\\\S]*</think>', '') %}\n        {{- '<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>' }}\n    {%- endif %}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n    {{- '<｜Assistant｜>' }}\n{%- endif %}"
+  },
+  {
+    "order": "aa4",
+    "sha256sum": "b3af887d0a015b39fab2395e4faf682c1a81a6a3fd09a43f0d4292f7d94bf4d0",
+    "name": "DeepSeek-R1-Distill-Qwen-1.5B",
+    "filename": "DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf",
+    "filesize": "1068807776",
+    "requires": "3.8.0",
+    "ramrequired": "3",
+    "parameters": "1.5 billion",
+    "quant": "q4_0",
+    "type": "deepseek",
+    "description": "<p>The official Qwen2.5-Math-1.5B distillation of DeepSeek-R1.</p><ul><li>License: <a href=\"https://opensource.org/license/mit\">MIT</a></li><li>No restrictions on commercial use</li><li>#reasoning</li></ul>",
+    "url": "https://huggingface.co/bartowski/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/resolve/main/DeepSeek-R1-Distill-Qwen-1.5B-Q4_0.gguf",
+    "chatTemplate": "{%- if not add_generation_prompt is defined %}\n    {%- set add_generation_prompt = false %}\n{%- endif %}\n{%- if messages[0]['role'] == 'system' %}\n    {{- messages[0]['content'] }}\n{%- endif %}\n{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '<｜User｜>' + message['content'] }}\n    {%- endif %}\n    {%- if message['role'] == 'assistant' %}\n        {%- set content = message['content'] | regex_replace('^[\\\\s\\\\S]*</think>', '') %}\n        {{- '<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>' }}\n    {%- endif %}\n{%- endfor -%}\n{%- if add_generation_prompt %}\n    {{- '<｜Assistant｜>' }}\n{%- endif %}"
+  },
+  {
+    "order": "b",
+    "md5sum": "27b44e8ae1817525164ddf4f8dae8af4",
+    "name": "Llama 3.2 3B Instruct",
+    "filename": "Llama-3.2-3B-Instruct-Q4_0.gguf",
+    "filesize": "1921909280",
+    "requires": "3.4.0",
+    "ramrequired": "4",
+    "parameters": "3 billion",
+    "quant": "q4_0",
+    "type": "LLaMA3",
+    "description": "<ul><li>Fast responses</li><li>Instruct model</li><li>Multilingual dialogue use</li><li>Agentic system capable</li><li>Trained by Meta</li><li>License: <a href=\"https://llama.meta.com/llama3_2/license/\">Meta Llama 3.2 Community License</a></li></ul>",
+    "url": "https://huggingface.co/bartowski/Llama-3.2-3B-Instruct-GGUF/resolve/main/Llama-3.2-3B-Instruct-Q4_0.gguf",
+    "promptTemplate": "<|start_header_id|>user<|end_header_id|>\n\n%1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n%2",
+    "systemPrompt": "<|start_header_id|>system<|end_header_id|>\nCutting Knowledge Date: December 2023\n\nYou are a helpful assistant.<|eot_id|>",
+    "chatTemplate": "{{- bos_token }}\n{%- set date_string = strftime_now('%d %b %Y') %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] | trim %}\n    {%- set loop_start = 1 %}\n{%- else %}\n    {%- set system_message = '' %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n\n{#- System message #}\n{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}\n{{- 'Cutting Knowledge Date: December 2023\\n' }}\n{{- 'Today Date: ' + date_string + '\\n\\n' }}\n{{- system_message }}\n{{- '<|eot_id|>' }}\n\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}"
  },
  {
    "order": "c",
+    "md5sum": "48ff0243978606fdba19d899b77802fc",
+    "name": "Llama 3.2 1B Instruct",
+    "filename": "Llama-3.2-1B-Instruct-Q4_0.gguf",
+    "filesize": "773025920",
+    "requires": "3.4.0",
+    "ramrequired": "2",
+    "parameters": "1 billion",
+    "quant": "q4_0",
+    "type": "LLaMA3",
+    "description": "<ul><li>Fast responses</li><li>Instruct model</li><li>Multilingual dialogue use</li><li>Agentic system capable</li><li>Trained by Meta</li><li>License: <a href=\"https://llama.meta.com/llama3_2/license/\">Meta Llama 3.2 Community License</a></li></ul>",
+    "url": "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/Llama-3.2-1B-Instruct-Q4_0.gguf",
+    "promptTemplate": "<|start_header_id|>user<|end_header_id|>\n\n%1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n%2",
+    "systemPrompt": "<|start_header_id|>system<|end_header_id|>\nCutting Knowledge Date: December 2023\n\nYou are a helpful assistant.<|eot_id|>",
+    "chatTemplate": "{{- bos_token }}\n{%- set date_string = strftime_now('%d %b %Y') %}\n\n{#- This block extracts the system message, so we can slot it into the right place. #}\n{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] | trim %}\n    {%- set loop_start = 1 %}\n{%- else %}\n    {%- set system_message = '' %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n\n{#- System message #}\n{{- '<|start_header_id|>system<|end_header_id|>\\n\\n' }}\n{{- 'Cutting Knowledge Date: December 2023\\n' }}\n{{- 'Today Date: ' + date_string + '\\n\\n' }}\n{{- system_message }}\n{{- '<|eot_id|>' }}\n\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n' + message['content'] | trim + '<|eot_id|>' }}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}\n{%- endif %}"
+  },
+  {
+    "order": "d",
    "md5sum": "a5f6b4eabd3992da4d7fb7f020f921eb",
    "name": "Nous Hermes 2 Mistral DPO",
    "filename": "Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf",
@ -45,10 +140,11 @@
    "description": "<strong>Good overall fast chat model</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Accepts system prompts in ChatML format</li><li>Trained by Mistral AI<li>Finetuned by Nous Research on the OpenHermes-2.5 dataset<li>Licensed for commercial use</ul>",
    "url": "https://huggingface.co/NousResearch/Nous-Hermes-2-Mistral-7B-DPO-GGUF/resolve/main/Nous-Hermes-2-Mistral-7B-DPO.Q4_0.gguf",
    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
-    "systemPrompt": ""
+    "systemPrompt": "",
+    "chatTemplate": "{%- for message in messages %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  },
  {
-    "order": "d",
+    "order": "e",
    "md5sum": "97463be739b50525df56d33b26b00852",
    "name": "Mistral Instruct",
    "filename": "mistral-7b-instruct-v0.1.Q4_0.gguf",
@ -61,10 +157,28 @@
    "systemPrompt": "",
    "description": "<strong>Strong overall fast instruction following model</strong><br><ul><li>Fast responses</li><li>Trained by Mistral AI<li>Uncensored</li><li>Licensed for commercial use</li></ul>",
    "url": "https://gpt4all.io/models/gguf/mistral-7b-instruct-v0.1.Q4_0.gguf",
-    "promptTemplate": "[INST] %1 [/INST]"
+    "promptTemplate": "[INST] %1 [/INST]",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set system_message = messages[0]['content'] %}\n    {%- set loop_start = 1 %}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if (message['role'] == 'user') != ((loop.index0 - loop_start) % 2 == 0) %}\n            {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}\n        {%- endif %}\n        {%- if message['role'] == 'user' %}\n            {%- if loop.index0 == loop_start and loop_start == 1 %}\n                {{- ' [INST] ' + system_message + '\\n\\n' + message['content'] + ' [/INST]' }}\n            {%- else %}\n                {{- ' [INST] ' + message['content'] + ' [/INST]' }}\n            {%- endif %}\n        {%- elif message['role'] == 'assistant' %}\n            {{- ' ' + message['content'] + eos_token }}\n        {%- else %}\n            {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}"
  },
  {
-    "order": "e",
+    "order": "f",
+    "md5sum": "8a9c75bcd8a66b7693f158ec96924eeb",
+    "name": "Llama 3.1 8B Instruct 128k",
+    "filename": "Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf",
+    "filesize": "4661212096",
+    "requires": "3.1.1",
+    "ramrequired": "8",
+    "parameters": "8 billion",
+    "quant": "q4_0",
+    "type": "LLaMA3",
+    "description": "<ul><li><strong>For advanced users only. Not recommended for use on Windows or Linux without selecting CUDA due to speed issues.</strong></li><li>Fast responses</li><li>Chat based model</li><li>Large context size of 128k</li><li>Accepts agentic system prompts in Llama 3.1 format</li><li>Trained by Meta</li><li>License: <a href=\"https://llama.meta.com/llama3_1/license/\">Meta Llama 3.1 Community License</a></li></ul>",
+    "url": "https://huggingface.co/GPT4All-Community/Meta-Llama-3.1-8B-Instruct-128k/resolve/main/Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf",
+    "promptTemplate": "<|start_header_id|>user<|end_header_id|>\n\n%1<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n%2",
+    "systemPrompt": "<|start_header_id|>system<|end_header_id|>\nCutting Knowledge Date: December 2023\n\nYou are a helpful assistant.<|eot_id|>",
+    "chatTemplate": "{%- set loop_messages = messages %}\n{%- for message in loop_messages %}\n    {%- set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\\n\\n'+ message['content'] | trim + '<|eot_id|>' %}\n    {%- if loop.index0 == 0 %}\n        {%- set content = bos_token + content %}\n    {%- endif %}\n    {{- content }}\n{%- endfor %}\n{{- '<|start_header_id|>assistant<|end_header_id|>\\n\\n' }}"
+  },
+  {
+    "order": "g",
    "md5sum": "f692417a22405d80573ac10cb0cd6c6a",
    "name": "Mistral OpenOrca",
    "filename": "mistral-7b-openorca.gguf2.Q4_0.gguf",
@ -77,10 +191,11 @@
    "description": "<strong>Strong overall fast chat model</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by Mistral AI<li>Finetuned on OpenOrca dataset curated via <a href=\"https://atlas.nomic.ai/\">Nomic Atlas</a><li>Licensed for commercial use</ul>",
    "url": "https://gpt4all.io/models/gguf/mistral-7b-openorca.gguf2.Q4_0.gguf",
    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
-    "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI.\n<|im_end|>\n"
+    "systemPrompt": "<|im_start|>system\nYou are MistralOrca, a large language model trained by Alignment Lab AI.\n<|im_end|>\n",
+    "chatTemplate": "{%- for message in messages %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  },
  {
-    "order": "f",
+    "order": "h",
    "md5sum": "c4c78adf744d6a20f05c8751e3961b84",
    "name": "GPT4All Falcon",
    "filename": "gpt4all-falcon-newbpe-q4_0.gguf",
@ -93,10 +208,11 @@
    "systemPrompt": "",
    "description": "<strong>Very fast model with good quality</strong><br><ul><li>Fastest responses</li><li>Instruction based</li><li>Trained by TII<li>Finetuned by Nomic AI<li>Licensed for commercial use</ul>",
    "url": "https://gpt4all.io/models/gguf/gpt4all-falcon-newbpe-q4_0.gguf",
-    "promptTemplate": "### Instruction:\n%1\n\n### Response:\n"
+    "promptTemplate": "### Instruction:\n%1\n\n### Response:\n",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set loop_start = 1 %}\n    {{- messages[0]['content'] + '\\n\\n' }}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if message['role'] == 'user' %}\n            {{- '### User: ' + message['content'] + '\\n\\n' }}\n        {%- elif message['role'] == 'assistant' %}\n            {{- '### Assistant: ' + message['content'] + '\\n\\n' }}\n        {%- else %}\n            {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '### Assistant:' }}\n{%- endif %}"
  },
  {
-    "order": "g",
+    "order": "i",
    "md5sum": "00c8593ba57f5240f59662367b3ed4a5",
    "name": "Orca 2 (Medium)",
    "filename": "orca-2-7b.Q4_0.gguf",
@ -108,10 +224,11 @@
    "type": "LLaMA2",
    "systemPrompt": "",
    "description": "<ul><li>Instruction based<li>Trained by Microsoft<li>Cannot be used commercially</ul>",
-    "url": "https://gpt4all.io/models/gguf/orca-2-7b.Q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/orca-2-7b.Q4_0.gguf",
+    "chatTemplate": "{%- for message in messages %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  },
  {
-    "order": "h",
+    "order": "j",
    "md5sum": "3c0d63c4689b9af7baa82469a6f51a19",
    "name": "Orca 2 (Full)",
    "filename": "orca-2-13b.Q4_0.gguf",
@ -123,10 +240,11 @@
    "type": "LLaMA2",
    "systemPrompt": "",
    "description": "<ul><li>Instruction based<li>Trained by Microsoft<li>Cannot be used commercially</ul>",
-    "url": "https://gpt4all.io/models/gguf/orca-2-13b.Q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/orca-2-13b.Q4_0.gguf",
+    "chatTemplate": "{%- for message in messages %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  },
  {
-    "order": "i",
+    "order": "k",
    "md5sum": "5aff90007499bce5c64b1c0760c0b186",
    "name": "Wizard v1.2",
    "filename": "wizardlm-13b-v1.2.Q4_0.gguf",
@ -138,10 +256,12 @@
    "type": "LLaMA2",
    "systemPrompt": "",
    "description": "<strong>Strong overall larger model</strong><br><ul><li>Instruction based<li>Gives very long responses<li>Finetuned with only 1k of high-quality data<li>Trained by Microsoft and Peking University<li>Cannot be used commercially</ul>",
-    "url": "https://gpt4all.io/models/gguf/wizardlm-13b-v1.2.Q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/wizardlm-13b-v1.2.Q4_0.gguf",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set loop_start = 1 %}\n    {{- messages[0]['content'] + ' ' }}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in loop_messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if message['role'] == 'user' %}\n            {{- 'USER: ' + message['content'] }}\n        {%- elif message['role'] == 'assistant' %}\n            {{- 'ASSISTANT: ' + message['content'] }}\n        {%- else %}\n            {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n        {%- endif %}\n        {%- if (loop.index0 - loop_start) % 2 == 0 %}\n            {{- ' ' }}\n        {%- else %}\n            {{- eos_token }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- 'ASSISTANT:' }}\n{%- endif %}",
+    "systemMessage": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions."
  },
  {
-    "order": "j",
+    "order": "l",
    "md5sum": "31b47b4e8c1816b62684ac3ca373f9e1",
    "name": "Ghost 7B v0.9.1",
    "filename": "ghost-7b-v0.9.1-Q4_0.gguf",
@ -154,10 +274,12 @@
    "description": "<strong>Ghost 7B v0.9.1</strong> fast, powerful and smooth for Vietnamese and English languages.",
    "url": "https://huggingface.co/lamhieu/ghost-7b-v0.9.1-gguf/resolve/main/ghost-7b-v0.9.1-Q4_0.gguf",
    "promptTemplate": "<|user|>\n%1</s>\n<|assistant|>\n%2</s>\n",
-    "systemPrompt": "<|system|>\nYou are Ghost created by Lam Hieu. You are a helpful and knowledgeable assistant. You like to help and always give honest information, in its original language. In communication, you are always respectful, equal and promote positive behavior.\n</s>"
+    "systemPrompt": "<|system|>\nYou are Ghost created by Lam Hieu. You are a helpful and knowledgeable assistant. You like to help and always give honest information, in its original language. In communication, you are always respectful, equal and promote positive behavior.\n</s>",
+    "chatTemplate": "{%- for message in messages %}\n    {%- if message['role'] == 'user' %}\n        {{- '<|user|>\\n' + message['content'] + eos_token }}\n    {%- elif message['role'] == 'system' %}\n        {{- '<|system|>\\n' + message['content'] + eos_token }}\n    {%- elif message['role'] == 'assistant' %}\n        {{- '<|assistant|>\\n'  + message['content'] + eos_token }}\n    {%- endif %}\n    {%- if loop.last and add_generation_prompt %}\n        {{- '<|assistant|>' }}\n    {%- endif %}\n{%- endfor %}",
+    "systemMessage": "You are Ghost created by Lam Hieu. You are a helpful and knowledgeable assistant. You like to help and always give honest information, in its original language. In communication, you are always respectful, equal and promote positive behavior."
  },
  {
-    "order": "k",
+    "order": "m",
    "md5sum": "3d12810391d04d1153b692626c0c6e16",
    "name": "Hermes",
    "filename": "nous-hermes-llama2-13b.Q4_0.gguf",
@ -170,10 +292,11 @@
    "systemPrompt": "",
    "description": "<strong>Extremely good model</strong><br><ul><li>Instruction based<li>Gives long responses<li>Curated with 300,000 uncensored instructions<li>Trained by Nous Research<li>Cannot be used commercially</ul>",
    "url": "https://gpt4all.io/models/gguf/nous-hermes-llama2-13b.Q4_0.gguf",
-    "promptTemplate": "### Instruction:\n%1\n\n### Response:\n"
+    "promptTemplate": "### Instruction:\n%1\n\n### Response:\n",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set loop_start = 1 %}\n    {{- messages[0]['content'] + '\\n\\n' }}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if message['role'] == 'user' %}\n            {{- '### Instruction:\\n' + message['content'] + '\\n\\n' }}\n        {%- elif message['role'] == 'assistant' %}\n            {{- '### Response:\\n' + message['content'] + '\\n\\n' }}\n        {%- else %}\n            {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '### Instruction:\\n' }}\n{%- endif %}"
  },
  {
-    "order": "l",
+    "order": "n",
    "md5sum": "40388eb2f8d16bb5d08c96fdfaac6b2c",
    "name": "Snoozy",
    "filename": "gpt4all-13b-snoozy-q4_0.gguf",
@ -185,10 +308,12 @@
    "type": "LLaMA",
    "systemPrompt": "",
    "description": "<strong>Very good overall model</strong><br><ul><li>Instruction based<li>Based on the same dataset as Groovy<li>Slower than Groovy, with higher quality responses<li>Trained by Nomic AI<li>Cannot be used commercially</ul>",
-    "url": "https://gpt4all.io/models/gguf/gpt4all-13b-snoozy-q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/gpt4all-13b-snoozy-q4_0.gguf",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set loop_start = 1 %}\n    {{- messages[0]['content'] + '\\n\\n' }}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if message['role'] == 'user' %}\n            {{- '### Instruction:\\n' + message['content'] + '\\n\\n' }}\n        {%- elif message['role'] == 'assistant' %}\n            {{- '### Response:\\n' + message['content'] + '\\n\\n' }}\n        {%- else %}\n            {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '### Response:\\n' }}\n{%- endif %}",
+    "systemMessage": "Below is an instruction that describes a task. Write a response that appropriately completes the request."
  },
  {
-    "order": "m",
+    "order": "o",
    "md5sum": "15dcb4d7ea6de322756449c11a0b7545",
    "name": "MPT Chat",
    "filename": "mpt-7b-chat-newbpe-q4_0.gguf",
@ -202,10 +327,11 @@
    "description": "<strong>Good model with novel architecture</strong><br><ul><li>Fast responses<li>Chat based<li>Trained by Mosaic ML<li>Cannot be used commercially</ul>",
    "url": "https://gpt4all.io/models/gguf/mpt-7b-chat-newbpe-q4_0.gguf",
    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
-    "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>\n"
+    "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>\n",
+    "chatTemplate": "{%- for message in messages %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  },
  {
-    "order": "n",
+    "order": "p",
    "md5sum": "ab5d8e8a2f79365ea803c1f1d0aa749d",
    "name": "MPT Chat",
    "filename": "mpt-7b-chat.gguf4.Q4_0.gguf",
@ -218,10 +344,11 @@
    "description": "<strong>Good model with novel architecture</strong><br><ul><li>Fast responses<li>Chat based<li>Trained by Mosaic ML<li>Cannot be used commercially</ul>",
    "url": "https://gpt4all.io/models/gguf/mpt-7b-chat.gguf4.Q4_0.gguf",
    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>\n",
-    "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>\n"
+    "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>\n",
+    "chatTemplate": "{%- for message in messages %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  },
  {
-    "order": "o",
+    "order": "q",
    "md5sum": "f8347badde9bfc2efbe89124d78ddaf5",
    "name": "Phi-3 Mini Instruct",
    "filename": "Phi-3-mini-4k-instruct.Q4_0.gguf",
@ -234,10 +361,11 @@
    "description": "<ul><li>Very fast responses</li><li>Chat based model</li><li>Accepts system prompts in Phi-3 format</li><li>Trained by Microsoft</li><li>License: <a href=\"https://opensource.org/license/mit\">MIT</a></li><li>No restrictions on commercial use</li></ul>",
    "url": "https://gpt4all.io/models/gguf/Phi-3-mini-4k-instruct.Q4_0.gguf",
    "promptTemplate": "<|user|>\n%1<|end|>\n<|assistant|>\n%2<|end|>\n",
-    "systemPrompt": ""
+    "systemPrompt": "",
+    "chatTemplate": "{{- bos_token }}\n{%- for message in messages %}\n    {{- '<|' + message['role'] + '|>\\n' + message['content'] + '<|end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|assistant|>\\n' }}\n{%- else %}\n    {{- eos_token }}\n{%- endif %}"
  },
  {
-    "order": "p",
+    "order": "r",
    "md5sum": "0e769317b90ac30d6e09486d61fefa26",
    "name": "Mini Orca (Small)",
    "filename": "orca-mini-3b-gguf2-q4_0.gguf",
@ -250,10 +378,11 @@
    "description": "<strong>Small version of new model with novel dataset</strong><br><ul><li>Very fast responses</li><li>Instruction based</li><li>Explain tuned datasets</li><li>Orca Research Paper dataset construction approaches</li><li>Cannot be used commercially</li></ul>",
    "url": "https://gpt4all.io/models/gguf/orca-mini-3b-gguf2-q4_0.gguf",
    "promptTemplate": "### User:\n%1\n\n### Response:\n",
-    "systemPrompt": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n"
+    "systemPrompt": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set loop_start = 1 %}\n    {{- '### System:\\n' + messages[0]['content'] + '\\n\\n' }}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if message['role'] == 'user' %}\n            {{- '### User:\\n' + message['content'] + '\\n\\n' }}\n        {%- elif message['role'] == 'assistant' %}\n            {{- '### Response:\\n' + message['content'] + '\\n\\n' }}\n        {%- else %}\n            {{- raise_exception('Only user and assistant roles are supported, with the exception of an initial optional system message!') }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '### Response:\\n' }}\n{%- endif %}"
  },
  {
-    "order": "q",
+    "order": "s",
    "md5sum": "c232f17e09bca4b7ee0b5b1f4107c01e",
    "disableGUI": "true",
    "name": "Replit",
@ -267,10 +396,11 @@
    "systemPrompt": "",
    "promptTemplate": "%1",
    "description": "<strong>Trained on subset of the Stack</strong><br><ul><li>Code completion based<li>Licensed for commercial use<li>WARNING: Not available for chat GUI</ul>",
-    "url": "https://gpt4all.io/models/gguf/replit-code-v1_5-3b-newbpe-q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/replit-code-v1_5-3b-newbpe-q4_0.gguf",
+    "chatTemplate": null
  },
  {
-    "order": "r",
+    "order": "t",
    "md5sum": "70841751ccd95526d3dcfa829e11cd4c",
    "disableGUI": "true",
    "name": "Starcoder",
@ -284,10 +414,11 @@
    "systemPrompt": "",
    "promptTemplate": "%1",
    "description": "<strong>Trained on subset of the Stack</strong><br><ul><li>Code completion based<li>WARNING: Not available for chat GUI</ul>",
-    "url": "https://gpt4all.io/models/gguf/starcoder-newbpe-q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/starcoder-newbpe-q4_0.gguf",
+    "chatTemplate": null
  },
  {
-    "order": "s",
+    "order": "u",
    "md5sum": "e973dd26f0ffa6e46783feaea8f08c83",
    "disableGUI": "true",
    "name": "Rift coder",
@ -301,10 +432,11 @@
    "systemPrompt": "",
    "promptTemplate": "%1",
    "description": "<strong>Trained on collection of Python and TypeScript</strong><br><ul><li>Code completion based<li>WARNING: Not available for chat GUI</li>",
-    "url": "https://gpt4all.io/models/gguf/rift-coder-v0-7b-q4_0.gguf"
+    "url": "https://gpt4all.io/models/gguf/rift-coder-v0-7b-q4_0.gguf",
+    "chatTemplate": null
  },
  {
-    "order": "t",
+    "order": "v",
    "md5sum": "e479e6f38b59afc51a470d1953a6bfc7",
    "disableGUI": "true",
    "name": "SBert",
@ -319,10 +451,11 @@
    "embeddingModel": true,
    "systemPrompt": "",
    "description": "<strong>LocalDocs text embeddings model</strong><br><ul><li>For use with LocalDocs feature<li>Used for retrieval augmented generation (RAG)",
-    "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2-f16.gguf"
+    "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2-f16.gguf",
+    "chatTemplate": null
  },
  {
-    "order": "u",
+    "order": "w",
    "md5sum": "dd90e2cb7f8e9316ac3796cece9883b5",
    "name": "SBert",
    "filename": "all-MiniLM-L6-v2.gguf2.f16.gguf",
@ -335,10 +468,11 @@
    "type": "Bert",
    "embeddingModel": true,
    "description": "<strong>LocalDocs text embeddings model</strong><br><ul><li>For use with LocalDocs feature<li>Used for retrieval augmented generation (RAG)",
-    "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2.gguf2.f16.gguf"
+    "url": "https://gpt4all.io/models/gguf/all-MiniLM-L6-v2.gguf2.f16.gguf",
+    "chatTemplate": null
  },
  {
-    "order": "v",
+    "order": "x",
    "md5sum": "919de4dd6f25351bcb0223790db1932d",
    "name": "EM German Mistral",
    "filename": "em_german_mistral_v01.Q4_0.gguf",
@ -351,10 +485,12 @@
    "description": "<strong>Mistral-based model for German-language applications</strong><br><ul><li>Fast responses</li><li>Chat based model</li><li>Trained by ellamind<li>Finetuned on German instruction and chat data</a><li>Licensed for commercial use</ul>",
    "url": "https://huggingface.co/TheBloke/em_german_mistral_v01-GGUF/resolve/main/em_german_mistral_v01.Q4_0.gguf",
    "promptTemplate": "USER: %1 ASSISTANT: ",
-    "systemPrompt": "Du bist ein hilfreicher Assistent. "
+    "systemPrompt": "Du bist ein hilfreicher Assistent. ",
+    "chatTemplate": "{%- if messages[0]['role'] == 'system' %}\n    {%- set loop_start = 1 %}\n    {{- messages[0]['content'] }}\n{%- else %}\n    {%- set loop_start = 0 %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if loop.index0 >= loop_start %}\n        {%- if not loop.first %}\n            {{- ' ' }}\n        {%- endif %}\n        {%- if message['role'] == 'user' %}\n            {{- 'USER: ' + message['content'] }}\n        {%- elif message['role'] == 'assistant' %}\n            {{- 'ASSISTANT: ' + message['content'] }}\n        {%- else %}\n            {{- raise_exception('After the optional system message, conversation roles must be either user or assistant.') }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {%- if messages %}\n        {{- ' ' }}\n    {%- endif %}\n    {{- 'ASSISTANT:' }}\n{%- endif %}",
+    "systemMessage": "Du bist ein hilfreicher Assistent."
  },
  {
-    "order": "w",
+    "order": "y",
    "md5sum": "60ea031126f82db8ddbbfecc668315d2",
    "disableGUI": "true",
    "name": "Nomic Embed Text v1",
@ -368,10 +504,11 @@
    "embeddingModel": true,
    "systemPrompt": "",
    "description": "nomic-embed-text-v1",
-    "url": "https://gpt4all.io/models/gguf/nomic-embed-text-v1.f16.gguf"
+    "url": "https://gpt4all.io/models/gguf/nomic-embed-text-v1.f16.gguf",
+    "chatTemplate": null
  },
  {
-    "order": "x",
+    "order": "z",
    "md5sum": "a5401e7f7e46ed9fcaed5b60a281d547",
    "disableGUI": "true",
    "name": "Nomic Embed Text v1.5",
@ -385,6 +522,24 @@
    "embeddingModel": true,
    "systemPrompt": "",
    "description": "nomic-embed-text-v1.5",
-    "url": "https://gpt4all.io/models/gguf/nomic-embed-text-v1.5.f16.gguf"
+    "url": "https://gpt4all.io/models/gguf/nomic-embed-text-v1.5.f16.gguf",
+    "chatTemplate": null
+  },
+  {
+    "order": "zzz",
+    "md5sum": "a8c5a783105f87a481543d4ed7d7586d",
+    "name": "Qwen2-1.5B-Instruct",
+    "filename": "qwen2-1_5b-instruct-q4_0.gguf",
+    "filesize": "937532800",
+    "requires": "3.0",
+    "ramrequired": "3",
+    "parameters": "1.5 billion",
+    "quant": "q4_0",
+    "type": "qwen2",
+    "description": "<ul><li>Very fast responses</li><li>Instruction based model</li><li>Usage of LocalDocs (RAG): Highly recommended</li><li>Supports context length of up to 32768</li><li>Trained and finetuned by Qwen (Alibaba Cloud)</li><li>License: <a href=\"https://www.apache.org/licenses/LICENSE-2.0.html/\">Apache 2.0</a></li></ul>",
+    "url": "https://huggingface.co/Qwen/Qwen2-1.5B-Instruct-GGUF/resolve/main/qwen2-1_5b-instruct-q4_0.gguf",
+    "promptTemplate": "<|im_start|>user\n%1<|im_end|>\n<|im_start|>assistant\n%2<|im_end|>",
+    "systemPrompt": "<|im_start|>system\nBelow is an instruction that describes a task. Write a response that appropriately completes the request.<|im_end|>\n",
+    "chatTemplate": "{%- for message in messages %}\n    {%- if loop.first and messages[0]['role'] != 'system' %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' }}\n    {%- endif %}\n    {{- '<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>\\n' }}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}"
  }
 ]
--- a/gpt4all-chat/metadata/release.json
+++ b/gpt4all-chat/metadata/release.json
--- a/gpt4all-chat/pyproject.toml
+++ b/gpt4all-chat/pyproject.toml
@ -0,0 +1,29 @@
+[tool.pytest.ini_options]
+addopts = ['--import-mode=importlib']
+
+[tool.mypy]
+files = 'tests/python'
+pretty = true
+strict = true
+warn_unused_ignores = false
+
+[tool.pytype]
+inputs = ['tests/python']
+jobs = 'auto'
+bind_decorated_methods = true
+none_is_not_bool = true
+overriding_renamed_parameter_count_checks = true
+strict_none_binding = true
+precise_return = true
+# protocols:
+# - https://github.com/google/pytype/issues/1423
+# - https://github.com/google/pytype/issues/1424
+strict_import = true
+strict_parameter_checks = true
+strict_primitive_comparisons = true
+# strict_undefined_checks: too many false positives
+
+[tool.isort]
+src_paths = ['tests/python']
+line_length = 120
+combine_as_imports = true
--- a/gpt4all-chat/qml/AddCollectionView.qml
+++ b/gpt4all-chat/qml/AddCollectionView.qml
@ -89,15 +89,8 @@ Rectangle {
            property alias collection: collection.text
            property alias folder_path: folderEdit.text

-            FolderDialog {
+            MyFolderDialog {
                id: folderDialog
-                title: qsTr("Please choose a directory")
-            }
-
-            function openFolderDialog(currentFolder, onAccepted) {
-                folderDialog.currentFolder = currentFolder;
-                folderDialog.accepted.connect(function() { onAccepted(folderDialog.selectedFolder); });
-                folderDialog.open();
            }

            Label {
@ -170,7 +163,7 @@ Rectangle {
                    id: browseButton
                    text: qsTr("Browse")
                    onClicked: {
-                        root.openFolderDialog(StandardPaths.writableLocation(StandardPaths.HomeLocation), function(selectedFolder) {
+                        folderDialog.openFolderDialog(StandardPaths.writableLocation(StandardPaths.HomeLocation), function(selectedFolder) {
                            root.folder_path = selectedFolder
                        })
                    }
--- a/gpt4all-chat/qml/AddGPT4AllModelView.qml
+++ b/gpt4all-chat/qml/AddGPT4AllModelView.qml
@ -0,0 +1,483 @@
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+import QtQuick.Dialogs
+import Qt.labs.folderlistmodel
+import Qt5Compat.GraphicalEffects
+
+import llm
+import chatlistmodel
+import download
+import modellist
+import network
+import gpt4all
+import mysettings
+import localdocs
+
+ColumnLayout {
+    Layout.fillWidth: true
+    Layout.alignment: Qt.AlignTop
+    spacing: 5
+
+    Label {
+        Layout.topMargin: 0
+        Layout.bottomMargin: 25
+        Layout.rightMargin: 150 * theme.fontScale
+        Layout.alignment: Qt.AlignTop
+        Layout.fillWidth: true
+        verticalAlignment: Text.AlignTop
+        text: qsTr("These models have been specifically configured for use in GPT4All. The first few models on the " +
+                   "list are known to work the best, but you should only attempt to use models that will fit in your " +
+                   "available memory.")
+        font.pixelSize: theme.fontSizeLarger
+        color: theme.textColor
+        wrapMode: Text.WordWrap
+    }
+
+    Label {
+        visible: !ModelList.gpt4AllDownloadableModels.count && !ModelList.asyncModelRequestOngoing
+        Layout.fillWidth: true
+        Layout.fillHeight: true
+        horizontalAlignment: Qt.AlignHCenter
+        verticalAlignment: Qt.AlignVCenter
+        text: qsTr("Network error: could not retrieve %1").arg("http://gpt4all.io/models/models3.json")
+        font.pixelSize: theme.fontSizeLarge
+        color: theme.mutedTextColor
+    }
+
+    MyBusyIndicator {
+        visible: !ModelList.gpt4AllDownloadableModels.count && ModelList.asyncModelRequestOngoing
+        running: ModelList.asyncModelRequestOngoing
+        Accessible.role: Accessible.Animation
+        Layout.alignment: Qt.AlignCenter
+        Accessible.name: qsTr("Busy indicator")
+        Accessible.description: qsTr("Displayed when the models request is ongoing")
+    }
+
+    RowLayout {
+        ButtonGroup {
+            id: buttonGroup
+            exclusive: true
+        }
+        MyButton {
+            text: qsTr("All")
+            checked: true
+            borderWidth: 0
+            backgroundColor: checked ? theme.lightButtonBackground : "transparent"
+            backgroundColorHovered: theme.lighterButtonBackgroundHovered
+            backgroundRadius: 5
+            padding: 15
+            topPadding: 8
+            bottomPadding: 8
+            textColor: theme.lighterButtonForeground
+            fontPixelSize: theme.fontSizeLarge
+            fontPixelBold: true
+            checkable: true
+            ButtonGroup.group: buttonGroup
+            onClicked: {
+                ModelList.gpt4AllDownloadableModels.filter("");
+            }
+
+        }
+        MyButton {
+            text: qsTr("Reasoning")
+            borderWidth: 0
+            backgroundColor: checked ? theme.lightButtonBackground : "transparent"
+            backgroundColorHovered: theme.lighterButtonBackgroundHovered
+            backgroundRadius: 5
+            padding: 15
+            topPadding: 8
+            bottomPadding: 8
+            textColor: theme.lighterButtonForeground
+            fontPixelSize: theme.fontSizeLarge
+            fontPixelBold: true
+            checkable: true
+            ButtonGroup.group: buttonGroup
+            onClicked: {
+                ModelList.gpt4AllDownloadableModels.filter("#reasoning");
+            }
+        }
+        Layout.bottomMargin: 10
+    }
+
+    ScrollView {
+        id: scrollView
+        ScrollBar.vertical.policy: ScrollBar.AsNeeded
+        Layout.fillWidth: true
+        Layout.fillHeight: true
+        clip: true
+
+        ListView {
+            id: modelListView
+            model: ModelList.gpt4AllDownloadableModels
+            boundsBehavior: Flickable.StopAtBounds
+            spacing: 30
+
+            delegate: Rectangle {
+                id: delegateItem
+                width: modelListView.width
+                height: childrenRect.height + 60
+                color: theme.conversationBackground
+                radius: 10
+                border.width: 1
+                border.color: theme.controlBorder
+
+                ColumnLayout {
+                    anchors.top: parent.top
+                    anchors.left: parent.left
+                    anchors.right: parent.right
+                    anchors.margins: 30
+
+                    Text {
+                        Layout.fillWidth: true
+                        Layout.alignment: Qt.AlignLeft
+                        text: name
+                        elide: Text.ElideRight
+                        color: theme.titleTextColor
+                        font.pixelSize: theme.fontSizeLargest
+                        font.bold: true
+                        Accessible.role: Accessible.Paragraph
+                        Accessible.name: qsTr("Model file")
+                        Accessible.description: qsTr("Model file to be downloaded")
+                    }
+
+
+                    Rectangle {
+                        Layout.fillWidth: true
+                        height: 1
+                        color: theme.dividerColor
+                    }
+
+                    RowLayout {
+                        Layout.topMargin: 10
+                        Layout.fillWidth: true
+                        Text {
+                            id: descriptionText
+                            text: description
+                            font.pixelSize: theme.fontSizeLarge
+                            Layout.fillWidth: true
+                            wrapMode: Text.WordWrap
+                            textFormat: Text.StyledText
+                            color: theme.textColor
+                            linkColor: theme.textColor
+                            Accessible.role: Accessible.Paragraph
+                            Accessible.name: qsTr("Description")
+                            Accessible.description: qsTr("File description")
+                            onLinkActivated: function(link) { Qt.openUrlExternally(link); }
+                            MouseArea {
+                                anchors.fill: parent
+                                acceptedButtons: Qt.NoButton // pass clicks to parent
+                                cursorShape: parent.hoveredLink ? Qt.PointingHandCursor : Qt.ArrowCursor
+                            }
+                        }
+
+                        // FIXME Need to overhaul design here which must take into account
+                        // features not present in current figma including:
+                        // * Ability to cancel a current download
+                        // * Ability to resume a download
+                        // * The presentation of an error if encountered
+                        // * Whether to show already installed models
+                        // * Install of remote models with API keys
+                        // * The presentation of the progress bar
+                        Rectangle {
+                            id: actionBox
+                            width: childrenRect.width + 20
+                            color: "transparent"
+                            border.width: 1
+                            border.color: theme.dividerColor
+                            radius: 10
+                            Layout.rightMargin: 20
+                            Layout.bottomMargin: 20
+                            Layout.minimumHeight: childrenRect.height + 20
+                            Layout.alignment: Qt.AlignRight | Qt.AlignTop
+
+                            ColumnLayout {
+                                spacing: 0
+                                MySettingsButton {
+                                    id: downloadButton
+                                    text: isDownloading ? qsTr("Cancel") : isIncomplete ? qsTr("Resume") : qsTr("Download")
+                                    font.pixelSize: theme.fontSizeLarge
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    visible: !installed && !calcHash && downloadError === ""
+                                    Accessible.description: qsTr("Stop/restart/start the download")
+                                    onClicked: {
+                                        if (!isDownloading) {
+                                            Download.downloadModel(filename);
+                                        } else {
+                                            Download.cancelDownload(filename);
+                                        }
+                                    }
+                                }
+
+                                MySettingsDestructiveButton {
+                                    id: removeButton
+                                    text: qsTr("Remove")
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    visible: !isDownloading && (installed || isIncomplete)
+                                    Accessible.description: qsTr("Remove model from filesystem")
+                                    onClicked: {
+                                        Download.removeModel(filename);
+                                    }
+                                }
+
+                                ColumnLayout {
+                                    spacing: 0
+                                    Label {
+                                        Layout.topMargin: 20
+                                        Layout.leftMargin: 20
+                                        visible: downloadError !== ""
+                                        textFormat: Text.StyledText
+                                        text: qsTr("<strong><font size=\"1\"><a href=\"#error\">Error</a></strong></font>")
+                                        color: theme.textColor
+                                        font.pixelSize: theme.fontSizeLarge
+                                        linkColor: theme.textErrorColor
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: text
+                                        Accessible.description: qsTr("Describes an error that occurred when downloading")
+                                        onLinkActivated: {
+                                            downloadingErrorPopup.text = downloadError;
+                                            downloadingErrorPopup.open();
+                                        }
+                                    }
+
+                                    Label {
+                                        visible: LLM.systemTotalRAMInGB() < ramrequired
+                                        Layout.topMargin: 20
+                                        Layout.leftMargin: 20
+                                        Layout.maximumWidth: 300
+                                        textFormat: Text.StyledText
+                                        text: qsTr("<strong><font size=\"2\">WARNING: Not recommended for your hardware. Model requires more memory (%1 GB) than your system has available (%2).</strong></font>").arg(ramrequired).arg(LLM.systemTotalRAMInGBString())
+                                        color: theme.textErrorColor
+                                        font.pixelSize: theme.fontSizeLarge
+                                        wrapMode: Text.WordWrap
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: text
+                                        Accessible.description: qsTr("Error for incompatible hardware")
+                                        onLinkActivated: {
+                                            downloadingErrorPopup.text = downloadError;
+                                            downloadingErrorPopup.open();
+                                        }
+                                    }
+                                }
+
+                                ColumnLayout {
+                                    visible: isDownloading && !calcHash
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    spacing: 20
+
+                                    ProgressBar {
+                                        id: itemProgressBar
+                                        Layout.fillWidth: true
+                                        width: 200
+                                        value: bytesReceived / bytesTotal
+                                        background: Rectangle {
+                                            implicitHeight: 45
+                                            color: theme.progressBackground
+                                            radius: 3
+                                        }
+                                        contentItem: Item {
+                                            implicitHeight: 40
+
+                                            Rectangle {
+                                                width: itemProgressBar.visualPosition * parent.width
+                                                height: parent.height
+                                                radius: 2
+                                                color: theme.progressForeground
+                                            }
+                                        }
+                                        Accessible.role: Accessible.ProgressBar
+                                        Accessible.name: qsTr("Download progressBar")
+                                        Accessible.description: qsTr("Shows the progress made in the download")
+                                    }
+
+                                    Label {
+                                        id: speedLabel
+                                        color: theme.textColor
+                                        Layout.alignment: Qt.AlignRight
+                                        text: speed
+                                        font.pixelSize: theme.fontSizeLarge
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: qsTr("Download speed")
+                                        Accessible.description: qsTr("Download speed in bytes/kilobytes/megabytes per second")
+                                    }
+                                }
+
+                                RowLayout {
+                                    visible: calcHash
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.maximumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    clip: true
+
+                                    Label {
+                                        id: calcHashLabel
+                                        color: theme.textColor
+                                        text: qsTr("Calculating...")
+                                        font.pixelSize: theme.fontSizeLarge
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: text
+                                        Accessible.description: qsTr("Whether the file hash is being calculated")
+                                    }
+
+                                    MyBusyIndicator {
+                                        id: busyCalcHash
+                                        running: calcHash
+                                        Accessible.role: Accessible.Animation
+                                        Accessible.name: qsTr("Busy indicator")
+                                        Accessible.description: qsTr("Displayed when the file hash is being calculated")
+                                    }
+                                }
+                            }
+                        }
+                    }
+
+                    Item  {
+                        Layout.minimumWidth: childrenRect.width
+                        Layout.minimumHeight: childrenRect.height
+                        Layout.bottomMargin: 10
+                        RowLayout {
+                            id: paramRow
+                            anchors.centerIn: parent
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("File size")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: filesize
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                            Rectangle {
+                                width: 1
+                                Layout.fillHeight: true
+                                color: theme.dividerColor
+                            }
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("RAM required")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: ramrequired >= 0 ? qsTr("%1 GB").arg(ramrequired) : qsTr("?")
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                            Rectangle {
+                                width: 1
+                                Layout.fillHeight: true
+                                color: theme.dividerColor
+                            }
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("Parameters")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: parameters !== "" ? parameters : qsTr("?")
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                            Rectangle {
+                                width: 1
+                                Layout.fillHeight: true
+                                color: theme.dividerColor
+                            }
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("Quant")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: quant
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                            Rectangle {
+                                width: 1
+                                Layout.fillHeight: true
+                                color: theme.dividerColor
+                            }
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("Type")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: type
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                        }
+
+                        Rectangle {
+                            color: "transparent"
+                            anchors.fill: paramRow
+                            border.color: theme.dividerColor
+                            border.width: 1
+                            radius: 10
+                        }
+                    }
+
+                    Rectangle {
+                        Layout.fillWidth: true
+                        height: 1
+                        color: theme.dividerColor
+                    }
+                }
+            }
+        }
+    }
+}
--- a/gpt4all-chat/qml/AddHFModelView.qml
+++ b/gpt4all-chat/qml/AddHFModelView.qml
@ -0,0 +1,703 @@
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+import QtQuick.Dialogs
+import Qt.labs.folderlistmodel
+import Qt5Compat.GraphicalEffects
+
+import llm
+import chatlistmodel
+import download
+import modellist
+import network
+import gpt4all
+import mysettings
+import localdocs
+
+ColumnLayout {
+    Layout.fillWidth: true
+    Layout.fillHeight: true
+    Layout.alignment: Qt.AlignTop
+    spacing: 5
+
+    Label {
+        Layout.topMargin: 0
+        Layout.bottomMargin: 25
+        Layout.rightMargin: 150 * theme.fontScale
+        Layout.alignment: Qt.AlignTop
+        Layout.fillWidth: true
+        verticalAlignment: Text.AlignTop
+        text: qsTr("Use the search to find and download models from HuggingFace. There is NO GUARANTEE that these " +
+                   "will work. Many will require additional configuration before they can be used.")
+        font.pixelSize: theme.fontSizeLarger
+        color: theme.textColor
+        wrapMode: Text.WordWrap
+    }
+
+    RowLayout {
+        Layout.fillWidth: true
+        Layout.fillHeight: true
+        Layout.alignment: Qt.AlignCenter
+        Layout.margins: 0
+        spacing: 10
+        MyTextField {
+            id: discoverField
+            property string textBeingSearched: ""
+            readOnly: ModelList.discoverInProgress
+            Layout.alignment: Qt.AlignCenter
+            Layout.fillWidth: true
+            font.pixelSize: theme.fontSizeLarger
+            placeholderText: qsTr("Discover and download models by keyword search...")
+            Accessible.role: Accessible.EditableText
+            Accessible.name: placeholderText
+            Accessible.description: qsTr("Text field for discovering and filtering downloadable models")
+            Connections {
+                target: ModelList
+                function onDiscoverInProgressChanged() {
+                    if (ModelList.discoverInProgress) {
+                        discoverField.textBeingSearched = discoverField.text;
+                        discoverField.text = qsTr("Searching \u00B7 %1").arg(discoverField.textBeingSearched);
+                    } else {
+                        discoverField.text = discoverField.textBeingSearched;
+                        discoverField.textBeingSearched = "";
+                    }
+                }
+            }
+            background: ProgressBar {
+                id: discoverProgressBar
+                indeterminate: ModelList.discoverInProgress && ModelList.discoverProgress === 0.0
+                value: ModelList.discoverProgress
+                background: Rectangle {
+                    color: theme.controlBackground
+                    border.color: theme.controlBorder
+                    radius: 10
+                }
+                contentItem: Item {
+                    Rectangle {
+                        visible: ModelList.discoverInProgress
+                        anchors.bottom: parent.bottom
+                        width: discoverProgressBar.visualPosition * parent.width
+                        height: 10
+                        radius: 2
+                        color: theme.progressForeground
+                    }
+                }
+            }
+
+            Keys.onReturnPressed: (event)=> {
+                                      if (event.modifiers & Qt.ControlModifier || event.modifiers & Qt.ShiftModifier)
+                                      event.accepted = false;
+                                      else {
+                                          editingFinished();
+                                          sendDiscovery()
+                                      }
+                                  }
+            function sendDiscovery() {
+                ModelList.huggingFaceDownloadableModels.discoverAndFilter(discoverField.text);
+            }
+            RowLayout {
+                spacing: 0
+                anchors.right: discoverField.right
+                anchors.verticalCenter: discoverField.verticalCenter
+                anchors.rightMargin: 15
+                visible: !ModelList.discoverInProgress
+                MyMiniButton {
+                    id: clearDiscoverButton
+                    backgroundColor: theme.textColor
+                    backgroundColorHovered: theme.iconBackgroundDark
+                    visible: discoverField.text !== ""
+                    source: "qrc:/gpt4all/icons/close.svg"
+                    onClicked: {
+                        discoverField.text = ""
+                        discoverField.sendDiscovery() // should clear results
+                    }
+                }
+                MyMiniButton {
+                    backgroundColor: theme.textColor
+                    backgroundColorHovered: theme.iconBackgroundDark
+                    source: "qrc:/gpt4all/icons/settings.svg"
+                    onClicked: {
+                        discoveryTools.visible = !discoveryTools.visible
+                    }
+                }
+                MyMiniButton {
+                    id: sendButton
+                    enabled: !ModelList.discoverInProgress
+                    backgroundColor: theme.textColor
+                    backgroundColorHovered: theme.iconBackgroundDark
+                    source: "qrc:/gpt4all/icons/send_message.svg"
+                    Accessible.name: qsTr("Initiate model discovery and filtering")
+                    Accessible.description: qsTr("Triggers discovery and filtering of models")
+                    onClicked: {
+                        discoverField.sendDiscovery()
+                    }
+                }
+            }
+        }
+    }
+
+    RowLayout {
+        id: discoveryTools
+        Layout.fillWidth: true
+        Layout.alignment: Qt.AlignCenter
+        Layout.margins: 0
+        spacing: 20
+        visible: false
+        MyComboBox {
+            id: comboSort
+            model: ListModel {
+                ListElement { name: qsTr("Default") }
+                ListElement { name: qsTr("Likes") }
+                ListElement { name: qsTr("Downloads") }
+                ListElement { name: qsTr("Recent") }
+            }
+            currentIndex: ModelList.discoverSort
+            contentItem: Text {
+                anchors.horizontalCenter: parent.horizontalCenter
+                rightPadding: 30
+                color: theme.textColor
+                text: {
+                    return qsTr("Sort by: %1").arg(comboSort.displayText)
+                }
+                font.pixelSize: theme.fontSizeLarger
+                verticalAlignment: Text.AlignVCenter
+                horizontalAlignment: Text.AlignHCenter
+                elide: Text.ElideRight
+            }
+            onActivated: function (index) {
+                ModelList.discoverSort = index;
+            }
+        }
+        MyComboBox {
+            id: comboSortDirection
+            model: ListModel {
+                ListElement { name: qsTr("Asc") }
+                ListElement { name: qsTr("Desc") }
+            }
+            currentIndex: {
+                if (ModelList.discoverSortDirection === 1)
+                    return 0
+                else
+                    return 1;
+            }
+            contentItem: Text {
+                anchors.horizontalCenter: parent.horizontalCenter
+                rightPadding: 30
+                color: theme.textColor
+                text: {
+                    return qsTr("Sort dir: %1").arg(comboSortDirection.displayText)
+                }
+                font.pixelSize: theme.fontSizeLarger
+                verticalAlignment: Text.AlignVCenter
+                horizontalAlignment: Text.AlignHCenter
+                elide: Text.ElideRight
+            }
+            onActivated: function (index) {
+                if (index === 0)
+                    ModelList.discoverSortDirection = 1;
+                else
+                    ModelList.discoverSortDirection = -1;
+            }
+        }
+        MyComboBox {
+            id: comboLimit
+            model: ListModel {
+                ListElement { name: "5" }
+                ListElement { name: "10" }
+                ListElement { name: "20" }
+                ListElement { name: "50" }
+                ListElement { name: "100" }
+                ListElement { name: qsTr("None") }
+            }
+
+            currentIndex: {
+                if (ModelList.discoverLimit === 5)
+                    return 0;
+                else if (ModelList.discoverLimit === 10)
+                    return 1;
+                else if (ModelList.discoverLimit === 20)
+                    return 2;
+                else if (ModelList.discoverLimit === 50)
+                    return 3;
+                else if (ModelList.discoverLimit === 100)
+                    return 4;
+                else if (ModelList.discoverLimit === -1)
+                    return 5;
+            }
+            contentItem: Text {
+                anchors.horizontalCenter: parent.horizontalCenter
+                rightPadding: 30
+                color: theme.textColor
+                text: {
+                    return qsTr("Limit: %1").arg(comboLimit.displayText)
+                }
+                font.pixelSize: theme.fontSizeLarger
+                verticalAlignment: Text.AlignVCenter
+                horizontalAlignment: Text.AlignHCenter
+                elide: Text.ElideRight
+            }
+            onActivated: function (index) {
+                switch (index) {
+                case 0:
+                    ModelList.discoverLimit = 5; break;
+                case 1:
+                    ModelList.discoverLimit = 10; break;
+                case 2:
+                    ModelList.discoverLimit = 20; break;
+                case 3:
+                    ModelList.discoverLimit = 50; break;
+                case 4:
+                    ModelList.discoverLimit = 100; break;
+                case 5:
+                    ModelList.discoverLimit = -1; break;
+                }
+            }
+        }
+    }
+
+    ScrollView {
+        id: scrollView
+        ScrollBar.vertical.policy: ScrollBar.AsNeeded
+        Layout.fillWidth: true
+        Layout.fillHeight: true
+        clip: true
+
+        ListView {
+            id: modelListView
+            model: ModelList.huggingFaceDownloadableModels
+            boundsBehavior: Flickable.StopAtBounds
+            spacing: 30
+
+            delegate: Rectangle {
+                id: delegateItem
+                width: modelListView.width
+                height: childrenRect.height + 60
+                color: theme.conversationBackground
+                radius: 10
+                border.width: 1
+                border.color: theme.controlBorder
+
+                ColumnLayout {
+                    anchors.top: parent.top
+                    anchors.left: parent.left
+                    anchors.right: parent.right
+                    anchors.margins: 30
+
+                    Text {
+                        Layout.fillWidth: true
+                        Layout.alignment: Qt.AlignLeft
+                        text: name
+                        elide: Text.ElideRight
+                        color: theme.titleTextColor
+                        font.pixelSize: theme.fontSizeLargest
+                        font.bold: true
+                        Accessible.role: Accessible.Paragraph
+                        Accessible.name: qsTr("Model file")
+                        Accessible.description: qsTr("Model file to be downloaded")
+                    }
+
+
+                    Rectangle {
+                        Layout.fillWidth: true
+                        height: 1
+                        color: theme.dividerColor
+                    }
+
+                    RowLayout {
+                        Layout.topMargin: 10
+                        Layout.fillWidth: true
+                        Text {
+                            id: descriptionText
+                            text: description
+                            font.pixelSize: theme.fontSizeLarge
+                            Layout.fillWidth: true
+                            wrapMode: Text.WordWrap
+                            textFormat: Text.StyledText
+                            color: theme.textColor
+                            linkColor: theme.textColor
+                            Accessible.role: Accessible.Paragraph
+                            Accessible.name: qsTr("Description")
+                            Accessible.description: qsTr("File description")
+                            onLinkActivated: function(link) { Qt.openUrlExternally(link); }
+                            MouseArea {
+                                anchors.fill: parent
+                                acceptedButtons: Qt.NoButton // pass clicks to parent
+                                cursorShape: parent.hoveredLink ? Qt.PointingHandCursor : Qt.ArrowCursor
+                            }
+                        }
+
+                        // FIXME Need to overhaul design here which must take into account
+                        // features not present in current figma including:
+                        // * Ability to cancel a current download
+                        // * Ability to resume a download
+                        // * The presentation of an error if encountered
+                        // * Whether to show already installed models
+                        // * Install of remote models with API keys
+                        // * The presentation of the progress bar
+                        Rectangle {
+                            id: actionBox
+                            width: childrenRect.width + 20
+                            color: "transparent"
+                            border.width: 1
+                            border.color: theme.dividerColor
+                            radius: 10
+                            Layout.rightMargin: 20
+                            Layout.bottomMargin: 20
+                            Layout.minimumHeight: childrenRect.height + 20
+                            Layout.alignment: Qt.AlignRight | Qt.AlignTop
+
+                            ColumnLayout {
+                                spacing: 0
+                                MySettingsButton {
+                                    id: downloadButton
+                                    text: isDownloading ? qsTr("Cancel") : isIncomplete ? qsTr("Resume") : qsTr("Download")
+                                    font.pixelSize: theme.fontSizeLarge
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    visible: !isOnline && !installed && !calcHash && downloadError === ""
+                                    Accessible.description: qsTr("Stop/restart/start the download")
+                                    onClicked: {
+                                        if (!isDownloading) {
+                                            Download.downloadModel(filename);
+                                        } else {
+                                            Download.cancelDownload(filename);
+                                        }
+                                    }
+                                }
+
+                                MySettingsDestructiveButton {
+                                    id: removeButton
+                                    text: qsTr("Remove")
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    visible: !isDownloading && (installed || isIncomplete)
+                                    Accessible.description: qsTr("Remove model from filesystem")
+                                    onClicked: {
+                                        Download.removeModel(filename);
+                                    }
+                                }
+
+                                MySettingsButton {
+                                    id: installButton
+                                    visible: !installed && isOnline
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    text: qsTr("Install")
+                                    font.pixelSize: theme.fontSizeLarge
+                                    onClicked: {
+                                        var apiKeyText = apiKey.text.trim(),
+                                        baseUrlText = baseUrl.text.trim(),
+                                        modelNameText = modelName.text.trim();
+
+                                        var apiKeyOk = apiKeyText !== "",
+                                        baseUrlOk = !isCompatibleApi || baseUrlText !== "",
+                                        modelNameOk = !isCompatibleApi || modelNameText !== "";
+
+                                        if (!apiKeyOk)
+                                            apiKey.showError();
+                                        if (!baseUrlOk)
+                                            baseUrl.showError();
+                                        if (!modelNameOk)
+                                            modelName.showError();
+
+                                        if (!apiKeyOk || !baseUrlOk || !modelNameOk)
+                                            return;
+
+                                        if (!isCompatibleApi)
+                                            Download.installModel(
+                                                        filename,
+                                                        apiKeyText,
+                                                        );
+                                        else
+                                            Download.installCompatibleModel(
+                                                        modelNameText,
+                                                        apiKeyText,
+                                                        baseUrlText,
+                                                        );
+                                    }
+                                    Accessible.role: Accessible.Button
+                                    Accessible.name: qsTr("Install")
+                                    Accessible.description: qsTr("Install online model")
+                                }
+
+                                ColumnLayout {
+                                    spacing: 0
+                                    Label {
+                                        Layout.topMargin: 20
+                                        Layout.leftMargin: 20
+                                        visible: downloadError !== ""
+                                        textFormat: Text.StyledText
+                                        text: qsTr("<strong><font size=\"1\"><a href=\"#error\">Error</a></strong></font>")
+                                        color: theme.textColor
+                                        font.pixelSize: theme.fontSizeLarge
+                                        linkColor: theme.textErrorColor
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: text
+                                        Accessible.description: qsTr("Describes an error that occurred when downloading")
+                                        onLinkActivated: {
+                                            downloadingErrorPopup.text = downloadError;
+                                            downloadingErrorPopup.open();
+                                        }
+                                    }
+
+                                    Label {
+                                        visible: LLM.systemTotalRAMInGB() < ramrequired
+                                        Layout.topMargin: 20
+                                        Layout.leftMargin: 20
+                                        Layout.maximumWidth: 300
+                                        textFormat: Text.StyledText
+                                        text: qsTr("<strong><font size=\"2\">WARNING: Not recommended for your hardware. Model requires more memory (%1 GB) than your system has available (%2).</strong></font>").arg(ramrequired).arg(LLM.systemTotalRAMInGBString())
+                                        color: theme.textErrorColor
+                                        font.pixelSize: theme.fontSizeLarge
+                                        wrapMode: Text.WordWrap
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: text
+                                        Accessible.description: qsTr("Error for incompatible hardware")
+                                        onLinkActivated: {
+                                            downloadingErrorPopup.text = downloadError;
+                                            downloadingErrorPopup.open();
+                                        }
+                                    }
+                                }
+
+                                ColumnLayout {
+                                    visible: isDownloading && !calcHash
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    spacing: 20
+
+                                    ProgressBar {
+                                        id: itemProgressBar
+                                        Layout.fillWidth: true
+                                        width: 200
+                                        value: bytesReceived / bytesTotal
+                                        background: Rectangle {
+                                            implicitHeight: 45
+                                            color: theme.progressBackground
+                                            radius: 3
+                                        }
+                                        contentItem: Item {
+                                            implicitHeight: 40
+
+                                            Rectangle {
+                                                width: itemProgressBar.visualPosition * parent.width
+                                                height: parent.height
+                                                radius: 2
+                                                color: theme.progressForeground
+                                            }
+                                        }
+                                        Accessible.role: Accessible.ProgressBar
+                                        Accessible.name: qsTr("Download progressBar")
+                                        Accessible.description: qsTr("Shows the progress made in the download")
+                                    }
+
+                                    Label {
+                                        id: speedLabel
+                                        color: theme.textColor
+                                        Layout.alignment: Qt.AlignRight
+                                        text: speed
+                                        font.pixelSize: theme.fontSizeLarge
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: qsTr("Download speed")
+                                        Accessible.description: qsTr("Download speed in bytes/kilobytes/megabytes per second")
+                                    }
+                                }
+
+                                RowLayout {
+                                    visible: calcHash
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.maximumWidth: 200
+                                    Layout.fillWidth: true
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    clip: true
+
+                                    Label {
+                                        id: calcHashLabel
+                                        color: theme.textColor
+                                        text: qsTr("Calculating...")
+                                        font.pixelSize: theme.fontSizeLarge
+                                        Accessible.role: Accessible.Paragraph
+                                        Accessible.name: text
+                                        Accessible.description: qsTr("Whether the file hash is being calculated")
+                                    }
+
+                                    MyBusyIndicator {
+                                        id: busyCalcHash
+                                        running: calcHash
+                                        Accessible.role: Accessible.Animation
+                                        Accessible.name: qsTr("Busy indicator")
+                                        Accessible.description: qsTr("Displayed when the file hash is being calculated")
+                                    }
+                                }
+
+                                MyTextField {
+                                    id: apiKey
+                                    visible: !installed && isOnline
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    wrapMode: Text.WrapAnywhere
+                                    function showError() {
+                                        messageToast.show(qsTr("ERROR: $API_KEY is empty."));
+                                        apiKey.placeholderTextColor = theme.textErrorColor;
+                                    }
+                                    onTextChanged: {
+                                        apiKey.placeholderTextColor = theme.mutedTextColor;
+                                    }
+                                    placeholderText: qsTr("enter $API_KEY")
+                                    Accessible.role: Accessible.EditableText
+                                    Accessible.name: placeholderText
+                                    Accessible.description: qsTr("Whether the file hash is being calculated")
+                                }
+
+                                MyTextField {
+                                    id: baseUrl
+                                    visible: !installed && isOnline && isCompatibleApi
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    wrapMode: Text.WrapAnywhere
+                                    function showError() {
+                                        messageToast.show(qsTr("ERROR: $BASE_URL is empty."));
+                                        baseUrl.placeholderTextColor = theme.textErrorColor;
+                                    }
+                                    onTextChanged: {
+                                        baseUrl.placeholderTextColor = theme.mutedTextColor;
+                                    }
+                                    placeholderText: qsTr("enter $BASE_URL")
+                                    Accessible.role: Accessible.EditableText
+                                    Accessible.name: placeholderText
+                                    Accessible.description: qsTr("Whether the file hash is being calculated")
+                                }
+
+                                MyTextField {
+                                    id: modelName
+                                    visible: !installed && isOnline && isCompatibleApi
+                                    Layout.topMargin: 20
+                                    Layout.leftMargin: 20
+                                    Layout.minimumWidth: 200
+                                    Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
+                                    wrapMode: Text.WrapAnywhere
+                                    function showError() {
+                                        messageToast.show(qsTr("ERROR: $MODEL_NAME is empty."))
+                                        modelName.placeholderTextColor = theme.textErrorColor;
+                                    }
+                                    onTextChanged: {
+                                        modelName.placeholderTextColor = theme.mutedTextColor;
+                                    }
+                                    placeholderText: qsTr("enter $MODEL_NAME")
+                                    Accessible.role: Accessible.EditableText
+                                    Accessible.name: placeholderText
+                                    Accessible.description: qsTr("Whether the file hash is being calculated")
+                                }
+                            }
+                        }
+                    }
+
+                    Item  {
+                        Layout.minimumWidth: childrenRect.width
+                        Layout.minimumHeight: childrenRect.height
+                        Layout.bottomMargin: 10
+                        RowLayout {
+                            id: paramRow
+                            anchors.centerIn: parent
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("File size")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: filesize
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                            Rectangle {
+                                width: 1
+                                Layout.fillHeight: true
+                                color: theme.dividerColor
+                            }
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("Quant")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: quant
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                            Rectangle {
+                                width: 1
+                                Layout.fillHeight: true
+                                color: theme.dividerColor
+                            }
+                            ColumnLayout {
+                                Layout.topMargin: 10
+                                Layout.bottomMargin: 10
+                                Layout.leftMargin: 20
+                                Layout.rightMargin: 20
+                                Text {
+                                    text: qsTr("Type")
+                                    font.pixelSize: theme.fontSizeSmall
+                                    color: theme.mutedDarkTextColor
+                                }
+                                Text {
+                                    text: type
+                                    color: theme.textColor
+                                    font.pixelSize: theme.fontSizeSmall
+                                    font.bold: true
+                                }
+                            }
+                        }
+
+                        Rectangle {
+                            color: "transparent"
+                            anchors.fill: paramRow
+                            border.color: theme.dividerColor
+                            border.width: 1
+                            radius: 10
+                        }
+                    }
+
+                    Rectangle {
+                        Layout.fillWidth: true
+                        height: 1
+                        color: theme.dividerColor
+                    }
+                }
+            }
+        }
+    }
+}
--- a/gpt4all-chat/qml/AddModelView.qml
+++ b/gpt4all-chat/qml/AddModelView.qml
@ -42,12 +42,12 @@ Rectangle {
        anchors.top: parent.top
        anchors.bottom: parent.bottom
        anchors.margins: 30
-        spacing: 30
+        spacing: 10

        ColumnLayout {
            Layout.fillWidth: true
            Layout.alignment: Qt.AlignTop
-            spacing: 30
+            spacing: 10

            MyButton {
                id: backButton
@ -76,716 +76,80 @@ Rectangle {
                font.pixelSize: theme.fontSizeBanner
                color: theme.titleTextColor
            }
+        }

-            RowLayout {
-                Layout.fillWidth: true
-                Layout.alignment: Qt.AlignCenter
-                Layout.margins: 0
-                spacing: 10
-                MyTextField {
-                    id: discoverField
-                    property string textBeingSearched: ""
-                    readOnly: ModelList.discoverInProgress
-                    Layout.alignment: Qt.AlignCenter
-                    Layout.fillWidth: true
-                    font.pixelSize: theme.fontSizeLarger
-                    placeholderText: qsTr("Discover and download models by keyword search...")
-                    Accessible.role: Accessible.EditableText
-                    Accessible.name: placeholderText
-                    Accessible.description: qsTr("Text field for discovering and filtering downloadable models")
-                    Connections {
-                        target: ModelList
-                        function onDiscoverInProgressChanged() {
-                            if (ModelList.discoverInProgress) {
-                                discoverField.textBeingSearched = discoverField.text;
-                                discoverField.text = qsTr("Searching \u00B7 %1").arg(discoverField.textBeingSearched);
-                            } else {
-                                discoverField.text = discoverField.textBeingSearched;
-                                discoverField.textBeingSearched = "";
-                            }
-                        }
-                    }
-                    background: ProgressBar {
-                        id: discoverProgressBar
-                        indeterminate: ModelList.discoverInProgress && ModelList.discoverProgress === 0.0
-                        value: ModelList.discoverProgress
-                        background: Rectangle {
-                            color: theme.controlBackground
-                            border.color: theme.controlBorder
-                            radius: 10
-                        }
-                        contentItem: Item {
-                            Rectangle {
-                                visible: ModelList.discoverInProgress
-                                anchors.bottom: parent.bottom
-                                width: discoverProgressBar.visualPosition * parent.width
-                                height: 10
-                                radius: 2
-                                color: theme.progressForeground
-                            }
-                        }
-                    }
-
-                    Keys.onReturnPressed: (event)=> {
-                                              if (event.modifiers & Qt.ControlModifier || event.modifiers & Qt.ShiftModifier)
-                                              event.accepted = false;
-                                              else {
-                                                  editingFinished();
-                                                  sendDiscovery()
-                                              }
-                                          }
-                    function sendDiscovery() {
-                        ModelList.downloadableModels.discoverAndFilter(discoverField.text);
-                    }
-                    RowLayout {
-                        spacing: 0
-                        anchors.right: discoverField.right
-                        anchors.verticalCenter: discoverField.verticalCenter
-                        anchors.rightMargin: 15
-                        visible: !ModelList.discoverInProgress
-                        MyMiniButton {
-                            id: clearDiscoverButton
-                            backgroundColor: theme.textColor
-                            backgroundColorHovered: theme.iconBackgroundDark
-                            visible: discoverField.text !== ""
-                            source: "qrc:/gpt4all/icons/close.svg"
-                            onClicked: {
-                                discoverField.text = ""
-                                discoverField.sendDiscovery() // should clear results
-                            }
-                        }
-                        MyMiniButton {
-                            backgroundColor: theme.textColor
-                            backgroundColorHovered: theme.iconBackgroundDark
-                            source: "qrc:/gpt4all/icons/settings.svg"
-                            onClicked: {
-                                discoveryTools.visible = !discoveryTools.visible
-                            }
-                        }
-                        MyMiniButton {
-                            id: sendButton
-                            enabled: !ModelList.discoverInProgress
-                            backgroundColor: theme.textColor
-                            backgroundColorHovered: theme.iconBackgroundDark
-                            source: "qrc:/gpt4all/icons/send_message.svg"
-                            Accessible.name: qsTr("Initiate model discovery and filtering")
-                            Accessible.description: qsTr("Triggers discovery and filtering of models")
-                            onClicked: {
-                                discoverField.sendDiscovery()
-                            }
-                        }
-                    }
+        RowLayout {
+            id: bar
+            implicitWidth: 600
+            spacing: 10
+            MyTabButton {
+                text: qsTr("GPT4All")
+                isSelected: gpt4AllModelView.isShown()
+                onPressed: {
+                    gpt4AllModelView.show();
                }
            }
-
-            RowLayout {
-                id: discoveryTools
-                Layout.fillWidth: true
-                Layout.alignment: Qt.AlignCenter
-                Layout.margins: 0
-                spacing: 20
-                visible: false
-                MyComboBox {
-                    id: comboSort
-                    model: [qsTr("Default"), qsTr("Likes"), qsTr("Downloads"), qsTr("Recent")]
-                    currentIndex: ModelList.discoverSort
-                    contentItem: Text {
-                        anchors.horizontalCenter: parent.horizontalCenter
-                        rightPadding: 30
-                        color: theme.textColor
-                        text: {
-                            return qsTr("Sort by: %1").arg(comboSort.displayText)
-                        }
-                        font.pixelSize: theme.fontSizeLarger
-                        verticalAlignment: Text.AlignVCenter
-                        horizontalAlignment: Text.AlignHCenter
-                        elide: Text.ElideRight
-                    }
-                    onActivated: function (index) {
-                        ModelList.discoverSort = index;
-                    }
+            MyTabButton {
+                text: qsTr("Remote Providers")
+                isSelected: remoteModelView.isShown()
+                onPressed: {
+                    remoteModelView.show();
                }
-                MyComboBox {
-                    id: comboSortDirection
-                    model: [qsTr("Asc"), qsTr("Desc")]
-                    currentIndex: {
-                        if (ModelList.discoverSortDirection === 1)
-                            return 0
-                        else
-                            return 1;
-                    }
-                    contentItem: Text {
-                        anchors.horizontalCenter: parent.horizontalCenter
-                        rightPadding: 30
-                        color: theme.textColor
-                        text: {
-                            return qsTr("Sort dir: %1").arg(comboSortDirection.displayText)
-                        }
-                        font.pixelSize: theme.fontSizeLarger
-                        verticalAlignment: Text.AlignVCenter
-                        horizontalAlignment: Text.AlignHCenter
-                        elide: Text.ElideRight
-                    }
-                    onActivated: function (index) {
-                        if (index === 0)
-                            ModelList.discoverSortDirection = 1;
-                        else
-                            ModelList.discoverSortDirection = -1;
-                    }
-                }
-                MyComboBox {
-                    id: comboLimit
-                    model: ["5", "10", "20", "50", "100", qsTr("None")]
-                    currentIndex: {
-                        if (ModelList.discoverLimit === 5)
-                            return 0;
-                        else if (ModelList.discoverLimit === 10)
-                            return 1;
-                        else if (ModelList.discoverLimit === 20)
-                            return 2;
-                        else if (ModelList.discoverLimit === 50)
-                            return 3;
-                        else if (ModelList.discoverLimit === 100)
-                            return 4;
-                        else if (ModelList.discoverLimit === -1)
-                            return 5;
-                    }
-                    contentItem: Text {
-                        anchors.horizontalCenter: parent.horizontalCenter
-                        rightPadding: 30
-                        color: theme.textColor
-                        text: {
-                            return qsTr("Limit: %1").arg(comboLimit.displayText)
-                        }
-                        font.pixelSize: theme.fontSizeLarger
-                        verticalAlignment: Text.AlignVCenter
-                        horizontalAlignment: Text.AlignHCenter
-                        elide: Text.ElideRight
-                    }
-                    onActivated: function (index) {
-                        switch (index) {
-                        case 0:
-                            ModelList.discoverLimit = 5; break;
-                        case 1:
-                            ModelList.discoverLimit = 10; break;
-                        case 2:
-                            ModelList.discoverLimit = 20; break;
-                        case 3:
-                            ModelList.discoverLimit = 50; break;
-                        case 4:
-                            ModelList.discoverLimit = 100; break;
-                        case 5:
-                            ModelList.discoverLimit = -1; break;
-                        }
-                    }
+            }
+            MyTabButton {
+                text: qsTr("HuggingFace")
+                isSelected: huggingfaceModelView.isShown()
+                onPressed: {
+                    huggingfaceModelView.show();
                }
            }
        }

-        Label {
-            visible: !ModelList.downloadableModels.count && !ModelList.asyncModelRequestOngoing
+        StackLayout {
+            id: stackLayout
            Layout.fillWidth: true
            Layout.fillHeight: true
-            horizontalAlignment: Qt.AlignHCenter
-            verticalAlignment: Qt.AlignVCenter
-            text: qsTr("Network error: could not retrieve %1").arg("http://gpt4all.io/models/models3.json")
-            font.pixelSize: theme.fontSizeLarge
-            color: theme.mutedTextColor
-        }

-        MyBusyIndicator {
-            visible: !ModelList.downloadableModels.count && ModelList.asyncModelRequestOngoing
-            running: ModelList.asyncModelRequestOngoing
-            Accessible.role: Accessible.Animation
-            Layout.alignment: Qt.AlignCenter
-            Accessible.name: qsTr("Busy indicator")
-            Accessible.description: qsTr("Displayed when the models request is ongoing")
-        }
+            AddGPT4AllModelView {
+                id: gpt4AllModelView
+                Layout.fillWidth: true
+                Layout.fillHeight: true

-        ScrollView {
-            id: scrollView
-            ScrollBar.vertical.policy: ScrollBar.AsNeeded
-            Layout.fillWidth: true
-            Layout.fillHeight: true
-            clip: true
+                function show() {
+                    stackLayout.currentIndex = 0;
+                }
+                function isShown() {
+                    return stackLayout.currentIndex === 0;
+                }
+            }

-            ListView {
-                id: modelListView
-                model: ModelList.downloadableModels
-                boundsBehavior: Flickable.StopAtBounds
-                spacing: 30
+            AddRemoteModelView {
+                id: remoteModelView
+                Layout.fillWidth: true
+                Layout.fillHeight: true

-                delegate: Rectangle {
-                    id: delegateItem
-                    width: modelListView.width
-                    height: childrenRect.height + 60
-                    color: theme.conversationBackground
-                    radius: 10
-                    border.width: 1
-                    border.color: theme.controlBorder
+                function show() {
+                    stackLayout.currentIndex = 1;
+                }
+                function isShown() {
+                    return stackLayout.currentIndex === 1;
+                }
+            }

-                    ColumnLayout {
-                        anchors.top: parent.top
-                        anchors.left: parent.left
-                        anchors.right: parent.right
-                        anchors.margins: 30
+            AddHFModelView {
+                id: huggingfaceModelView
+                Layout.fillWidth: true
+                Layout.fillHeight: true
+                // FIXME: This generates a warning and should not be used inside a layout, but without
+                // it the text field inside this qml does not display at full width so it looks like
+                // a bug in stacklayout
+                anchors.fill: parent

-                        Text {
-                            Layout.fillWidth: true
-                            Layout.alignment: Qt.AlignLeft
-                            text: name
-                            elide: Text.ElideRight
-                            color: theme.titleTextColor
-                            font.pixelSize: theme.fontSizeLargest
-                            font.bold: true
-                            Accessible.role: Accessible.Paragraph
-                            Accessible.name: qsTr("Model file")
-                            Accessible.description: qsTr("Model file to be downloaded")
-                        }
-
-
-                        Rectangle {
-                            Layout.fillWidth: true
-                            height: 1
-                            color: theme.dividerColor
-                        }
-
-                        RowLayout {
-                            Layout.topMargin: 10
-                            Layout.fillWidth: true
-                            Text {
-                                id: descriptionText
-                                text: description
-                                font.pixelSize: theme.fontSizeLarge
-                                Layout.fillWidth: true
-                                wrapMode: Text.WordWrap
-                                textFormat: Text.StyledText
-                                color: theme.textColor
-                                linkColor: theme.textColor
-                                Accessible.role: Accessible.Paragraph
-                                Accessible.name: qsTr("Description")
-                                Accessible.description: qsTr("File description")
-                                onLinkActivated: function(link) { Qt.openUrlExternally(link); }
-                                MouseArea {
-                                    anchors.fill: parent
-                                    acceptedButtons: Qt.NoButton // pass clicks to parent
-                                    cursorShape: parent.hoveredLink ? Qt.PointingHandCursor : Qt.ArrowCursor
-                                }
-                            }
-
-                            // FIXME Need to overhaul design here which must take into account
-                            // features not present in current figma including:
-                            // * Ability to cancel a current download
-                            // * Ability to resume a download
-                            // * The presentation of an error if encountered
-                            // * Whether to show already installed models
-                            // * Install of remote models with API keys
-                            // * The presentation of the progress bar
-                            Rectangle {
-                                id: actionBox
-                                width: childrenRect.width + 20
-                                color: "transparent"
-                                border.width: 1
-                                border.color: theme.dividerColor
-                                radius: 10
-                                Layout.rightMargin: 20
-                                Layout.bottomMargin: 20
-                                Layout.minimumHeight: childrenRect.height + 20
-                                Layout.alignment: Qt.AlignRight | Qt.AlignTop
-
-                                ColumnLayout {
-                                    spacing: 0
-                                    MySettingsButton {
-                                        id: downloadButton
-                                        text: isDownloading ? qsTr("Cancel") : isIncomplete ? qsTr("Resume") : qsTr("Download")
-                                        font.pixelSize: theme.fontSizeLarge
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.fillWidth: true
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        visible: !isOnline && !installed && !calcHash && downloadError === ""
-                                        Accessible.description: qsTr("Stop/restart/start the download")
-                                        onClicked: {
-                                            if (!isDownloading) {
-                                                Download.downloadModel(filename);
-                                            } else {
-                                                Download.cancelDownload(filename);
-                                            }
-                                        }
-                                    }
-
-                                    MySettingsDestructiveButton {
-                                        id: removeButton
-                                        text: qsTr("Remove")
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.fillWidth: true
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        visible: !isDownloading && (installed || isIncomplete)
-                                        Accessible.description: qsTr("Remove model from filesystem")
-                                        onClicked: {
-                                            Download.removeModel(filename);
-                                        }
-                                    }
-
-                                    MySettingsButton {
-                                        id: installButton
-                                        visible: !installed && isOnline
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.fillWidth: true
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        text: qsTr("Install")
-                                        font.pixelSize: theme.fontSizeLarge
-                                        onClicked: {
-                                            var apiKeyText = apiKey.text.trim(),
-                                                baseUrlText = baseUrl.text.trim(),
-                                                modelNameText = modelName.text.trim();
-
-                                            var apiKeyOk = apiKeyText !== "",
-                                                baseUrlOk = !isCompatibleApi || baseUrlText !== "",
-                                                modelNameOk = !isCompatibleApi || modelNameText !== "";
-
-                                            if (!apiKeyOk)
-                                                apiKey.showError();
-                                            if (!baseUrlOk)
-                                                baseUrl.showError();
-                                            if (!modelNameOk)
-                                                modelName.showError();
-
-                                            if (!apiKeyOk || !baseUrlOk || !modelNameOk)
-                                                return;
-
-                                            if (!isCompatibleApi)
-                                                Download.installModel(
-                                                    filename,
-                                                    apiKeyText,
-                                                );
-                                            else
-                                                Download.installCompatibleModel(
-                                                    modelNameText,
-                                                    apiKeyText,
-                                                    baseUrlText,
-                                                );
-                                        }
-                                        Accessible.role: Accessible.Button
-                                        Accessible.name: qsTr("Install")
-                                        Accessible.description: qsTr("Install online model")
-                                    }
-
-                                    ColumnLayout {
-                                        spacing: 0
-                                        Label {
-                                            Layout.topMargin: 20
-                                            Layout.leftMargin: 20
-                                            visible: downloadError !== ""
-                                            textFormat: Text.StyledText
-                                            text: qsTr("<strong><font size=\"1\"><a href=\"#error\">Error</a></strong></font>")
-                                            color: theme.textColor
-                                            font.pixelSize: theme.fontSizeLarge
-                                            linkColor: theme.textErrorColor
-                                            Accessible.role: Accessible.Paragraph
-                                            Accessible.name: text
-                                            Accessible.description: qsTr("Describes an error that occurred when downloading")
-                                            onLinkActivated: {
-                                                downloadingErrorPopup.text = downloadError;
-                                                downloadingErrorPopup.open();
-                                            }
-                                        }
-
-                                        Label {
-                                            visible: LLM.systemTotalRAMInGB() < ramrequired
-                                            Layout.topMargin: 20
-                                            Layout.leftMargin: 20
-                                            Layout.maximumWidth: 300
-                                            textFormat: Text.StyledText
-                                            text: qsTr("<strong><font size=\"2\">WARNING: Not recommended for your hardware. Model requires more memory (%1 GB) than your system has available (%2).</strong></font>").arg(ramrequired).arg(LLM.systemTotalRAMInGBString())
-                                            color: theme.textErrorColor
-                                            font.pixelSize: theme.fontSizeLarge
-                                            wrapMode: Text.WordWrap
-                                            Accessible.role: Accessible.Paragraph
-                                            Accessible.name: text
-                                            Accessible.description: qsTr("Error for incompatible hardware")
-                                            onLinkActivated: {
-                                                downloadingErrorPopup.text = downloadError;
-                                                downloadingErrorPopup.open();
-                                            }
-                                        }
-                                    }
-
-                                    ColumnLayout {
-                                        visible: isDownloading && !calcHash
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.fillWidth: true
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        spacing: 20
-
-                                        ProgressBar {
-                                            id: itemProgressBar
-                                            Layout.fillWidth: true
-                                            width: 200
-                                            value: bytesReceived / bytesTotal
-                                            background: Rectangle {
-                                                implicitHeight: 45
-                                                color: theme.progressBackground
-                                                radius: 3
-                                            }
-                                            contentItem: Item {
-                                                implicitHeight: 40
-
-                                                Rectangle {
-                                                    width: itemProgressBar.visualPosition * parent.width
-                                                    height: parent.height
-                                                    radius: 2
-                                                    color: theme.progressForeground
-                                                }
-                                            }
-                                            Accessible.role: Accessible.ProgressBar
-                                            Accessible.name: qsTr("Download progressBar")
-                                            Accessible.description: qsTr("Shows the progress made in the download")
-                                        }
-
-                                        Label {
-                                            id: speedLabel
-                                            color: theme.textColor
-                                            Layout.alignment: Qt.AlignRight
-                                            text: speed
-                                            font.pixelSize: theme.fontSizeLarge
-                                            Accessible.role: Accessible.Paragraph
-                                            Accessible.name: qsTr("Download speed")
-                                            Accessible.description: qsTr("Download speed in bytes/kilobytes/megabytes per second")
-                                        }
-                                    }
-
-                                    RowLayout {
-                                        visible: calcHash
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.maximumWidth: 200
-                                        Layout.fillWidth: true
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        clip: true
-
-                                        Label {
-                                            id: calcHashLabel
-                                            color: theme.textColor
-                                            text: qsTr("Calculating...")
-                                            font.pixelSize: theme.fontSizeLarge
-                                            Accessible.role: Accessible.Paragraph
-                                            Accessible.name: text
-                                            Accessible.description: qsTr("Whether the file hash is being calculated")
-                                        }
-
-                                        MyBusyIndicator {
-                                            id: busyCalcHash
-                                            running: calcHash
-                                            Accessible.role: Accessible.Animation
-                                            Accessible.name: qsTr("Busy indicator")
-                                            Accessible.description: qsTr("Displayed when the file hash is being calculated")
-                                        }
-                                    }
-
-                                    MyTextField {
-                                        id: apiKey
-                                        visible: !installed && isOnline
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        wrapMode: Text.WrapAnywhere
-                                        function showError() {
-                                            messageToast.show(qsTr("ERROR: $API_KEY is empty."));
-                                            apiKey.placeholderTextColor = theme.textErrorColor;
-                                        }
-                                        onTextChanged: {
-                                            apiKey.placeholderTextColor = theme.mutedTextColor;
-                                        }
-                                        placeholderText: qsTr("enter $API_KEY")
-                                        Accessible.role: Accessible.EditableText
-                                        Accessible.name: placeholderText
-                                        Accessible.description: qsTr("Whether the file hash is being calculated")
-                                    }
-
-                                    MyTextField {
-                                        id: baseUrl
-                                        visible: !installed && isOnline && isCompatibleApi
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        wrapMode: Text.WrapAnywhere
-                                        function showError() {
-                                            messageToast.show(qsTr("ERROR: $BASE_URL is empty."));
-                                            baseUrl.placeholderTextColor = theme.textErrorColor;
-                                        }
-                                        onTextChanged: {
-                                            baseUrl.placeholderTextColor = theme.mutedTextColor;
-                                        }
-                                        placeholderText: qsTr("enter $BASE_URL")
-                                        Accessible.role: Accessible.EditableText
-                                        Accessible.name: placeholderText
-                                        Accessible.description: qsTr("Whether the file hash is being calculated")
-                                    }
-
-                                    MyTextField {
-                                        id: modelName
-                                        visible: !installed && isOnline && isCompatibleApi
-                                        Layout.topMargin: 20
-                                        Layout.leftMargin: 20
-                                        Layout.minimumWidth: 200
-                                        Layout.alignment: Qt.AlignTop | Qt.AlignHCenter
-                                        wrapMode: Text.WrapAnywhere
-                                        function showError() {
-                                            messageToast.show(qsTr("ERROR: $MODEL_NAME is empty."))
-                                            modelName.placeholderTextColor = theme.textErrorColor;
-                                        }
-                                        onTextChanged: {
-                                            modelName.placeholderTextColor = theme.mutedTextColor;
-                                        }
-                                        placeholderText: qsTr("enter $MODEL_NAME")
-                                        Accessible.role: Accessible.EditableText
-                                        Accessible.name: placeholderText
-                                        Accessible.description: qsTr("Whether the file hash is being calculated")
-                                    }
-                                }
-                            }
-                        }
-
-                        Item  {
-                            Layout.minimumWidth: childrenRect.width
-                            Layout.minimumHeight: childrenRect.height
-                            Layout.bottomMargin: 10
-                            RowLayout {
-                                id: paramRow
-                                anchors.centerIn: parent
-                                ColumnLayout {
-                                    Layout.topMargin: 10
-                                    Layout.bottomMargin: 10
-                                    Layout.leftMargin: 20
-                                    Layout.rightMargin: 20
-                                    Text {
-                                        text: qsTr("File size")
-                                        font.pixelSize: theme.fontSizeSmall
-                                        color: theme.mutedDarkTextColor
-                                    }
-                                    Text {
-                                        text: filesize
-                                        color: theme.textColor
-                                        font.pixelSize: theme.fontSizeSmall
-                                        font.bold: true
-                                    }
-                                }
-                                Rectangle {
-                                    width: 1
-                                    Layout.fillHeight: true
-                                    color: theme.dividerColor
-                                }
-                                ColumnLayout {
-                                    Layout.topMargin: 10
-                                    Layout.bottomMargin: 10
-                                    Layout.leftMargin: 20
-                                    Layout.rightMargin: 20
-                                    Text {
-                                        text: qsTr("RAM required")
-                                        font.pixelSize: theme.fontSizeSmall
-                                        color: theme.mutedDarkTextColor
-                                    }
-                                    Text {
-                                        text: ramrequired >= 0 ? qsTr("%1 GB").arg(ramrequired) : qsTr("?")
-                                        color: theme.textColor
-                                        font.pixelSize: theme.fontSizeSmall
-                                        font.bold: true
-                                    }
-                                }
-                                Rectangle {
-                                    width: 1
-                                    Layout.fillHeight: true
-                                    color: theme.dividerColor
-                                }
-                                ColumnLayout {
-                                    Layout.topMargin: 10
-                                    Layout.bottomMargin: 10
-                                    Layout.leftMargin: 20
-                                    Layout.rightMargin: 20
-                                    Text {
-                                        text: qsTr("Parameters")
-                                        font.pixelSize: theme.fontSizeSmall
-                                        color: theme.mutedDarkTextColor
-                                    }
-                                    Text {
-                                        text: parameters !== "" ? parameters : qsTr("?")
-                                        color: theme.textColor
-                                        font.pixelSize: theme.fontSizeSmall
-                                        font.bold: true
-                                    }
-                                }
-                                Rectangle {
-                                    width: 1
-                                    Layout.fillHeight: true
-                                    color: theme.dividerColor
-                                }
-                                ColumnLayout {
-                                    Layout.topMargin: 10
-                                    Layout.bottomMargin: 10
-                                    Layout.leftMargin: 20
-                                    Layout.rightMargin: 20
-                                    Text {
-                                        text: qsTr("Quant")
-                                        font.pixelSize: theme.fontSizeSmall
-                                        color: theme.mutedDarkTextColor
-                                    }
-                                    Text {
-                                        text: quant
-                                        color: theme.textColor
-                                        font.pixelSize: theme.fontSizeSmall
-                                        font.bold: true
-                                    }
-                                }
-                                Rectangle {
-                                    width: 1
-                                    Layout.fillHeight: true
-                                    color: theme.dividerColor
-                                }
-                                ColumnLayout {
-                                    Layout.topMargin: 10
-                                    Layout.bottomMargin: 10
-                                    Layout.leftMargin: 20
-                                    Layout.rightMargin: 20
-                                    Text {
-                                        text: qsTr("Type")
-                                        font.pixelSize: theme.fontSizeSmall
-                                        color: theme.mutedDarkTextColor
-                                    }
-                                    Text {
-                                        text: type
-                                        color: theme.textColor
-                                        font.pixelSize: theme.fontSizeSmall
-                                        font.bold: true
-                                    }
-                                }
-                            }
-
-                            Rectangle {
-                                color: "transparent"
-                                anchors.fill: paramRow
-                                border.color: theme.dividerColor
-                                border.width: 1
-                                radius: 10
-                            }
-                        }
-
-                        Rectangle {
-                            Layout.fillWidth: true
-                            height: 1
-                            color: theme.dividerColor
-                        }
-                    }
+                function show() {
+                    stackLayout.currentIndex = 2;
+                }
+                function isShown() {
+                    return stackLayout.currentIndex === 2;
                }
            }
        }
--- a/gpt4all-chat/qml/AddRemoteModelView.qml
+++ b/gpt4all-chat/qml/AddRemoteModelView.qml
@ -0,0 +1,147 @@
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+import QtQuick.Dialogs
+import Qt.labs.folderlistmodel
+import Qt5Compat.GraphicalEffects
+
+import llm
+import chatlistmodel
+import download
+import modellist
+import network
+import gpt4all
+import mysettings
+import localdocs
+
+ColumnLayout {
+    Layout.fillWidth: true
+    Layout.alignment: Qt.AlignTop
+    spacing: 5
+
+    Label {
+        Layout.topMargin: 0
+        Layout.bottomMargin: 25
+        Layout.rightMargin: 150 * theme.fontScale
+        Layout.alignment: Qt.AlignTop
+        Layout.fillWidth: true
+        verticalAlignment: Text.AlignTop
+        text: qsTr("Various remote model providers that use network resources for inference.")
+        font.pixelSize: theme.fontSizeLarger
+        color: theme.textColor
+        wrapMode: Text.WordWrap
+    }
+
+    ScrollView {
+        id: scrollView
+        ScrollBar.vertical.policy: ScrollBar.AsNeeded
+        Layout.fillWidth: true
+        Layout.fillHeight: true
+        contentWidth: availableWidth
+        clip: true
+        Flow {
+            anchors.left: parent.left
+            anchors.right: parent.right
+            spacing: 20
+            bottomPadding: 20
+            property int childWidth: 330 * theme.fontScale
+            property int childHeight: 400 + 166 * theme.fontScale
+            RemoteModelCard {
+                width: parent.childWidth
+                height: parent.childHeight
+                providerBaseUrl: "https://api.groq.com/openai/v1/"
+                providerName: qsTr("Groq")
+                providerImage: "qrc:/gpt4all/icons/groq.svg"
+                providerDesc: qsTr('Groq offers a high-performance AI inference engine designed for low-latency and efficient processing. Optimized for real-time applications, Groq’s technology is ideal for users who need fast responses from open large language models and other AI workloads.<br><br>Get your API key: <a href="https://console.groq.com/keys">https://groq.com/</a>')
+                modelWhitelist: [
+                    // last updated 2025-02-24
+                    "deepseek-r1-distill-llama-70b",
+                    "deepseek-r1-distill-qwen-32b",
+                    "gemma2-9b-it",
+                    "llama-3.1-8b-instant",
+                    "llama-3.2-1b-preview",
+                    "llama-3.2-3b-preview",
+                    "llama-3.3-70b-specdec",
+                    "llama-3.3-70b-versatile",
+                    "llama3-70b-8192",
+                    "llama3-8b-8192",
+                    "mixtral-8x7b-32768",
+                    "qwen-2.5-32b",
+                    "qwen-2.5-coder-32b",
+                ]
+            }
+            RemoteModelCard {
+                width: parent.childWidth
+                height: parent.childHeight
+                providerBaseUrl: "https://api.openai.com/v1/"
+                providerName: qsTr("OpenAI")
+                providerImage: "qrc:/gpt4all/icons/openai.svg"
+                providerDesc: qsTr('OpenAI provides access to advanced AI models, including GPT-4 supporting a wide range of applications, from conversational AI to content generation and code completion.<br><br>Get your API key: <a href="https://platform.openai.com/signup">https://openai.com/</a>')
+                modelWhitelist: [
+                    // last updated 2025-02-24
+                    "gpt-3.5-turbo",
+                    "gpt-3.5-turbo-16k",
+                    "gpt-4",
+                    "gpt-4-32k",
+                    "gpt-4-turbo",
+                    "gpt-4o",
+                ]
+            }
+            RemoteModelCard {
+                width: parent.childWidth
+                height: parent.childHeight
+                providerBaseUrl: "https://api.mistral.ai/v1/"
+                providerName: qsTr("Mistral")
+                providerImage: "qrc:/gpt4all/icons/mistral.svg"
+                providerDesc: qsTr('Mistral AI specializes in efficient, open-weight language models optimized for various natural language processing tasks. Their models are designed for flexibility and performance, making them a solid option for applications requiring scalable AI solutions.<br><br>Get your API key: <a href="https://mistral.ai/">https://mistral.ai/</a>')
+                modelWhitelist: [
+                    // last updated 2025-02-24
+                    "codestral-2405",
+                    "codestral-2411-rc5",
+                    "codestral-2412",
+                    "codestral-2501",
+                    "codestral-latest",
+                    "codestral-mamba-2407",
+                    "codestral-mamba-latest",
+                    "ministral-3b-2410",
+                    "ministral-3b-latest",
+                    "ministral-8b-2410",
+                    "ministral-8b-latest",
+                    "mistral-large-2402",
+                    "mistral-large-2407",
+                    "mistral-large-2411",
+                    "mistral-large-latest",
+                    "mistral-medium-2312",
+                    "mistral-medium-latest",
+                    "mistral-saba-2502",
+                    "mistral-saba-latest",
+                    "mistral-small-2312",
+                    "mistral-small-2402",
+                    "mistral-small-2409",
+                    "mistral-small-2501",
+                    "mistral-small-latest",
+                    "mistral-tiny-2312",
+                    "mistral-tiny-2407",
+                    "mistral-tiny-latest",
+                    "open-codestral-mamba",
+                    "open-mistral-7b",
+                    "open-mistral-nemo",
+                    "open-mistral-nemo-2407",
+                    "open-mixtral-8x22b",
+                    "open-mixtral-8x22b-2404",
+                    "open-mixtral-8x7b",
+                ]
+            }
+            RemoteModelCard {
+                width: parent.childWidth
+                height: parent.childHeight
+                providerIsCustom: true
+                providerName: qsTr("Custom")
+                providerImage: "qrc:/gpt4all/icons/antenna_3.svg"
+                providerDesc: qsTr("The custom provider option allows users to connect their own OpenAI-compatible AI models or third-party inference services. This is useful for organizations with proprietary models or those leveraging niche AI providers not listed here.")
+            }
+        }
+    }
+}
--- a/gpt4all-chat/qml/ApplicationSettings.qml
+++ b/gpt4all-chat/qml/ApplicationSettings.qml
@ -10,7 +10,7 @@ import network
 import llm

 MySettingsTab {
-    onRestoreDefaultsClicked: {
+    onRestoreDefaults: {
        MySettings.restoreApplicationDefaults();
    }
    title: qsTr("Application")
@ -32,15 +32,15 @@ MySettingsTab {
        anchors.centerIn: parent
        modal: false
        padding: 20
+        width: 40 + 400 * theme.fontScale
        Text {
+            anchors.fill: parent
            horizontalAlignment: Text.AlignJustify
-            text: qsTr("ERROR: Update system could not find the MaintenanceTool used<br>
-                   to check for updates!<br><br>
-                   Did you install this application using the online installer? If so,<br>
-                   the MaintenanceTool executable should be located one directory<br>
-                   above where this application resides on your filesystem.<br><br>
-                   If you can't start it manually, then I'm afraid you'll have to<br>
-                   reinstall.")
+            text: qsTr("ERROR: Update system could not find the MaintenanceTool used to check for updates!<br/><br/>"
+                  + "Did you install this application using the online installer? If so, the MaintenanceTool "
+                  + "executable should be located one directory above where this application resides on your "
+                  + "filesystem.<br/><br/>If you can't start it manually, then I'm afraid you'll have to reinstall.")
+            wrapMode: Text.WordWrap
            color: theme.textErrorColor
            font.pixelSize: theme.fontSizeLarge
            Accessible.role: Accessible.Dialog
@ -108,7 +108,11 @@ MySettingsTab {
            Layout.fillWidth: false
            Layout.alignment: Qt.AlignRight
            // NOTE: indices match values of ChatTheme enum, keep them in sync
-            model: [qsTr("Light"), qsTr("Dark"), qsTr("LegacyDark")]
+            model: ListModel {
+                ListElement { name: qsTr("Light") }
+                ListElement { name: qsTr("Dark") }
+                ListElement { name: qsTr("LegacyDark") }
+            }
            Accessible.name: themeLabel.text
            Accessible.description: themeLabel.helpText
            function updateModel() {
@ -143,7 +147,11 @@ MySettingsTab {
            Layout.fillWidth: false
            Layout.alignment: Qt.AlignRight
            // NOTE: indices match values of FontSize enum, keep them in sync
-            model: [qsTr("Small"), qsTr("Medium"), qsTr("Large")]
+            model: ListModel {
+                ListElement { name: qsTr("Small") }
+                ListElement { name: qsTr("Medium") }
+                ListElement { name: qsTr("Large") }
+            }
            Accessible.name: fontLabel.text
            Accessible.description: fontLabel.helpText
            function updateModel() {
@ -313,6 +321,12 @@ MySettingsTab {
                    defaultModelBox.updateModel()
                }
            }
+            Connections {
+                target: MySettings
+                function onLanguageAndLocaleChanged() {
+                    defaultModelBox.rebuildModel()
+                }
+            }
            Connections {
                target: ModelList
                function onSelectableModelListChanged() {
@ -335,7 +349,11 @@ MySettingsTab {
            Layout.maximumWidth: 400
            Layout.alignment: Qt.AlignRight
            // NOTE: indices match values of SuggestionMode enum, keep them in sync
-            model: [ qsTr("When chatting with LocalDocs"), qsTr("Whenever possible"), qsTr("Never") ]
+            model: ListModel {
+                ListElement { name: qsTr("When chatting with LocalDocs") }
+                ListElement { name: qsTr("Whenever possible") }
+                ListElement { name: qsTr("Never") }
+            }
            Accessible.name: suggestionModeLabel.text
            Accessible.description: suggestionModeLabel.helpText
            onActivated: {
@ -376,11 +394,14 @@ MySettingsTab {
                    }
                }
            }
+            MyFolderDialog {
+                id: folderDialog
+            }
            MySettingsButton {
                text: qsTr("Browse")
                Accessible.description: qsTr("Choose where to save model files")
                onClicked: {
-                    openFolderDialog("file://" + MySettings.modelPath, function(selectedFolder) {
+                    folderDialog.openFolderDialog("file://" + MySettings.modelPath, function(selectedFolder) {
                        MySettings.modelPath = selectedFolder
                    })
                }
@ -466,32 +487,32 @@ MySettingsTab {
            Accessible.description: ToolTip.text
        }
        MySettingsLabel {
-            id: saveChatsContextLabel
-            text: qsTr("Save Chat Context")
-            helpText: qsTr("Save the chat model's state to disk for faster loading. WARNING: Uses ~2GB per chat.")
-            Layout.row: 12
+            id: trayLabel
+            text: qsTr("Enable System Tray")
+            helpText: qsTr("The application will minimize to the system tray when the window is closed.")
+            Layout.row: 13
            Layout.column: 0
        }
        MyCheckBox {
-            id: saveChatsContextBox
-            Layout.row: 12
+            id: trayBox
+            Layout.row: 13
            Layout.column: 2
            Layout.alignment: Qt.AlignRight
-            checked: MySettings.saveChatsContext
+            checked: MySettings.systemTray
            onClicked: {
-                MySettings.saveChatsContext = !MySettings.saveChatsContext
+                MySettings.systemTray = !MySettings.systemTray
            }
        }
        MySettingsLabel {
            id: serverChatLabel
-            text: qsTr("Enable Local Server")
+            text: qsTr("Enable Local API Server")
            helpText: qsTr("Expose an OpenAI-Compatible server to localhost. WARNING: Results in increased resource usage.")
-            Layout.row: 13
+            Layout.row: 14
            Layout.column: 0
        }
        MyCheckBox {
            id: serverChatBox
-            Layout.row: 13
+            Layout.row: 14
            Layout.column: 2
            Layout.alignment: Qt.AlignRight
            checked: MySettings.serverChat
@ -503,7 +524,7 @@ MySettingsTab {
            id: serverPortLabel
            text: qsTr("API Server Port")
            helpText: qsTr("The port to use for the local server. Requires restart.")
-            Layout.row: 14
+            Layout.row: 15
            Layout.column: 0
        }
        MyTextField {
@ -511,7 +532,7 @@ MySettingsTab {
            text: MySettings.networkPort
            color: theme.textColor
            font.pixelSize: theme.fontSizeLarge
-            Layout.row: 14
+            Layout.row: 15
            Layout.column: 2
            Layout.minimumWidth: 200
            Layout.maximumWidth: 200
@ -556,12 +577,12 @@ MySettingsTab {
            id: updatesLabel
            text: qsTr("Check For Updates")
            helpText: qsTr("Manually check for an update to GPT4All.");
-            Layout.row: 15
+            Layout.row: 16
            Layout.column: 0
        }

        MySettingsButton {
-            Layout.row: 15
+            Layout.row: 16
            Layout.column: 2
            Layout.alignment: Qt.AlignRight
            text: qsTr("Updates");
@ -572,7 +593,7 @@ MySettingsTab {
        }

        Rectangle {
-            Layout.row: 16
+            Layout.row: 17
            Layout.column: 0
            Layout.columnSpan: 3
            Layout.fillWidth: true
--- a/gpt4all-chat/qml/ChatCollapsibleItem.qml
+++ b/gpt4all-chat/qml/ChatCollapsibleItem.qml
@ -0,0 +1,166 @@
+import Qt5Compat.GraphicalEffects
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+
+import gpt4all
+import mysettings
+import toolenums
+
+ColumnLayout {
+    property alias textContent: innerTextItem.textContent
+    property bool isCurrent: false
+    property bool isError: false
+    property bool isThinking: false
+    property int  thinkingTime: 0
+
+    Layout.topMargin: 10
+    Layout.bottomMargin: 10
+
+    Item {
+        Layout.preferredWidth: childrenRect.width
+        Layout.preferredHeight: 38
+        RowLayout {
+            anchors.left: parent.left
+            anchors.top: parent.top
+            anchors.bottom: parent.bottom
+
+            Item {
+                Layout.preferredWidth: myTextArea.implicitWidth
+                Layout.preferredHeight: myTextArea.implicitHeight
+                TextArea {
+                    id: myTextArea
+                    text: {
+                        if (isError)
+                            return qsTr("Analysis encountered error");
+                        if (isCurrent)
+                            return isThinking ? qsTr("Thinking") : qsTr("Analyzing");
+                        return isThinking
+                            ? qsTr("Thought for %1 %2")
+                                  .arg(Math.ceil(thinkingTime / 1000.0))
+                                  .arg(Math.ceil(thinkingTime / 1000.0) === 1 ? qsTr("second") : qsTr("seconds"))
+                            : qsTr("Analyzed");
+                    }
+                    padding: 0
+                    font.pixelSize: theme.fontSizeLarger
+                    enabled: false
+                    focus: false
+                    readOnly: true
+                    color: headerMA.containsMouse ? theme.mutedDarkTextColorHovered : theme.mutedTextColor
+                    hoverEnabled: false
+                }
+
+                Item {
+                    id: textColorOverlay
+                    anchors.fill: parent
+                    clip: true
+                    visible: false
+                    Rectangle {
+                        id: animationRec
+                        width: myTextArea.width * 0.3
+                        anchors.top: parent.top
+                        anchors.bottom: parent.bottom
+                        color: theme.textColor
+
+                        SequentialAnimation {
+                            running: isCurrent
+                            loops: Animation.Infinite
+                            NumberAnimation {
+                                target: animationRec;
+                                property: "x";
+                                from: -animationRec.width;
+                                to: myTextArea.width * 3;
+                                duration: 2000
+                            }
+                        }
+                    }
+                }
+                OpacityMask {
+                    visible: isCurrent
+                    anchors.fill: parent
+                    maskSource: myTextArea
+                    source: textColorOverlay
+                }
+            }
+
+            Item {
+                id: caret
+                Layout.preferredWidth: contentCaret.width
+                Layout.preferredHeight: contentCaret.height
+                Image {
+                    id: contentCaret
+                    anchors.centerIn: parent
+                    visible: false
+                    sourceSize.width: theme.fontSizeLarge
+                    sourceSize.height: theme.fontSizeLarge
+                    mipmap: true
+                    source: {
+                        if (contentLayout.state === "collapsed")
+                            return "qrc:/gpt4all/icons/caret_right.svg";
+                        else
+                            return "qrc:/gpt4all/icons/caret_down.svg";
+                    }
+                }
+
+                ColorOverlay {
+                    anchors.fill: contentCaret
+                    source: contentCaret
+                    color: headerMA.containsMouse ? theme.mutedDarkTextColorHovered : theme.mutedTextColor
+                }
+            }
+        }
+
+        MouseArea {
+            id: headerMA
+            hoverEnabled: true
+            anchors.fill: parent
+            onClicked: {
+                if (contentLayout.state === "collapsed")
+                    contentLayout.state = "expanded";
+                else
+                    contentLayout.state = "collapsed";
+            }
+        }
+    }
+
+    ColumnLayout {
+        id: contentLayout
+        spacing: 0
+        state: "collapsed"
+        clip: true
+
+        states: [
+            State {
+                name: "expanded"
+                PropertyChanges { target: contentLayout; Layout.preferredHeight: innerContentLayout.height }
+            },
+            State {
+                name: "collapsed"
+                PropertyChanges { target: contentLayout; Layout.preferredHeight: 0 }
+            }
+        ]
+
+        transitions: [
+            Transition {
+                SequentialAnimation {
+                    PropertyAnimation {
+                        target: contentLayout
+                        property: "Layout.preferredHeight"
+                        duration: 300
+                        easing.type: Easing.InOutQuad
+                    }
+                }
+            }
+        ]
+
+        ColumnLayout {
+            id: innerContentLayout
+            Layout.leftMargin: 30
+            ChatTextItem {
+                id: innerTextItem
+            }
+        }
+    }
+}
--- a/gpt4all-chat/qml/ChatItemView.qml
+++ b/gpt4all-chat/qml/ChatItemView.qml
@ -0,0 +1,832 @@
+import Qt5Compat.GraphicalEffects
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+import Qt.labs.qmlmodels
+
+import gpt4all
+import mysettings
+import toolenums
+
+ColumnLayout {
+
+property var inputBoxText: null
+signal setInputBoxText(text: string)
+
+Item {
+
+Layout.fillWidth: true
+Layout.maximumWidth: parent.width
+Layout.preferredHeight: gridLayout.height
+
+HoverHandler { id: hoverArea }
+
+GridLayout {
+    id: gridLayout
+    anchors.left: parent.left
+    anchors.right: parent.right
+    columns: 2
+
+    Item {
+        Layout.row: 0
+        Layout.column: 0
+        Layout.alignment: Qt.AlignVCenter | Qt.AlignRight
+        Layout.preferredWidth: 32
+        Layout.preferredHeight: 32
+        Layout.topMargin: model.index > 0 ? 25 : 0
+
+        Image {
+            id: logo
+            sourceSize: Qt.size(32, 32)
+            fillMode: Image.PreserveAspectFit
+            mipmap: true
+            visible: false
+            source: name !== "Response: " ? "qrc:/gpt4all/icons/you.svg" : "qrc:/gpt4all/icons/gpt4all_transparent.svg"
+        }
+
+        ColorOverlay {
+            id: colorOver
+            anchors.fill: logo
+            source: logo
+            color: theme.conversationHeader
+            RotationAnimation {
+                id: rotationAnimation
+                target: colorOver
+                property: "rotation"
+                from: 0
+                to: 360
+                duration: 1000
+                loops: Animation.Infinite
+                running: isCurrentResponse && currentChat.responseInProgress
+            }
+        }
+    }
+
+    Item {
+        Layout.row: 0
+        Layout.column: 1
+        Layout.fillWidth: true
+        Layout.preferredHeight: 38
+        Layout.topMargin: model.index > 0 ? 25 : 0
+
+        RowLayout {
+            spacing: 5
+            anchors.left: parent.left
+            anchors.top: parent.top
+            anchors.bottom: parent.bottom
+
+            TextArea {
+                text: {
+                    if (name === "Response: ")
+                        return qsTr("GPT4All");
+                    return qsTr("You");
+                }
+                padding: 0
+                font.pixelSize: theme.fontSizeLarger
+                font.bold: true
+                color: theme.conversationHeader
+                enabled: false
+                focus: false
+                readOnly: true
+            }
+            Text {
+                visible: name === "Response: "
+                font.pixelSize: theme.fontSizeLarger
+                text: currentModelName()
+                color: theme.mutedTextColor
+            }
+            RowLayout {
+                visible: isCurrentResponse && (content === "" && currentChat.responseInProgress)
+                Text {
+                    color: theme.mutedTextColor
+                    font.pixelSize: theme.fontSizeLarger
+                    text: {
+                        switch (currentChat.responseState) {
+                        case Chat.ResponseStopped: return qsTr("response stopped ...");
+                        case Chat.LocalDocsRetrieval: return qsTr("retrieving localdocs: %1 ...").arg(currentChat.collectionList.join(", "));
+                        case Chat.LocalDocsProcessing: return qsTr("searching localdocs: %1 ...").arg(currentChat.collectionList.join(", "));
+                        case Chat.PromptProcessing: return qsTr("processing ...")
+                        case Chat.ResponseGeneration: return qsTr("generating response ...");
+                        case Chat.GeneratingQuestions: return qsTr("generating questions ...");
+                        case Chat.ToolCallGeneration: return qsTr("generating toolcall ...");
+                        default: return ""; // handle unexpected values
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    ColumnLayout {
+        Layout.row: 1
+        Layout.column: 1
+        Layout.fillWidth: true
+        spacing: 10
+        Flow {
+            id: attachedUrlsFlow
+            Layout.fillWidth: true
+            Layout.bottomMargin: 10
+            spacing: 10
+            visible: promptAttachments.length !== 0
+            Repeater {
+                model: promptAttachments
+
+                delegate: Rectangle {
+                    width: 350
+                    height: 50
+                    radius: 5
+                    color: theme.attachmentBackground
+                    border.color: theme.controlBorder
+
+                    Row {
+                        spacing: 5
+                        anchors.fill: parent
+                        anchors.margins: 5
+
+                        MyFileIcon {
+                            iconSize: 40
+                            fileName: modelData.file
+                        }
+
+                        Text {
+                            width: 295
+                            height: 40
+                            text: modelData.file
+                            color: theme.textColor
+                            horizontalAlignment: Text.AlignHLeft
+                            verticalAlignment: Text.AlignVCenter
+                            font.pixelSize: theme.fontSizeMedium
+                            font.bold: true
+                            wrapMode: Text.WrapAnywhere
+                            elide: Qt.ElideRight
+                        }
+                    }
+                }
+            }
+        }
+
+        Repeater {
+            model: childItems
+
+            DelegateChooser {
+                id: chooser
+                role: "name"
+                DelegateChoice {
+                    roleValue: "Text: ";
+                    ChatTextItem {
+                        Layout.fillWidth: true
+                        textContent: modelData.content
+                    }
+                }
+                DelegateChoice {
+                    roleValue: "ToolCall: ";
+                    ChatCollapsibleItem {
+                        Layout.fillWidth: true
+                        textContent: modelData.content
+                        isCurrent: modelData.isCurrentResponse
+                        isError: modelData.isToolCallError
+                    }
+                }
+                DelegateChoice {
+                    roleValue: "Think: ";
+                    ChatCollapsibleItem {
+                        Layout.fillWidth: true
+                        textContent: modelData.content
+                        isCurrent: modelData.isCurrentResponse
+                        isError: false
+                        isThinking: true
+                        thinkingTime: modelData.thinkingTime
+                        visible: modelData.content !== ""
+                    }
+                }
+            }
+
+            delegate: chooser
+        }
+
+        ChatTextItem {
+            Layout.fillWidth: true
+            textContent: content
+        }
+
+        ThumbsDownDialog {
+            id: thumbsDownDialog
+            x: Math.round((parent.width - width) / 2)
+            y: Math.round((parent.height - height) / 2)
+            width: 640
+            height: 300
+            property string text: content
+            response: newResponse === undefined || newResponse === "" ? text : newResponse
+            onAccepted: {
+                var responseHasChanged = response !== text && response !== newResponse
+                if (thumbsDownState && !thumbsUpState && !responseHasChanged)
+                    return
+
+                chatModel.updateNewResponse(model.index, response)
+                chatModel.updateThumbsUpState(model.index, false)
+                chatModel.updateThumbsDownState(model.index, true)
+                Network.sendConversation(currentChat.id, getConversationJson());
+            }
+        }
+    }
+
+    Item {
+        Layout.row: 2
+        Layout.column: 1
+        Layout.topMargin: 5
+        Layout.alignment: Qt.AlignVCenter
+        Layout.preferredWidth: childrenRect.width
+        Layout.preferredHeight: childrenRect.height
+        visible: {
+            if (name !== "Response: ")
+                return false
+            if (consolidatedSources.length === 0)
+                return false
+            if (!MySettings.localDocsShowReferences)
+                return false
+            if (isCurrentResponse && currentChat.responseInProgress
+                    && currentChat.responseState !== Chat.GeneratingQuestions )
+                return false
+            return true
+        }
+
+        MyButton {
+            backgroundColor: theme.sourcesBackground
+            backgroundColorHovered: theme.sourcesBackgroundHovered
+            contentItem: RowLayout {
+                anchors.centerIn: parent
+
+                Item {
+                    Layout.preferredWidth: 24
+                    Layout.preferredHeight: 24
+
+                    Image {
+                        id: sourcesIcon
+                        visible: false
+                        anchors.fill: parent
+                        sourceSize.width: 24
+                        sourceSize.height: 24
+                        mipmap: true
+                        source: "qrc:/gpt4all/icons/db.svg"
+                    }
+
+                    ColorOverlay {
+                        anchors.fill: sourcesIcon
+                        source: sourcesIcon
+                        color: theme.textColor
+                    }
+                }
+
+                Text {
+                    text: qsTr("%n Source(s)", "", consolidatedSources.length)
+                    padding: 0
+                    font.pixelSize: theme.fontSizeLarge
+                    font.bold: true
+                    color: theme.styledTextColor
+                }
+
+                Item {
+                    Layout.preferredWidth: caret.width
+                    Layout.preferredHeight: caret.height
+                    Image {
+                        id: caret
+                        anchors.centerIn: parent
+                        visible: false
+                        sourceSize.width: theme.fontSizeLarge
+                        sourceSize.height: theme.fontSizeLarge
+                        mipmap: true
+                        source: {
+                            if (sourcesLayout.state === "collapsed")
+                                return "qrc:/gpt4all/icons/caret_right.svg";
+                            else
+                                return "qrc:/gpt4all/icons/caret_down.svg";
+                        }
+                    }
+
+                    ColorOverlay {
+                        anchors.fill: caret
+                        source: caret
+                        color: theme.textColor
+                    }
+                }
+            }
+
+            onClicked: {
+                if (sourcesLayout.state === "collapsed")
+                    sourcesLayout.state = "expanded";
+                else
+                    sourcesLayout.state = "collapsed";
+            }
+        }
+    }
+
+    ColumnLayout {
+        id: sourcesLayout
+        Layout.row: 3
+        Layout.column: 1
+        Layout.topMargin: 5
+        visible: {
+            if (consolidatedSources.length === 0)
+                return false
+            if (!MySettings.localDocsShowReferences)
+                return false
+            if (isCurrentResponse && currentChat.responseInProgress
+                    && currentChat.responseState !== Chat.GeneratingQuestions )
+                return false
+            return true
+        }
+        clip: true
+        Layout.fillWidth: true
+        Layout.preferredHeight: 0
+        state: "collapsed"
+        states: [
+            State {
+                name: "expanded"
+                PropertyChanges { target: sourcesLayout; Layout.preferredHeight: sourcesFlow.height }
+            },
+            State {
+                name: "collapsed"
+                PropertyChanges { target: sourcesLayout; Layout.preferredHeight: 0 }
+            }
+        ]
+
+        transitions: [
+            Transition {
+                SequentialAnimation {
+                    PropertyAnimation {
+                        target: sourcesLayout
+                        property: "Layout.preferredHeight"
+                        duration: 300
+                        easing.type: Easing.InOutQuad
+                    }
+                }
+            }
+        ]
+
+        Flow {
+            id: sourcesFlow
+            Layout.fillWidth: true
+            spacing: 10
+            visible: consolidatedSources.length !== 0
+            Repeater {
+                model: consolidatedSources
+
+                delegate: Rectangle {
+                    radius: 10
+                    color: ma.containsMouse ? theme.sourcesBackgroundHovered : theme.sourcesBackground
+                    width: 200
+                    height: 75
+
+                    MouseArea {
+                        id: ma
+                        enabled: modelData.path !== ""
+                        anchors.fill: parent
+                        hoverEnabled: true
+                        onClicked: function() {
+                            Qt.openUrlExternally(modelData.fileUri)
+                        }
+                    }
+
+                    Rectangle {
+                        id: debugTooltip
+                        anchors.right: parent.right
+                        anchors.bottom: parent.bottom
+                        width: 24
+                        height: 24
+                        color: "transparent"
+                        ToolTip {
+                            parent: debugTooltip
+                            visible: debugMouseArea.containsMouse
+                            text: modelData.text
+                            contentWidth: 900
+                            delay: 500
+                        }
+                        MouseArea {
+                            id: debugMouseArea
+                            anchors.fill: parent
+                            hoverEnabled: true
+                        }
+                    }
+
+                    ColumnLayout {
+                        anchors.left: parent.left
+                        anchors.top: parent.top
+                        anchors.margins: 10
+                        spacing: 0
+                        RowLayout {
+                            id: title
+                            spacing: 5
+                            Layout.maximumWidth: 180
+                            MyFileIcon {
+                                iconSize: 24
+                                fileName: modelData.file
+                                Layout.preferredWidth: iconSize
+                                Layout.preferredHeight: iconSize
+                            }
+                            Text {
+                                Layout.maximumWidth: 156
+                                text: modelData.collection !== "" ? modelData.collection : qsTr("LocalDocs")
+                                font.pixelSize: theme.fontSizeLarge
+                                font.bold: true
+                                color: theme.styledTextColor
+                                elide: Qt.ElideRight
+                            }
+                            Rectangle {
+                                Layout.fillWidth: true
+                                color: "transparent"
+                                height: 1
+                            }
+                        }
+                        Text {
+                            Layout.fillHeight: true
+                            Layout.maximumWidth: 180
+                            Layout.maximumHeight: 55 - title.height
+                            text: modelData.file
+                            color: theme.textColor
+                            font.pixelSize: theme.fontSizeSmall
+                            elide: Qt.ElideRight
+                            wrapMode: Text.WrapAnywhere
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    ConfirmationDialog {
+        id: editPromptDialog
+        dialogTitle: qsTr("Edit this message?")
+        description: qsTr("All following messages will be permanently erased.")
+        onAccepted: {
+            const msg = currentChat.popPrompt(index);
+            if (msg !== null)
+                setInputBoxText(msg);
+        }
+    }
+
+    ConfirmationDialog {
+        id: redoResponseDialog
+        dialogTitle: qsTr("Redo this response?")
+        description: qsTr("All following messages will be permanently erased.")
+        onAccepted: currentChat.regenerateResponse(index)
+    }
+
+    RowLayout {
+        id: buttonRow
+        Layout.row: 4
+        Layout.column: 1
+        Layout.maximumWidth: parent.width
+        Layout.fillWidth: false
+        Layout.alignment: Qt.AlignLeft | Qt.AlignTop
+        spacing: 3
+        visible: !isCurrentResponse || !currentChat.responseInProgress
+        enabled: opacity > 0
+        opacity: hoverArea.hovered
+
+        Behavior on opacity {
+            OpacityAnimator { duration: 30 }
+        }
+
+        ChatMessageButton {
+            readonly property var editingDisabledReason: {
+                if (!currentChat.isModelLoaded)
+                    return qsTr("Cannot edit chat without a loaded model.");
+                if (currentChat.responseInProgress)
+                    return qsTr("Cannot edit chat while the model is generating.");
+                return null;
+            }
+            visible: !currentChat.isServer && model.name === "Prompt: "
+            enabled: editingDisabledReason === null
+            Layout.maximumWidth: 24
+            Layout.maximumHeight: 24
+            Layout.alignment: Qt.AlignVCenter
+            Layout.fillWidth: false
+            name: editingDisabledReason ?? qsTr("Edit")
+            source: "qrc:/gpt4all/icons/edit.svg"
+            onClicked: {
+                if (inputBoxText === "")
+                    editPromptDialog.open();
+            }
+        }
+
+        ChatMessageButton {
+            readonly property var editingDisabledReason: {
+                if (!currentChat.isModelLoaded)
+                    return qsTr("Cannot redo response without a loaded model.");
+                if (currentChat.responseInProgress)
+                    return qsTr("Cannot redo response while the model is generating.");
+                return null;
+            }
+            visible: !currentChat.isServer && model.name === "Response: "
+            enabled: editingDisabledReason === null
+            Layout.maximumWidth: 24
+            Layout.maximumHeight: 24
+            Layout.alignment: Qt.AlignVCenter
+            Layout.fillWidth: false
+            name: editingDisabledReason ?? qsTr("Redo")
+            source: "qrc:/gpt4all/icons/regenerate.svg"
+            onClicked: {
+                if (index == chatModel.count - 1) {
+                    // regenerate last message without confirmation
+                    currentChat.regenerateResponse(index);
+                    return;
+                }
+                redoResponseDialog.open();
+            }
+        }
+
+        ChatMessageButton {
+            Layout.maximumWidth: 24
+            Layout.maximumHeight: 24
+            Layout.alignment: Qt.AlignVCenter
+            Layout.fillWidth: false
+            name: qsTr("Copy")
+            source: "qrc:/gpt4all/icons/copy.svg"
+            onClicked: {
+                chatModel.copyToClipboard(index);
+            }
+        }
+
+        Item {
+            visible: name === "Response: " && MySettings.networkIsActive
+            Layout.alignment: Qt.AlignVCenter
+            Layout.preferredWidth: childrenRect.width
+            Layout.preferredHeight: childrenRect.height
+            Layout.fillWidth: false
+
+            ChatMessageButton {
+                id: thumbsUp
+                anchors.left: parent.left
+                anchors.verticalCenter: parent.verticalCenter
+                opacity: thumbsUpState || thumbsUpState == thumbsDownState ? 1.0 : 0.2
+                source: "qrc:/gpt4all/icons/thumbs_up.svg"
+                name: qsTr("Like response")
+                onClicked: {
+                    if (thumbsUpState && !thumbsDownState)
+                        return
+
+                    chatModel.updateNewResponse(index, "")
+                    chatModel.updateThumbsUpState(index, true)
+                    chatModel.updateThumbsDownState(index, false)
+                    Network.sendConversation(currentChat.id, getConversationJson());
+                }
+            }
+
+            ChatMessageButton {
+                id: thumbsDown
+                anchors.top: thumbsUp.top
+                anchors.topMargin: buttonRow.spacing
+                anchors.left: thumbsUp.right
+                anchors.leftMargin: buttonRow.spacing
+                checked: thumbsDownState
+                opacity: thumbsDownState || thumbsUpState == thumbsDownState ? 1.0 : 0.2
+                bgTransform: [
+                    Matrix4x4 {
+                        matrix: Qt.matrix4x4(-1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1)
+                    },
+                    Translate {
+                        x: thumbsDown.width
+                    }
+                ]
+                source: "qrc:/gpt4all/icons/thumbs_down.svg"
+                name: qsTr("Dislike response")
+                onClicked: {
+                    thumbsDownDialog.open()
+                }
+            }
+        }
+    }
+} // GridLayout
+
+} // Item
+
+GridLayout {
+    Layout.fillWidth: true
+    Layout.maximumWidth: parent.width
+
+    function shouldShowSuggestions() {
+        if (!isCurrentResponse)
+            return false;
+        if (MySettings.suggestionMode === 2) // Off
+            return false;
+        if (MySettings.suggestionMode === 0 && consolidatedSources.length === 0) // LocalDocs only
+            return false;
+        return currentChat.responseState === Chat.GeneratingQuestions || currentChat.generatedQuestions.length !== 0;
+    }
+
+    Item {
+        visible: parent.shouldShowSuggestions()
+        Layout.row: 5
+        Layout.column: 0
+        Layout.topMargin: 20
+        Layout.alignment: Qt.AlignVCenter | Qt.AlignRight
+        Layout.preferredWidth: 28
+        Layout.preferredHeight: 28
+        Image {
+            id: stack
+            sourceSize: Qt.size(28, 28)
+            fillMode: Image.PreserveAspectFit
+            mipmap: true
+            visible: false
+            source: "qrc:/gpt4all/icons/stack.svg"
+        }
+
+        ColorOverlay {
+            anchors.fill: stack
+            source: stack
+            color: theme.conversationHeader
+        }
+    }
+
+    Item {
+        visible: parent.shouldShowSuggestions()
+        Layout.row: 5
+        Layout.column: 1
+        Layout.topMargin: 20
+        Layout.fillWidth: true
+        Layout.preferredHeight: 38
+        RowLayout {
+            spacing: 5
+            anchors.left: parent.left
+            anchors.top: parent.top
+            anchors.bottom: parent.bottom
+
+            TextArea {
+                text: qsTr("Suggested follow-ups")
+                padding: 0
+                font.pixelSize: theme.fontSizeLarger
+                font.bold: true
+                color: theme.conversationHeader
+                enabled: false
+                focus: false
+                readOnly: true
+            }
+        }
+    }
+
+    ColumnLayout {
+        visible: parent.shouldShowSuggestions()
+        Layout.row: 6
+        Layout.column: 1
+        Layout.fillWidth: true
+        Layout.minimumHeight: 1
+        spacing: 10
+        Repeater {
+            model: currentChat.generatedQuestions
+            TextArea {
+                id: followUpText
+                Layout.fillWidth: true
+                Layout.alignment: Qt.AlignLeft
+                rightPadding: 40
+                topPadding: 10
+                leftPadding: 20
+                bottomPadding: 10
+                text: modelData
+                focus: false
+                readOnly: true
+                wrapMode: Text.WordWrap
+                hoverEnabled: !currentChat.responseInProgress
+                color: theme.textColor
+                font.pixelSize: theme.fontSizeLarge
+                background: Rectangle {
+                    color: hovered ? theme.sourcesBackgroundHovered : theme.sourcesBackground
+                    radius: 10
+                }
+                MouseArea {
+                    id: maFollowUp
+                    anchors.fill: parent
+                    enabled: !currentChat.responseInProgress
+                    onClicked: function() {
+                        var chat = window.currentChat
+                        var followup = modelData
+                        chat.stopGenerating()
+                        chat.newPromptResponsePair(followup)
+                    }
+                }
+                Item {
+                    anchors.right: parent.right
+                    anchors.verticalCenter: parent.verticalCenter
+                    width: 40
+                    height: 40
+                    visible: !currentChat.responseInProgress
+                    Image {
+                        id: plusImage
+                        anchors.verticalCenter: parent.verticalCenter
+                        sourceSize.width: 20
+                        sourceSize.height: 20
+                        mipmap: true
+                        visible: false
+                        source: "qrc:/gpt4all/icons/plus.svg"
+                    }
+
+                    ColorOverlay {
+                        anchors.fill: plusImage
+                        source: plusImage
+                        color: theme.styledTextColor
+                    }
+                }
+            }
+        }
+
+        Rectangle {
+            Layout.fillWidth: true
+            color: "transparent"
+            radius: 10
+            Layout.preferredHeight: currentChat.responseInProgress ? 40 : 0
+            clip: true
+            ColumnLayout {
+                id: followUpLayout
+                anchors.fill: parent
+                Rectangle {
+                    id: myRect1
+                    Layout.preferredWidth: 0
+                    Layout.minimumWidth: 0
+                    Layout.maximumWidth: parent.width
+                    height: 12
+                    color: theme.sourcesBackgroundHovered
+                }
+
+                Rectangle {
+                    id: myRect2
+                    Layout.preferredWidth: 0
+                    Layout.minimumWidth: 0
+                    Layout.maximumWidth: parent.width
+                    height: 12
+                    color: theme.sourcesBackgroundHovered
+                }
+
+                SequentialAnimation {
+                    id: followUpProgressAnimation
+                    ParallelAnimation {
+                        PropertyAnimation {
+                            target: myRect1
+                            property: "Layout.preferredWidth"
+                            from: 0
+                            to: followUpLayout.width
+                            duration: 1000
+                        }
+                        PropertyAnimation {
+                            target: myRect2
+                            property: "Layout.preferredWidth"
+                            from: 0
+                            to: followUpLayout.width / 2
+                            duration: 1000
+                        }
+                    }
+                    SequentialAnimation {
+                        loops: Animation.Infinite
+                        ParallelAnimation {
+                            PropertyAnimation {
+                                target: myRect1
+                                property: "opacity"
+                                from: 1
+                                to: 0.2
+                                duration: 1500
+                            }
+                            PropertyAnimation {
+                                target: myRect2
+                                property: "opacity"
+                                from: 1
+                                to: 0.2
+                                duration: 1500
+                            }
+                        }
+                        ParallelAnimation {
+                            PropertyAnimation {
+                                target: myRect1
+                                property: "opacity"
+                                from: 0.2
+                                to: 1
+                                duration: 1500
+                            }
+                            PropertyAnimation {
+                                target: myRect2
+                                property: "opacity"
+                                from: 0.2
+                                to: 1
+                                duration: 1500
+                            }
+                        }
+                    }
+                }
+
+                onVisibleChanged: {
+                    if (visible)
+                        followUpProgressAnimation.start();
+                }
+            }
+
+            Behavior on Layout.preferredHeight {
+                NumberAnimation {
+                    duration: 300
+                    easing.type: Easing.InOutQuad
+                }
+            }
+        }
+    }
+
+} // GridLayout
+
+} // ColumnLayout
--- a/gpt4all-chat/qml/ChatMessageButton.qml
+++ b/gpt4all-chat/qml/ChatMessageButton.qml
@ -0,0 +1,20 @@
+import QtQuick
+import QtQuick.Controls
+
+import gpt4all
+
+MyToolButton {
+    property string name
+
+    width: 24
+    height: 24
+    imageWidth: width
+    imageHeight: height
+    ToolTip {
+        visible: parent.hovered
+        y: parent.height * 1.5
+        text: name
+        delay: Qt.styleHints.mousePressAndHoldInterval
+    }
+    Accessible.name: name
+}
--- a/gpt4all-chat/qml/ChatTextItem.qml
+++ b/gpt4all-chat/qml/ChatTextItem.qml
@ -0,0 +1,139 @@
+import Qt5Compat.GraphicalEffects
+import QtCore
+import QtQuick
+import QtQuick.Controls
+import QtQuick.Controls.Basic
+import QtQuick.Layouts
+
+import gpt4all
+import mysettings
+import toolenums
+
+TextArea {
+    id: myTextArea
+    property string textContent: ""
+    visible: textContent != ""
+    Layout.fillWidth: true
+    padding: 0
+    color: {
+        if (!currentChat.isServer)
+            return theme.textColor
+        return theme.white
+    }
+    wrapMode: Text.WordWrap
+    textFormat: TextEdit.PlainText
+    focus: false
+    readOnly: true
+    font.pixelSize: theme.fontSizeLarge
+    cursorVisible: isCurrentResponse ? currentChat.responseInProgress : false
+    cursorPosition: text.length
+    TapHandler {
+        id: tapHandler
+        onTapped: function(eventPoint, button) {
+            var clickedPos = myTextArea.positionAt(eventPoint.position.x, eventPoint.position.y);
+            var success = textProcessor.tryCopyAtPosition(clickedPos);
+            if (success)
+                copyCodeMessage.open();
+        }
+    }
+
+    MouseArea {
+        id: conversationMouseArea
+        anchors.fill: parent
+        acceptedButtons: Qt.RightButton
+
+        onClicked: (mouse) => {
+                       if (mouse.button === Qt.RightButton) {
+                           conversationContextMenu.x = conversationMouseArea.mouseX
+                           conversationContextMenu.y = conversationMouseArea.mouseY
+                           conversationContextMenu.open()
+                       }
+                   }
+    }
+
+    onLinkActivated: function(link) {
+        if (!isCurrentResponse || !currentChat.responseInProgress)
+            Qt.openUrlExternally(link)
+    }
+
+    onLinkHovered: function (link) {
+        if (!isCurrentResponse || !currentChat.responseInProgress)
+            statusBar.externalHoveredLink = link
+    }
+
+    MyMenu {
+        id: conversationContextMenu
+        MyMenuItem {
+            text: qsTr("Copy")
+            enabled: myTextArea.selectedText !== ""
+            height: enabled ? implicitHeight : 0
+            onTriggered: myTextArea.copy()
+        }
+        MyMenuItem {
+            text: qsTr("Copy Message")
+            enabled: myTextArea.selectedText === ""
+            height: enabled ? implicitHeight : 0
+            onTriggered: {
+                myTextArea.selectAll()
+                myTextArea.copy()
+                myTextArea.deselect()
+            }
+        }
+        MyMenuItem {
+            text: textProcessor.shouldProcessText ? qsTr("Disable markdown") : qsTr("Enable markdown")
+            height: enabled ? implicitHeight : 0
+            onTriggered: {
+                textProcessor.shouldProcessText = !textProcessor.shouldProcessText;
+                textProcessor.setValue(textContent);
+            }
+        }
+    }
+
+    ChatViewTextProcessor {
+        id: textProcessor
+    }
+
+    function resetChatViewTextProcessor() {
+        textProcessor.fontPixelSize                = myTextArea.font.pixelSize
+        textProcessor.codeColors.defaultColor      = theme.codeDefaultColor
+        textProcessor.codeColors.keywordColor      = theme.codeKeywordColor
+        textProcessor.codeColors.functionColor     = theme.codeFunctionColor
+        textProcessor.codeColors.functionCallColor = theme.codeFunctionCallColor
+        textProcessor.codeColors.commentColor      = theme.codeCommentColor
+        textProcessor.codeColors.stringColor       = theme.codeStringColor
+        textProcessor.codeColors.numberColor       = theme.codeNumberColor
+        textProcessor.codeColors.headerColor       = theme.codeHeaderColor
+        textProcessor.codeColors.backgroundColor   = theme.codeBackgroundColor
+        textProcessor.textDocument                 = textDocument
+        textProcessor.setValue(textContent);
+    }
+
+    property bool textProcessorReady: false
+
+    Component.onCompleted: {
+        resetChatViewTextProcessor();
+        textProcessorReady = true;
+    }
+
+    Connections {
+        target: myTextArea
+        function onTextContentChanged() {
+            if (myTextArea.textProcessorReady)
+                textProcessor.setValue(textContent);
+        }
+    }
+
+    Connections {
+        target: MySettings
+        function onFontSizeChanged() {
+            myTextArea.resetChatViewTextProcessor();
+        }
+        function onChatThemeChanged() {
+            myTextArea.resetChatViewTextProcessor();
+        }
+    }
+
+    Accessible.role: Accessible.Paragraph
+    Accessible.name: text
+    Accessible.description: name === "Response: " ? "The response by the model" : "The prompt by the user"
+}
--- a/gpt4all-chat/qml/ChatView.qml
+++ b/gpt4all-chat/qml/ChatView.qml
--- a/Show More
+++ b/Show More
				`@ -0,0 +1 @@`
				`Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6`
				`@ -1 +0,0 @@`
				`Subproject commit c6546b0544ad2c01e8a1630b101e92336a68b036`
				`@ -0,0 +1 @@`
				`Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc`
				`@ -0,0 +1 @@`
				`Subproject commit 29e81b369128525749dcb6516195b6b062eda955`
				`@ -0,0 +1 @@`
				`Subproject commit 21bdef01eddcbd78044eea1d50b9dee08d218ff2`
				`@ -0,0 +1 @@`
				`Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592`
				`@ -0,0 +1 @@`
				`Subproject commit 606b6347edf0758c531abb6c36743e09a4c48a84`
				`@ -0,0 +1 @@`
				`Subproject commit e97bb2442cd6ab3d5bb5f5a3e8a1f7d6081d613b`
				`@ -0,0 +1 @@`
				`Subproject commit 9e59f1036657303b29eaf709945f339e403e5f2f`
				`@ -0,0 +1 @@`
				<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><path d="M36,152v56H52a28,28,0,0,0,0-56Z" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M216,200.87A22.12,22.12,0,0,1,200,208c-13.26,0-24-12.54-24-28s10.74-28,24-28a22.12,22.12,0,0,1,16,7.13" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M48,112V40a8,8,0,0,1,8-8h96l56,56v24" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="152 32 152 88 208 88" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><ellipse cx="128" cy="180" rx="24" ry="28" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>