2025-06-24 00:02:31 -04:00
192 changed files with 17030 additions and 27291 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,17 +1,13 @@
 version: 2.1
 setup: true
 orbs:
-  path-filtering: circleci/path-filtering@1.3.0
+  path-filtering: circleci/path-filtering@0.0.1
 workflows:
  version: 2.1
  generate-config:
    jobs:
      - path-filtering/filter:
          filters:
            tags:
              only:
                - /.*/
          base-revision: main
          config-path: .circleci/continue_config.yml
          mapping: |
@ -20,3 +16,4 @@ workflows:
            gpt4all-bindings/python/.* run-python-workflow true
            gpt4all-bindings/typescript/.* run-ts-workflow true
            gpt4all-chat/.* run-chat-workflow true
            .* run-default-workflow true
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = blong, afterall, assistent, crasher, requestor
+ignore-words-list = blong, afterall, som, assistent, crasher
-skip = ./.git,./gpt4all-chat/translations,*.pdf,*.svg,*.lock
+skip = .git,*.pdf,*.svg,*.lock,*.ts
--- a/.gitignore
+++ b/.gitignore
@ -181,8 +181,6 @@ CMakeLists.txt.user
 gpt4all-chat/models/*
 build_*
 build-*
 cmake-build-*
 /gpt4all-chat/tests/python/config.py
 # IntelliJ
 .idea/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,25 +1,7 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/deps/llama.cpp-mainline
+	path = gpt4all-backend/llama.cpp-mainline
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
 [submodule "gpt4all-chat/usearch"]
-	path = gpt4all-chat/deps/usearch
+	path = gpt4all-chat/usearch
 	url = https://github.com/nomic-ai/usearch.git
 [submodule "gpt4all-chat/deps/SingleApplication"]
 	path = gpt4all-chat/deps/SingleApplication
 	url = https://github.com/nomic-ai/SingleApplication.git
 [submodule "gpt4all-chat/deps/fmt"]
 	path = gpt4all-chat/deps/fmt
 	url = https://github.com/fmtlib/fmt.git
 [submodule "gpt4all-chat/deps/DuckX"]
 	path = gpt4all-chat/deps/DuckX
 	url = https://github.com/nomic-ai/DuckX.git
 [submodule "gpt4all-chat/deps/QXlsx"]
 	path = gpt4all-chat/deps/QXlsx
 	url = https://github.com/nomic-ai/QXlsx.git
 [submodule "gpt4all-chat/deps/minja"]
 	path = gpt4all-chat/deps/minja
 	url = https://github.com/nomic-ai/minja.git
 [submodule "gpt4all-chat/deps/json"]
 	path = gpt4all-chat/deps/json
 	url = https://github.com/nlohmann/json.git
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -51,6 +51,11 @@ Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
 E-mail: thiagojramos@outlook.com<br/>
 - pt\_BR translation
 Victor Emanuel ([@SINAPSA-IC](https://github.com/SINAPSA-IC))<br/>
 E-mail: contact@sinapsaro.ro<br/>
 Discord: `@sinapsa_ic_56124_99632`
 - ro\_RO translation
 不知火 Shiranui ([@supersonictw](https://github.com/supersonictw))<br/>
 E-mail: supersonic@livemail.tw<br/>
 Discord: `@supersonictw`
@ -72,6 +77,6 @@ Discord: `@Tim453`
 - Flatpak
 Jack ([@wuodoo](https://github.com/wuodoo))<br/>
-E-mail: 2296103047@qq.com<br/>
+E-mail: 2296103047@qq.com><br/>
 Discord: `@mikage`
 - zh\_CN translation
--- a/README.md
+++ b/README.md
@ -1,77 +1,48 @@
 <h1 align="center">GPT4All</h1>
-<p align="center">
+<p align="center">GPT4All runs large language models (LLMs) privately on everyday desktops & laptops. <br> <br> No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>
  Now with support for DeepSeek R1 Distillations
 </p>
 <p align="center">
  <a href="https://www.nomic.ai/gpt4all">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a> &bull; <a href="https://www.youtube.com/watch?v=gQcZDXRVJok">YouTube Tutorial</a>
 </p>
 <p align="center">
  GPT4All runs large language models (LLMs) privately on everyday desktops & laptops.
 </p>
 <p align="center">
  No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>.
 </p>
 <p align="center">
  Read about what's new in <a href="https://www.nomic.ai/blog/tag/gpt4all">our blog</a>.
 </p>
 <p align="center">
  <a href="https://nomic.ai/gpt4all/#newsletter-form">Subscribe to the newsletter</a>
 </p>
 https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311
 <p align="center">
  <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
    <img src="gpt4all-bindings/python/docs/assets/windows.png" width="80" height="80"><br>
    Download for Windows
  </a>
 </p>
 <p align="center">
  <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
    <img src="gpt4all-bindings/python/docs/assets/mac.png" width="85" height="100"><br>
    Download for MacOS
  </a>
 </p>
 <p align="center">
  <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" width="120" height="120"><br>
    Download for Ubuntu
  </a>
 </p>
 <p align="center">
  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
    <img width='240' alt='Get it on Flathub' src='https://flathub.org/api/badge?locale=en'><br>
    Get it on Flathub (community maintained)
  </a>
 </p>
 <p align="center">
  <a href="https://gpt4all.io">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a>
 </p>
 <p align="center">
  <a href="https://forms.nomic.ai/gpt4all-release-notes-signup">Subscribe to the newsletter</a>
 </p>
 <p align="center">
 GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
 </p>
-
+<p align="center">
-## Download Links
+ <a href="https://www.phorm.ai/query?projectId=755eecd3-24ad-49cc-abf4-0ab84caacf63"><img src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg" alt="phorm.ai"></a>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64-arm.exe">
    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows ARM Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
    <img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
  </a> &mdash;
 </p>
 <p>
  The Windows and Linux builds require Intel Core i3 2nd Gen / AMD Bulldozer, or better.
 </p>
 <p>
  The Windows ARM build supports Qualcomm Snapdragon and Microsoft SQ1/SQ2 processors.
 </p>
 <p>
  The Linux build is x86-64 only (no ARM).
 </p>
 <p>
  The macOS build requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
 </p>
 See the full [System Requirements](gpt4all-chat/system_requirements.md) for more details.
 <br/>
 <br/>
 <p>
  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
    <img style="height: 2em; width: auto" alt='Get it on Flathub' src='https://flathub.org/api/badge'><br/>
    Flathub (community maintained)
  </a>
 </p>
 ## Install GPT4All Python
@ -104,7 +75,7 @@ with model.chat_session():
    - Improved user workflow for LocalDocs
    - Expanded access to more model architectures
 - **October 19th, 2023**: GGUF Support Launches with Support for:
-    - Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5
+    - Mistral 7b base model, an updated model gallery on [gpt4all.io](https://gpt4all.io), several new local code models including Rift Coder v1.5
    - [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) support for Q4\_0 and Q4\_1 quantizations in GGUF.
    - Offline build support for running old versions of the GPT4All Local LLM Chat Client.
 - **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
--- a/common/common.cmake
+++ b/common/common.cmake
@ -1,41 +0,0 @@
 function(gpt4all_add_warning_options target)
    if (MSVC)
        return()
    endif()
    target_compile_options("${target}" PRIVATE
        # base options
        -Wall
        -Wextra
        # extra options
        -Wcast-align
        -Wextra-semi
        -Wformat=2
        -Wmissing-include-dirs
        -Wsuggest-override
        -Wvla
        # errors
        -Werror=format-security
        -Werror=init-self
        -Werror=pointer-arith
        -Werror=undef
        # disabled warnings
        -Wno-sign-compare
        -Wno-unused-parameter
    )
    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
        target_compile_options("${target}" PRIVATE
            -Wduplicated-branches
            -Wduplicated-cond
            -Wlogical-op
            -Wno-reorder
            -Wno-null-dereference
        )
    elseif (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
        target_compile_options("${target}" PRIVATE
            -Wunreachable-code-break
            -Wunreachable-code-return
            -Werror=pointer-integer-compare
            -Wno-reorder-ctor
        )
    endif()
 endfunction()
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,7 +1,4 @@
-cmake_minimum_required(VERSION 3.23)  # for FILE_SET
+cmake_minimum_required(VERSION 3.21)  # for PROJECT_IS_TOP_LEVEL
 include(../common/common.cmake)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -36,7 +33,7 @@ set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
-set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 set(BUILD_SHARED_LIBS ON)
@ -50,7 +47,7 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()
-set(DIRECTORY deps/llama.cpp-mainline)
+set(DIRECTORY llama.cpp-mainline)
 include(llama.cpp.cmake)
 set(BUILD_VARIANTS)
@ -66,23 +63,9 @@ if (LLMODEL_VULKAN)
    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
 endif()
 if (LLMODEL_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
+    if (DEFINED CMAKE_CUDA_ARCHITECTURES)
-
+        set(GGML_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
    # Defaults must be set before enable_language(CUDA).
    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
        # 61 == integer CUDA intrinsics
        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
    include(CheckLanguage)
    check_language(CUDA)
@ -97,6 +80,8 @@ if (LLMODEL_ROCM)
    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()
 set(CMAKE_VERBOSE_MAKEFILE ON)
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
@ -129,10 +114,6 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Include GGML
    include_ggml(-mainline-${BUILD_VARIANT})
    if (BUILD_VARIANT MATCHES metal)
        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
    endif()
    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
        set(TARGET_NAME ${TARGET_NAME}-${BUILD_VARIANT})
@ -151,13 +132,9 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Add each individual implementations
    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        src/llamamodel.cpp src/llmodel_shared.cpp)
+        llamamodel.cpp llmodel_shared.cpp)
    gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        src include/gpt4all-backend
    )
    prepare_target(llamamodel-mainline llama-mainline)
    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
@ -166,20 +143,11 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
 endforeach()
 add_library(llmodel
-    src/dlhandle.cpp
+    llmodel.h llmodel.cpp llmodel_shared.cpp
-    src/llmodel.cpp
+    llmodel_c.h llmodel_c.cpp
-    src/llmodel_c.cpp
+    dlhandle.cpp
    src/llmodel_shared.cpp
 )
 gpt4all_add_warning_options(llmodel)
 target_sources(llmodel PUBLIC
    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
    FILES include/gpt4all-backend/llmodel.h
          include/gpt4all-backend/llmodel_c.h
          include/gpt4all-backend/sysinfo.h
 )
 target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
 target_include_directories(llmodel PRIVATE src include/gpt4all-backend)
 set_target_properties(llmodel PROPERTIES
                              VERSION ${PROJECT_VERSION}
--- a/gpt4all-backend/README.md
+++ b/gpt4all-backend/README.md
@ -27,7 +27,7 @@ Unfortunately, no for three reasons:
 # What is being done to make them more compatible?
-A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.
+A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.
 # What about GPU inference?
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -1 +0,0 @@
 Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
--- a/gpt4all-backend/src/dlhandle.cpp
+++ b/gpt4all-backend/src/dlhandle.cpp
--- a/gpt4all-backend/src/dlhandle.h
+++ b/gpt4all-backend/src/dlhandle.h
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -0,0 +1 @@
 Subproject commit add387854ea73d83770a62282089dea666fa266f
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -378,7 +378,19 @@ function(include_ggml SUFFIX)
        find_package(CUDAToolkit REQUIRED)
        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
-        # architectures are set in gpt4all-backend/CMakeLists.txt
+        if (NOT DEFINED GGML_CUDA_ARCHITECTURES)
            # 52 == lowest CUDA 12 standard
            # 60 == f16 CUDA intrinsics
            # 61 == integer CUDA intrinsics
            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
            if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
                set(GGML_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
            else()
                set(GGML_CUDA_ARCHITECTURES "52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
                #set(GGML_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
            endif()
        endif()
        message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
        set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
        file(GLOB   GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
@ -811,8 +823,7 @@ function(include_ggml SUFFIX)
            list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
        endif()
-        set(GGML_METALLIB "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
+        set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
        add_custom_command(
            OUTPUT ${GGML_METALLIB}
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
@ -823,6 +834,7 @@ function(include_ggml SUFFIX)
            DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
            COMMENT "Compiling Metal kernels"
            )
        set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON)
        add_custom_target(
            ggml-metal ALL
@ -978,13 +990,10 @@ function(include_ggml SUFFIX)
    add_library(llama${SUFFIX} STATIC
                ${DIRECTORY}/include/llama.h
                ${DIRECTORY}/src/llama-grammar.cpp
                ${DIRECTORY}/src/llama-sampling.cpp
                ${DIRECTORY}/src/llama-vocab.cpp
                ${DIRECTORY}/src/llama.cpp
                ${DIRECTORY}/src/unicode-data.cpp
                ${DIRECTORY}/src/unicode.cpp
                ${DIRECTORY}/src/unicode.h
                ${DIRECTORY}/src/unicode.cpp
                ${DIRECTORY}/src/unicode-data.cpp
                )
    target_include_directories(llama${SUFFIX} PUBLIC  ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
@ -1009,6 +1018,9 @@ function(include_ggml SUFFIX)
        C_STANDARD 11
        C_STANDARD_REQUIRED true
        )
    if (GGML_CUDA_ARCHITECTURES)
        set_property(TARGET ggml${SUFFIX} llama${SUFFIX} PROPERTY CUDA_ARCHITECTURES "${GGML_CUDA_ARCHITECTURES}")
    endif()
    target_compile_options(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
    target_compile_options(llama${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
@ -2,7 +2,6 @@
 #include "llamamodel_impl.h"
 #include "llmodel.h"
 #include "utils.h"
 #include <ggml.h>
 #include <llama.h>
@ -53,8 +52,6 @@ static const std::vector<const char *> KNOWN_ARCHES {
    "gpt2",
    // "gptj", -- no inference code
    "gptneox",
    "granite",
    "granitemoe",
    "mpt",
    "baichuan",
    "starcoder",
@ -82,7 +79,6 @@ static const std::vector<const char *> KNOWN_ARCHES {
    "command-r",
    // "dbrx", -- 16x12B parameters
    "olmo",
    "olmoe",
    "openelm",
    // "arctic", -- 10B+128x3.66B parameters
    "deepseek2",
@ -107,34 +103,26 @@ static bool llama_verbose()
    return var && *var;
 }
-static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
+static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata)
 {
    (void)userdata;
-
+    if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
-    static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
+        fputs(text, stderr);
    if (!llama_verbose()) {
        auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
        lastlevel = efflevel;
        switch (efflevel) {
            case GGML_LOG_LEVEL_CONT:
                UNREACHABLE();
                break;
            case GGML_LOG_LEVEL_WARN:
                if (warn) break;
                [[fallthrough]];
            case GGML_LOG_LEVEL_NONE: // not used?
            case GGML_LOG_LEVEL_INFO:
            case GGML_LOG_LEVEL_DEBUG:
                return; // suppress
            case GGML_LOG_LEVEL_ERROR:
                ;
        }
    }
    fputs(text, stderr);
 }
 #ifdef GGML_USE_CUDA
 static void cuda_log_callback(enum ggml_log_level level, const char *text, void *userdata)
 {
    (void)userdata;
    if (llama_verbose() || level <= GGML_LOG_LEVEL_WARN) {
        fputs(text, stderr);
    }
 }
 #endif
 struct gpt_params {
    int32_t seed          = -1;   // RNG seed
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
    // sampling parameters
@ -149,6 +137,36 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
 };
 static int llama_sample_top_p_top_k(
        llama_context *ctx,
        const llama_token *last_n_tokens_data,
        int last_n_tokens_size,
        int top_k,
        float top_p,
        float min_p,
        float temp,
        float repeat_penalty) {
    auto logits = llama_get_logits_ith(ctx, -1);
    auto n_vocab = llama_n_vocab(llama_get_model(ctx));
    // Populate initial list of all candidates
    std::vector<llama_token_data> candidates;
    candidates.reserve(n_vocab);
    for (int token_id = 0; token_id < n_vocab; token_id++) {
        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
    }
    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
    // Sample repeat penalty
    llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
    // Temperature sampling
    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
    llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
    llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
    llama_sample_min_p(ctx, &candidates_p, min_p, 1);
    llama_sample_temp(ctx, &candidates_p, temp);
    return llama_sample_token(ctx, &candidates_p);
 }
 const char *get_arch_name(gguf_context *ctx_gguf)
 {
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
@ -205,7 +223,7 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
        if (keyidx != -1) {
            value = gguf_get_val_u32(ctx, keyidx);
        } else {
-            std::cerr << __func__ << ": " << key << " not found in " << modelPath << "\n";
+            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
        }
    }
@ -215,27 +233,21 @@ cleanup:
 }
 struct LLamaPrivate {
-    bool                         modelLoaded  = false;
+    const std::string modelPath;
-    int                          device       = -1;
+    bool modelLoaded = false;
-    std::string                  deviceName;
+    int device = -1;
-    int64_t                      n_threads    = 0;
+    std::string deviceName;
-    std::vector<LLModel::Token>  end_tokens;
+    llama_model *model = nullptr;
-    const char                  *backend_name = nullptr;
+    llama_context *ctx = nullptr;
-    std::vector<LLModel::Token>  inputTokens;
+    llama_model_params model_params;
-
+    llama_context_params ctx_params;
-    llama_model          *model        = nullptr;
+    int64_t n_threads = 0;
-    llama_context        *ctx          = nullptr;
+    std::vector<LLModel::Token> end_tokens;
-    llama_model_params    model_params;
+    const char *backend_name = nullptr;
    llama_context_params  ctx_params;
    llama_sampler        *sampler_chain;
 };
 LLamaModel::LLamaModel()
-    : d_ptr(std::make_unique<LLamaPrivate>())
+    : d_ptr(new LLamaPrivate) {}
 {
    auto sparams = llama_sampler_chain_default_params();
    d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
 }
 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@ -424,9 +436,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        }
    }
-    d_ptr->ctx_params.n_ctx  = n_ctx;
+    d_ptr->ctx_params.n_ctx   = n_ctx;
-    d_ptr->ctx_params.type_k = params.kv_type;
+    d_ptr->ctx_params.seed    = params.seed;
-    d_ptr->ctx_params.type_v = params.kv_type;
+    d_ptr->ctx_params.type_k  = params.kv_type;
    d_ptr->ctx_params.type_v  = params.kv_type;
    // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
    // that we want this many logits so the state serializes consistently.
@ -492,7 +505,6 @@ LLamaModel::~LLamaModel()
        llama_free(d_ptr->ctx);
    }
    llama_free_model(d_ptr->model);
    llama_sampler_free(d_ptr->sampler_chain);
 }
 bool LLamaModel::isModelLoaded() const
@ -502,41 +514,38 @@ bool LLamaModel::isModelLoaded() const
 size_t LLamaModel::stateSize() const
 {
-    return llama_state_get_size(d_ptr->ctx);
+    return llama_get_state_size(d_ptr->ctx);
 }
-size_t LLamaModel::saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const
+size_t LLamaModel::saveState(uint8_t *dest) const
 {
-    size_t bytesWritten = llama_state_get_data(d_ptr->ctx, stateOut.data(), stateOut.size());
+    return llama_copy_state_data(d_ptr->ctx, dest);
    if (bytesWritten)
        inputTokensOut.assign(d_ptr->inputTokens.begin(), d_ptr->inputTokens.end());
    return bytesWritten;
 }
-size_t LLamaModel::restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens)
+size_t LLamaModel::restoreState(const uint8_t *src)
 {
-    size_t bytesRead = llama_state_set_data(d_ptr->ctx, state.data(), state.size());
+    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
-    if (bytesRead)
+    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
        d_ptr->inputTokens.assign(inputTokens.begin(), inputTokens.end());
    return bytesRead;
 }
-std::vector<LLModel::Token> LLamaModel::tokenize(std::string_view str) const
+std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special)
 {
    bool atStart = m_tokenize_last_token == -1;
    bool insertSpace = atStart || (
        llama_token_get_attr(d_ptr->model, m_tokenize_last_token)
        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN)
    );
    std::vector<LLModel::Token> fres(str.length() + 4);
-    int32_t fres_len = llama_tokenize(
+    int32_t fres_len = llama_tokenize_gpt4all(
-        d_ptr->model, str.data(), str.length(), fres.data(), fres.size(), /*add_special*/ true, /*parse_special*/ true
+        d_ptr->model, str.c_str(), str.length(), fres.data(), fres.size(), /*add_special*/ atStart,
        /*parse_special*/ special, /*insert_space*/ insertSpace
    );
    fres.resize(fres_len);
    if (fres_len)
        m_tokenize_last_token = fres.back();
    return fres;
 }
 bool LLamaModel::isSpecialToken(Token id) const
 {
    return llama_token_get_attr(d_ptr->model, id)
        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
 }
 std::string LLamaModel::tokenToString(Token id) const
 {
    std::vector<char> result(8, 0);
@ -553,58 +562,18 @@ std::string LLamaModel::tokenToString(Token id) const
    return std::string(result.data(), result.size());
 }
-void LLamaModel::initSampler(const PromptContext &promptCtx)
+LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
 {
-    auto *model = d_ptr->model;
+    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
-    auto *chain = d_ptr->sampler_chain;
+    return llama_sample_top_p_top_k(d_ptr->ctx,
-
+        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
-    // clear sampler chain
+        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
-    for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
+        promptCtx.repeat_penalty);
        auto *smpl = llama_sampler_chain_remove(chain, i);
        llama_sampler_free(smpl);
    }
    // build new chain
    llama_sampler_chain_add(chain,
        llama_sampler_init_penalties(
            llama_n_vocab(model),
            llama_token_eos(model),
            llama_token_nl(model),
            promptCtx.repeat_last_n,
            promptCtx.repeat_penalty,
            // TODO(jared): consider making the below configurable
            /*penalty_freq*/    0.0f,
            /*penalty_present*/ 0.0f,
            /*penalize_nl*/     true,
            /*ignore_eos*/      false
        )
    );
    if (promptCtx.temp == 0.0f) {
        llama_sampler_chain_add(chain, llama_sampler_init_greedy());
    } else {
        struct llama_sampler *samplers[] = {
            llama_sampler_init_top_k(promptCtx.top_k),
            llama_sampler_init_top_p(promptCtx.top_p, 1),
            llama_sampler_init_min_p(promptCtx.min_p, 1),
            llama_sampler_init_temp(promptCtx.temp),
            llama_sampler_init_softmax(),
            llama_sampler_init_dist(LLAMA_DEFAULT_SEED),
        };
        for (auto *smpl : samplers)
            llama_sampler_chain_add(chain, smpl);
    }
 }
-LLModel::Token LLamaModel::sampleToken() const
+bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
-    return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
+    llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);
 }
 bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
 {
    assert(!tokens.empty());
    llama_kv_cache_seq_rm(d_ptr->ctx, 0, nPast, -1);
    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
@ -612,7 +581,7 @@ bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
    for (int32_t i = 0; i < batch.n_tokens; i++) {
        batch.token   [i] = tokens[i];
-        batch.pos     [i] = nPast + i;
+        batch.pos     [i] = ctx.n_past + i;
        batch.n_seq_id[i] = 1;
        batch.seq_id  [i][0] = 0;
        batch.logits  [i] = false;
@ -626,86 +595,11 @@ bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
    return res == 0;
 }
 void LLamaModel::shiftContext(const PromptContext &promptCtx, int32_t *nPast)
 {
    // infinite text generation via context shifting
    // erase up to n_ctx*contextErase tokens
    int n_keep = shouldAddBOS();
    int n_past = *nPast;
    int n_discard = std::min(n_past - n_keep, int(contextLength() * promptCtx.contextErase));
    assert(n_discard > 0);
    if (n_discard <= 0)
        return;
    std::cerr << "Llama: context full, swapping: n_past = " << n_past << ", n_keep = " << n_keep
              << ", n_discard = " << n_discard << "\n";
    // erase the first n_discard tokens from the context
    llama_kv_cache_seq_rm (d_ptr->ctx, 0, n_keep,             n_keep + n_discard);
    llama_kv_cache_seq_add(d_ptr->ctx, 0, n_keep + n_discard, n_past,             -n_discard);
    auto &inp = d_ptr->inputTokens;
    inp.erase(inp.begin() + n_keep, inp.begin() + n_keep + n_discard);
    *nPast = inp.size();
 }
 int32_t LLamaModel::contextLength() const
 {
    return llama_n_ctx(d_ptr->ctx);
 }
 auto LLamaModel::specialTokens() -> std::unordered_map<std::string, std::string> const
 {
    if (!d_ptr->model)
        throw std::logic_error("model not loaded");
    std::unordered_map<std::string, std::string> tokens;
    if (auto id = llama_token_bos(d_ptr->model); id != LLAMA_TOKEN_NULL)
        tokens.emplace("bos_token", tokenToString(id));
    if (auto id = llama_token_eos(d_ptr->model); id != LLAMA_TOKEN_NULL)
        tokens.emplace("eos_token", tokenToString(id));
    return tokens;
 }
 int32_t LLamaModel::inputLength() const
 {
    return d_ptr->inputTokens.size();
 }
 int32_t LLamaModel::computeModelInputPosition(std::span<const Token> input) const
 {
    // find common prefix
    auto cacheIt = d_ptr->inputTokens.begin();
    auto inputIt = input.begin();
    while (cacheIt < d_ptr->inputTokens.end() && inputIt < input.end() && *cacheIt == *inputIt) {
        ++cacheIt; ++inputIt;
    }
    // tell the caller to ignore the tokens between [begin, inputIt)
    return inputIt - input.begin();
 }
 void LLamaModel::setModelInputPosition(int32_t pos)
 {
    auto &inp = d_ptr->inputTokens;
    assert(pos >= 0);
    assert(pos <= inp.size());
    // truncate token cache to end at the new n_past
    if (pos < inp.size())
        inp.resize(pos);
 }
 void LLamaModel::appendInputToken(Token tok)
 {
    d_ptr->inputTokens.push_back(tok);
 }
 auto LLamaModel::inputTokens() const -> std::span<const Token>
 {
    return d_ptr->inputTokens;
 }
 const std::vector<LLModel::Token> &LLamaModel::endTokens() const
 {
    return d_ptr->end_tokens;
@ -726,37 +620,6 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
    return get_arch_key_u32(modelPath, "block_count");
 }
 // TODO(jared): reduce redundant code and operations by combining all metadata getters for unloaded
 //              models into a class that keeps the model file open
 auto LLamaModel::chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
 {
    auto *ctx = load_gguf(modelPath);
    if (!ctx)
        return std::unexpected("failed to open model file");
    std::expected<std::string, std::string> result;
    enum gguf_type ktype;
    const int kid = gguf_find_key(ctx, "tokenizer.chat_template");
    if (kid == -1) {
        result = std::unexpected("key not found");
        goto cleanup;
    }
    ktype = gguf_get_kv_type(ctx, kid);
    if (ktype != GGUF_TYPE_STRING) {
        result = std::unexpected(
            "expected key type STRING (" + std::to_string(GGUF_TYPE_STRING) + "), got " + std::to_string(ktype)
        );
        goto cleanup;
    }
    result = gguf_get_val_str(ctx, kid);
 cleanup:
    gguf_free(ctx);
    return result;
 }
 #ifdef GGML_USE_VULKAN
 static const char *getVulkanVendorName(uint32_t vendorID)
 {
@ -1329,9 +1192,9 @@ DLL_EXPORT bool is_arch_supported(const char *arch)
 DLL_EXPORT LLModel *construct()
 {
-    llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
+    llama_log_set(llama_log_callback, nullptr);
 #ifdef GGML_USE_CUDA
-    ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
+    ggml_backend_cuda_log_set_callback(cuda_log_callback, nullptr);
 #endif
    return new LLamaModel;
 }
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -6,12 +6,10 @@
 #include "llmodel.h"
 #include <functional>
 #include <memory>
 #include <span>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <unordered_map>
 struct LLamaPrivate;
 struct EmbModelSpec;
@ -29,8 +27,8 @@ public:
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
-    size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
+    size_t saveState(uint8_t *dest) const override;
-    size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
+    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
@ -49,36 +47,25 @@ public:
    void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
    int32_t contextLength() const override;
    auto specialTokens() -> std::unordered_map<std::string, std::string> const override;
 protected:
    std::vector<Token> tokenize(std::string_view str) const override;
    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
    void initSampler(const PromptContext &ctx) override;
    Token sampleToken() const override;
    bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
    void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
    int32_t inputLength() const override;
    int32_t computeModelInputPosition(std::span<const Token> input) const override;
    void setModelInputPosition(int32_t pos) override;
    void appendInputToken(Token tok) override;
    std::span<const Token> inputTokens() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;
    int32_t maxContextLength(std::string const &modelPath) const override;
    int32_t layerCount(std::string const &modelPath) const override;
    auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;
    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
                       const EmbModelSpec *spec);
 private:
    std::unique_ptr<LLamaPrivate> d_ptr;
    bool m_supportsEmbedding = false;
    bool m_supportsCompletion = false;
 protected:
    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override;
    std::string tokenToString(Token id) const override;
    Token sampleToken(PromptContext &ctx) const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
    int32_t contextLength() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;
    int32_t maxContextLength(std::string const &modelPath) const override;
    int32_t layerCount(std::string const &modelPath) const override;
    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
                       const EmbModelSpec *spec);
 };
 #endif // LLAMAMODEL_H
--- a/gpt4all-backend/src/llmodel.cpp
+++ b/gpt4all-backend/src/llmodel.cpp
@ -140,14 +140,9 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
            std::string path;
            // Split the paths string by the delimiter and process each path.
            while (std::getline(ss, path, ';')) {
-                fs::directory_iterator iter;
+                std::u8string u8_path(path.begin(), path.end());
                try {
                    iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
                } catch (const fs::filesystem_error &) {
                    continue; // skip nonexistent path
                }
                // Iterate over all libraries
-                for (const auto &f : iter) {
+                for (const auto &f : fs::directory_iterator(u8_path)) {
                    const fs::path &p = f.path();
                    if (p.extension() != LIB_FILE_EXT) continue;
@ -331,12 +326,6 @@ bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
    return llama && llama->isEmbeddingModel(modelPath);
 }
 auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
 {
    auto *llama = constructGlobalLlama();
    return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
 }
 void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
 {
    s_implementations_search_path = path;
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -5,10 +5,8 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <expected>
 #include <functional>
 #include <optional>
 #include <span>
 #include <stdexcept>
 #include <string>
 #include <string_view>
@ -25,10 +23,6 @@ using namespace std::string_literals;
 class LLModel {
 public:
    using Token = int32_t;
    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
    using ProgressCallback    = std::function<bool(float progress)>;
    class BadArchError: public std::runtime_error {
    public:
@ -106,7 +100,6 @@ public:
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
        static bool isEmbeddingModel(const std::string &modelPath);
        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
        static void setImplementationsSearchPath(const std::string &path);
        static const std::string &implementationsSearchPath();
        static bool hasSupportedCPU();
@ -130,6 +123,9 @@ public:
    };
    struct PromptContext {
        std::vector<int32_t> tokens;    // current tokens in the context window
        int32_t n_past = 0;             // number of tokens in past conversation
        int32_t n_ctx = 0;              // number of tokens possible in context window
        int32_t n_predict = 200;
        int32_t top_k = 40;
        float   top_p = 0.9f;
@ -138,31 +134,37 @@ public:
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
+        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context window
    };
    using ProgressCallback = std::function<bool(float progress)>;
    explicit LLModel() {}
    virtual ~LLModel() {}
    virtual bool supportsEmbedding() const = 0;
    virtual bool supportsCompletion() const = 0;
    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
+    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; };
    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const = 0;
+    virtual size_t stateSize() const { return 0; }
-    virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
+    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
-    virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;
+    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
-    virtual void prompt(std::string_view        prompt,
+    virtual void prompt(const std::string &prompt,
-                        const PromptCallback   &promptCallback,
+                        const std::string &promptTemplate,
-                        const ResponseCallback &responseCallback,
+                        std::function<bool(int32_t)> promptCallback,
-                        const PromptContext    &ctx);
+                        std::function<bool(int32_t, const std::string&)> responseCallback,
                        std::function<bool(bool)> recalculateCallback,
                        PromptContext &ctx,
                        bool special = false,
                        std::string *fakeReply = nullptr);
-    virtual int32_t countPromptTokens(std::string_view prompt) const;
+    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
    virtual size_t embeddingSize() const {
        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
@ -207,24 +209,14 @@ public:
    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
    virtual int32_t contextLength() const = 0;
    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
+    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) = 0;
    virtual bool isSpecialToken(Token id) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
-    virtual void initSampler(const PromptContext &ctx) = 0;
+    virtual Token sampleToken(PromptContext &ctx) const = 0;
-    virtual Token sampleToken() const = 0;
+    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
-    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
+    virtual int32_t contextLength() const = 0;
    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
    virtual int32_t inputLength() const = 0;
    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
    virtual void setModelInputPosition(int32_t pos) = 0;
    virtual void appendInputToken(Token tok) = 0;
    virtual std::span<const Token> inputTokens() const = 0;
    virtual const std::vector<Token> &endTokens() const = 0;
    virtual bool shouldAddBOS() const = 0;
@ -240,11 +232,9 @@ protected:
        return -1;
    }
-    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
+    // This is a helper function called from the default implementation of 'prompt' but it can be
-    {
+    // shared by all base classes so it isn't virtual
-        (void)modelPath;
+    void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
        return std::unexpected("not implemented");
    }
    const Implementation *m_implementation = nullptr;
@ -257,15 +247,16 @@ protected:
        return true;
    }
-    // prefill context with prompt
+    bool decodePrompt(std::function<bool(int32_t)> promptCallback,
-    auto decodePrompt(const PromptCallback &promptCallback,
+                      std::function<bool(int32_t, const std::string&)> responseCallback,
-                      const PromptContext  &promptCtx,
+                      std::function<bool(bool)> recalculateCallback,
-                      std::vector<Token>    embd_inp)
+                      PromptContext &promptCtx,
-        -> std::optional<int32_t>;
+                      std::vector<Token> embd_inp);
-    // generate a response
+    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
-    void generateResponse(const ResponseCallback &responseCallback,
+                          std::function<bool(bool)> recalculateCallback,
-                          const PromptContext    &promptCtx,
+                          PromptContext &promptCtx);
-                          int32_t                 nPast);
+
    Token m_tokenize_last_token = -1; // not serialized
    friend class LLMImplementation;
 };
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -7,20 +7,16 @@
 #include <cstdlib>
 #include <cstring>
 #include <exception>
 #include <functional>
 #include <iostream>
 #include <memory>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <span>
 namespace ranges = std::ranges;
 static_assert(sizeof(token_t) == sizeof(LLModel::Token));
 struct LLModelWrapper {
    LLModel *llModel = nullptr;
    LLModel::PromptContext promptContext;
    ~LLModelWrapper() { delete llModel; }
 };
@ -88,80 +84,77 @@ bool llmodel_isModelLoaded(llmodel_model model)
    return wrapper->llModel->isModelLoaded();
 }
-uint64_t llmodel_state_get_size(llmodel_model model)
+uint64_t llmodel_get_state_size(llmodel_model model)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->stateSize();
 }
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
                                token_t **input_tokens_out, uint64_t *n_input_tokens)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    std::vector<LLModel::Token> inputTokens;
+    return wrapper->llModel->saveState(dest);
    auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
    if (bytesWritten) {
        auto *buf = new LLModel::Token[inputTokens.size()];
        ranges::copy(inputTokens, buf);
        *input_tokens_out = buf;
        *n_input_tokens = uint64_t(inputTokens.size());
    } else {
        *input_tokens_out = nullptr;
        *n_input_tokens = 0;
    }
    return bytesWritten;
 }
-void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
 {
    delete[] input_tokens;
 }
 uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
                                const token_t *input_tokens, uint64_t n_input_tokens)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
+    return wrapper->llModel->restoreState(src);
 }
-bool llmodel_prompt(llmodel_model               model,
+void llmodel_prompt(llmodel_model model, const char *prompt,
-                    const char                 *prompt,
+                    const char *prompt_template,
-                    llmodel_prompt_callback     prompt_callback,
+                    llmodel_prompt_callback prompt_callback,
-                    llmodel_response_callback   response_callback,
+                    llmodel_response_callback response_callback,
-                    llmodel_prompt_context     *ctx,
+                    llmodel_recalculate_callback recalculate_callback,
-                    const char                **error)
+                    llmodel_prompt_context *ctx,
                    bool special,
                    const char *fake_reply)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
    auto response_func = [response_callback](int32_t token_id, const std::string &response) {
        return response_callback(token_id, response.c_str());
    };
    // Copy the C prompt context
-    LLModel::PromptContext promptContext {
+    wrapper->promptContext.n_past = ctx->n_past;
-        .n_predict      = ctx->n_predict,
+    wrapper->promptContext.n_ctx = ctx->n_ctx;
-        .top_k          = ctx->top_k,
+    wrapper->promptContext.n_predict = ctx->n_predict;
-        .top_p          = ctx->top_p,
+    wrapper->promptContext.top_k = ctx->top_k;
-        .min_p          = ctx->min_p,
+    wrapper->promptContext.top_p = ctx->top_p;
-        .temp           = ctx->temp,
+    wrapper->promptContext.min_p = ctx->min_p;
-        .n_batch        = ctx->n_batch,
+    wrapper->promptContext.temp = ctx->temp;
-        .repeat_penalty = ctx->repeat_penalty,
+    wrapper->promptContext.n_batch = ctx->n_batch;
-        .repeat_last_n  = ctx->repeat_last_n,
+    wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
-        .contextErase   = ctx->context_erase,
+    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
-    };
+    wrapper->promptContext.contextErase = ctx->context_erase;
-    auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
+    std::string fake_reply_str;
-        return prompt_callback(token_ids.data(), token_ids.size(), cached);
+    if (fake_reply) { fake_reply_str = fake_reply; }
-    };
+    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
    auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
        return response_callback(token_id, piece.data());
    };
    // Call the C++ prompt method
-    try {
+    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
-        wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
+                             wrapper->promptContext, special, fake_reply_p);
    } catch (std::exception const &e) {
        llmodel_set_error(error, e.what());
        return false;
    }
-    return true;
+    // Update the C context by giving access to the wrappers raw pointers to std::vector data
    // which involves no copies
    ctx->tokens = wrapper->promptContext.tokens.data();
    ctx->tokens_size = wrapper->promptContext.tokens.size();
    // Update the rest of the C prompt context
    ctx->n_past = wrapper->promptContext.n_past;
    ctx->n_ctx = wrapper->promptContext.n_ctx;
    ctx->n_predict = wrapper->promptContext.n_predict;
    ctx->top_k = wrapper->promptContext.top_k;
    ctx->top_p = wrapper->promptContext.top_p;
    ctx->min_p = wrapper->promptContext.min_p;
    ctx->temp = wrapper->promptContext.temp;
    ctx->n_batch = wrapper->promptContext.n_batch;
    ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;
    ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;
    ctx->context_erase = wrapper->promptContext.contextErase;
 }
 float *llmodel_embed(
@ -300,21 +293,3 @@ const char *llmodel_model_gpu_device_name(llmodel_model model)
    const auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->gpuDeviceName();
 }
 int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
 {
    auto *wrapper = static_cast<const LLModelWrapper *>(model);
    try {
        return wrapper->llModel->countPromptTokens(prompt);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return -1;
    }
 }
 void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
 {
    auto *wrapper = static_cast<const LLModelWrapper *>(model);
    for (auto &[name, token] : wrapper->llModel->specialTokens())
        callback(name.c_str(), token.c_str());
 }
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -23,11 +23,6 @@ extern "C" {
 */
 typedef void *llmodel_model;
 /**
 * A token.
 */
 typedef int32_t token_t;
 /**
 * llmodel_prompt_context structure for holding the prompt context.
 * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
@ -35,15 +30,19 @@ typedef int32_t token_t;
 * behavior.
 */
 struct llmodel_prompt_context {
    int32_t *tokens;        // current tokens in the context window
    size_t tokens_size;     // the size of the raw tokens vector
    int32_t n_past;         // number of tokens in past conversation
    int32_t n_ctx;          // number of tokens possible in context window
    int32_t n_predict;      // number of tokens to predict
    int32_t top_k;          // top k logits to sample from
-    float   top_p;          // nucleus sampling probability threshold
+    float top_p;            // nucleus sampling probability threshold
-    float   min_p;          // Min P sampling
+    float min_p;            // Min P sampling
-    float   temp;           // temperature to adjust model's output distribution
+    float temp;             // temperature to adjust model's output distribution
    int32_t n_batch;        // number of predictions to generate in parallel
-    float   repeat_penalty; // penalty factor for repeated tokens
+    float repeat_penalty;   // penalty factor for repeated tokens
    int32_t repeat_last_n;  // last n tokens to penalize
-    float   context_erase;  // percent of context to erase if we exceed the context window
+    float context_erase;    // percent of context to erase if we exceed the context window
 };
 struct llmodel_gpu_device {
@ -62,12 +61,10 @@ typedef struct llmodel_gpu_device llmodel_gpu_device;
 /**
 * Callback type for prompt processing.
- * @param token_ids An array of token ids of the prompt.
+ * @param token_id The token id of the prompt.
 * @param n_token_ids The number of tokens in the array.
 * @param cached Whether the tokens were already in cache.
 * @return a bool indicating whether the model should keep processing.
 */
-typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
+typedef bool (*llmodel_prompt_callback)(int32_t token_id);
 /**
 * Callback type for response.
@ -75,7 +72,14 @@ typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token
 * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
 * @return a bool indicating whether the model should keep generating.
 */
-typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
+typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
 /**
 * Callback type for recalculation of context.
 * @param whether the model is recalculating the context.
 * @return a bool indicating whether the model should keep generating.
 */
 typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
 /**
 * Embedding cancellation callback for use with llmodel_embed.
@ -86,8 +90,6 @@ typedef bool (*llmodel_response_callback)(token_t token_id, const char *response
 */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
 typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
 /**
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
@ -146,57 +148,46 @@ bool llmodel_isModelLoaded(llmodel_model model);
 * @param model A pointer to the llmodel_model instance.
 * @return the size in bytes of the internal state of the model
 */
-uint64_t llmodel_state_get_size(llmodel_model model);
+uint64_t llmodel_get_state_size(llmodel_model model);
 /**
- * Saves the internal state of the model.
+ * Saves the internal state of the model to the specified destination address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
+ * @param dest A pointer to the destination.
- * @param state_size The size of the destination for the state.
+ * @return the number of bytes copied
 * @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
 * be freed with llmodel_state_free_input_tokens.
 * @param n_input_tokens Where to store the size of the token cache state.
 * @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
 * size is set to zero.
 */
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
                                token_t **input_tokens_out, uint64_t *n_input_tokens);
 /**
 * Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
 * @param input_tokens The token cache buffer.
 */
 void llmodel_state_free_input_tokens(token_t *input_tokens);
 /**
 * Restores the internal state of the model using data from the specified address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param state A pointer to the state data.
+ * @param src A pointer to the src.
- * @param state_size The size of the state data.
+ * @return the number of bytes read
 * @param input_tokens The token cache associated with the saved state.
 * @param n_input_tokens The number of tokens in input_tokens.
 * @return The number of bytes read, or zero on error.
 */
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
                                const token_t *input_tokens, uint64_t n_input_tokens);
 /**
 * Generate a response using the model.
 * @param model A pointer to the llmodel_model instance.
 * @param prompt A string representing the input prompt.
 * @param prompt_template A string representing the input prompt template.
 * @param prompt_callback A callback function for handling the processing of prompt.
 * @param response_callback A callback function for handling the generated response.
 * @param recalculate_callback A callback function for handling recalculation requests.
 * @param special True if special tokens in the prompt should be processed, false otherwise.
 * @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
 * @param ctx A pointer to the llmodel_prompt_context structure.
 * @param error A pointer to a string; will only be set on error.
 */
-bool llmodel_prompt(llmodel_model               model,
+void llmodel_prompt(llmodel_model model, const char *prompt,
-                    const char                 *prompt,
+                    const char *prompt_template,
-                    llmodel_prompt_callback     prompt_callback,
+                    llmodel_prompt_callback prompt_callback,
-                    llmodel_response_callback   response_callback,
+                    llmodel_response_callback response_callback,
-                    llmodel_prompt_context     *ctx,
+                    llmodel_recalculate_callback recalculate_callback,
-                    const char                **error);
+                    llmodel_prompt_context *ctx,
                    bool special,
                    const char *fake_reply);
 /**
 * Generate an embedding using the model.
@ -308,10 +299,6 @@ const char *llmodel_model_backend_name(llmodel_model model);
 */
 const char *llmodel_model_gpu_device_name(llmodel_model model);
 int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
 void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
 #ifdef __cplusplus
 }
 #endif
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@ -0,0 +1,322 @@
 #include "llmodel.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iostream>
 #include <optional>
 #include <regex>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <unordered_set>
 #include <vector>
 // TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
 // FIXME(jared): if recalculate returns false, we leave n_past<tokens.size() and do not tell the caller to stop
 // FIXME(jared): if we get here during chat name or follow-up generation, bad things will happen when we try to restore
 // the old prompt context afterwards
 void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate)
 {
    int n_keep = shouldAddBOS();
    const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
    // Erase the first percentage of context from the tokens
    std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
    size_t i = n_keep;
    promptCtx.n_past = n_keep;
    while (i < promptCtx.tokens.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
        assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << "LLModel ERROR: Failed to process prompt\n";
            goto stop_generating;
        }
        promptCtx.n_past += batch.size();
        if (!recalculate(true))
            goto stop_generating;
        i = batch_end;
    }
    assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));
 stop_generating:
    recalculate(false);
 }
 static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err)
 {
    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
    placeholders.clear();
    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
    if (placeholders.size() > 2) {
        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
        return false;
    }
    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
        return false;
    }
    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
        return false;
    }
    return true;
 }
 void LLModel::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,
                     std::function<bool(int32_t, const std::string&)> responseCallback,
                     std::function<bool(bool)> recalculateCallback,
                     PromptContext &promptCtx,
                     bool special,
                     std::string *fakeReply)
 {
    if (!isModelLoaded()) {
        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
        return;
    }
    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
        responseCallback(-1, errorMessage);
        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
        return;
    }
    // make sure token cache matches decode offset
    if (promptCtx.tokens.size() < promptCtx.n_past) {
        std::ostringstream ss;
        ss << "expected n_past to be at most " << promptCtx.tokens.size() << ", got " << promptCtx.n_past;
        throw std::out_of_range(ss.str());
    }
    if (promptCtx.n_past < promptCtx.tokens.size())
        promptCtx.tokens.resize(promptCtx.n_past);
    m_tokenize_last_token = promptCtx.tokens.empty() ? -1 : promptCtx.tokens.back(); // not serialized
    // parse the prompt template
    std::vector<std::smatch> placeholders;
    {
        std::string err;
        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
            responseCallback(-1, err);
            std::cerr << err << "\n";
            return;
        }
    }
    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
    // tokenize the user prompt
    std::vector<Token> embd_inp;
    if (placeholders.empty()) {
        // this is unusual, but well-defined
        std::cerr << __func__ << ": prompt template has no placeholder\n";
        embd_inp = tokenize(promptCtx, promptTemplate, true);
    } else {
        // template: beginning of user prompt
        const auto &phUser = placeholders[0];
        std::string userPrefix(phUser.prefix());
        if (!userPrefix.empty()) {
            embd_inp = tokenize(promptCtx, userPrefix, true);
            promptCtx.n_past += embd_inp.size();
        }
        // user input (shouldn't have special token processing)
        auto tokens = tokenize(promptCtx, prompt, special);
        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
        promptCtx.n_past += tokens.size();
        // template: end of user prompt + start of assistant prompt
        size_t start = phUser.position() + phUser.length();
        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
        auto userToAsst = promptTemplate.substr(start, end - start);
        if (!userToAsst.empty()) {
            tokens = tokenize(promptCtx, userToAsst, true);
            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
            promptCtx.n_past += tokens.size();
        }
    }
    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
    // decode the user prompt
    if (!decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp))
        return; // error
    // decode the assistant's reply, either generated or spoofed
    if (fakeReply == nullptr) {
        generateResponse(responseCallback, recalculateCallback, promptCtx);
    } else {
        embd_inp = tokenize(promptCtx, *fakeReply, false);
        if (!decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp))
            return; // error
    }
    // decode the rest of the prompt template
    // template: end of assistant prompt
    std::string asstSuffix;
    if (placeholders.size() >= 2) {
        size_t start = placeholders[1].position() + placeholders[1].length();
        asstSuffix = promptTemplate.substr(start);
    } else {
        asstSuffix = "\n\n"; // default to a blank link, good for e.g. Alpaca
    }
    if (!asstSuffix.empty()) {
        embd_inp = tokenize(promptCtx, asstSuffix, true);
        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
    }
 }
 // returns false on error
 bool LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
                           std::function<bool(int32_t, const std::string&)> responseCallback,
                           std::function<bool(bool)> recalculateCallback,
                           PromptContext &promptCtx,
                           std::vector<Token> embd_inp) {
    // save the context size
    promptCtx.n_ctx = contextLength();
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
        return false;
    }
    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
    promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
    // process the prompt in batches
    size_t i = 0;
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
        // Check if the context has run out...
        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
            return false;
        }
        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            promptCtx.tokens.push_back(batch.at(t));
            promptCtx.n_past += 1;
            if (!promptCallback(batch.at(t)))
                return false;
        }
        i = batch_end;
    }
    return true;
 }
 void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                               std::function<bool(bool)> recalculateCallback,
                               PromptContext &promptCtx) {
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    std::unordered_set<std::string> reversePrompts
        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
    // predict next tokens
    for (int i = 0; i < promptCtx.n_predict; i++) {
        // sample next token
        auto id = sampleToken(promptCtx);
        // Check if the context has run out...
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, { id })) {
            std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
            return;
        }
        // display text
        for (const auto token : endTokens()) {
            if (id == token) return;
        }
        const std::string str = tokenToString(id);
        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
        const std::string completed = cachedResponse + std::string(str);
        if (reversePrompts.find(completed) != reversePrompts.end())
            return;
        // Check if it partially matches our reverse prompts and if so, cache
        for (const auto& s : reversePrompts) {
            if (s.compare(0, completed.size(), completed) == 0) {
                foundPartialReversePrompt = true;
                cachedResponse = completed;
                break;
            }
        }
        // Regardless the token gets added to our cache
        cachedTokens.push_back(id);
        // Continue if we have found a partial match
        if (foundPartialReversePrompt)
            continue;
        // Empty the cache
        for (auto t : cachedTokens) {
            promptCtx.tokens.push_back(t);
            promptCtx.n_past += 1;
            //TODO: Conversion to std::string can be avoided here...
            if (!responseCallback(t, std::string(tokenToString(t))))
                return;
        }
        cachedTokens.clear();
    }
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
 ) {
    (void)texts;
    (void)embeddings;
    (void)prefix;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    (void)cancelCb;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
    bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)isRetrieval;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@ -0,0 +1,49 @@
 #pragma once
 #include <ggml.h>
 #include <cstddef>
 #include <cstdint>
 #include <vector>
 struct llm_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
    void resize(size_t size) {
        delete[] addr;
        addr = new uint8_t[size];
        this->size = size;
    }
    ~llm_buffer() {
        delete[] addr;
    }
 };
 struct llm_kv_cache {
    struct ggml_tensor * k;
    struct ggml_tensor * v;
    struct ggml_context * ctx = NULL;
    llm_buffer buf;
    int n; // number of tokens currently in the cache
    ~llm_kv_cache() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
 };
 inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
 {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
    if (plan.work_size > 0) {
        buf.resize(plan.work_size);
        plan.work_data = buf.addr;
    }
    ggml_graph_compute(graph, &plan);
 }
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -1,298 +0,0 @@
 #include "llmodel.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <iostream>
 #include <iterator>
 #include <optional>
 #include <ranges>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
 namespace ranges = std::ranges;
 namespace views  = std::ranges::views;
 void LLModel::prompt(
    std::string_view        prompt,
    const PromptCallback   &promptCallback,
    const ResponseCallback &responseCallback,
    const PromptContext    &promptCtx
 ) {
    if (!isModelLoaded())
        throw std::invalid_argument("Attempted to prompt an unloaded model.");
    if (!supportsCompletion())
        throw std::invalid_argument("Not a text completion model.");
    if (!promptCtx.n_batch)
        throw std::invalid_argument("Batch size cannot be zero.");
    if (!promptCtx.n_predict)
        return; // nothing requested
    auto embd_inp = tokenize(prompt);
    if (embd_inp.empty())
        throw std::invalid_argument("Prompt tokenized to zero tokens.");
    if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
        generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
 }
 int32_t LLModel::countPromptTokens(std::string_view prompt) const
 {
    if (!isModelLoaded())
        throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
    return int32_t(tokenize(prompt).size());
 }
 auto LLModel::decodePrompt(
    const PromptCallback &promptCallback,
    const PromptContext  &promptCtx,
    std::vector<Token>    embd_inp
 ) -> std::optional<int32_t>
 {
    assert(!embd_inp.empty());
    int32_t nCtx = contextLength();
    int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
    // Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
    // requested n_past.
    // This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
    int32_t nPast = computeModelInputPosition(embd_inp);
    // always decode up to a full batch before generating, even if cached
    nPast -= std::min(n_batch, nPast);
    // TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
    if (!nPast && int32_t(embd_inp.size()) > nCtx) {
        // no cache hit -> shift the input before even processing
        int32_t nKeep     = shouldAddBOS();
        auto    newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
        int32_t nDiscard  = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
        // execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
        auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
        if (!promptCallback(discardedTokens, true))
            return std::nullopt;
        // erase nDiscard tokens
        embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
        assert(int32_t(embd_inp.size()) <= nCtx);
        // check the cache again, just in case
        nPast = computeModelInputPosition(embd_inp);
        nPast -= std::min(n_batch, nPast);
    }
    setModelInputPosition(nPast);
    // execute the callback even for skipped tokens
    if (!promptCallback(embd_inp | views::take(nPast), true))
        return std::nullopt;
    // process the prompt in batches
    for (int32_t i = nPast; i < embd_inp.size();) {
        auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
        std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
        // Check if the context has run out...
        if (nPast + int32_t(batch.size()) > nCtx) {
            shiftContext(promptCtx, &nPast);
            assert(nPast + int32_t(batch.size()) <= nCtx);
        }
        // FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
        if (!evalTokens(nPast, batch))
            throw std::runtime_error("An internal error was encountered during prompt processing.");
        for (auto &tok : batch) {
            appendInputToken(tok);
            nPast++;
            if (!promptCallback({ &tok, 1 }, false))
                return std::nullopt;
        }
        i = batch_end;
    }
    return nPast;
 }
 /*
 * If string s overlaps with the string key such that some prefix of the key is at the end
 * of the string, return the position in s where the first match starts. Otherwise, return
 * std::string::npos. Examples:
 * s = "bfo",  key = "foo" -> 1
 * s = "fooa", key = "foo" -> npos
 */
 static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
 {
    if (s.empty() || key.empty())
        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
            return start;
    }
    return std::string::npos;
 }
 void LLModel::generateResponse(
    const ResponseCallback &responseCallback,
    const PromptContext    &promptCtx,
    int32_t                 nPast
 ) {
    static const char *stopSequences[] {
        "### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
        "<|im_start|>", "<|im_end|>", "<|endoftext|>",
    };
    initSampler(promptCtx);
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    int n_predicted = 0;
    // Predict next tokens
    for (bool stop = false; !stop;) {
        // Sample next token
        std::optional<Token> new_tok = sampleToken();
        std::string new_piece = tokenToString(new_tok.value());
        cachedTokens.push_back(new_tok.value());
        cachedResponse += new_piece;
        auto accept = [this, &promptCtx, &new_tok, &nPast] {
            // Shift context if out of space
            if (nPast >= contextLength()) {
                shiftContext(promptCtx, &nPast);
                assert(nPast < contextLength());
            }
            // Accept the token
            Token tok = std::exchange(new_tok, std::nullopt).value();
            if (!evalTokens(nPast, { &tok, 1 }))
                throw std::runtime_error("An internal error was encountered during response generation.");
            appendInputToken(tok);
            nPast++;
        };
        // Check for EOS
        auto lengthLimit = std::string::npos;
        for (const auto token : endTokens()) {
            if (new_tok == token) {
                stop = true;
                lengthLimit = cachedResponse.size() - new_piece.size();
            }
        }
        if (lengthLimit != std::string::npos) {
            // EOS matched
        } else if (!isSpecialToken(new_tok.value())) {
            // Check if the response contains a stop sequence
            for (const auto &p : stopSequences) {
                auto match = cachedResponse.find(p);
                if (match != std::string::npos) stop = true;
                lengthLimit = std::min(lengthLimit, match);
                if (match == 0) break;
            }
            // Check if the response matches the start of a stop sequence
            if (lengthLimit == std::string::npos) {
                for (const auto &p : stopSequences) {
                    auto match = stringsOverlap(cachedResponse, p);
                    lengthLimit = std::min(lengthLimit, match);
                    if (match == 0) break;
                }
            }
        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
            // Special tokens must exactly match a stop sequence
            stop = true;
            lengthLimit = cachedResponse.size() - new_piece.size();
        }
        // Empty the cache, up to the length limit
        std::string::size_type responseLength = 0;
        while (!cachedTokens.empty()) {
            Token tok = cachedTokens.front();
            std::string piece = tokenToString(tok);
            // Stop if the piece (or part of it) does not fit within the length limit
            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
                break;
            // Remove token from cache
            assert(cachedResponse.starts_with(piece));
            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
            // Accept the token, if needed (not cached)
            if (cachedTokens.empty() && new_tok)
                accept();
            // Send the token
            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
                stop = true;
                break;
            }
            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
            // output token IDs and could cache a partial token for the next prompt call
            responseLength += piece.size();
        }
        assert(cachedTokens.empty() == cachedResponse.empty());
        // Accept the token, if needed (in cache)
        if (new_tok) {
            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
            if (stop) {
                cachedTokens.pop_back();
            } else {
                accept();
            }
        }
    }
    if (inputLength() < cachedTokens.size()) {
        /* This is theoretically possible if the longest stop sequence is greater than
         * n_ctx * contextErase tokens. */
        throw std::runtime_error("shifted too much context, can't go back");
    }
 #ifndef NDEBUG
    auto inp = inputTokens();
    auto discard_start = inp.end() - cachedTokens.size();
    assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
 #endif
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
 ) {
    (void)texts;
    (void)embeddings;
    (void)prefix;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    (void)cancelCb;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
    bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)isRetrieval;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
@ -1,17 +0,0 @@
 #pragma once
 #include <cassert>
 #ifdef NDEBUG
 #   ifdef __has_builtin
 #       if __has_builtin(__builtin_unreachable)
 #           define UNREACHABLE() __builtin_unreachable()
 #       else
 #           define UNREACHABLE() do {} while (0)
 #       endif
 #   else
 #       define UNREACHABLE() do {} while (0)
 #   endif
 #else
 #   define UNREACHABLE() assert(!"Unreachable statement was reached")
 #endif
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
--- a/gpt4all-backend/utils.cpp
+++ b/gpt4all-backend/utils.cpp
@ -0,0 +1,339 @@
 #include "utils.h"
 #include <cmath>
 #include <cstdio>
 #include <cstdlib>
 #include <fstream>
 #include <iterator>
 #include <regex>
 #include <utility>
 void replace(std::string & str, const std::string & needle, const std::string & replacement)
 {
    size_t pos = 0;
    while ((pos = str.find(needle, pos)) != std::string::npos) {
        str.replace(pos, needle.length(), replacement);
        pos += replacement.length();
    }
 }
 std::map<std::string, int32_t> json_parse(const std::string & fname)
 {
    std::map<std::string, int32_t> result;
    // read file into string
    std::string json;
    {
        std::ifstream ifs(fname);
        if (!ifs) {
            fprintf(stderr, "Failed to open %s\n", fname.c_str());
            exit(1);
        }
        json = std::string((std::istreambuf_iterator<char>(ifs)),
                (std::istreambuf_iterator<char>()));
    }
    if (json[0] != '{') {
        return result;
    }
    // parse json
    {
        bool has_key  = false;
        bool in_token = false;
        std::string str_key = "";
        std::string str_val = "";
        int n = json.size();
        for (int i = 1; i < n; ++i) {
            if (!in_token) {
                if (json[i] == ' ') continue;
                if (json[i] == '"') {
                    in_token = true;
                    continue;
                }
            } else {
                if (json[i] == '\\' && i+1 < n) {
                    if (has_key == false) {
                        str_key += json[i];
                    } else {
                        str_val += json[i];
                    }
                    ++i;
                } else if (json[i] == '"') {
                    if (has_key == false) {
                        has_key = true;
                        ++i;
                        while (json[i] == ' ') ++i;
                        ++i; // :
                        while (json[i] == ' ') ++i;
                        if (json[i] != '\"') {
                            while (json[i] != ',' && json[i] != '}') {
                                str_val += json[i++];
                            }
                            has_key = false;
                        } else {
                            in_token = true;
                            continue;
                        }
                    } else {
                        has_key = false;
                    }
                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
                    ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
                    try {
                        result[str_key] = std::stoi(str_val);
                    } catch (...) {
                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
                    }
                    str_key = "";
                    str_val = "";
                    in_token = false;
                    continue;
                }
                if (has_key == false) {
                    str_key += json[i];
                } else {
                    str_val += json[i];
                }
            }
        }
    }
    return result;
 }
 std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text)
 {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 std::string regex_escape(const std::string &s)
 {
  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
  return std::regex_replace(s, metacharacters, "\\$&");
 }
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text)
 {
    // Generate the subpattern from the special_tokens vector if it's not empty
    if (!vocab.special_tokens.empty()) {
        std::vector<gpt_vocab::id> out;
        std::vector<std::string> chunks;
        std::string str = text;
        std::string special_tokens_subpattern;
        for (const auto &token : vocab.special_tokens) {
            if (!special_tokens_subpattern.empty()) {
                special_tokens_subpattern += "|";
            }
            special_tokens_subpattern += regex_escape(token);
        }
        std::regex re(special_tokens_subpattern);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            auto tok = vocab.token_to_id.find(m.str());
            if (tok != vocab.token_to_id.end()) {
                auto tokid = tok->second;
                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
                out.push_back(tokid);
                str = m.suffix();
            }
        }
        if (!str.empty()) {
            auto tokrest = gpt_tokenize_inner(vocab, str);
            out.insert(out.end(), tokrest.begin(), tokrest.end());
        }
        return out;
    } else {
        return gpt_tokenize_inner(vocab, text);
    }
 }
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab)
 {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
    vocab.token_to_id = ::json_parse(fname);
    for (const auto & kv : vocab.token_to_id) {
        vocab.id_to_token[kv.second] = kv.first;
    }
    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
    // print the vocabulary
    //for (auto kv : vocab.token_to_id) {
    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
    //}
    return true;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const size_t actualVocabSize,
        const int32_t * last_n_tokens_data,
        int   last_n_tokens_size,
        const std::vector<float> logits,
        int    top_k,
        double top_p,
        double temp,
        float repeat_penalty,
        std::mt19937 & rng) {
    int n_logits = actualVocabSize;
    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
    const auto * plogits = logits.data();
    if (temp <= 0) {
        // select the token with the highest logit directly
        float max_logit = plogits[0];
        gpt_vocab::id max_id = 0;
        for (int i = 1; i < n_logits; ++i) {
            if (plogits[i] > max_logit) {
                max_logit = plogits[i];
                max_id = i;
            }
        }
        return max_id;
    }
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    {
        const float scale = 1.0f/temp;
        for (int i = 0; i < n_logits; ++i) {
            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
                if (plogits[i] < 0.0f) {
                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                } else {
                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
                }
            } else {
                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
            }
        }
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }
    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }
    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }
        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int) probs.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
    //}
    //exit(0);
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
--- a/gpt4all-backend/utils.h
+++ b/gpt4all-backend/utils.h
@ -0,0 +1,101 @@
 // Various helper functions and utilities
 #pragma once
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <map>
 #include <random>
 #include <string>
 #include <thread>
 #include <vector>
 //
 // General purpose inline functions
 //
 constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes)
 {
    return bytes*1024*1024;
 }
 //
 // CLI argument parsing
 //
 struct gpt_params {
    int32_t seed      = -1; // RNG seed
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict = 200; // new tokens to predict
    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.9f;
    float   temp  = 0.9f;
    int32_t n_batch = 8; // batch size for prompt processing
    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
    std::string prompt;
 };
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 //
 // Vocab utils
 //
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
    std::vector<std::string> special_tokens;
    void add_special_token(const std::string &token) {
        special_tokens.push_back(token);
    }
 };
 void replace(std::string & str, const std::string & needle, const std::string & replacement);
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 // split text into tokens
 //
 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 //
 // Regex (Python):
 // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 //
 // Regex (C++):
 // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 // sample next token given probabilities for each embedding
 //
 //   - consider only the top K tokens
 //   - from them, consider only the top tokens with cumulative probability > P
 //
 // TODO: not sure if this implementation is correct
 //
 gpt_vocab::id gpt_sample_top_k_top_p(
        const size_t actualVocabSize,
        const int32_t * last_n_tokens_data,
        int   last_n_tokens_size,
        const std::vector<float> logits,
        int    top_k,
        double top_p,
        double temp,
        float repeat_penalty,
        std::mt19937 & rng);
--- a/gpt4all-bindings/cli/app.py
+++ b/gpt4all-bindings/cli/app.py
@ -113,7 +113,10 @@ def _old_loop(gpt4all_instance):
        full_response = gpt4all_instance.chat_completion(
            MESSAGES,
            # preferential kwargs for chat ux
            logits_size=0,
            tokens_size=0,
            n_past=0,
            n_ctx=0,
            n_predict=200,
            top_k=40,
            top_p=0.9,
--- a/gpt4all-bindings/python/CHANGELOG.md
+++ b/gpt4all-bindings/python/CHANGELOG.md
@ -4,41 +4,6 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [Unreleased]
 ### Added
 - Warn on Windows if the Microsoft Visual C++ runtime libraries are not found ([#2920](https://github.com/nomic-ai/gpt4all/pull/2920))
 - Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))
 - Add ability to modify or replace the history of an active chat session ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 ### Changed
 - Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
 - Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
 - Fix CalledProcessError on Intel Macs since v2.8.0 ([#3045](https://github.com/nomic-ai/gpt4all/pull/3045))
 - Use Jinja for chat templates instead of per-message QString.arg-style templates ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 ## [2.8.2] - 2024-08-14
 ### Fixed
 - Fixed incompatibility with Python 3.8 since v2.7.0 and Python <=3.11 since v2.8.1 ([#2871](https://github.com/nomic-ai/gpt4all/pull/2871))
 ## [2.8.1] - 2024-08-13
 ### Added
 - Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
 ### Changed
 - Search for pip-installed CUDA 11 as well as CUDA 12 ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
 - Stop shipping CUBINs to reduce wheel size ([#2802](https://github.com/nomic-ai/gpt4all/pull/2802))
 - Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 ### Fixed
 - Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2849](https://github.com/nomic-ai/gpt4all/pull/2849))
 - Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 - Fix a segfault on exit when using CPU mode on Linux with NVIDIA and EGL ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 ## [2.8.0] - 2024-08-05
 ### Added
@ -51,7 +16,6 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 - Detect use of a Python interpreter under Rosetta for a clearer error message ([#2793](https://github.com/nomic-ai/gpt4all/pull/2793))
 ### Changed
 - Build against CUDA 11.8 instead of CUDA 12 for better compatibility with older drivers ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
 - Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
 ### Removed
@ -69,7 +33,4 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
 [Unreleased]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.2...HEAD
 [2.8.2]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.1...python-v2.8.2
 [2.8.1]: https://github.com/nomic-ai/gpt4all/compare/python-v2.8.0...python-v2.8.1
 [2.8.0]: https://github.com/nomic-ai/gpt4all/compare/python-v2.7.0...python-v2.8.0
--- a/gpt4all-bindings/python/docs/assets/attach_spreadsheet.png
+++ b/gpt4all-bindings/python/docs/assets/attach_spreadsheet.png
--- a/gpt4all-bindings/python/docs/assets/chat_window.png
+++ b/gpt4all-bindings/python/docs/assets/chat_window.png
--- a/gpt4all-bindings/python/docs/assets/disney_spreadsheet.png
+++ b/gpt4all-bindings/python/docs/assets/disney_spreadsheet.png
--- a/gpt4all-bindings/python/docs/assets/gpt4all_xlsx_attachment.mp4
+++ b/gpt4all-bindings/python/docs/assets/gpt4all_xlsx_attachment.mp4
--- a/gpt4all-bindings/python/docs/assets/spreadsheet_chat.png
+++ b/gpt4all-bindings/python/docs/assets/spreadsheet_chat.png
--- a/gpt4all-bindings/python/docs/gpt4all_api_server/home.md
+++ b/gpt4all-bindings/python/docs/gpt4all_api_server/home.md
@ -1,86 +0,0 @@
 # GPT4All API Server
 GPT4All provides a local API server that allows you to run LLMs over an HTTP API. 
 ## Key Features
 - **Local Execution**: Run models on your own hardware for privacy and offline use.
 - **LocalDocs Integration**: Run the API with relevant text snippets provided to your LLM from a [LocalDocs collection](../gpt4all_desktop/localdocs.md).
 - **OpenAI API Compatibility**: Use existing OpenAI-compatible clients and tools with your local models.
 ## Activating the API Server
 1. Open the GPT4All Chat Desktop Application.
 2. Go to `Settings` > `Application` and scroll down to `Advanced`.
 3. Check the box for the `"Enable Local API Server"` setting.
 4. The server listens on port 4891 by default. You can choose another port number in the `"API Server Port"` setting.
 ## Connecting to the API Server
 The base URL used for the API server is `http://localhost:4891/v1` (or `http://localhost:<PORT_NUM>/v1` if you are using a different port number). 
 The server only accepts HTTP connections (not HTTPS) and only listens on localhost (127.0.0.1) (e.g. not to the IPv6 localhost address `::1`.)
 ## Examples
 !!! note "Example GPT4All API calls"
    === "cURL"
        ```bash
        curl -X POST http://localhost:4891/v1/chat/completions -d '{
        "model": "Phi-3 Mini Instruct",
        "messages": [{"role":"user","content":"Who is Lionel Messi?"}],
        "max_tokens": 50,
        "temperature": 0.28
        }'
        ```
    === "PowerShell"
        ```powershell
        Invoke-WebRequest -URI http://localhost:4891/v1/chat/completions -Method POST -ContentType application/json -Body '{
        "model": "Phi-3 Mini Instruct",
        "messages": [{"role":"user","content":"Who is Lionel Messi?"}],
        "max_tokens": 50,
        "temperature": 0.28
        }'
        ```
 ## API Endpoints
 | Method | Path | Description |
 |--------|------|-------------|
 | GET | `/v1/models` | List available models |
 | GET | `/v1/models/<name>` | Get details of a specific model |
 | POST | `/v1/completions` | Generate text completions |
 | POST | `/v1/chat/completions` | Generate chat completions |
 ## LocalDocs Integration
 You can use LocalDocs with the API server:
 1. Open the Chats view in the GPT4All application.
 2. Scroll to the bottom of the chat history sidebar.
 3. Select the server chat (it has a different background color).
 4. Activate LocalDocs collections in the right sidebar.
 (Note: LocalDocs can currently only be activated through the GPT4All UI, not via the API itself).
 Now, your API calls to your local LLM will have relevant references from your LocalDocs collection retrieved and placed in the input message for the LLM to respond to.
 The references retrieved for your API call can be accessed in the API response object at 
 `response["choices"][0]["references"]`
 The data included in the `references` are:
 - `text`: the actual text content from the snippet that was extracted from the reference document
 - `author`: the author of the reference document (if available)
 - `date`: the date of creation of the reference document (if available)
 - `page`: the page number the snippet is from (only available for PDF documents for now)
 - `title`: the title of the reference document (if available)
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/chat_templates.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/chat_templates.md
@ -1,206 +0,0 @@
 ## What are chat templates?
 Natively, large language models only know how to complete plain text and do not know the difference between their input and their output. In order to support a chat with a person, LLMs are designed to use a template to convert the conversation to plain text using a specific format.
 For a given model, it is important to use an appropriate chat template, as each model is designed to work best with a specific format. The chat templates included with the built-in models should be sufficient for most purposes.
 There are two reasons you would want to alter the chat template:
 - You are sideloading a model and there is no chat template available,
 - You would like to have greater control over the input to the LLM than a system message provides.
 ## What is a system message?
 A system message is a message that controls the responses from the LLM in a way that affects the entire conversation. System messages can be short, such as "Speak like a pirate.", or they can be long and contain a lot of context for the LLM to keep in mind.
 Not all models are designed to use a system message, so they work with some models better than others.
 ## How do I customize the chat template or system message?
 To customize the chat template or system message, go to Settings > Model. Make sure to select the correct model at the top. If you clone a model, you can use a different chat template or system message from the base model, enabling you to use different settings for each conversation.
 These settings take effect immediately. After changing them, you can click "Redo last response" in the chat view, and the response will take the new settings into account.
 ## Do I need to write a chat template?
 You typically do not need to write your own chat template. The exception is models that are not in the official model list and do not come with a chat template built-in. These will show a "Clear" option above the chat template field in the Model Settings page instead of a "Reset" option. See the section on [finding] or [creating] a chat template.
 [finding]: #how-do-i-find-a-chat-template
 [creating]: #advanced-how-do-chat-templates-work
 ## What changed in GPT4All v3.5?
 GPT4All v3.5 overhauled the chat template system. There are three crucial differences:
 - The chat template now formats an entire conversation instead of a single pair of messages,
 - The chat template now uses Jinja syntax instead of `%1` and `%2` placeholders,
 - And the system message should no longer contain control tokens or trailing whitespace.
 If you are using any chat templates or system messages that had been added or altered from the default before upgrading to GPT4All v3.5 or newer, these will no longer work. See below for how to solve common errors you may see after upgrading.
 ## Error/Warning: System message is not plain text.
 This is easy to fix. Go to the model's settings and look at the system prompt. There are three things to look for:
 - Control tokens such as `<|im_start|>`, `<|start_header_id|>`, or `<|system|>`
 - A prefix such as `### System` or `SYSTEM:`
 - Trailing whitespace, such as a space character or blank line.
 If you see any of these things, remove them. For example, this legacy system prompt:
 ```
 <|start_header_id|>system<|end_header_id|>
 You are a helpful assistant.<|eot_id|>
 ```
 Should become this:
 ```
 You are a helpful assistant.
 ```
 If you do not see anything that needs to be changed, you can dismiss the error by making a minor modification to the message and then changing it back.
 If you see a warning, your system message does not appear to be plain text. If you believe this warning is incorrect, it can be safely ignored. If in doubt, ask on the [Discord].
 [Discord]: https://discord.gg/mGZE39AS3e
 ## Error: Legacy system prompt needs to be updated in Settings.
 This is the same as [above][above-1], but appears on the chat page.
 [above-1]: #errorwarning-system-message-is-not-plain-text
 ## Error/Warning: Chat template is not in Jinja format.
 This is the result of attempting to use an old-style template (possibly from a previous version) in GPT4All 3.5+.
 Go to the Model Settings page and select the affected model. If you see a "Reset" button, and you have not intentionally modified the prompt template, you can click "Reset". Otherwise, this is what you can do:
 1. Back up your chat template by copying it safely to a text file and saving it. In the next step, it will be removed from GPT4All.
 2. Click "Reset" or "Clear".
 3. If you clicked "Clear", the chat template is now gone. Follow the steps to [find][finding] or [create][creating] a basic chat template for your model.
 4. Customize the chat template to suit your needs. For help, read the section about [creating] a chat template.
 ## Error: Legacy prompt template needs to be updated in Settings.
 This is the same as [above][above-2], but appears on the chat page.
 [above-2]: #errorwarning-chat-template-is-not-in-jinja-format
 ## The chat template has a syntax error.
 If there is a syntax error while editing the chat template, the details will be displayed in an error message above the input box. This could be because the chat template is not actually in Jinja format (see [above][above-2]).
 Otherwise, you have either typed something correctly, or the model comes with a template that is incompatible with GPT4All. See [the below section][creating] on creating chat templates and make sure that everything is correct. When in doubt, ask on the [Discord].
 ## Error: No chat template configured.
 This may appear for models that are not from the official model list and do not include a chat template. Older versions of GPT4All picked a poor default in this case. You will get much better results if you follow the steps to [find][finding] or [create][creating] a chat template for your model.
 ## Error: The chat template cannot be blank.
 If the button above the chat template on the Model Settings page says "Clear", see [above][above-3]. If you see "Reset", click that button to restore a reasonable default. Also see the section on [syntax errors][chat-syntax-error].
 [above-3]: #error-no-chat-template-configured
 [chat-syntax-error]: #the-chat-template-has-a-syntax-error
 ## How do I find a chat template?
 When in doubt, you can always ask the [Discord] community for help. Below are the instructions to find one on your own.
 The authoritative source for a model's chat template is the HuggingFace repo that the original (non-GGUF) model came from. First, you should find this page. If you just have a model file, you can try a google search for the model's name. If you know the page you downloaded the GGUF model from, its README usually links to the original non-GGUF model.
 Once you have located the original model, there are two methods you can use to extract its chat template. Pick whichever one you are most comfortable with.
 ### Using the CLI (all models)
 1. Install `jq` using your preferred package manager - e.g. Chocolatey (Windows), Homebrew (macOS), or apt (Ubuntu).
 2. Download `tokenizer_config.json` from the model's "Files and versions" tab.
 3. Open a command prompt in the directory which you have downloaded the model file.
 4. Run `jq -r ".chat_template" tokenizer_config.json`. This shows the chat template in a human-readable form. You can copy this and paste it into the settings page.
 5. (Optional) You can save the output to a text file like this: `jq -r ".chat_template" tokenizer_config.json >chat_template.txt`
 If the output is "null", the model does not provide a chat template. See the [below instructions][creating] on creating a chat template.
 ### Python (open models)
 1. Install `transformers` using your preferred python package manager, e.g. `pip install transformers`. Make sure it is at least version v4.43.0.
 2. Copy the ID of the HuggingFace model, using the clipboard icon next to the name. For example, if the URL is `https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B`, the ID is `NousResearch/Hermes-2-Pro-Llama-3-8B`.
 3. Open a python interpreter (`python`) and run the following commands. Change the model ID in the example to the one you copied.
 ```
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained('NousResearch/Hermes-2-Pro-Llama-3-8B')
 >>> print(tokenizer.get_chat_template())
 ```
 You can copy the output and paste it into the settings page.
 4. (Optional) You can save the output to a text file like this:
 ```
 >>> open('chat_template.txt', 'w').write(tokenizer.get_chat_template())
 ```
 If you get a ValueError exception, this model does not provide a chat template. See the [below instructions][creating] on creating a chat template.
 ### Python (gated models)
 Some models, such as Llama and Mistral, do not allow public access to their chat template. You must either use the CLI method above, or follow the following instructions to use Python:
 1. For these steps, you must have git and git-lfs installed.
 2. You must have a HuggingFace account and be logged in.
 3. You must already have access to the gated model. Otherwise, request access.
 4. You must have an SSH key configured for git access to HuggingFace.
 5. `git clone` the model's HuggingFace repo using the SSH clone URL. There is no need to download the entire model, which is very large. A good way to do this on Linux is:
 ```console
 $ GIT_LFS_SKIP_SMUDGE=1 git clone hf.co:meta-llama/Llama-3.1-8B-Instruct.git
 $ cd Llama-3.1-8B-Instruct
 $ git lfs pull -I "tokenizer.*"
 ```
 6. Follow the above instructions for open models, but replace the model ID with the path to the directory containing `tokenizer\_config.json`:
 ```
 >>> tokenizer = AutoTokenizer.from_pretrained('.')
 ```
 ## Advanced: How do chat templates work?
 The chat template is applied to the entire conversation you see in the chat window. The template loops over the list of messages, each containing `role` and `content` fields. `role` is either `user`, `assistant`, or `system`.
 GPT4All also supports the special variables `bos_token`, `eos_token`, and `add_generation_prompt`. See the [HuggingFace docs] for what those do.
 [HuggingFace docs]: https://huggingface.co/docs/transformers/v4.46.3/en/chat_templating#special-variables
 ## Advanced: How do I make a chat template?
 The best way to create a chat template is to start by using an existing one as a reference. Then, modify it to use the format documented for the given model. Its README page may explicitly give an example of its template. Or, it may mention the name of a well-known standard template, such as ChatML, Alpaca, Vicuna. GPT4All does not yet include presets for these templates, so they will have to be found in other models or taken from the community.
 For more information, see the very helpful [HuggingFace guide]. Some of this is not applicable, such as the information about tool calling and RAG - GPT4All implements those features differently.
 Some models use a prompt template that does not intuitively map to a multi-turn chat, because it is more intended for single instructions. The [FastChat] implementation of these templates is a useful reference for the correct way to extend them to multiple messages.
 [HuggingFace guide]: https://huggingface.co/docs/transformers/v4.46.3/en/chat_templating#advanced-template-writing-tips
 [FastChat]: https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 # Advanced: What are GPT4All v1 templates?
 GPT4All supports its own template syntax, which is nonstandard but provides complete control over the way LocalDocs sources and file attachments are inserted into the conversation. These templates begin with `{# gpt4all v1 #}` and look similar to the example below.
 For standard templates, GPT4All combines the user message, sources, and attachments into the `content` field. For GPT4All v1 templates, this is not done, so they must be used directly in the template for those features to work correctly.
 ```jinja
 {# gpt4all v1 #}
 {%- for message in messages %}
    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
    {%- if message['role'] == 'user' %}
        {%- for source in message['sources'] %}
            {%- if loop.first %}
                {{- '### Context:\n' }}
            {%- endif %}
            {{- 'Collection: ' + source['collection'] + '\n'   +
                'Path: '       + source['path']       + '\n'   +
                'Excerpt: '    + source['text']       + '\n\n' }}
        {%- endfor %}
    {%- endif %}
    {%- for attachment in message['prompt_attachments'] %}
        {{- attachment['processed_content'] + '\n\n' }}
    {%- endfor %}
    {{- message['content'] | trim }}
    {{- '<|eot_id|>' }}
 {%- endfor %}
 {%- if add_generation_prompt %}
    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
 {%- endif %}
 ```
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md
@ -1,85 +0,0 @@
 # Using GPT4All to Privately Chat with your Microsoft Excel Spreadsheets
 Local and Private AI Chat with your Microsoft Excel Spreadsheets
 Microsoft Excel allows you to create, manage, and analyze data in spreadsheet format. By attaching your spreadsheets directly to GPT4All, you can privately chat with the AI to query and explore the data, enabling you to summarize, generate reports, and glean insights from your files—all within your conversation.
 <div style="position: relative; padding-bottom: 56.25%; height: 0; overflow: hidden;">
  <iframe src="../../assets/gpt4all_xlsx_attachment.mp4" style="position: absolute; top: 0; left: 0; width: 100%; height: 100%; border:0;" allowfullscreen title="YouTube Video"></iframe>
 </div>
 ## Attach Microsoft Excel to your GPT4All Conversation
 !!! note "Attach Microsoft Excel to your GPT4All Conversation"
    1. **Install GPT4All and Open **:
        - Go to [nomic.ai/gpt4all](https://nomic.ai/gpt4all) to install GPT4All for your operating system.
        - Navigate to the Chats view within GPT4All.
        <table>
            <tr>
               <td>
                  <!-- Screenshot of Chat view -->
                  <img width="1348" alt="Chat view" src="../../assets/chat_window.png">
               </td>
            </tr>
         </table>
    2. **Example Spreadsheet **:
        <table>
            <tr>
               <td>
                  <!-- Screenshot of Spreadsheet view -->
                  <img width="1348" alt="Spreadsheet view" src="../../assets/disney_spreadsheet.png">
               </td>
            </tr>
         </table>
    3. **Attach to GPT4All conversration**
        <table>
            <tr>
               <td>
                  <!-- Screenshot of Attach view -->
                  <img width="1348" alt="Attach view" src="../../assets/attach_spreadsheet.png">
               </td>
            </tr>
         </table>
    4. **Have GPT4All Summarize and Generate a Report**
        <table>
            <tr>
               <td>
                  <!-- Screenshot of Attach view -->
                  <img width="1348" alt="Attach view" src="../../assets/spreadsheet_chat.png">
               </td>
            </tr>
         </table>
 ## How It Works
 GPT4All parses your attached excel spreadsheet into Markdown, a format understandable to LLMs, and adds the markdown text to the context for your LLM chat. You can view the code that converts `.xslx` to Markdown [here](https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-chat/src/xlsxtomd.cpp) in the GPT4All github repo.
 For example, the above spreadsheet titled `disney_income_stmt.xlsx` would be formatted the following way:
 ```markdown
 ## disney_income_stmt
 |Walt Disney Co.|||||||
 |---|---|---|---|---|---|---|
 |Consolidated Income Statement|||||||
 |||||||||
 |US$ in millions|||||||
 |12 months ended:|2023-09-30 00:00:00|2022-10-01 00:00:00|2021-10-02 00:00:00|2020-10-03 00:00:00|2019-09-28 00:00:00|2018-09-29 00:00:00|
 |Services|79562|74200|61768|59265|60542|50869|
 ...
 ...
 ...
 ```
 ## Limitations
 It is important to double-check the claims LLMs make about the spreadsheets you provide. LLMs can make mistakes about the data they are presented, particularly for the LLMs with smaller parameter counts (~8B) that fit within the memory of consumer hardware.
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/quickstart.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/quickstart.md
@ -4,8 +4,6 @@ The GPT4All Desktop Application allows you to download and run large language mo
 With GPT4All, you can chat with models, turn your local files into information sources for models [(LocalDocs)](localdocs.md), or browse models available online to download onto your device.
 [Official Video Tutorial](https://www.youtube.com/watch?v=gQcZDXRVJok)
 ## Quickstart
 !!! note "Quickstart"
--- a/gpt4all-bindings/python/docs/gpt4all_desktop/settings.md
+++ b/gpt4all-bindings/python/docs/gpt4all_desktop/settings.md
@ -8,11 +8,10 @@
    | --- | --- | --- |
    | **Theme** | Color theme for the application. Options are `Light`, `Dark`, and `LegacyDark` | `Light` |
    | **Font Size** | Font size setting for text throughout the application. Options are Small, Medium, and Large | Small |
    | **Language and Locale** | The language and locale of that language you wish to use | System Locale |
    | **Device** | Device that will run your models. Options are `Auto` (GPT4All chooses), `Metal` (Apple Silicon M1+), `CPU`, and `GPU` | `Auto` |
    | **Default Model** | Choose your preferred LLM to load by default on startup| Auto |
    | **Suggestion Mode** | Generate suggested follow up questions at the end of responses | When chatting with LocalDocs | 
    | **Download Path** | Select a destination on your device to save downloaded models | Windows: `C:\Users\{username}\AppData\Local\nomic.ai\GPT4All`<br><br>Mac: `/Users/{username}/Library/Application Support/nomic.ai/GPT4All/`<br><br>Linux: `/home/{username}/.local/share/nomic.ai/GPT4All` |
    | **Enable Datalake** | Opt-in to sharing interactions with GPT4All community (**anonymous** and **optional**) | Off |
 !!! note "Advanced Application Settings"
@ -20,7 +19,7 @@
    | Setting | Description | Default Value |
    | --- | --- | --- |
    | **CPU Threads** | Number of concurrently running CPU threads (more can speed up responses) | 4 |
-    | **Enable System Tray** | The application will minimize to the system tray / taskbar when the window is closed | Off |
+    | **Save Chat Context** | Save chat context to disk to pick up exactly where a model left off. | Off |
    | **Enable Local Server** | Allow any application on your device to use GPT4All via an OpenAI-compatible GPT4All API | Off |
    | **API Server Port** | Local HTTP port for the local API server | 4891 |
@ -31,11 +30,8 @@
    | Setting | Description | Default Value |
    | --- | --- | --- |
    | **Name** | Unique name of this model / character| set by model uploader |
-    | **Model File** | Filename (.gguf) of the model | set by model uploader |
+    | **System Prompt** | General instructions for the chats this model will be used for | set by model uploader |
-    | **System Message** | General instructions for the chats this model will be used for | set by model uploader |
+    | **Prompt Template** | Format of user <-> assistant interactions for the chats this model will be used for | set by model uploader |
    | **Chat Template** | Format of user <-> assistant interactions for the chats this model will be used for | set by model uploader |
    | **Chat Name Prompt** | Prompt used to automatically generate chat names | Describe the above conversation in seven words or less. |
    | **Suggested FollowUp Prompt** | Prompt used to automatically generate follow up questions after a chat response | Suggest three very short factual follow-up questions that have not been answered yet or cannot be found inspired by the previous conversation and excerpts. |
 ### Clone
--- a/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
+++ b/gpt4all-bindings/python/docs/gpt4all_help/troubleshooting.md
@ -4,7 +4,7 @@
 It is possible you are trying to load a model from HuggingFace whose weights are not compatible with our [backend](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings).
-Try downloading one of the officially supported models listed on the main models page in the application. If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).
+Try downloading one of the officially supported models mentioned our [website](https://gpt4all.io/). If the problem persists, please share your experience on our [Discord](https://discord.com/channels/1076964370942267462).
 ## Bad Responses 
--- a/gpt4all-bindings/python/gpt4all/_pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/_pyllmodel.py
@ -3,13 +3,14 @@ from __future__ import annotations
 import ctypes
 import os
 import platform
 import re
 import subprocess
 import sys
 import textwrap
 import threading
 from enum import Enum
 from queue import Queue
-from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Iterator, Literal, NoReturn, TypeVar, overload
+from typing import TYPE_CHECKING, Any, Callable, Generic, Iterable, Literal, NoReturn, TypeVar, overload
 if sys.version_info >= (3, 9):
    import importlib.resources as importlib_resources
@ -23,75 +24,40 @@ else:
    from typing import TypedDict
 if TYPE_CHECKING:
-    from typing_extensions import ParamSpec, TypeAlias
+    from typing_extensions import TypeAlias
    T = TypeVar("T")
    P = ParamSpec("P")
 EmbeddingsType = TypeVar('EmbeddingsType', bound='list[Any]')
 cuda_found: bool = False
 # TODO(jared): use operator.call after we drop python 3.10 support
 def _operator_call(obj: Callable[P, T], /, *args: P.args, **kwargs: P.kwargs) -> T:
    return obj(*args, **kwargs)
 # Detect Rosetta 2
-@_operator_call
+if platform.system() == "Darwin" and platform.processor() == "i386":
-def check_rosetta() -> None:
+    if subprocess.run(
-    if platform.system() == "Darwin" and platform.processor() == "i386":
+        "sysctl -n sysctl.proc_translated".split(), check=True, capture_output=True, text=True,
-        p = subprocess.run("sysctl -n sysctl.proc_translated".split(), capture_output=True, text=True)
+    ).stdout.strip() == "1":
-        if p.returncode == 0 and p.stdout.strip() == "1":
+        raise RuntimeError(textwrap.dedent("""\
-            raise RuntimeError(textwrap.dedent("""\
+            Running GPT4All under Rosetta is not supported due to CPU feature requirements.
-                Running GPT4All under Rosetta is not supported due to CPU feature requirements.
+            Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
-                Please install GPT4All in an environment that uses a native ARM64 Python interpreter.
+        """))
            """).strip())
-
+# Find CUDA libraries from the official packages
-# Check for C++ runtime libraries
+cuda_found = False
-if platform.system() == "Windows":
+if platform.system() in ('Linux', 'Windows'):
    try:
-        ctypes.CDLL("msvcp140.dll")
+        from nvidia import cuda_runtime, cublas
-        ctypes.CDLL("vcruntime140.dll")
+    except ImportError:
-        ctypes.CDLL("vcruntime140_1.dll")
+        pass  # CUDA is optional
-    except OSError as e:
+    else:
-        print(textwrap.dedent(f"""\
+        if platform.system() == 'Linux':
-            {e!r}
+            cudalib   = 'lib/libcudart.so.12'
-            The Microsoft Visual C++ runtime libraries were not found. Please install them from
+            cublaslib = 'lib/libcublas.so.12'
            https://aka.ms/vs/17/release/vc_redist.x64.exe
        """), file=sys.stderr)
@_operator_call
 def find_cuda() -> None:
    global cuda_found
    def _load_cuda(rtver: str, blasver: str) -> None:
        if platform.system() == "Linux":
            cudalib   = f"lib/libcudart.so.{rtver}"
            cublaslib = f"lib/libcublas.so.{blasver}"
        else:  # Windows
-            cudalib   = fr"bin\cudart64_{rtver.replace('.', '')}.dll"
+            cudalib   = r'bin\cudart64_12.dll'
-            cublaslib = fr"bin\cublas64_{blasver}.dll"
+            cublaslib = r'bin\cublas64_12.dll'
        # preload the CUDA libs so the backend can find them
        ctypes.CDLL(os.path.join(cuda_runtime.__path__[0], cudalib), mode=ctypes.RTLD_GLOBAL)
        ctypes.CDLL(os.path.join(cublas.__path__[0], cublaslib), mode=ctypes.RTLD_GLOBAL)
-
+        cuda_found = True
    # Find CUDA libraries from the official packages
    if platform.system() in ("Linux", "Windows"):
        try:
            from nvidia import cuda_runtime, cublas
        except ImportError:
            pass  # CUDA is optional
        else:
            for rtver, blasver in [("12", "12"), ("11.0", "11")]:
                try:
                    _load_cuda(rtver, blasver)
                    cuda_found = True
                except OSError:  # dlopen() does not give specific error codes
                    pass  # try the next one
 # TODO: provide a config file to make this more robust
@ -118,18 +84,21 @@ llmodel = load_llmodel_library()
 class LLModelPromptContext(ctypes.Structure):
    _fields_ = [
-        ("n_predict",      ctypes.c_int32),
+        ("tokens", ctypes.POINTER(ctypes.c_int32)),
-        ("top_k",          ctypes.c_int32),
+        ("tokens_size", ctypes.c_size_t),
-        ("top_p",          ctypes.c_float),
+        ("n_past", ctypes.c_int32),
-        ("min_p",          ctypes.c_float),
+        ("n_ctx", ctypes.c_int32),
-        ("temp",           ctypes.c_float),
+        ("n_predict", ctypes.c_int32),
-        ("n_batch",        ctypes.c_int32),
+        ("top_k", ctypes.c_int32),
        ("top_p", ctypes.c_float),
        ("min_p", ctypes.c_float),
        ("temp", ctypes.c_float),
        ("n_batch", ctypes.c_int32),
        ("repeat_penalty", ctypes.c_float),
-        ("repeat_last_n",  ctypes.c_int32),
+        ("repeat_last_n", ctypes.c_int32),
-        ("context_erase",  ctypes.c_float),
+        ("context_erase", ctypes.c_float),
    ]
 class LLModelGPUDevice(ctypes.Structure):
    _fields_ = [
        ("backend", ctypes.c_char_p),
@ -140,7 +109,6 @@ class LLModelGPUDevice(ctypes.Structure):
        ("vendor", ctypes.c_char_p),
    ]
 # Define C function signatures using ctypes
 llmodel.llmodel_model_create.argtypes = [ctypes.c_char_p]
 llmodel.llmodel_model_create.restype = ctypes.c_void_p
@ -158,21 +126,24 @@ llmodel.llmodel_required_mem.restype = ctypes.c_size_t
 llmodel.llmodel_isModelLoaded.argtypes = [ctypes.c_void_p]
 llmodel.llmodel_isModelLoaded.restype = ctypes.c_bool
-PromptCallback       = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_int32), ctypes.c_size_t, ctypes.c_bool)
+PromptCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32)
-ResponseCallback     = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
+ResponseCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_int32, ctypes.c_char_p)
-EmbCancelCallback    = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
+RecalculateCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.c_bool)
-SpecialTokenCallback = ctypes.CFUNCTYPE(None, ctypes.c_char_p, ctypes.c_char_p)
+EmbCancelCallback = ctypes.CFUNCTYPE(ctypes.c_bool, ctypes.POINTER(ctypes.c_uint), ctypes.c_uint, ctypes.c_char_p)
 llmodel.llmodel_prompt.argtypes = [
    ctypes.c_void_p,
    ctypes.c_char_p,
    ctypes.c_char_p,
    PromptCallback,
    ResponseCallback,
    RecalculateCallback,
    ctypes.POINTER(LLModelPromptContext),
-    ctypes.POINTER(ctypes.c_char_p),
+    ctypes.c_bool,
    ctypes.c_char_p,
 ]
-llmodel.llmodel_prompt.restype = ctypes.c_bool
+llmodel.llmodel_prompt.restype = None
 llmodel.llmodel_embed.argtypes = [
    ctypes.c_void_p,
@ -221,12 +192,6 @@ llmodel.llmodel_model_backend_name.restype = ctypes.c_char_p
 llmodel.llmodel_model_gpu_device_name.argtypes = [ctypes.c_void_p]
 llmodel.llmodel_model_gpu_device_name.restype = ctypes.c_char_p
 llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char_p)]
 llmodel.llmodel_count_prompt_tokens.restype = ctypes.c_int32
 llmodel.llmodel_model_foreach_special_token.argtypes = [ctypes.c_void_p, SpecialTokenCallback]
 llmodel.llmodel_model_foreach_special_token.restype = None
 ResponseCallbackType = Callable[[int, str], bool]
 RawResponseCallbackType = Callable[[int, bytes], bool]
 EmbCancelCallbackType: TypeAlias = 'Callable[[list[int], str], bool]'
@ -271,6 +236,7 @@ class LLModel:
        self.model_path = model_path.encode()
        self.n_ctx = n_ctx
        self.ngl = ngl
        self.context: LLModelPromptContext | None = None
        self.buffer = bytearray()
        self.buff_expecting_cont_bytes: int = 0
@ -290,10 +256,6 @@ class LLModel:
            raise RuntimeError(f"Unable to instantiate model: {errmsg}")
        self.model: ctypes.c_void_p | None = model
        self.special_tokens_map: dict[str, str] = {}
        llmodel.llmodel_model_foreach_special_token(
            self.model, lambda n, t: self.special_tokens_map.__setitem__(n.decode(), t.decode()),
        )
    def __del__(self, llmodel=llmodel):
        if hasattr(self, 'model'):
@ -320,19 +282,6 @@ class LLModel:
        dev = llmodel.llmodel_model_gpu_device_name(self.model)
        return None if dev is None else dev.decode()
    def count_prompt_tokens(self, prompt: str) -> int:
        if self.model is None:
            self._raise_closed()
        err = ctypes.c_char_p()
        n_tok = llmodel.llmodel_count_prompt_tokens(self.model, prompt, ctypes.byref(err))
        if n_tok < 0:
            s = err.value
            errmsg = 'null' if s is None else s.decode()
            raise RuntimeError(f'Unable to count prompt tokens: {errmsg}')
        return n_tok
    llmodel.llmodel_count_prompt_tokens.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
    @staticmethod
    def list_gpus(mem_required: int = 0) -> list[str]:
        """
@ -396,6 +345,50 @@ class LLModel:
            raise Exception("Model not loaded")
        return llmodel.llmodel_threadCount(self.model)
    def _set_context(
        self,
        n_predict: int = 4096,
        top_k: int = 40,
        top_p: float = 0.9,
        min_p: float = 0.0,
        temp: float = 0.1,
        n_batch: int = 8,
        repeat_penalty: float = 1.2,
        repeat_last_n: int = 10,
        context_erase: float = 0.75,
        reset_context: bool = False,
    ):
        if self.context is None:
            context = LLModelPromptContext(
                tokens_size=0,
                n_past=0,
                n_ctx=0,
                n_predict=n_predict,
                top_k=top_k,
                top_p=top_p,
                min_p=min_p,
                temp=temp,
                n_batch=n_batch,
                repeat_penalty=repeat_penalty,
                repeat_last_n=repeat_last_n,
                context_erase=context_erase,
            )
            self.context = context
        else:
            context = self.context
            if reset_context:
                self.context.n_past = 0
        self.context.n_predict = n_predict
        self.context.top_k = top_k
        self.context.top_p = top_p
        self.context.min_p = min_p
        self.context.temp = temp
        self.context.n_batch = n_batch
        self.context.repeat_penalty = repeat_penalty
        self.context.repeat_last_n = repeat_last_n
        self.context.context_erase = context_erase
    @overload
    def generate_embeddings(
        self, text: str, prefix: str | None, dimensionality: int, do_mean: bool, atlas: bool,
@ -465,18 +458,20 @@ class LLModel:
    def prompt_model(
        self,
-        prompt          : str,
+        prompt: str,
-        callback        : ResponseCallbackType,
+        prompt_template: str,
-        n_predict       : int                  = 4096,
+        callback: ResponseCallbackType,
-        top_k           : int                  = 40,
+        n_predict: int = 4096,
-        top_p           : float                = 0.9,
+        top_k: int = 40,
-        min_p           : float                = 0.0,
+        top_p: float = 0.9,
-        temp            : float                = 0.1,
+        min_p: float = 0.0,
-        n_batch         : int                  = 8,
+        temp: float = 0.1,
-        repeat_penalty  : float                = 1.2,
+        n_batch: int = 8,
-        repeat_last_n   : int                  = 10,
+        repeat_penalty: float = 1.2,
-        context_erase   : float                = 0.75,
+        repeat_last_n: int = 10,
-        reset_context   : bool                 = False,
+        context_erase: float = 0.75,
        reset_context: bool = False,
        special: bool = False,
    ):
        """
        Generate response from model from a prompt.
@ -499,38 +494,35 @@ class LLModel:
        self.buffer.clear()
        self.buff_expecting_cont_bytes = 0
-        context = LLModelPromptContext(
+        self._set_context(
-            n_predict      = n_predict,
+            n_predict=n_predict,
-            top_k          = top_k,
+            top_k=top_k,
-            top_p          = top_p,
+            top_p=top_p,
-            min_p          = min_p,
+            min_p=min_p,
-            temp           = temp,
+            temp=temp,
-            n_batch        = n_batch,
+            n_batch=n_batch,
-            repeat_penalty = repeat_penalty,
+            repeat_penalty=repeat_penalty,
-            repeat_last_n  = repeat_last_n,
+            repeat_last_n=repeat_last_n,
-            context_erase  = context_erase,
+            context_erase=context_erase,
            reset_context=reset_context,
        )
-        error_msg: bytes | None = None
+        llmodel.llmodel_prompt(
        def error_callback(msg: bytes) -> None:
            nonlocal error_msg
            error_msg = msg
        err = ctypes.c_char_p()
        if not llmodel.llmodel_prompt(
            self.model,
            ctypes.c_char_p(prompt.encode()),
            ctypes.c_char_p(prompt_template.encode()),
            PromptCallback(self._prompt_callback),
            ResponseCallback(self._callback_decoder(callback)),
-            context,
+            RecalculateCallback(self._recalculate_callback),
-            ctypes.byref(err),
+            self.context,
-        ):
+            special,
-            s = err.value
+            ctypes.c_char_p(),
-            raise RuntimeError(f"prompt error: {'null' if s is None else s.decode()}")
+        )
    def prompt_model_streaming(
-        self, prompt: str, callback: ResponseCallbackType = empty_response_callback, **kwargs: Any,
+        self, prompt: str, prompt_template: str, callback: ResponseCallbackType = empty_response_callback, **kwargs
-    ) -> Iterator[str]:
+    ) -> Iterable[str]:
        if self.model is None:
            self._raise_closed()
@ -549,15 +541,15 @@ class LLModel:
            return _generator_callback
-        def run_llmodel_prompt(prompt: str, callback: ResponseCallbackType, **kwargs):
+        def run_llmodel_prompt(prompt: str, prompt_template: str, callback: ResponseCallbackType, **kwargs):
-            self.prompt_model(prompt, callback, **kwargs)
+            self.prompt_model(prompt, prompt_template, callback, **kwargs)
            output_queue.put(Sentinel.TERMINATING_SYMBOL)
        # Kick off llmodel_prompt in separate thread so we can return generator
        # immediately
        thread = threading.Thread(
            target=run_llmodel_prompt,
-            args=(prompt, _generator_callback_wrapper(callback)),
+            args=(prompt, prompt_template, _generator_callback_wrapper(callback)),
            kwargs=kwargs,
        )
        thread.start()
@ -612,5 +604,10 @@ class LLModel:
    # Empty prompt callback
    @staticmethod
-    def _prompt_callback(token_ids: ctypes._Pointer[ctypes.c_int32], n_token_ids: int, cached: bool) -> bool:
+    def _prompt_callback(token_id: int) -> bool:
        return True
    # Empty recalculate callback
    @staticmethod
    def _recalculate_callback(is_recalculating: bool) -> bool:
        return is_recalculating
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -4,66 +4,38 @@ Python only API for running all GPT4All models.
 from __future__ import annotations
 import hashlib
 import json
 import os
 import platform
 import re
 import sys
 import time
 import warnings
 from contextlib import contextmanager
 from datetime import datetime
 from pathlib import Path
 from types import TracebackType
-from typing import TYPE_CHECKING, Any, Iterable, Iterator, Literal, NamedTuple, NoReturn, Protocol, TypedDict, overload
+from typing import TYPE_CHECKING, Any, Iterable, Literal, Protocol, overload
 import jinja2
 import requests
 from jinja2.sandbox import ImmutableSandboxedEnvironment
 from requests.exceptions import ChunkedEncodingError
 from tqdm import tqdm
 from urllib3.exceptions import IncompleteRead, ProtocolError
 from ._pyllmodel import (CancellationError as CancellationError, EmbCancelCallbackType, EmbedResult as EmbedResult,
-                         LLModel, ResponseCallbackType, _operator_call, empty_response_callback)
+                         LLModel, ResponseCallbackType, empty_response_callback)
 if TYPE_CHECKING:
    from typing_extensions import Self, TypeAlias
-if sys.platform == "darwin":
+if sys.platform == 'darwin':
    import fcntl
 # TODO: move to config
 DEFAULT_MODEL_DIRECTORY = Path.home() / ".cache" / "gpt4all"
-ConfigType: TypeAlias = "dict[str, Any]"
+DEFAULT_PROMPT_TEMPLATE = "### Human:\n{0}\n\n### Assistant:\n"
-# Environment setup adapted from HF transformers
+ConfigType: TypeAlias = 'dict[str, Any]'
-@_operator_call
+MessageType: TypeAlias = 'dict[str, str]'
 def _jinja_env() -> ImmutableSandboxedEnvironment:
    def raise_exception(message: str) -> NoReturn:
        raise jinja2.exceptions.TemplateError(message)
    def tojson(obj: Any, indent: int | None = None) -> str:
        return json.dumps(obj, ensure_ascii=False, indent=indent)
    def strftime_now(fmt: str) -> str:
        return datetime.now().strftime(fmt)
    env = ImmutableSandboxedEnvironment(trim_blocks=True, lstrip_blocks=True)
    env.filters["tojson"         ] = tojson
    env.globals["raise_exception"] = raise_exception
    env.globals["strftime_now"   ] = strftime_now
    return env
 class MessageType(TypedDict):
    role: str
    content: str
 class ChatSession(NamedTuple):
    template: jinja2.Template
    history: list[MessageType]
 class Embed4All:
@ -83,7 +55,7 @@ class Embed4All:
            kwargs: Remaining keyword arguments are passed to the `GPT4All` constructor.
        """
        if model_name is None:
-            model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"
+            model_name = 'all-MiniLM-L6-v2.gguf2.f16.gguf'
        self.gpt4all = GPT4All(model_name, n_threads=n_threads, device=device, **kwargs)
    def __enter__(self) -> Self:
@ -174,18 +146,18 @@ class Embed4All:
            dimensionality = -1
        else:
            if dimensionality <= 0:
-                raise ValueError(f"Dimensionality must be None or a positive integer, got {dimensionality}")
+                raise ValueError(f'Dimensionality must be None or a positive integer, got {dimensionality}')
            if dimensionality < self.MIN_DIMENSIONALITY:
                warnings.warn(
-                    f"Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}."
+                    f'Dimensionality {dimensionality} is less than the suggested minimum of {self.MIN_DIMENSIONALITY}.'
-                    " Performance may be degraded."
+                    ' Performance may be degraded.'
                )
        try:
            do_mean = {"mean": True, "truncate": False}[long_text_mode]
        except KeyError:
            raise ValueError(f"Long text mode must be one of 'mean' or 'truncate', got {long_text_mode!r}")
        result = self.gpt4all.model.generate_embeddings(text, prefix, dimensionality, do_mean, atlas, cancel_cb)
-        return result if return_dict else result["embeddings"]
+        return result if return_dict else result['embeddings']
 class GPT4All:
@ -233,30 +205,31 @@ class GPT4All:
        """
        self.model_type = model_type
-        self._chat_session: ChatSession | None = None
+        self._history: list[MessageType] | None = None
        self._current_prompt_template: str = "{0}"
        device_init = None
-        if sys.platform == "darwin":
+        if sys.platform == 'darwin':
            if device is None:
-                backend = "auto"  # "auto" is effectively "metal" due to currently non-functional fallback
+                backend = 'auto'  # 'auto' is effectively 'metal' due to currently non-functional fallback
-            elif device == "cpu":
+            elif device == 'cpu':
-                backend = "cpu"
+                backend = 'cpu'
            else:
-                if platform.machine() != "arm64" or device != "gpu":
+                if platform.machine() != 'arm64' or device != 'gpu':
-                    raise ValueError(f"Unknown device for this platform: {device}")
+                    raise ValueError(f'Unknown device for this platform: {device}')
-                backend = "metal"
+                backend = 'metal'
        else:
-            backend = "kompute"
+            backend = 'kompute'
-            if device is None or device == "cpu":
+            if device is None or device == 'cpu':
                pass  # use kompute with no device
-            elif device in ("cuda", "kompute"):
+            elif device in ('cuda', 'kompute'):
                backend = device
-                device_init = "gpu"
+                device_init = 'gpu'
-            elif device.startswith("cuda:"):
+            elif device.startswith('cuda:'):
-                backend = "cuda"
+                backend = 'cuda'
-                device_init = _remove_prefix(device, "cuda:")
+                device_init = device.removeprefix('cuda:')
            else:
-                device_init = _remove_prefix(device, "kompute:")
+                device_init = device.removeprefix('kompute:')
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
@ -292,13 +265,7 @@ class GPT4All:
    @property
    def current_chat_session(self) -> list[MessageType] | None:
-        return None if self._chat_session is None else self._chat_session.history
+        return None if self._history is None else list(self._history)
    @current_chat_session.setter
    def current_chat_session(self, history: list[MessageType]) -> None:
        if self._chat_session is None:
            raise ValueError("current_chat_session may only be set when there is an active chat session")
        self._chat_session.history[:] = history
    @staticmethod
    def list_models() -> list[ConfigType]:
@ -310,7 +277,7 @@ class GPT4All:
        """
        resp = requests.get("https://gpt4all.io/models/models3.json")
        if resp.status_code != 200:
-            raise ValueError(f"Request failed: HTTP {resp.status_code} {resp.reason}")
+            raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
        return resp.json()
    @classmethod
@ -340,9 +307,15 @@ class GPT4All:
        # get the config for the model
        config: ConfigType = {}
        if allow_download:
-            models = cls.list_models()
+            available_models = cls.list_models()
-            if (model := next((m for m in models if m["filename"] == model_filename), None)) is not None:
+
-                config.update(model)
+            for m in available_models:
                if model_filename == m["filename"]:
                    tmpl = m.get("promptTemplate", DEFAULT_PROMPT_TEMPLATE)
                    # change to Python-style formatting
                    m["promptTemplate"] = tmpl.replace("%1", "{0}", 1).replace("%2", "{1}", 1)
                    config.update(m)
                    break
        # Validate download directory
        if model_path is None:
@ -384,7 +357,7 @@ class GPT4All:
        expected_md5: str | None = None,
    ) -> str | os.PathLike[str]:
        """
-        Download model from gpt4all.io.
+        Download model from https://gpt4all.io.
        Args:
            model_filename: Filename of model (with .gguf extension).
@ -406,13 +379,13 @@ class GPT4All:
            headers = {}
            if offset:
                print(f"\nDownload interrupted, resuming from byte position {offset}", file=sys.stderr)
-                headers["Range"] = f"bytes={offset}-"  # resume incomplete response
+                headers['Range'] = f'bytes={offset}-'  # resume incomplete response
                headers["Accept-Encoding"] = "identity"  # Content-Encoding changes meaning of ranges
            response = requests.get(url, stream=True, headers=headers)
            if response.status_code not in (200, 206):
-                raise ValueError(f"Request failed: HTTP {response.status_code} {response.reason}")
+                raise ValueError(f'Request failed: HTTP {response.status_code} {response.reason}')
-            if offset and (response.status_code != 206 or str(offset) not in response.headers.get("Content-Range", "")):
+            if offset and (response.status_code != 206 or str(offset) not in response.headers.get('Content-Range', '')):
-                raise ValueError("Connection was interrupted and server does not support range requests")
+                raise ValueError('Connection was interrupted and server does not support range requests')
            if (enc := response.headers.get("Content-Encoding")) is not None:
                raise ValueError(f"Expected identity Content-Encoding, got {enc}")
            return response
@ -511,19 +484,19 @@ class GPT4All:
    def generate(
        self,
-        prompt         : str,
+        prompt: str,
        *,
-        max_tokens     : int                  = 200,
+        max_tokens: int = 200,
-        temp           : float                = 0.7,
+        temp: float = 0.7,
-        top_k          : int                  = 40,
+        top_k: int = 40,
-        top_p          : float                = 0.4,
+        top_p: float = 0.4,
-        min_p          : float                = 0.0,
+        min_p: float = 0.0,
-        repeat_penalty : float                = 1.18,
+        repeat_penalty: float = 1.18,
-        repeat_last_n  : int                  = 64,
+        repeat_last_n: int = 64,
-        n_batch        : int                  = 8,
+        n_batch: int = 8,
-        n_predict      : int | None           = None,
+        n_predict: int | None = None,
-        streaming      : bool                 = False,
+        streaming: bool = False,
-        callback       : ResponseCallbackType = empty_response_callback,
+        callback: ResponseCallbackType = empty_response_callback,
    ) -> Any:
        """
        Generate outputs from any GPT4All model.
@ -548,94 +521,122 @@ class GPT4All:
        # Preparing the model request
        generate_kwargs: dict[str, Any] = dict(
-            temp           = temp,
+            temp=temp,
-            top_k          = top_k,
+            top_k=top_k,
-            top_p          = top_p,
+            top_p=top_p,
-            min_p          = min_p,
+            min_p=min_p,
-            repeat_penalty = repeat_penalty,
+            repeat_penalty=repeat_penalty,
-            repeat_last_n  = repeat_last_n,
+            repeat_last_n=repeat_last_n,
-            n_batch        = n_batch,
+            n_batch=n_batch,
-            n_predict      = n_predict if n_predict is not None else max_tokens,
+            n_predict=n_predict if n_predict is not None else max_tokens,
        )
-        # Prepare the callback, process the model response
+        if self._history is not None:
-        full_response = ""
+            # check if there is only one message, i.e. system prompt:
            reset = len(self._history) == 1
            self._history.append({"role": "user", "content": prompt})
-        def _callback_wrapper(token_id: int, response: str) -> bool:
+            fct_func = self._format_chat_prompt_template.__func__  # type: ignore[attr-defined]
-            nonlocal full_response
+            if fct_func is GPT4All._format_chat_prompt_template:
-            full_response += response
+                if reset:
-            return callback(token_id, response)
+                    # ingest system prompt
-
+                    # use "%1%2" and not "%1" to avoid implicit whitespace
-        last_msg_rendered = prompt
+                    self.model.prompt_model(self._history[0]["content"], "%1%2",
-        if self._chat_session is not None:
+                                            empty_response_callback,
-            session = self._chat_session
+                                            n_batch=n_batch, n_predict=0, reset_context=True, special=True)
-            def render(messages: list[MessageType]) -> str:
+                prompt_template = self._current_prompt_template.format("%1", "%2")
-                return session.template.render(
+            else:
-                    messages=messages,
+                warnings.warn(
-                    add_generation_prompt=True,
+                    "_format_chat_prompt_template is deprecated. Please use a chat session with a prompt template.",
-                    **self.model.special_tokens_map,
+                    DeprecationWarning,
                )
-            session.history.append(MessageType(role="user", content=prompt))
+                # special tokens won't be processed
-            prompt = render(session.history)
+                prompt = self._format_chat_prompt_template(
-            if len(session.history) > 1:
+                    self._history[-1:],
-                last_msg_rendered = render(session.history[-1:])
+                    self._history[0]["content"] if reset else "",
                )
                prompt_template = "%1"
                generate_kwargs["reset_context"] = reset
        else:
            prompt_template = "%1"
            generate_kwargs["reset_context"] = True
-        # Check request length
+        # Prepare the callback, process the model response
-        last_msg_len = self.model.count_prompt_tokens(last_msg_rendered)
+        output_collector: list[MessageType]
-        if last_msg_len > (limit := self.model.n_ctx - 4):
+        output_collector = [
-            raise ValueError(f"Your message was too long and could not be processed ({last_msg_len} > {limit}).")
+            {"content": ""}
        ]  # placeholder for the self._history if chat session is not activated
        if self._history is not None:
            self._history.append({"role": "assistant", "content": ""})
            output_collector = self._history
        def _callback_wrapper(
            callback: ResponseCallbackType,
            output_collector: list[MessageType],
        ) -> ResponseCallbackType:
            def _callback(token_id: int, response: str) -> bool:
                nonlocal callback, output_collector
                output_collector[-1]["content"] += response
                return callback(token_id, response)
            return _callback
        # Send the request to the model
        if streaming:
-            def stream() -> Iterator[str]:
+            return self.model.prompt_model_streaming(
-                yield from self.model.prompt_model_streaming(prompt, _callback_wrapper, **generate_kwargs)
+                prompt,
-                if self._chat_session is not None:
+                prompt_template,
-                    self._chat_session.history.append(MessageType(role="assistant", content=full_response))
+                _callback_wrapper(callback, output_collector),
-            return stream()
+                **generate_kwargs,
            )
-        self.model.prompt_model(prompt, _callback_wrapper, **generate_kwargs)
+        self.model.prompt_model(
-        if self._chat_session is not None:
+            prompt,
-            self._chat_session.history.append(MessageType(role="assistant", content=full_response))
+            prompt_template,
-        return full_response
+            _callback_wrapper(callback, output_collector),
            **generate_kwargs,
        )
        return output_collector[-1]["content"]
    @contextmanager
    def chat_session(
        self,
-        system_message: str | Literal[False] | None = None,
+        system_prompt: str | None = None,
-        chat_template: str | None = None,
+        prompt_template: str | None = None,
    ):
        """
        Context manager to hold an inference optimized chat session with a GPT4All model.
        Args:
-            system_message: An initial instruction for the model, None to use the model default, or False to disable. Defaults to None.
+            system_prompt: An initial instruction for the model.
-            chat_template: Jinja template for the conversation, or None to use the model default. Defaults to None.
+            prompt_template: Template for the prompts with {0} being replaced by the user message.
        """
-        if system_message is None:
+        if system_prompt is None:
-            system_message = self.config.get("systemMessage", False)
+            system_prompt = self.config.get("systemPrompt", "")
-        if chat_template is None:
+        if prompt_template is None:
-            if "name" not in self.config:
+            if (tmpl := self.config.get("promptTemplate")) is None:
-                raise ValueError("For sideloaded models or with allow_download=False, you must specify a chat template.")
+                warnings.warn("Use of a sideloaded model or allow_download=False without specifying a prompt template "
-            if "chatTemplate" not in self.config:
+                              "is deprecated. Defaulting to Alpaca.", DeprecationWarning)
-                raise NotImplementedError("This model appears to have a built-in chat template, but loading it is not "
+                tmpl = DEFAULT_PROMPT_TEMPLATE
-                                          "currently implemented. Please pass a template to chat_session() directly.")
+            prompt_template = tmpl
            if (tmpl := self.config["chatTemplate"]) is None:
                raise ValueError(f"The model {self.config['name']!r} does not support chat.")
            chat_template = tmpl
-        history = []
+        if re.search(r"%1(?![0-9])", prompt_template):
-        if system_message is not False:
+            raise ValueError("Prompt template containing a literal '%1' is not supported. For a prompt "
-            history.append(MessageType(role="system", content=system_message))
+                             "placeholder, please use '{0}' instead.")
-        self._chat_session = ChatSession(
+
-            template=_jinja_env.from_string(chat_template),
+        self._history = [{"role": "system", "content": system_prompt}]
-            history=history,
+        self._current_prompt_template = prompt_template
        )
        try:
            yield self
        finally:
-            self._chat_session = None
+            self._history = None
            self._current_prompt_template = "{0}"
    @staticmethod
    def list_gpus() -> list[str]:
@ -647,6 +648,43 @@ class GPT4All:
        """
        return LLModel.list_gpus()
    def _format_chat_prompt_template(
        self,
        messages: list[MessageType],
        default_prompt_header: str = "",
        default_prompt_footer: str = "",
    ) -> str:
        """
        Helper method for building a prompt from list of messages using the self._current_prompt_template as a template for each message.
        Warning:
            This function was deprecated in version 2.3.0, and will be removed in a future release.
        Args:
            messages:  List of dictionaries. Each dictionary should have a "role" key
                with value of "system", "assistant", or "user" and a "content" key with a
                string value. Messages are organized such that "system" messages are at top of prompt,
                and "user" and "assistant" messages are displayed in order. Assistant messages get formatted as
                "Response: {content}".
        Returns:
            Formatted prompt.
        """
        full_prompt = default_prompt_header + "\n\n" if default_prompt_header != "" else ""
        for message in messages:
            if message["role"] == "user":
                user_message = self._current_prompt_template.format(message["content"])
                full_prompt += user_message
            if message["role"] == "assistant":
                assistant_message = message["content"] + "\n"
                full_prompt += assistant_message
        full_prompt += "\n\n" + default_prompt_footer if default_prompt_footer != "" else ""
        return full_prompt
 def append_extension_if_missing(model_name):
    if not model_name.endswith((".bin", ".gguf")):
@ -659,7 +697,7 @@ class _HasFileno(Protocol):
 def _fsync(fd: int | _HasFileno) -> None:
-    if sys.platform == "darwin":
+    if sys.platform == 'darwin':
        # Apple's fsync does not flush the drive write cache
        try:
            fcntl.fcntl(fd, fcntl.F_FULLFSYNC)
@ -668,7 +706,3 @@ def _fsync(fd: int | _HasFileno) -> None:
        else:
            return
    os.fsync(fd)
 def _remove_prefix(s: str, prefix: str) -> str:
    return s[len(prefix):] if s.startswith(prefix) else s
--- a/gpt4all-bindings/python/mkdocs.yml
+++ b/gpt4all-bindings/python/mkdocs.yml
@ -14,14 +14,10 @@ nav:
    - 'Models' : 'gpt4all_desktop/models.md'
    - 'LocalDocs' : 'gpt4all_desktop/localdocs.md'
    - 'Settings' : 'gpt4all_desktop/settings.md'
    - 'Chat Templates' : 'gpt4all_desktop/chat_templates.md'
    - 'Cookbook':
      - 'Local AI Chat with Microsoft Excel': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-microsoft-excel.md'
      - 'Local AI Chat with your Google Drive': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-google-drive.md'
      - 'Local AI Chat with your Obsidian Vault': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-Obsidian.md'
      - 'Local AI Chat with your OneDrive': 'gpt4all_desktop/cookbook/use-local-ai-models-to-privately-chat-with-One-Drive.md'
    - 'API Server':
      - 'gpt4all_api_server/home.md'
    - 'Python SDK':
      - 'gpt4all_python/home.md'
      - 'Monitoring': 'gpt4all_python/monitoring.md'
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -68,17 +68,16 @@ def get_long_description():
 setup(
    name=package_name,
-    version="2.8.3.dev0",
+    version="2.8.0",
    description="Python bindings for GPT4All",
    long_description=get_long_description(),
    long_description_content_type="text/markdown",
    author="Nomic and the Open Source Community",
    author_email="support@nomic.ai",
-    url="https://www.nomic.ai/gpt4all",
+    url="https://gpt4all.io/",
    project_urls={
        "Documentation": "https://docs.gpt4all.io/gpt4all_python.html",
        "Source code": "https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python",
        "Changelog": "https://github.com/nomic-ai/gpt4all/blob/main/gpt4all-bindings/python/CHANGELOG.md",
    },
    classifiers = [
        "Programming Language :: Python :: 3",
@ -88,16 +87,15 @@ setup(
    python_requires='>=3.8',
    packages=find_packages(),
    install_requires=[
        'importlib_resources; python_version < "3.9"',
        'jinja2~=3.1',
        'requests',
        'tqdm',
        'importlib_resources; python_version < "3.9"',
        'typing-extensions>=4.3.0; python_version >= "3.9" and python_version < "3.11"',
    ],
    extras_require={
        'cuda': [
-            'nvidia-cuda-runtime-cu11',
+            'nvidia-cuda-runtime-cu12',
-            'nvidia-cublas-cu11',
+            'nvidia-cublas-cu12',
        ],
        'all': [
            'gpt4all[cuda]; platform_system == "Windows" or platform_system == "Linux"',
--- a/gpt4all-chat/.flake8
+++ b/gpt4all-chat/.flake8
@ -1,5 +0,0 @@
 # vim: set syntax=dosini:
 [flake8]
 exclude = .*,__pycache__
 max-line-length = 120
 extend-ignore = B001,C408,D,DAR,E221,E303,E722,E741,E800,N801,N806,P101,S101,S324,S404,S406,S410,S603,WPS100,WPS110,WPS111,WPS113,WPS114,WPS115,WPS120,WPS2,WPS300,WPS301,WPS304,WPS305,WPS306,WPS309,WPS316,WPS317,WPS318,WPS319,WPS322,WPS323,WPS326,WPS329,WPS330,WPS332,WPS336,WPS337,WPS347,WPS360,WPS361,WPS407,WPS414,WPS420,WPS421,WPS429,WPS430,WPS431,WPS432,WPS433,WPS437,WPS440,WPS440,WPS441,WPS442,WPS457,WPS458,WPS460,WPS462,WPS463,WPS473,WPS501,WPS504,WPS505,WPS508,WPS509,WPS510,WPS515,WPS516,WPS519,WPS520,WPS529,WPS531,WPS602,WPS604,WPS605,WPS608,WPS609,WPS613,WPS615
--- a/gpt4all-chat/CHANGELOG.md
+++ b/gpt4all-chat/CHANGELOG.md
@ -1,335 +0,0 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ## [3.10.0] - 2025-02-24
 ### Added
 - Whitelist Granite (non-MoE) model architecture (by [@ThiloteE](https://github.com/ThiloteE) in [#3487](https://github.com/nomic-ai/gpt4all/pull/3487))
 - Add support for CUDA compute 5.0 GPUs such as the GTX 750 ([#3499](https://github.com/nomic-ai/gpt4all/pull/3499))
 - Add a Remote Providers tab to the Add Model page ([#3506](https://github.com/nomic-ai/gpt4all/pull/3506))
 ### Changed
 - Substitute prettier default templates for OLMoE 7B 0924/0125 and Granite 3.1 3B/8B (by [@ThiloteE](https://github.com/ThiloteE) in [#3471](https://github.com/nomic-ai/gpt4all/pull/3471))
 - Build with LLVM Clang 19 on macOS and Ubuntu ([#3500](https://github.com/nomic-ai/gpt4all/pull/3500))
 ### Fixed
 - Fix several potential crashes ([#3465](https://github.com/nomic-ai/gpt4all/pull/3465))
 - Fix visual spacing issues with deepseek models ([#3470](https://github.com/nomic-ai/gpt4all/pull/3470))
 - Add missing strings to Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#3496](https://github.com/nomic-ai/gpt4all/pull/3496))
 - Update Simplified Chinese translation (by [@Junior2Ran](https://github.com/Junior2Ran) in [#3467](https://github.com/nomic-ai/pull/3467))
 ## [3.9.0] - 2025-02-04
 ### Added
 - Whitelist OLMoE and Granite MoE model architectures (no Vulkan) (by [@ThiloteE](https://github.com/ThiloteE) in [#3449](https://github.com/nomic-ai/gpt4all/pull/3449))
 ### Fixed
 - Fix "index N is not a prompt" when using LocalDocs with reasoning ([#3451](https://github.com/nomic-ai/gpt4all/pull/3451))
 - Work around rendering artifacts on Snapdragon SoCs with Windows ([#3450](https://github.com/nomic-ai/gpt4all/pull/3450))
 - Prevent DeepSeek-R1 reasoning from appearing in chat names and follow-up questions ([#3458](https://github.com/nomic-ai/gpt4all/pull/3458))
 - Fix LocalDocs crash on Windows ARM when reading PDFs ([#3460](https://github.com/nomic-ai/gpt4all/pull/3460))
 - Fix UI freeze when chat template is `{#` ([#3446](https://github.com/nomic-ai/gpt4all/pull/3446))
 ## [3.8.0] - 2025-01-30
 ### Added
 - Support DeepSeek-R1 Qwen models ([#3431](https://github.com/nomic-ai/gpt4all/pull/3431))
 - Support for think tags in the GUI ([#3440](https://github.com/nomic-ai/gpt4all/pull/3440))
 - Support specifying SHA256 hash in models3.json instead of MD5 ([#3437](https://github.com/nomic-ai/gpt4all/pull/3437))
 ### Changed
 - Use minja instead of Jinja2Cpp for significantly improved template compatibility ([#3433](https://github.com/nomic-ai/gpt4all/pull/3433))
 ### Fixed
 - Fix regression while using localdocs with server API ([#3410](https://github.com/nomic-ai/gpt4all/pull/3410))
 - Don't show system messages in server chat view ([#3411](https://github.com/nomic-ai/gpt4all/pull/3411))
 - Fix `codesign --verify` failure on macOS ([#3413](https://github.com/nomic-ai/gpt4all/pull/3413))
 - Code Interpreter: Fix console.log not accepting a single string after v3.7.0 ([#3426](https://github.com/nomic-ai/gpt4all/pull/3426))
 - Fix Phi 3.1 Mini 128K Instruct template (by [@ThiloteE](https://github.com/ThiloteE) in [#3412](https://github.com/nomic-ai/gpt4all/pull/3412))
 - Don't block the gui thread for reasoning ([#3435](https://github.com/nomic-ai/gpt4all/pull/3435))
 - Fix corruption of unicode in output of reasoning models ([#3443](https://github.com/nomic-ai/gpt4all/pull/3443))
 ## [3.7.0] - 2025-01-21
 ### Added
 - Add support for the Windows ARM64 target platform (CPU-only) ([#3385](https://github.com/nomic-ai/gpt4all/pull/3385))
 ### Changed
 - Update from Qt 6.5.1 to 6.8.1 ([#3386](https://github.com/nomic-ai/gpt4all/pull/3386))
 ### Fixed
 - Fix the timeout error in code interpreter ([#3369](https://github.com/nomic-ai/gpt4all/pull/3369))
 - Fix code interpreter console.log not accepting multiple arguments ([#3371](https://github.com/nomic-ai/gpt4all/pull/3371))
 - Remove 'X is defined' checks from templates for better compatibility ([#3372](https://github.com/nomic-ai/gpt4all/pull/3372))
 - Jinja2Cpp: Add 'if' requirement for 'else' parsing to fix crash ([#3373](https://github.com/nomic-ai/gpt4all/pull/3373))
 - Save chats on quit, even if the window isn't closed first ([#3387](https://github.com/nomic-ai/gpt4all/pull/3387))
 - Add chat template replacements for five new models and fix EM German Mistral ([#3393](https://github.com/nomic-ai/gpt4all/pull/3393))
 - Fix crash when entering `{{ a["foo"(` as chat template ([#3394](https://github.com/nomic-ai/gpt4all/pull/3394))
 - Sign the maintenance tool on macOS to prevent crash on Sequoia ([#3391](https://github.com/nomic-ai/gpt4all/pull/3391))
 - Jinja2Cpp: Fix operator precedence in 'not X is defined' ([#3402](https://github.com/nomic-ai/gpt4all/pull/3402))
 ## [3.6.1] - 2024-12-20
 ### Fixed
 - Fix the stop generation button no longer working in v3.6.0 ([#3336](https://github.com/nomic-ai/gpt4all/pull/3336))
 - Fix the copy entire conversation button no longer working in v3.6.0 ([#3336](https://github.com/nomic-ai/gpt4all/pull/3336))
 ## [3.6.0] - 2024-12-19
 ### Added
 - Automatically substitute chat templates that are not compatible with Jinja2Cpp in GGUFs ([#3327](https://github.com/nomic-ai/gpt4all/pull/3327))
 - Built-in javascript code interpreter tool plus model ([#3173](https://github.com/nomic-ai/gpt4all/pull/3173))
 ### Fixed
 - Fix remote model template to allow for XML in messages ([#3318](https://github.com/nomic-ai/gpt4all/pull/3318))
 - Fix Jinja2Cpp bug that broke system message detection in chat templates ([#3325](https://github.com/nomic-ai/gpt4all/pull/3325))
 - Fix LocalDocs sources displaying in unconsolidated form after v3.5.0 ([#3328](https://github.com/nomic-ai/gpt4all/pull/3328))
 ## [3.5.3] - 2024-12-16
 ### Fixed
 - Fix LocalDocs not using information from sources in v3.5.2 ([#3302](https://github.com/nomic-ai/gpt4all/pull/3302))
 ## [3.5.2] - 2024-12-13
 ### Added
 - Create separate download pages for built-in and HuggingFace models ([#3269](https://github.com/nomic-ai/gpt4all/pull/3269))
 ### Fixed
 - Fix API server ignoring assistant messages in history after v3.5.0 ([#3256](https://github.com/nomic-ai/gpt4all/pull/3256))
 - Fix API server replying with incorrect token counts and stop reason after v3.5.0 ([#3256](https://github.com/nomic-ai/gpt4all/pull/3256))
 - Fix API server remembering previous, unrelated conversations after v3.5.0 ([#3256](https://github.com/nomic-ai/gpt4all/pull/3256))
 - Fix mishandling of default chat template and system message of cloned models in v3.5.0 ([#3262](https://github.com/nomic-ai/gpt4all/pull/3262))
 - Fix untranslated text on the startup dialog ([#3293](https://github.com/nomic-ai/gpt4all/pull/3293))
 ## [3.5.1] - 2024-12-10
 ### Fixed
 - Fix an incorrect value for currentResponse ([#3245](https://github.com/nomic-ai/gpt4all/pull/3245))
 - Fix the default model button so it works again after 3.5.0 ([#3246](https://github.com/nomic-ai/gpt4all/pull/3246))
 - Fix chat templates for Nous Hermes 2 Mistral, Mistral OpenOrca, Qwen 2, and remote models ([#3250](https://github.com/nomic-ai/gpt4all/pull/3250))
 - Fix chat templates for Llama 3.2 models ([#3251](https://github.com/nomic-ai/gpt4all/pull/3251))
 ## [3.5.0] - 2024-12-09
 ### Changed
 - Update Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#3236](https://github.com/nomic-ai/gpt4all/pull/3236))
 - Update Romanian translation (by [@SINAPSA-IC](https://github.com/SINAPSA-IC) in [#3232](https://github.com/nomic-ai/gpt4all/pull/3232))
 ### Fixed
 - Fix a few more problems with the Jinja changes ([#3239](https://github.com/nomic-ai/gpt4all/pull/3239))
 ## [3.5.0-rc2] - 2024-12-06
 ### Changed
 - Fade messages out with an animation when they are removed from the chat view ([#3227](https://github.com/nomic-ai/gpt4all/pull/3227))
 - Tweak wording of edit/redo confirmation dialogs ([#3228](https://github.com/nomic-ai/gpt4all/pull/3228))
 - Make edit/redo buttons disabled instead of invisible when they are temporarily unavailable ([#3228](https://github.com/nomic-ai/gpt4all/pull/3228))
 ## [3.5.0-rc1] - 2024-12-04
 ### Added
 - Add ability to attach text, markdown, and rst files to chat ([#3135](https://github.com/nomic-ai/gpt4all/pull/3135))
 - Add feature to minimize to system tray (by [@bgallois](https://github.com/bgallois) in [#3109](https://github.com/nomic-ai/gpt4all/pull/3109))
 - Basic cache for faster prefill when the input shares a prefix with previous context ([#3073](https://github.com/nomic-ai/gpt4all/pull/3073))
 - Add ability to edit prompts and regenerate any response ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 ### Changed
 - Implement Qt 6.8 compatibility ([#3121](https://github.com/nomic-ai/gpt4all/pull/3121))
 - Use Jinja for chat templates instead of per-message QString.arg-style templates ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 - API server: Use system message(s) from client instead of settings ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 - API server: Accept messages in any order supported by the model instead of requiring user/assistant pairs ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 - Remote models: Pass system message with "system" role instead of joining with user message ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 ### Removed
 - Remove option to save binary model state to disk ([#3147](https://github.com/nomic-ai/gpt4all/pull/3147))
 ### Fixed
 - Fix bug in GUI when localdocs encounters binary data ([#3137](https://github.com/nomic-ai/gpt4all/pull/3137))
 - Fix LocalDocs bugs that prevented some docx files from fully chunking ([#3140](https://github.com/nomic-ai/gpt4all/pull/3140))
 - Fix missing softmax that was causing crashes and effectively infinite temperature since 3.4.0 ([#3202](https://github.com/nomic-ai/gpt4all/pull/3202))
 ## [3.4.2] - 2024-10-16
 ### Fixed
 - Limit bm25 retrieval to only specified collections ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
 - Fix bug removing documents because of a wrong case sensitive file suffix check ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
 - Fix bug with hybrid localdocs search where database would get out of sync ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
 - Fix GUI bug where the localdocs embedding device appears blank ([#3083](https://github.com/nomic-ai/gpt4all/pull/3083))
 - Prevent LocalDocs from not making progress in certain cases ([#3094](https://github.com/nomic-ai/gpt4all/pull/3094))
 ## [3.4.1] - 2024-10-11
 ### Fixed
 - Improve the Italian translation ([#3048](https://github.com/nomic-ai/gpt4all/pull/3048))
 - Fix models.json cache location ([#3052](https://github.com/nomic-ai/gpt4all/pull/3052))
 - Fix LocalDocs regressions caused by docx change ([#3079](https://github.com/nomic-ai/gpt4all/pull/3079))
 - Fix Go code being highlighted as Java ([#3080](https://github.com/nomic-ai/gpt4all/pull/3080))
 ## [3.4.0] - 2024-10-08
 ### Added
 - Add bm25 hybrid search to localdocs ([#2969](https://github.com/nomic-ai/gpt4all/pull/2969))
 - LocalDocs support for .docx files ([#2986](https://github.com/nomic-ai/gpt4all/pull/2986))
 - Add support for attaching Excel spreadsheet to chat ([#3007](https://github.com/nomic-ai/gpt4all/pull/3007), [#3028](https://github.com/nomic-ai/gpt4all/pull/3028))
 ### Changed
 - Rebase llama.cpp on latest upstream as of September 26th ([#2998](https://github.com/nomic-ai/gpt4all/pull/2998))
 - Change the error message when a message is too long ([#3004](https://github.com/nomic-ai/gpt4all/pull/3004))
 - Simplify chatmodel to get rid of unnecessary field and bump chat version ([#3016](https://github.com/nomic-ai/gpt4all/pull/3016))
 - Allow ChatLLM to have direct access to ChatModel for restoring state from text ([#3018](https://github.com/nomic-ai/gpt4all/pull/3018))
 - Improvements to XLSX conversion and UI fix ([#3022](https://github.com/nomic-ai/gpt4all/pull/3022))
 ### Fixed
 - Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))
 - Fix the local server rejecting min\_p/top\_p less than 1 ([#2996](https://github.com/nomic-ai/gpt4all/pull/2996))
 - Fix "regenerate" always forgetting the most recent message ([#3011](https://github.com/nomic-ai/gpt4all/pull/3011))
 - Fix loaded chats forgetting context when there is a system prompt ([#3015](https://github.com/nomic-ai/gpt4all/pull/3015))
 - Make it possible to downgrade and keep some chats, and avoid crash for some model types ([#3030](https://github.com/nomic-ai/gpt4all/pull/3030))
 - Fix scroll positition being reset in model view, and attempt a better fix for the clone issue ([#3042](https://github.com/nomic-ai/gpt4all/pull/3042))
 ## [3.3.1] - 2024-09-27 ([v3.3.y](https://github.com/nomic-ai/gpt4all/tree/v3.3.y))
 ### Fixed
 - Fix a crash when attempting to continue a chat loaded from disk ([#2995](https://github.com/nomic-ai/gpt4all/pull/2995))
 - Fix the local server rejecting min\_p/top\_p less than 1 ([#2996](https://github.com/nomic-ai/gpt4all/pull/2996))
 ## [3.3.0] - 2024-09-20
 ### Added
 - Use greedy sampling when temperature is set to zero ([#2854](https://github.com/nomic-ai/gpt4all/pull/2854))
 - Use configured system prompt in server mode and ignore system messages ([#2921](https://github.com/nomic-ai/gpt4all/pull/2921), [#2924](https://github.com/nomic-ai/gpt4all/pull/2924))
 - Add more system information to anonymous usage stats ([#2939](https://github.com/nomic-ai/gpt4all/pull/2939))
 - Check for unsupported Ubuntu and macOS versions at install time ([#2940](https://github.com/nomic-ai/gpt4all/pull/2940))
 ### Changed
 - The offline update button now directs users to the offline installer releases page. (by [@3Simplex](https://github.com/3Simplex) in [#2888](https://github.com/nomic-ai/gpt4all/pull/2888))
 - Change the website link on the home page to point to the new URL ([#2915](https://github.com/nomic-ai/gpt4all/pull/2915))
 - Smaller default window size, dynamic minimum size, and scaling tweaks ([#2904](https://github.com/nomic-ai/gpt4all/pull/2904))
 - Only allow a single instance of program to be run at a time ([#2923](https://github.com/nomic-ai/gpt4all/pull/2923]))
 ### Fixed
 - Bring back "Auto" option for Embeddings Device as "Application default," which went missing in v3.1.0 ([#2873](https://github.com/nomic-ai/gpt4all/pull/2873))
 - Correct a few strings in the Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#2872](https://github.com/nomic-ai/gpt4all/pull/2872) and [#2909](https://github.com/nomic-ai/gpt4all/pull/2909))
 - Correct typos in Traditional Chinese translation (by [@supersonictw](https://github.com/supersonictw) in [#2852](https://github.com/nomic-ai/gpt4all/pull/2852))
 - Set the window icon on Linux ([#2880](https://github.com/nomic-ai/gpt4all/pull/2880))
 - Corrections to the Romanian translation (by [@SINAPSA-IC](https://github.com/SINAPSA-IC) in [#2890](https://github.com/nomic-ai/gpt4all/pull/2890))
 - Fix singular/plural forms of LocalDocs "x Sources" (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2885](https://github.com/nomic-ai/gpt4all/pull/2885))
 - Fix a typo in Model Settings (by [@3Simplex](https://github.com/3Simplex) in [#2916](https://github.com/nomic-ai/gpt4all/pull/2916))
 - Fix the antenna icon tooltip when using the local server ([#2922](https://github.com/nomic-ai/gpt4all/pull/2922))
 - Fix a few issues with locating files and handling errors when loading remote models on startup ([#2875](https://github.com/nomic-ai/gpt4all/pull/2875))
 - Significantly improve API server request parsing and response correctness ([#2929](https://github.com/nomic-ai/gpt4all/pull/2929))
 - Remove unnecessary dependency on Qt WaylandCompositor module ([#2949](https://github.com/nomic-ai/gpt4all/pull/2949))
 - Update translations ([#2970](https://github.com/nomic-ai/gpt4all/pull/2970))
 - Fix macOS installer and remove extra installed copy of Nomic Embed ([#2973](https://github.com/nomic-ai/gpt4all/pull/2973))
 ## [3.2.1] - 2024-08-13
 ### Fixed
 - Do not initialize Vulkan driver when only using CPU ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 - Fix a potential crash on exit when using only CPU on Linux with NVIDIA (does not affect X11) ([#2843](https://github.com/nomic-ai/gpt4all/pull/2843))
 - Fix default CUDA architecture list after [#2802](https://github.com/nomic-ai/gpt4all/pull/2802) ([#2855](https://github.com/nomic-ai/gpt4all/pull/2855))
 ## [3.2.0] - 2024-08-12
 ### Added
 - Add Qwen2-1.5B-Instruct to models3.json (by [@ThiloteE](https://github.com/ThiloteE) in [#2759](https://github.com/nomic-ai/gpt4all/pull/2759))
 - Enable translation feature for seven languages: English, Spanish, Italian, Portuguese, Chinese Simplified, Chinese Traditional, Romanian ([#2830](https://github.com/nomic-ai/gpt4all/pull/2830))
 ### Changed
 - Add missing entries to Italian transltation (by [@Harvester62](https://github.com/Harvester62) in [#2783](https://github.com/nomic-ai/gpt4all/pull/2783))
 - Use llama\_kv\_cache ops to shift context faster ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Don't stop generating at end of context ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 ### Fixed
 - Case-insensitive LocalDocs source icon detection (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2761](https://github.com/nomic-ai/gpt4all/pull/2761))
 - Fix comparison of pre- and post-release versions for update check and models3.json ([#2762](https://github.com/nomic-ai/gpt4all/pull/2762), [#2772](https://github.com/nomic-ai/gpt4all/pull/2772))
 - Fix several backend issues ([#2778](https://github.com/nomic-ai/gpt4all/pull/2778))
  - Restore leading space removal logic that was incorrectly removed in [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
  - CUDA: Cherry-pick llama.cpp DMMV cols requirement fix that caused a crash with long conversations since [#2694](https://github.com/nomic-ai/gpt4all/pull/2694)
 - Make reverse prompt detection work more reliably and prevent it from breaking output ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Disallow context shift for chat name and follow-up generation to prevent bugs ([#2781](https://github.com/nomic-ai/gpt4all/pull/2781))
 - Explicitly target macOS 12.6 in CI to fix Metal compatibility on older macOS ([#2846](https://github.com/nomic-ai/gpt4all/pull/2846))
 ## [3.1.1] - 2024-07-27
 ### Added
 - Add Llama 3.1 8B Instruct to models3.json (by [@3Simplex](https://github.com/3Simplex) in [#2731](https://github.com/nomic-ai/gpt4all/pull/2731) and [#2732](https://github.com/nomic-ai/gpt4all/pull/2732))
 - Portuguese (BR) translation (by [thiagojramos](https://github.com/thiagojramos) in [#2733](https://github.com/nomic-ai/gpt4all/pull/2733))
 - Support adding arbitrary OpenAI-compatible models by URL (by [@supersonictw](https://github.com/supersonictw) in [#2683](https://github.com/nomic-ai/gpt4all/pull/2683))
 - Support Llama 3.1 RoPE scaling ([#2758](https://github.com/nomic-ai/gpt4all/pull/2758))
 ### Changed
 - Add missing entries to Chinese (Simplified) translation (by [wuodoo](https://github.com/wuodoo) in [#2716](https://github.com/nomic-ai/gpt4all/pull/2716) and [#2749](https://github.com/nomic-ai/gpt4all/pull/2749))
 - Update translation files and add missing paths to CMakeLists.txt ([#2735](https://github.com/nomic-ai/gpt4all/2735))
 ## [3.1.0] - 2024-07-24
 ### Added
 - Generate suggested follow-up questions ([#2634](https://github.com/nomic-ai/gpt4all/pull/2634), [#2723](https://github.com/nomic-ai/gpt4all/pull/2723))
  - Also add options for the chat name and follow-up question prompt templates
 - Scaffolding for translations ([#2612](https://github.com/nomic-ai/gpt4all/pull/2612))
 - Spanish (MX) translation (by [@jstayco](https://github.com/jstayco) in [#2654](https://github.com/nomic-ai/gpt4all/pull/2654))
 - Chinese (Simplified) translation by mikage ([#2657](https://github.com/nomic-ai/gpt4all/pull/2657))
 - Dynamic changes of language and locale at runtime ([#2659](https://github.com/nomic-ai/gpt4all/pull/2659), [#2677](https://github.com/nomic-ai/gpt4all/pull/2677))
 - Romanian translation by [@SINAPSA\_IC](https://github.com/SINAPSA_IC) ([#2662](https://github.com/nomic-ai/gpt4all/pull/2662))
 - Chinese (Traditional) translation (by [@supersonictw](https://github.com/supersonictw) in [#2661](https://github.com/nomic-ai/gpt4all/pull/2661))
 - Italian translation (by [@Harvester62](https://github.com/Harvester62) in [#2700](https://github.com/nomic-ai/gpt4all/pull/2700))
 ### Changed
 - Customize combo boxes and context menus to fit the new style ([#2535](https://github.com/nomic-ai/gpt4all/pull/2535))
 - Improve view bar scaling and Model Settings layout ([#2520](https://github.com/nomic-ai/gpt4all/pull/2520)
 - Make the logo spin while the model is generating ([#2557](https://github.com/nomic-ai/gpt4all/pull/2557))
 - Server: Reply to wrong GET/POST method with HTTP 405 instead of 404 (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2615](https://github.com/nomic-ai/gpt4all/pull/2615))
 - Update theme for menus (by [@3Simplex](https://github.com/3Simplex) in [#2578](https://github.com/nomic-ai/gpt4all/pull/2578))
 - Move the "stop" button to the message box ([#2561](https://github.com/nomic-ai/gpt4all/pull/2561))
 - Build with CUDA 11.8 for better compatibility ([#2639](https://github.com/nomic-ai/gpt4all/pull/2639))
 - Make links in latest news section clickable ([#2643](https://github.com/nomic-ai/gpt4all/pull/2643))
 - Support translation of settings choices ([#2667](https://github.com/nomic-ai/gpt4all/pull/2667), [#2690](https://github.com/nomic-ai/gpt4all/pull/2690))
 - Improve LocalDocs view's error message (by @cosmic-snow in [#2679](https://github.com/nomic-ai/gpt4all/pull/2679))
 - Ignore case of LocalDocs file extensions ([#2642](https://github.com/nomic-ai/gpt4all/pull/2642), [#2684](https://github.com/nomic-ai/gpt4all/pull/2684))
 - Update llama.cpp to commit 87e397d00 from July 19th ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694), [#2702](https://github.com/nomic-ai/gpt4all/pull/2702))
  - Add support for GPT-NeoX, Gemma 2, OpenELM, ChatGLM, and Jais architectures (all with Vulkan support)
  - Add support for DeepSeek-V2 architecture (no Vulkan support)
  - Enable Vulkan support for StarCoder2, XVERSE, Command R, and OLMo
 - Show scrollbar in chat collections list as needed (by [@cosmic-snow](https://github.com/cosmic-snow) in [#2691](https://github.com/nomic-ai/gpt4all/pull/2691))
 ### Removed
 - Remove support for GPT-J models ([#2676](https://github.com/nomic-ai/gpt4all/pull/2676), [#2693](https://github.com/nomic-ai/gpt4all/pull/2693))
 ### Fixed
 - Fix placement of thumbs-down and datalake opt-in dialogs ([#2540](https://github.com/nomic-ai/gpt4all/pull/2540))
 - Select the correct folder with the Linux fallback folder dialog ([#2541](https://github.com/nomic-ai/gpt4all/pull/2541))
 - Fix clone button sometimes producing blank model info ([#2545](https://github.com/nomic-ai/gpt4all/pull/2545))
 - Fix jerky chat view scrolling ([#2555](https://github.com/nomic-ai/gpt4all/pull/2555))
 - Fix "reload" showing for chats with missing models ([#2520](https://github.com/nomic-ai/gpt4all/pull/2520)
 - Fix property binding loop warning ([#2601](https://github.com/nomic-ai/gpt4all/pull/2601))
 - Fix UI hang with certain chat view content ([#2543](https://github.com/nomic-ai/gpt4all/pull/2543))
 - Fix crash when Kompute falls back to CPU ([#2640](https://github.com/nomic-ai/gpt4all/pull/2640))
 - Fix several Vulkan resource management issues ([#2694](https://github.com/nomic-ai/gpt4all/pull/2694))
 - Fix crash/hang when some models stop generating, by showing special tokens ([#2701](https://github.com/nomic-ai/gpt4all/pull/2701))
 [3.10.0]: https://github.com/nomic-ai/gpt4all/compare/v3.9.0...v3.10.0
 [3.9.0]: https://github.com/nomic-ai/gpt4all/compare/v3.8.0...v3.9.0
 [3.8.0]: https://github.com/nomic-ai/gpt4all/compare/v3.7.0...v3.8.0
 [3.7.0]: https://github.com/nomic-ai/gpt4all/compare/v3.6.1...v3.7.0
 [3.6.1]: https://github.com/nomic-ai/gpt4all/compare/v3.6.0...v3.6.1
 [3.6.0]: https://github.com/nomic-ai/gpt4all/compare/v3.5.3...v3.6.0
 [3.5.3]: https://github.com/nomic-ai/gpt4all/compare/v3.5.2...v3.5.3
 [3.5.2]: https://github.com/nomic-ai/gpt4all/compare/v3.5.1...v3.5.2
 [3.5.1]: https://github.com/nomic-ai/gpt4all/compare/v3.5.0...v3.5.1
 [3.5.0]: https://github.com/nomic-ai/gpt4all/compare/v3.5.0-rc2...v3.5.0
 [3.5.0-rc2]: https://github.com/nomic-ai/gpt4all/compare/v3.5.0-rc1...v3.5.0-rc2
 [3.5.0-rc1]: https://github.com/nomic-ai/gpt4all/compare/v3.4.2...v3.5.0-rc1
 [3.4.2]: https://github.com/nomic-ai/gpt4all/compare/v3.4.1...v3.4.2
 [3.4.1]: https://github.com/nomic-ai/gpt4all/compare/v3.4.0...v3.4.1
 [3.4.0]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.4.0
 [3.3.1]: https://github.com/nomic-ai/gpt4all/compare/v3.3.0...v3.3.1
 [3.3.0]: https://github.com/nomic-ai/gpt4all/compare/v3.2.1...v3.3.0
 [3.2.1]: https://github.com/nomic-ai/gpt4all/compare/v3.2.0...v3.2.1
 [3.2.0]: https://github.com/nomic-ai/gpt4all/compare/v3.1.1...v3.2.0
 [3.1.1]: https://github.com/nomic-ai/gpt4all/compare/v3.1.0...v3.1.1
 [3.1.0]: https://github.com/nomic-ai/gpt4all/compare/v3.0.0...v3.1.0
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -1,18 +1,8 @@
-cmake_minimum_required(VERSION 3.25)  # for try_compile SOURCE_FROM_VAR
+cmake_minimum_required(VERSION 3.16)
-include(../common/common.cmake)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
-
+set(CMAKE_CXX_STANDARD 20)
-set(APP_VERSION_MAJOR 3)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(APP_VERSION_MINOR 10)
 set(APP_VERSION_PATCH 1)
 set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
 set(APP_VERSION "${APP_VERSION_BASE}-dev0")
 project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C)
 if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
  set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "..." FORCE)
 endif()
 if(APPLE)
  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" OFF)
@ -26,88 +16,38 @@ if(APPLE)
  endif()
 endif()
-find_package(Python3 3.12 QUIET COMPONENTS Interpreter)
+set(APP_VERSION_MAJOR 3)
-
+set(APP_VERSION_MINOR 1)
-option(GPT4ALL_TEST "Build the tests" ${Python3_FOUND})
+set(APP_VERSION_PATCH 2)
-option(GPT4ALL_LOCALHOST "Build installer for localhost repo" OFF)
+set(APP_VERSION_BASE "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
-option(GPT4ALL_OFFLINE_INSTALLER "Build an offline installer" OFF)
+set(APP_VERSION "${APP_VERSION_BASE}-dev0")
 option(GPT4ALL_SIGN_INSTALL "Sign installed binaries and installers (requires signing identities)" OFF)
 option(GPT4ALL_GEN_CPACK_CONFIG "Generate the CPack config.xml in the package step and nothing else." OFF)
 set(GPT4ALL_USE_QTPDF "AUTO" CACHE STRING "Whether to Use QtPDF for LocalDocs. If OFF or not available on this platform, PDFium is used.")
 set_property(CACHE GPT4ALL_USE_QTPDF PROPERTY STRINGS AUTO ON OFF)
 set(GPT4ALL_FORCE_D3D12 "AUTO" CACHE STRING "Whether to use Direct3D 12 as the Qt scene graph backend. Defaults to ON on Windows ARM.")
 set_property(CACHE GPT4ALL_FORCE_D3D12 PROPERTY STRINGS AUTO ON OFF)
 include(cmake/cpack_config.cmake)
 if (GPT4ALL_GEN_CPACK_CONFIG)
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/cpack-steal-config.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/cpack-steal-config.cmake" @ONLY)
    set(CPACK_POST_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/cpack-steal-config.cmake)
    include(CPack)
    include(CPackIFW)
    return()
 endif()
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 if (MSVC)
    # Enable accurate __cplusplus macro
    add_compile_options($<$<COMPILE_LANGUAGE:CXX>:/Zc:__cplusplus>)
 endif()
 # conftests
 function(check_cpp_feature FEATURE_NAME MIN_VALUE)
    message(CHECK_START "Checking for ${FEATURE_NAME} >= ${MIN_VALUE}")
    string(CONCAT SRC
        "#include <version>\n"
        "#if !defined(${FEATURE_NAME}) || ${FEATURE_NAME} < ${MIN_VALUE}\n"
        "#   error \"${FEATURE_NAME} is not defined or less than ${MIN_VALUE}\"\n"
        "#endif\n"
        "int main() { return 0; }\n"
    )
    try_compile(HAS_FEATURE SOURCE_FROM_VAR "test_${FEATURE_NAME}.cpp" SRC)
    if (NOT HAS_FEATURE)
        message(CHECK_FAIL "fail")
        message(FATAL_ERROR
            "The C++ compiler\n  \"${CMAKE_CXX_COMPILER}\"\n"
            "is too old to support ${FEATURE_NAME} >= ${MIN_VALUE}.\n"
            "Please specify a newer compiler via -DCMAKE_C_COMPILER/-DCMAKE_CXX_COMPILER."
        )
    endif()
  message(CHECK_PASS "pass")
 endfunction()
 # check for monadic operations in std::optional (e.g. transform)
 check_cpp_feature("__cpp_lib_optional" "202110L")
 list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake/Modules")
 # Include the binary directory for the generated header file
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")
 project(gpt4all VERSION ${APP_VERSION_BASE} LANGUAGES CXX C)
 set(CMAKE_AUTOMOC ON)
 set(CMAKE_AUTORCC ON)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
-set(CMAKE_FIND_PACKAGE_TARGETS_GLOBAL ON)
+option(GPT4ALL_TRANSLATIONS OFF "Build with translations")
-set(GPT4ALL_QT_COMPONENTS Core HttpServer LinguistTools Quick QuickDialogs2 Sql Svg)
+option(GPT4ALL_LOCALHOST OFF "Build installer for localhost repo")
-set(GPT4ALL_USING_QTPDF OFF)
+option(GPT4ALL_OFFLINE_INSTALLER "Build an offline installer" OFF)
-if (CMAKE_SYSTEM_NAME MATCHES Windows AND CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+option(GPT4ALL_SIGN_INSTALL "Sign installed binaries and installers (requires signing identities)" OFF)
    # QtPDF is not available.
    if (GPT4ALL_USE_QTPDF STREQUAL "ON")
        message(FATAL_ERROR "QtPDF is not available on Windows ARM64.")
    endif()
 elseif (GPT4ALL_USE_QTPDF MATCHES "^(ON|AUTO)$")
    set(GPT4ALL_USING_QTPDF ON)
    list(APPEND GPT4ALL_QT_COMPONENTS Pdf)
 endif()
 find_package(Qt6 6.8 COMPONENTS ${GPT4ALL_QT_COMPONENTS} REQUIRED)
-if (QT_KNOWN_POLICY_QTP0004)
+# Generate a header file with the version number
-    qt_policy(SET QTP0004 NEW)  # generate extra qmldir files on Qt 6.8+
+configure_file(
  "${CMAKE_CURRENT_SOURCE_DIR}/cmake/config.h.in"
  "${CMAKE_CURRENT_BINARY_DIR}/config.h"
 )
 if(LINUX)
  find_package(Qt6 6.4 COMPONENTS Core Quick WaylandCompositor QuickDialogs2 Svg HttpServer Sql Pdf LinguistTools REQUIRED)
 else()
  find_package(Qt6 6.4 COMPONENTS Core Quick QuickDialogs2 Svg HttpServer Sql Pdf LinguistTools REQUIRED)
 endif()
 # Get the Qt6Core target properties
@ -124,62 +64,15 @@ get_filename_component(Qt6_ROOT_DIR "${Qt6_ROOT_DIR}/.." ABSOLUTE)
 message(STATUS "qmake binary: ${QMAKE_EXECUTABLE}")
 message(STATUS "Qt 6 root directory: ${Qt6_ROOT_DIR}")
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 set(GPT4ALL_CONFIG_FORCE_D3D12 -1)
 if (NOT CMAKE_SYSTEM_NAME MATCHES Windows OR Qt6_VERSION VERSION_LESS "6.6")
    # Direct3D 12 is not available.
    if (GPT4ALL_FORCE_D3D12 STREQUAL "ON")
        message(FATAL_ERROR "Cannot use Direct3D 12 on this platform.")
    endif()
 elseif (GPT4ALL_FORCE_D3D12 MATCHES "^(ON|AUTO)$")
    if (GPT4ALL_FORCE_D3D12 STREQUAL "ON" OR CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
        set(GPT4ALL_CONFIG_FORCE_D3D12 1)
    endif()
 endif()
 # Generate a header file for configuration
 configure_file(
    "${CMAKE_CURRENT_SOURCE_DIR}/src/config.h.in"
    "${CMAKE_CURRENT_BINARY_DIR}/config.h"
 )
 add_subdirectory(deps)
 add_subdirectory(../gpt4all-backend llmodel)
 if (GPT4ALL_TEST)
    enable_testing()
    # Llama-3.2-1B model
    set(TEST_MODEL "Llama-3.2-1B-Instruct-Q4_0.gguf")
    set(TEST_MODEL_MD5 "48ff0243978606fdba19d899b77802fc")
    set(TEST_MODEL_PATH "${CMAKE_BINARY_DIR}/resources/${TEST_MODEL}")
    set(TEST_MODEL_URL "https://huggingface.co/bartowski/Llama-3.2-1B-Instruct-GGUF/resolve/main/${TEST_MODEL}")
    # Create a custom command to download the file if it does not exist or if the checksum does not match
    add_custom_command(
        OUTPUT "${TEST_MODEL_PATH}"
        COMMAND ${CMAKE_COMMAND} -E echo "Downloading test model from ${TEST_MODEL_URL} ..."
        COMMAND ${CMAKE_COMMAND} -DURL="${TEST_MODEL_URL}" -DOUTPUT_PATH="${TEST_MODEL_PATH}" -DEXPECTED_MD5="${TEST_MODEL_MD5}" -P "${CMAKE_SOURCE_DIR}/cmake/download_model.cmake"
        DEPENDS "${CMAKE_SOURCE_DIR}/cmake/download_model.cmake"
    )
    # Define a custom target that depends on the downloaded model
    add_custom_target(download_test_model
        DEPENDS "${TEST_MODEL_PATH}"
    )
    add_subdirectory(tests)
    # The 'check' target makes sure the tests and their dependencies are up-to-date before running them
    add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND} --output-on-failure DEPENDS download_test_model chat gpt4all_tests)
 endif()
 set(CHAT_EXE_RESOURCES)
 # Metal shader library
 if (APPLE)
-    list(APPEND CHAT_EXE_RESOURCES "${GGML_METALLIB}")
+    list(APPEND CHAT_EXE_RESOURCES "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
 endif()
 # App icon
@ -193,6 +86,8 @@ elseif (APPLE)
    # And the following tells CMake where to find and install the file itself.
    set(APP_ICON_RESOURCE "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
    set_source_files_properties(${APP_ICON_RESOURCE} PROPERTIES
        MACOSX_PACKAGE_LOCATION "Resources")
    list(APPEND CHAT_EXE_RESOURCES "${APP_ICON_RESOURCE}")
 endif()
@ -212,49 +107,26 @@ if (APPLE)
    list(APPEND CHAT_EXE_RESOURCES "${LOCAL_EMBEDDING_MODEL_PATH}")
 endif()
 if (DEFINED GGML_METALLIB)
    set_source_files_properties("${GGML_METALLIB}" PROPERTIES GENERATED ON)
 endif()
 if (APPLE)
    set_source_files_properties(${CHAT_EXE_RESOURCES} PROPERTIES MACOSX_PACKAGE_LOCATION Resources)
 endif()
 set(MACOS_SOURCES)
 if (APPLE)
    find_library(COCOA_LIBRARY Cocoa)
    list(APPEND MACOS_SOURCES src/macosdock.mm src/macosdock.h)
 endif()
 qt_add_executable(chat
-    src/main.cpp
+    main.cpp
-    src/chat.cpp                  src/chat.h
+    chat.h chat.cpp
-    src/chatapi.cpp               src/chatapi.h
+    chatllm.h chatllm.cpp
-    src/chatlistmodel.cpp         src/chatlistmodel.h
+    chatmodel.h chatlistmodel.h chatlistmodel.cpp
-    src/chatllm.cpp               src/chatllm.h
+    chatapi.h chatapi.cpp
-    src/chatmodel.h               src/chatmodel.cpp
+    chatviewtextprocessor.h chatviewtextprocessor.cpp
-    src/chatviewtextprocessor.cpp src/chatviewtextprocessor.h
+    database.h database.cpp
-    src/codeinterpreter.cpp       src/codeinterpreter.h
+    download.h download.cpp
-    src/database.cpp              src/database.h
+    embllm.cpp embllm.h
-    src/download.cpp              src/download.h
+    localdocs.h localdocs.cpp localdocsmodel.h localdocsmodel.cpp
-    src/embllm.cpp                src/embllm.h
+    llm.h llm.cpp
-    src/jinja_helpers.cpp         src/jinja_helpers.h
+    modellist.h modellist.cpp
-    src/jinja_replacements.cpp    src/jinja_replacements.h
+    mysettings.h mysettings.cpp
-    src/llm.cpp                   src/llm.h
+    network.h network.cpp
-    src/localdocs.cpp             src/localdocs.h
+    server.h server.cpp
-    src/localdocsmodel.cpp        src/localdocsmodel.h
+    logger.h logger.cpp
-    src/logger.cpp                src/logger.h
+    ${APP_ICON_RESOURCE}
    src/modellist.cpp             src/modellist.h
    src/mysettings.cpp            src/mysettings.h
    src/network.cpp               src/network.h
    src/server.cpp                src/server.h
    src/tool.cpp                  src/tool.h
    src/toolcallparser.cpp        src/toolcallparser.h
    src/toolmodel.cpp             src/toolmodel.h
    src/xlsxtomd.cpp              src/xlsxtomd.h
    ${CHAT_EXE_RESOURCES}
    ${MACOS_SOURCES}
 )
 gpt4all_add_warning_options(chat)
 qt_add_qml_module(chat
    URI gpt4all
@ -264,15 +136,8 @@ qt_add_qml_module(chat
      main.qml
      qml/AddCollectionView.qml
      qml/AddModelView.qml
      qml/AddGPT4AllModelView.qml
      qml/AddHFModelView.qml
      qml/AddRemoteModelView.qml
      qml/ApplicationSettings.qml
      qml/ChatDrawer.qml
      qml/ChatCollapsibleItem.qml
      qml/ChatItemView.qml
      qml/ChatMessageButton.qml
      qml/ChatTextItem.qml
      qml/ChatView.qml
      qml/CollectionsDrawer.qml
      qml/HomeView.qml
@ -285,21 +150,17 @@ qt_add_qml_module(chat
      qml/PopupDialog.qml
      qml/SettingsView.qml
      qml/StartupDialog.qml
-      qml/ConfirmationDialog.qml
+      qml/SwitchModelDialog.qml
      qml/Theme.qml
      qml/ThumbsDownDialog.qml
      qml/Toast.qml
      qml/ToastManager.qml
      qml/MyBusyIndicator.qml
      qml/MyButton.qml
      qml/MyTabButton.qml
      qml/MyCheckBox.qml
      qml/MyComboBox.qml
      qml/MyDialog.qml
      qml/MyDirectoryField.qml
      qml/MyFileDialog.qml
      qml/MyFileIcon.qml
      qml/MyFolderDialog.qml
      qml/MyFancyLink.qml
      qml/MyMenu.qml
      qml/MyMenuItem.qml
@ -315,7 +176,6 @@ qt_add_qml_module(chat
      qml/MyTextField.qml
      qml/MyToolButton.qml
      qml/MyWelcomeButton.qml
      qml/RemoteModelCard.qml
    RESOURCES
      icons/antenna_1.svg
      icons/antenna_2.svg
@ -333,12 +193,9 @@ qt_add_qml_module(chat
      icons/edit.svg
      icons/eject.svg
      icons/email.svg
      icons/file-doc.svg
      icons/file-docx.svg
      icons/file-md.svg
      icons/file-pdf.svg
      icons/file-txt.svg
      icons/file-xls.svg
      icons/file.svg
      icons/github.svg
      icons/globe.svg
@ -346,7 +203,6 @@ qt_add_qml_module(chat
      icons/gpt4all-48.png
      icons/gpt4all.svg
      icons/gpt4all_transparent.svg
      icons/groq.svg
      icons/home.svg
      icons/image.svg
      icons/info.svg
@ -354,14 +210,10 @@ qt_add_qml_module(chat
      icons/left_panel_open.svg
      icons/local-docs.svg
      icons/models.svg
      icons/mistral.svg
      icons/network.svg
      icons/nomic_logo.svg
      icons/notes.svg
      icons/paperclip.svg
      icons/plus.svg
      icons/plus_circle.svg
      icons/openai.svg
      icons/recycle.svg
      icons/regenerate.svg
      icons/search.svg
@ -374,20 +226,21 @@ qt_add_qml_module(chat
      icons/trash.svg
      icons/twitter.svg
      icons/up_down.svg
      icons/webpage.svg
      icons/you.svg
 )
-qt_add_translations(chat
+if (GPT4ALL_TRANSLATIONS)
-    TS_FILES
+    qt_add_translations(chat
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_en_US.ts
+        TS_FILES
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_es_MX.ts
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_en.ts
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_CN.ts
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_es_MX.ts
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_TW.ts
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_CN.ts
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_ro_RO.ts
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_zh_TW.ts
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_it_IT.ts
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_ro_RO.ts
-    ${CMAKE_SOURCE_DIR}/translations/gpt4all_pt_BR.ts
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_it_IT.ts
-)
+        ${CMAKE_SOURCE_DIR}/translations/gpt4all_pt_BR.ts
    )
 endif()
 set_target_properties(chat PROPERTIES
    WIN32_EXECUTABLE TRUE
@ -406,20 +259,19 @@ if (APPLE)
        MACOSX_BUNDLE_GUI_IDENTIFIER gpt4all
        MACOSX_BUNDLE_BUNDLE_VERSION ${PROJECT_VERSION}
        MACOSX_BUNDLE_SHORT_VERSION_STRING ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}
        RESOURCE "${CHAT_EXE_RESOURCES}"
        OUTPUT_NAME gpt4all
    )
    add_dependencies(chat ggml-metal)
 endif()
-if (APPLE AND GPT4ALL_SIGN_INSTALL)
+    if(NOT MAC_SIGNING_IDENTITY)
-    if (NOT MAC_SIGNING_IDENTITY)
+        if(NOT DEFINED ENV{MAC_SIGNING_CERT_NAME} AND GPT4ALL_SIGN_INSTALL)
        if (NOT DEFINED ENV{MAC_SIGNING_CERT_NAME})
            REPORT_MISSING_SIGNING_CONTEXT()
        endif()
        set(MAC_SIGNING_IDENTITY $ENV{MAC_SIGNING_CERT_NAME})
    endif()
-    if (NOT MAC_SIGNING_TID)
+    if(NOT MAC_SIGNING_TID)
-        if (NOT DEFINED ENV{MAC_NOTARIZATION_TID})
+        if(NOT DEFINED ENV{MAC_NOTARIZATION_TID} AND GPT4ALL_SIGN_INSTALL)
            REPORT_MISSING_SIGNING_CONTEXT()
        endif()
        set(MAC_SIGNING_TID $ENV{MAC_NOTARIZATION_TID})
@ -438,47 +290,37 @@ endif()
 target_compile_definitions(chat
    PRIVATE $<$<OR:$<CONFIG:Debug>,$<CONFIG:RelWithDebInfo>>:QT_QML_DEBUG>)
 target_include_directories(chat PRIVATE src)
 # usearch uses the identifier 'slots' which conflicts with Qt's 'slots' keyword
 target_compile_definitions(chat PRIVATE QT_NO_SIGNALS_SLOTS_KEYWORDS)
-target_include_directories(chat PRIVATE deps/usearch/include
+target_include_directories(chat PRIVATE usearch/include
-                                        deps/usearch/fp16/include)
+                                        usearch/fp16/include)
-target_link_libraries(chat
+if(LINUX)
-    PRIVATE Qt6::Core Qt6::HttpServer Qt6::Quick Qt6::Sql Qt6::Svg)
+  target_link_libraries(chat
-if (GPT4ALL_USING_QTPDF)
+      PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf Qt6::WaylandCompositor)
    target_compile_definitions(chat PRIVATE GPT4ALL_USE_QTPDF)
    target_link_libraries(chat PRIVATE Qt6::Pdf)
 else()
-    # Link PDFium
+  target_link_libraries(chat
-    target_link_libraries(chat PRIVATE pdfium)
+    PRIVATE Qt6::Quick Qt6::Svg Qt6::HttpServer Qt6::Sql Qt6::Pdf)
 endif()
 target_link_libraries(chat
-    PRIVATE llmodel SingleApplication fmt::fmt duckx::duckx QXlsx)
+    PRIVATE llmodel)
 target_include_directories(chat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/json/include)
 target_include_directories(chat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/json/include/nlohmann)
 target_include_directories(chat PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/deps/minja/include)
 if (APPLE)
    target_link_libraries(chat PRIVATE ${COCOA_LIBRARY})
 endif()
 # -- install --
-if (APPLE)
+set(COMPONENT_NAME_MAIN ${PROJECT_NAME})
-    set(GPT4ALL_LIB_DEST bin/gpt4all.app/Contents/Frameworks)
+
-else()
+if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
-    set(GPT4ALL_LIB_DEST lib)
+  set(CMAKE_INSTALL_PREFIX ${CMAKE_BINARY_DIR}/install CACHE PATH "..." FORCE)
 endif()
 install(TARGETS chat DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN})
 install(
    TARGETS llmodel
-    LIBRARY DESTINATION ${GPT4ALL_LIB_DEST} COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
-    RUNTIME DESTINATION bin                 COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+    RUNTIME DESTINATION bin COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
 )
 # We should probably iterate through the list of the cmake for backend, but these need to be installed
@ -501,8 +343,8 @@ endif()
 install(
    TARGETS ${MODEL_IMPL_TARGETS}
-    LIBRARY DESTINATION ${GPT4ALL_LIB_DEST} COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
+    LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
-    RUNTIME DESTINATION lib                 COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
+    RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
 )
 if(APPLE AND GPT4ALL_SIGN_INSTALL)
@ -531,7 +373,7 @@ if (LLMODEL_CUDA)
        TARGETS llamamodel-mainline-cuda
                llamamodel-mainline-cuda-avxonly
        RUNTIME_DEPENDENCY_SET llama-cuda-deps
-        LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so
+        LIBRARY DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .so/.dylib
        RUNTIME DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN}  # .dll
    )
    if (WIN32)
@ -545,38 +387,65 @@ if (LLMODEL_CUDA)
    endif()
 endif()
 if (NOT GPT4ALL_USING_QTPDF)
    # Install PDFium
    if (WIN32)
        install(FILES ${PDFium_LIBRARY} DESTINATION bin                 COMPONENT ${COMPONENT_NAME_MAIN})  # .dll
    else()
        install(FILES ${PDFium_LIBRARY} DESTINATION ${GPT4ALL_LIB_DEST} COMPONENT ${COMPONENT_NAME_MAIN})  # .so/.dylib
    endif()
 endif()
 if (NOT APPLE)
-    install(FILES "${LOCAL_EMBEDDING_MODEL_PATH}"
+    install(FILES "${CMAKE_BINARY_DIR}/resources/${LOCAL_EMBEDDING_MODEL}"
            DESTINATION resources
            COMPONENT ${COMPONENT_NAME_MAIN})
 endif()
-if (CMAKE_SYSTEM_NAME MATCHES Linux)
+set(CPACK_GENERATOR "IFW")
 set(CPACK_VERBATIM_VARIABLES YES)
 set(CPACK_IFW_VERBOSE ON)
 if(${CMAKE_SYSTEM_NAME} MATCHES Linux)
    find_program(LINUXDEPLOYQT linuxdeployqt HINTS "$ENV{HOME}/dev/linuxdeployqt/build/tools/linuxdeployqt" "$ENV{HOME}/project/linuxdeployqt/bin")
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/deploy-qt-linux.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-linux.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-linux.cmake)
-elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
-    find_program(WINDEPLOYQT windeployqt)
+    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-linux")
    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@/${COMPONENT_NAME_MAIN}")
 elseif(${CMAKE_SYSTEM_NAME} MATCHES Windows)
    find_program(WINDEPLOYQT windeployqt HINTS ${_qt_bin_dir})
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/deploy-qt-windows.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-windows.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-windows.cmake)
-elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
+    set(CPACK_IFW_ROOT "C:/Qt/Tools/QtInstallerFramework/4.6")
-    find_program(MACDEPLOYQT macdeployqt)
+    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.ico")
    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64")
    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@\\${COMPONENT_NAME_MAIN}")
 elseif(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
    find_program(MACDEPLOYQT macdeployqt HINTS ${_qt_bin_dir})
    configure_file("${CMAKE_CURRENT_SOURCE_DIR}/cmake/deploy-qt-mac.cmake.in"
                   "${CMAKE_BINARY_DIR}/cmake/deploy-qt-mac.cmake" @ONLY)
    set(CPACK_PRE_BUILD_SCRIPTS ${CMAKE_BINARY_DIR}/cmake/deploy-qt-mac.cmake)
    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-darwin")
    set(CPACK_IFW_TARGET_DIRECTORY "@ApplicationsDir@/${COMPONENT_NAME_MAIN}")
    set(CPACK_BUNDLE_NAME ${COMPONENT_NAME_MAIN})
    set(CPACK_BUNDLE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
 endif()
 set(CPACK_PACKAGE_INSTALL_DIRECTORY ${COMPONENT_NAME_MAIN})
 set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
 SET(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
 set(CPACK_PACKAGE_HOMEPAGE_URL "https://gpt4all.io")
 set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
 set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
 set(CPACK_RESOURCE_FILE_README ${CMAKE_CURRENT_SOURCE_DIR}/README.md)
 set(CPACK_PACKAGE_EXECUTABLES "GPT4All")
 set(CPACK_CREATE_DESKTOP_LINKS "GPT4All")
 set(CPACK_IFW_PACKAGE_NAME "GPT4All")
 set(CPACK_IFW_PACKAGE_TITLE "GPT4All Installer")
 set(CPACK_IFW_PACKAGE_PUBLISHER "Nomic, Inc.")
 set(CPACK_IFW_PRODUCT_URL "https://gpt4all.io")
 set(CPACK_IFW_PACKAGE_WIZARD_STYLE "Aero")
 set(CPACK_IFW_PACKAGE_LOGO "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
 set(CPACK_IFW_PACKAGE_WINDOW_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png")
 set(CPACK_IFW_PACKAGE_WIZARD_SHOW_PAGE_LIST OFF)
 include(InstallRequiredSystemLibraries)
 include(CPack)
 include(CPackIFW)
@ -588,35 +457,20 @@ endif()
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} ESSENTIAL FORCED_INSTALLATION)
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} VERSION ${APP_VERSION})
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} LICENSES "MIT LICENSE" ${CPACK_RESOURCE_FILE_LICENSE})
-cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_gpt4all_component.qs")
+cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installerscript.qs")
 cpack_ifw_configure_component(${COMPONENT_NAME_MAIN} REPLACES "gpt4all-chat") #Was used in very earliest prototypes
 if (APPLE AND GPT4ALL_SIGN_INSTALL)
    if (GPT4ALL_OFFLINE_INSTALLER)
        cpack_add_component(maintenancetool HIDDEN)
    else()
        cpack_add_component(maintenancetool HIDDEN DOWNLOADED)
    endif()
    cpack_ifw_configure_component(maintenancetool ESSENTIAL FORCED_INSTALLATION)
    cpack_ifw_configure_component(maintenancetool VERSION ${APP_VERSION})
    cpack_ifw_configure_component(maintenancetool SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_maintenancetool_component.qs")
 endif()
 if (GPT4ALL_LOCALHOST)
    cpack_ifw_add_repository("GPT4AllRepository" URL "http://localhost/repository")
-elseif (GPT4ALL_OFFLINE_INSTALLER)
+elseif(GPT4ALL_OFFLINE_INSTALLER)
-    add_compile_definitions(GPT4ALL_OFFLINE_INSTALLER)
+  add_compile_definitions(GPT4ALL_OFFLINE_INSTALLER)
 else()
-    if (CMAKE_SYSTEM_NAME MATCHES Linux)
+  if(${CMAKE_SYSTEM_NAME} MATCHES Linux)
-        cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/linux/repository")
+      cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/linux/repository")
-    elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
+  elseif(${CMAKE_SYSTEM_NAME} MATCHES Windows)
-        # To sign the target on windows have to create a batch script add use it as a custom target and then use CPACK_IFW_EXTRA_TARGETS to set this extra target
+      #To sign the target on windows have to create a batch script add use it as a custom target and then use CPACK_IFW_EXTRA_TARGETS to set this extra target
-        if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$")
+      cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/windows/repository")
-            cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/windows/repository")
+  elseif(${CMAKE_SYSTEM_NAME} MATCHES Darwin)
-        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
+      cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/mac/repository")
-            cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/windows_arm/repository")
+  endif()
        endif()
    elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
        cpack_ifw_add_repository("GPT4AllRepository" URL "https://gpt4all.io/installer_repos/mac/repository")
    endif()
 endif()
--- a/gpt4all-chat/README.md
+++ b/gpt4all-chat/README.md
@ -0,0 +1,45 @@
 # gpt4all-chat
 Cross platform Qt based GUI for GPT4All versions with GPT-J as the base
 model. NOTE: The model seen in the screenshot is actually a preview of a
 new training run for GPT4All based on GPT-J. The GPT4All project is busy
 at work getting ready to release this model including installers for all
 three major OS's. In the meantime, you can try this UI out with the original
 GPT-J model by following build instructions below.
 ![image](https://user-images.githubusercontent.com/50458173/231464085-da9edff6-a593-410e-8f38-7513f75c8aab.png)
 ## Install
 One click installers for macOS, Linux, and Windows at https://gpt4all.io
 ## Features
 * Cross-platform (Linux, Windows, MacOSX)
 * The UI is made to look and feel like you've come to expect from a chatty gpt
 * Check for updates so you can always stay fresh with latest models
 * Easy to install with precompiled binaries available for all three major desktop platforms
 * Multi-modal - Ability to load more than one model and switch between them
 * Multi-chat - a list of current and past chats and the ability to save/delete/export and switch between
 * Supports models that are supported by llama.cpp
 * Model downloader in GUI featuring many popular open source models
 * Settings dialog to change temp, top_p, min_p, top_k, threads, etc
 * Copy your conversation to clipboard
 * RAG via LocalDocs feature
 * Check for updates to get the very latest GUI
 ## Building and running
 * Follow the visual instructions on the [build_and_run](build_and_run.md) page
 ## Getting the latest
 If you've already checked out the source code and/or built the program make sure when you do a git fetch to get the latest changes and that you also do `git submodule update --init --recursive` to update the submodules. (If you ever run into trouble, deinitializing via `git submodule deinit -f .` and then initializing again via `git submodule update --init --recursive` fixes most issues)
 ## Contributing
 * Pull requests welcome. See the feature wish list for ideas :)
 ## License
 The source code of this chat interface is currently under a MIT license.
--- a/gpt4all-chat/build_and_run.md
+++ b/gpt4all-chat/build_and_run.md
@ -12,21 +12,21 @@ On Windows and Linux, building GPT4All with full GPU support requires the [Vulka
 ## Note for Linux users
-Linux users may install Qt via their distro's official packages instead of using the Qt installer. You need at least Qt 6.5, with support for QPdf and the Qt HTTP Server. You may build from the CLI using CMake and Ninja, or with Qt Creator as described later in this document.
+Linux users may install Qt via their distro's official packages instead of using the Qt installer. You need at least Qt 6.5, with support for QPdf and the Qt HTTP Server. It should be straightforward to build with just cmake and make, but you may continue to follow these instructions to build with Qt Creator.
 On Arch Linux, this looks like:
 ```
-sudo pacman -S --needed cmake gcc ninja qt6-5compat qt6-base qt6-declarative qt6-httpserver qt6-svg qtcreator
+sudo pacman -S --needed base-devel qt6-base qt6-declarative qt6-wayland qt6-svg qt6-httpserver qt6-webengine qt6-5compat qt6-shadertools qtcreator cmake ninja
 ```
 On Ubuntu 23.04, this looks like:
 ```
-sudo apt install cmake g++ libgl-dev libqt6core5compat6 ninja-build qml6-module-qt5compat-graphicaleffects qt6-base-private-dev qt6-declarative-dev qt6-httpserver-dev qt6-svg-dev qtcreator
+sudo apt install build-essential qt6-base-dev qt6-declarative-dev qt6-wayland-dev qt6-svg-dev qt6-httpserver-dev qt6-webengine-dev libqt6core5compat6 qml6-module-qt5compat-graphicaleffects libqt6shadertools6 qtcreator cmake ninja-build
 ```
 On Fedora 39, this looks like:
 ```
-sudo dnf install cmake gcc-c++ ninja-build qt-creator qt5-qtgraphicaleffects qt6-qt5compat qt6-qtbase-private-devel qt6-qtdeclarative-devel qt6-qthttpserver-devel qt6-qtsvg-devel
+sudo dnf install make gcc gcc-c++ qt6-qtbase-devel qt6-qtdeclarative-devel qt6-qtwayland-devel qt6-qtsvg-devel qt6-qthttpserver-devel qt6-qtwebengine-devel qt6-qt5compat qt5-qtgraphicaleffects qt6-qtshadertools qt-creator cmake ninja-build
 ```
 ## Download Qt
@ -49,7 +49,10 @@ Under this release (e.g. Qt 6.5.0), select the target platform:
 - On Windows, it is called "MSVC 2019 64-bit" (for 64-bit x86 CPUs). MinGW has not been tested.
 Under this release, select the following additional components:
 - Qt Quick 3D
 - Qt Wayland Compositor (for Linux only)
 - Qt 5 Compatibility Module
 - Qt Shader Tools
 - Additional Libraries:
  - Qt HTTP Server
  - Qt PDF
--- a/gpt4all-chat/src/chat.cpp
+++ b/gpt4all-chat/src/chat.cpp
@ -1,32 +1,24 @@
 #include "chat.h"
 #include "chatlistmodel.h"
 #include "mysettings.h"
 #include "network.h"
 #include "server.h"
 #include "tool.h"
 #include "toolcallparser.h"
 #include "toolmodel.h"
 #include <QByteArray>
 #include <QDataStream>
 #include <QDateTime>
 #include <QDebug>
 #include <QFile>
 #include <QFileInfo>
 #include <QIODevice>
 #include <QLatin1String>
 #include <QMap>
 #include <QRegularExpression>
 #include <QString>
 #include <QStringList>
 #include <QTextStream>
 #include <Qt>
-#include <QtAssert>
+#include <QtGlobal>
 #include <QtLogging>
 #include <optional>
 #include <utility>
 using namespace ToolEnums;
 Chat::Chat(QObject *parent)
    : QObject(parent)
    , m_id(Network::globalInstance()->generateUniqueId())
@ -40,7 +32,7 @@ Chat::Chat(QObject *parent)
    connectLLM();
 }
-Chat::Chat(server_tag_t, QObject *parent)
+Chat::Chat(bool isServer, QObject *parent)
    : QObject(parent)
    , m_id(Network::globalInstance()->generateUniqueId())
    , m_name(tr("Server Chat"))
@ -70,23 +62,26 @@ void Chat::connectLLM()
    connect(m_llmodel, &ChatLLM::responseStopped, this, &Chat::responseStopped, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::modelLoadingError, this, &Chat::handleModelLoadingError, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::modelLoadingWarning, this, &Chat::modelLoadingWarning, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::recalcChanged, this, &Chat::handleRecalculating, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::generatedNameChanged, this, &Chat::generatedNameChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::generatedQuestionFinished, this, &Chat::generatedQuestionFinished, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::reportSpeed, this, &Chat::handleTokenSpeedChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::loadedModelInfoChanged, this, &Chat::loadedModelInfoChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection);
-    connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelChanged, Qt::QueuedConnection);
+    connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection);
    connect(m_llmodel, &ChatLLM::trySwitchContextOfLoadedModelCompleted, this, &Chat::handleTrySwitchContextOfLoadedModelCompleted, Qt::QueuedConnection);
    connect(this, &Chat::promptRequested, m_llmodel, &ChatLLM::prompt, Qt::QueuedConnection);
    connect(this, &Chat::modelChangeRequested, m_llmodel, &ChatLLM::modelChangeRequested, Qt::QueuedConnection);
    connect(this, &Chat::loadDefaultModelRequested, m_llmodel, &ChatLLM::loadDefaultModel, Qt::QueuedConnection);
    connect(this, &Chat::loadModelRequested, m_llmodel, &ChatLLM::loadModel, Qt::QueuedConnection);
    connect(this, &Chat::generateNameRequested, m_llmodel, &ChatLLM::generateName, Qt::QueuedConnection);
    connect(this, &Chat::regenerateResponseRequested, m_llmodel, &ChatLLM::regenerateResponse, Qt::QueuedConnection);
    connect(this, &Chat::resetResponseRequested, m_llmodel, &ChatLLM::resetResponse, Qt::QueuedConnection);
    connect(this, &Chat::resetContextRequested, m_llmodel, &ChatLLM::resetContext, Qt::QueuedConnection);
    connect(this, &Chat::processSystemPromptRequested, m_llmodel, &ChatLLM::processSystemPrompt, Qt::QueuedConnection);
    connect(this, &Chat::collectionListChanged, m_collectionModel, &LocalDocsCollectionsModel::setCollections);
    connect(ModelList::globalInstance(), &ModelList::modelInfoChanged, this, &Chat::handleModelInfoChanged);
 }
 void Chat::reset()
@ -94,15 +89,25 @@ void Chat::reset()
    stopGenerating();
    // Erase our current on disk representation as we're completely resetting the chat along with id
    ChatListModel::globalInstance()->removeChatFile(this);
    emit resetContextRequested();
    m_id = Network::globalInstance()->generateUniqueId();
    emit idChanged(m_id);
    // NOTE: We deliberately do no reset the name or creation date to indicate that this was originally
    // an older chat that was reset for another purpose. Resetting this data will lead to the chat
    // name label changing back to 'New Chat' and showing up in the chat model list as a 'New Chat'
    // further down in the list. This might surprise the user. In the future, we might get rid of
-    // the "reset context" button in the UI.
+    // the "reset context" button in the UI. Right now, by changing the model in the combobox dropdown
    // we effectively do a reset context. We *have* to do this right now when switching between different
    // types of models. The only way to get rid of that would be a very long recalculate where we rebuild
    // the context if we switch between different types of models. Probably the right way to fix this
    // is to allow switching models but throwing up a dialog warning users if we switch between types
    // of models that a long recalculation will ensue.
    m_chatModel->clear();
-    m_needsSave = true;
+}
 void Chat::processSystemPrompt()
 {
    emit processSystemPromptRequested();
 }
 void Chat::resetResponseState()
@ -120,88 +125,45 @@ void Chat::resetResponseState()
    emit responseStateChanged();
 }
-void Chat::newPromptResponsePair(const QString &prompt, const QList<QUrl> &attachedUrls)
+void Chat::prompt(const QString &prompt)
 {
    QStringList attachedContexts;
    QList<PromptAttachment> attachments;
    for (const QUrl &url : attachedUrls) {
        Q_ASSERT(url.isLocalFile());
        const QString localFilePath = url.toLocalFile();
        const QFileInfo info(localFilePath);
        Q_ASSERT(
            info.suffix().toLower() == "xlsx" ||
            info.suffix().toLower() == "txt" ||
            info.suffix().toLower() == "md" ||
            info.suffix().toLower() == "rst"
        );
        PromptAttachment attached;
        attached.url = url;
        QFile file(localFilePath);
        if (file.open(QIODevice::ReadOnly)) {
            attached.content = file.readAll();
            file.close();
        } else {
            qWarning() << "ERROR: Failed to open the attachment:" << localFilePath;
            continue;
        }
        attachments << attached;
        attachedContexts << attached.processedContent();
    }
    QString promptPlusAttached = prompt;
    if (!attachedContexts.isEmpty())
        promptPlusAttached = attachedContexts.join("\n\n") + "\n\n" + prompt;
    resetResponseState();
    if (int count = m_chatModel->count())
        m_chatModel->updateCurrentResponse(count - 1, false);
    m_chatModel->appendPrompt(prompt, attachments);
    m_chatModel->appendResponse();
    emit promptRequested(m_collections);
    m_needsSave = true;
 }
 void Chat::regenerateResponse(int index)
 {
    resetResponseState();
-    emit regenerateResponseRequested(index);
+    emit promptRequested(m_collections, prompt);
    m_needsSave = true;
 }
-QVariant Chat::popPrompt(int index)
+void Chat::regenerateResponse()
 {
-    auto content = m_llmodel->popPrompt(index);
+    const int index = m_chatModel->count() - 1;
-    m_needsSave = true;
+    m_chatModel->updateSources(index, QList<ResultInfo>());
-    if (content) return *content;
+    emit regenerateResponseRequested();
    return QVariant::fromValue(nullptr);
 }
 void Chat::stopGenerating()
 {
    // In future if we have more than one tool we'll have to keep track of which tools are possibly
    // running, but for now we only have one
    Tool *toolInstance = ToolModel::globalInstance()->get(ToolCallConstants::CodeInterpreterFunction);
    Q_ASSERT(toolInstance);
    toolInstance->interrupt();
    m_llmodel->stopGenerating();
 }
 QString Chat::response() const
 {
    return m_response;
 }
 Chat::ResponseState Chat::responseState() const
 {
    return m_responseState;
 }
-void Chat::handleResponseChanged()
+void Chat::handleResponseChanged(const QString &response)
 {
    if (m_responseState != Chat::ResponseGeneration) {
        m_responseState = Chat::ResponseGeneration;
        emit responseStateChanged();
    }
    m_response = response;
    const int index = m_chatModel->count() - 1;
    m_chatModel->updateValue(index, this->response());
    emit responseChanged();
 }
 void Chat::handleModelLoadingPercentageChanged(float loadingPercentage)
@ -228,7 +190,7 @@ void Chat::handleModelLoadingPercentageChanged(float loadingPercentage)
 void Chat::promptProcessing()
 {
    m_responseState = !databaseResults().isEmpty() ? Chat::LocalDocsProcessing : Chat::PromptProcessing;
-    emit responseStateChanged();
+     emit responseStateChanged();
 }
 void Chat::generatingQuestions()
@ -241,79 +203,20 @@ void Chat::responseStopped(qint64 promptResponseMs)
 {
    m_tokenSpeed = QString();
    emit tokenSpeedChanged();
    emit responseChanged();
    m_responseInProgress = false;
    m_responseState = Chat::ResponseStopped;
    emit responseInProgressChanged();
    emit responseStateChanged();
    if (m_generatedName.isEmpty())
        emit generateNameRequested();
-    const QString possibleToolcall = m_chatModel->possibleToolcall();
+    Network::globalInstance()->trackChatEvent("response_complete", {
    Network::globalInstance()->trackChatEvent("response_stopped", {
        {"first", m_firstResponse},
        {"message_count", chatModel()->count()},
        {"$duration", promptResponseMs / 1000.},
    });
    ToolCallParser parser;
    parser.update(possibleToolcall.toUtf8());
    if (parser.state() == ToolEnums::ParseState::Complete && parser.startTag() != ToolCallConstants::ThinkStartTag)
        processToolCall(parser.toolCall());
    else
        responseComplete();
 }
 void Chat::processToolCall(const QString &toolCall)
 {
    m_responseState = Chat::ToolCallGeneration;
    emit responseStateChanged();
    // Regex to remove the formatting around the code
    static const QRegularExpression regex("^\\s*```javascript\\s*|\\s*```\\s*$");
    QString code = toolCall;
    code.remove(regex);
    code = code.trimmed();
    // Right now the code interpreter is the only available tool
    Tool *toolInstance = ToolModel::globalInstance()->get(ToolCallConstants::CodeInterpreterFunction);
    Q_ASSERT(toolInstance);
    connect(toolInstance, &Tool::runComplete, this, &Chat::toolCallComplete, Qt::SingleShotConnection);
    // The param is the code
    const ToolParam param = { "code", ToolEnums::ParamType::String, code };
    m_responseInProgress = true;
    emit responseInProgressChanged();
    toolInstance->run({param});
 }
 void Chat::toolCallComplete(const ToolCallInfo &info)
 {
    // Update the current response with meta information about toolcall and re-parent
    m_chatModel->updateToolCall(info);
    ++m_consecutiveToolCalls;
    m_responseInProgress = false;
    emit responseInProgressChanged();
    // We limit the number of consecutive toolcalls otherwise we get into a potentially endless loop
    if (m_consecutiveToolCalls < 3 || info.error == ToolEnums::Error::NoError) {
        resetResponseState();
        emit promptRequested(m_collections); // triggers a new response
        return;
    }
    responseComplete();
 }
 void Chat::responseComplete()
 {
    if (m_generatedName.isEmpty())
        emit generateNameRequested();
    m_responseState = Chat::ResponseStopped;
    emit responseStateChanged();
    m_consecutiveToolCalls = 0;
    m_firstResponse = false;
 }
@ -324,16 +227,36 @@ ModelInfo Chat::modelInfo() const
 void Chat::setModelInfo(const ModelInfo &modelInfo)
 {
-    if (m_modelInfo != modelInfo) {
+    if (m_modelInfo == modelInfo && isModelLoaded())
        m_modelInfo = modelInfo;
        m_needsSave = true;
    } else if (isModelLoaded())
        return;
    m_modelInfo = modelInfo;
    emit modelInfoChanged();
    emit modelChangeRequested(modelInfo);
 }
 void Chat::newPromptResponsePair(const QString &prompt)
 {
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
    m_chatModel->appendResponse("Response: ", prompt);
    emit resetResponseRequested();
 }
 void Chat::serverNewPromptResponsePair(const QString &prompt)
 {
    resetResponseState();
    m_chatModel->updateCurrentResponse(m_chatModel->count() - 1, false);
    m_chatModel->appendPrompt("Prompt: ", prompt);
    m_chatModel->appendResponse("Response: ", prompt);
 }
 bool Chat::isRecalc() const
 {
    return m_llmodel->isRecalc();
 }
 void Chat::unloadAndDeleteLater()
 {
    if (!isModelLoaded()) {
@ -383,17 +306,24 @@ void Chat::trySwitchContextOfLoadedModel()
 void Chat::generatedNameChanged(const QString &name)
 {
-    m_generatedName = name;
+    // Only use the first three words maximum and remove newlines and extra spaces
-    m_name = name;
+    m_generatedName = name.simplified();
    QStringList words = m_generatedName.split(' ', Qt::SkipEmptyParts);
    int wordCount = qMin(7, words.size());
    m_name = words.mid(0, wordCount).join(' ');
    emit nameChanged();
    m_needsSave = true;
 }
 void Chat::generatedQuestionFinished(const QString &question)
 {
    m_generatedQuestions << question;
    emit generatedQuestionsChanged();
-    m_needsSave = true;
+}
 void Chat::handleRecalculating()
 {
    Network::globalInstance()->trackChatEvent("recalc_context", { {"length", m_chatModel->count()} });
    emit recalcChanged();
 }
 void Chat::handleModelLoadingError(const QString &error)
@ -430,26 +360,17 @@ QString Chat::fallbackReason() const
 void Chat::handleDatabaseResultsChanged(const QList<ResultInfo> &results)
 {
    m_databaseResults = results;
-    m_needsSave = true;
+    const int index = m_chatModel->count() - 1;
    m_chatModel->updateSources(index, m_databaseResults);
 }
 // we need to notify listeners of the modelInfo property when its properties are updated,
 // since it's a gadget and can't do that on its own
 void Chat::handleModelInfoChanged(const ModelInfo &modelInfo)
 {
    if (!m_modelInfo.id().isNull() && modelInfo.id() == m_modelInfo.id())
        emit modelInfoChanged();
 }
 // react if a new model is loaded
 void Chat::handleModelChanged(const ModelInfo &modelInfo)
 {
    if (m_modelInfo == modelInfo)
        return;
    m_modelInfo = modelInfo;
    emit modelInfoChanged();
    m_needsSave = true;
 }
 void Chat::handleTrySwitchContextOfLoadedModelCompleted(int value)
@ -464,14 +385,17 @@ bool Chat::serialize(QDataStream &stream, int version) const
    stream << m_id;
    stream << m_name;
    stream << m_userName;
-    if (version >= 5)
+    if (version > 4)
        stream << m_modelInfo.id();
    else
        stream << m_modelInfo.filename();
-    if (version >= 3)
+    if (version > 2)
        stream << m_collections;
-    if (!m_llmodel->serialize(stream, version))
+    const bool serializeKV = MySettings::globalInstance()->saveChatsContext();
    if (version > 5)
        stream << serializeKV;
    if (!m_llmodel->serialize(stream, version, serializeKV))
        return false;
    if (!m_chatModel->serialize(stream, version))
        return false;
@ -490,7 +414,7 @@ bool Chat::deserialize(QDataStream &stream, int version)
    QString modelId;
    stream >> modelId;
-    if (version >= 5) {
+    if (version > 4) {
        if (ModelList::globalInstance()->contains(modelId))
            m_modelInfo = ModelList::globalInstance()->modelInfo(modelId);
    } else {
@ -500,23 +424,27 @@ bool Chat::deserialize(QDataStream &stream, int version)
    if (!m_modelInfo.id().isEmpty())
        emit modelInfoChanged();
-    if (version >= 3) {
+    bool discardKV = m_modelInfo.id().isEmpty();
    if (version > 2) {
        stream >> m_collections;
        emit collectionListChanged(m_collections);
    }
    bool deserializeKV = true;
    if (version > 5)
        stream >> deserializeKV;
    m_llmodel->setModelInfo(m_modelInfo);
-    if (!m_llmodel->deserialize(stream, version))
+    if (!m_llmodel->deserialize(stream, version, deserializeKV, discardKV))
        return false;
    if (!m_chatModel->deserialize(stream, version))
        return false;
-    emit chatModelChanged();
+    m_llmodel->setStateFromText(m_chatModel->text());
    if (stream.status() != QDataStream::Ok)
        return false;
-    m_needsSave = false;
+    emit chatModelChanged();
-    return true;
+    return stream.status() == QDataStream::Ok;
 }
 QList<QString> Chat::collectionList() const
@ -536,7 +464,6 @@ void Chat::addCollection(const QString &collection)
    m_collections.append(collection);
    emit collectionListChanged(m_collections);
    m_needsSave = true;
 }
 void Chat::removeCollection(const QString &collection)
@ -546,5 +473,4 @@ void Chat::removeCollection(const QString &collection)
    m_collections.removeAll(collection);
    emit collectionListChanged(m_collections);
    m_needsSave = true;
 }
--- a/gpt4all-chat/src/chat.h
+++ b/gpt4all-chat/src/chat.h
@ -3,26 +3,18 @@
 #include "chatllm.h"
 #include "chatmodel.h"
-#include "database.h"
+#include "database.h" // IWYU pragma: keep
-#include "localdocsmodel.h"
+#include "localdocsmodel.h" // IWYU pragma: keep
 #include "modellist.h"
 #include "tool.h"
 #include <QDateTime>
 #include <QList>
 #include <QObject>
-#include <QQmlEngine> // IWYU pragma: keep
+#include <QQmlEngine>
 #include <QString>
-#include <QStringList> // IWYU pragma: keep
+#include <QtGlobal>
 #include <QUrl>
 #include <QVariant>
 #include <QtTypes>
 // IWYU pragma: no_forward_declare LocalDocsCollectionsModel
 // IWYU pragma: no_forward_declare ToolCallInfo
 class QDataStream;
 class Chat : public QObject
 {
    Q_OBJECT
@ -32,13 +24,15 @@ class Chat : public QObject
    Q_PROPERTY(bool isModelLoaded READ isModelLoaded NOTIFY isModelLoadedChanged)
    Q_PROPERTY(bool isCurrentlyLoading READ isCurrentlyLoading NOTIFY isCurrentlyLoadingChanged)
    Q_PROPERTY(float modelLoadingPercentage READ modelLoadingPercentage NOTIFY modelLoadingPercentageChanged)
    Q_PROPERTY(QString response READ response NOTIFY responseChanged)
    Q_PROPERTY(ModelInfo modelInfo READ modelInfo WRITE setModelInfo NOTIFY modelInfoChanged)
    Q_PROPERTY(bool responseInProgress READ responseInProgress NOTIFY responseInProgressChanged)
    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
    Q_PROPERTY(bool isServer READ isServer NOTIFY isServerChanged)
    Q_PROPERTY(ResponseState responseState READ responseState NOTIFY responseStateChanged)
    Q_PROPERTY(QList<QString> collectionList READ collectionList NOTIFY collectionListChanged)
    Q_PROPERTY(QString modelLoadingError READ modelLoadingError NOTIFY modelLoadingErrorChanged)
-    Q_PROPERTY(QString tokenSpeed READ tokenSpeed NOTIFY tokenSpeedChanged)
+    Q_PROPERTY(QString tokenSpeed READ tokenSpeed NOTIFY tokenSpeedChanged);
    Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
@ -50,23 +44,18 @@ class Chat : public QObject
    QML_UNCREATABLE("Only creatable from c++!")
 public:
    // tag for constructing a server chat
    struct server_tag_t { explicit server_tag_t() = default; };
    static inline constexpr server_tag_t server_tag = server_tag_t();
    enum ResponseState {
        ResponseStopped,
        LocalDocsRetrieval,
        LocalDocsProcessing,
        PromptProcessing,
        GeneratingQuestions,
-        ResponseGeneration,
+        ResponseGeneration
        ToolCallGeneration
    };
    Q_ENUM(ResponseState)
    explicit Chat(QObject *parent = nullptr);
-    explicit Chat(server_tag_t, QObject *parent = nullptr);
+    explicit Chat(bool isServer, QObject *parent = nullptr);
    virtual ~Chat();
    void destroy() { m_llmodel->destroy(); }
    void connectLLM();
@ -77,27 +66,29 @@ public:
    {
        m_userName = name;
        emit nameChanged();
        m_needsSave = true;
    }
    ChatModel *chatModel() { return m_chatModel; }
    bool isNewChat() const { return m_name == tr("New Chat") && !m_chatModel->count(); }
    Q_INVOKABLE void reset();
    Q_INVOKABLE void processSystemPrompt();
    bool  isModelLoaded()          const { return m_modelLoadingPercentage == 1.0f; }
    bool  isCurrentlyLoading()     const { return m_modelLoadingPercentage > 0.0f && m_modelLoadingPercentage < 1.0f; }
    float modelLoadingPercentage() const { return m_modelLoadingPercentage; }
-    Q_INVOKABLE void newPromptResponsePair(const QString &prompt, const QList<QUrl> &attachedUrls = {});
+    Q_INVOKABLE void prompt(const QString &prompt);
-    Q_INVOKABLE void regenerateResponse(int index);
+    Q_INVOKABLE void regenerateResponse();
    Q_INVOKABLE QVariant popPrompt(int index);
    Q_INVOKABLE void stopGenerating();
    Q_INVOKABLE void newPromptResponsePair(const QString &prompt);
    QList<ResultInfo> databaseResults() const { return m_databaseResults; }
    QString response() const;
    bool responseInProgress() const { return m_responseInProgress; }
    ResponseState responseState() const;
    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &modelInfo);
    bool isRecalc() const;
    Q_INVOKABLE void unloadModel();
    Q_INVOKABLE void reloadModel();
@ -118,6 +109,7 @@ public:
    Q_INVOKABLE bool hasCollection(const QString &collection) const;
    Q_INVOKABLE void addCollection(const QString &collection);
    Q_INVOKABLE void removeCollection(const QString &collection);
    void resetResponseState();
    QString modelLoadingError() const { return m_modelLoadingError; }
@ -131,11 +123,8 @@ public:
    QList<QString> generatedQuestions() const { return m_generatedQuestions; }
    bool needsSave() const { return m_needsSave; }
    void setNeedsSave(bool n) { m_needsSave = n; }
 public Q_SLOTS:
-    void resetResponseState();
+    void serverNewPromptResponsePair(const QString &prompt);
 Q_SIGNALS:
    void idChanged(const QString &id);
@ -145,15 +134,19 @@ Q_SIGNALS:
    void isCurrentlyLoadingChanged();
    void modelLoadingPercentageChanged();
    void modelLoadingWarning(const QString &warning);
    void responseChanged();
    void responseInProgressChanged();
    void responseStateChanged();
-    void promptRequested(const QStringList &enabledCollections);
+    void promptRequested(const QList<QString> &collectionList, const QString &prompt);
-    void regenerateResponseRequested(int index);
+    void regenerateResponseRequested();
    void resetResponseRequested();
    void resetContextRequested();
    void processSystemPromptRequested();
    void modelChangeRequested(const ModelInfo &modelInfo);
    void modelInfoChanged();
    void recalcChanged();
    void loadDefaultModelRequested();
    void loadModelRequested(const ModelInfo &modelInfo);
    void generateNameRequested();
    void modelLoadingErrorChanged();
    void isServerChanged();
@ -167,21 +160,18 @@ Q_SIGNALS:
    void generatedQuestionsChanged();
 private Q_SLOTS:
-    void handleResponseChanged();
+    void handleResponseChanged(const QString &response);
    void handleModelLoadingPercentageChanged(float);
    void promptProcessing();
    void generatingQuestions();
    void responseStopped(qint64 promptResponseMs);
    void processToolCall(const QString &toolCall);
    void toolCallComplete(const ToolCallInfo &info);
    void responseComplete();
    void generatedNameChanged(const QString &name);
    void generatedQuestionFinished(const QString &question);
    void handleRecalculating();
    void handleModelLoadingError(const QString &error);
    void handleTokenSpeedChanged(const QString &tokenSpeed);
    void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
    void handleModelInfoChanged(const ModelInfo &modelInfo);
    void handleModelChanged(const ModelInfo &modelInfo);
    void handleTrySwitchContextOfLoadedModelCompleted(int value);
 private:
@ -194,6 +184,7 @@ private:
    QString m_tokenSpeed;
    QString m_device;
    QString m_fallbackReason;
    QString m_response;
    QList<QString> m_collections;
    QList<QString> m_generatedQuestions;
    ChatModel *m_chatModel;
@ -209,11 +200,6 @@ private:
    bool m_firstResponse = true;
    int m_trySwitchContextInProgress = 0;
    bool m_isCurrentlyLoading = false;
    // True if we need to serialize the chat to disk, because of one of two reasons:
    // - The chat was freshly created during this launch.
    // - The chat was changed after loading it from disk.
    bool m_needsSave = true;
    int m_consecutiveToolCalls = 0;
 };
 #endif // CHAT_H
--- a/gpt4all-chat/src/chatapi.cpp
+++ b/gpt4all-chat/src/chatapi.cpp
@ -1,40 +1,29 @@
 #include "chatapi.h"
-#include "utils.h"
+#include "../gpt4all-backend/llmodel.h"
 #include <fmt/format.h>
 #include <QAnyStringView>
 #include <QCoreApplication>
 #include <QDebug>
 #include <QGuiApplication>
 #include <QDebug>
 #include <QJsonArray>
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QJsonValue>
 #include <QLatin1String>
 #include <QNetworkAccessManager>
 #include <QNetworkRequest>
 #include <QStringView>
 #include <QThread>
 #include <QUrl>
 #include <QUtf8StringView> // IWYU pragma: keep
 #include <QVariant>
 #include <QXmlStreamReader>
 #include <Qt>
-#include <QtAssert>
+#include <QtGlobal>
 #include <QtLogging>
 #include <expected>
 #include <functional>
 #include <iostream>
 #include <utility>
 using namespace Qt::Literals::StringLiterals;
 //#define DEBUG
 ChatAPI::ChatAPI()
    : QObject(nullptr)
    , m_modelName("gpt-3.5-turbo")
@ -62,6 +51,7 @@ bool ChatAPI::loadModel(const std::string &modelPath, int n_ctx, int ngl)
 void ChatAPI::setThreadCount(int32_t n_threads)
 {
    Q_UNUSED(n_threads);
    qt_noop();
 }
 int32_t ChatAPI::threadCount() const
@ -78,119 +68,89 @@ bool ChatAPI::isModelLoaded() const
    return true;
 }
-static auto parsePrompt(QXmlStreamReader &xml) -> std::expected<QJsonArray, QString>
+// All three of the state virtual functions are handled custom inside of chatllm save/restore
 size_t ChatAPI::stateSize() const
 {
-    QJsonArray messages;
+    return 0;
    auto xmlError = [&xml] {
        return std::unexpected(u"%1:%2: %3"_s.arg(xml.lineNumber()).arg(xml.columnNumber()).arg(xml.errorString()));
    };
    if (xml.hasError())
        return xmlError();
    if (xml.atEnd())
        return messages;
    // skip header
    bool foundElement = false;
    do {
        switch (xml.readNext()) {
            using enum QXmlStreamReader::TokenType;
        case Invalid:
            return xmlError();
        case EndDocument:
            return messages;
        default:
            foundElement = true;
        case StartDocument:
        case Comment:
        case DTD:
        case ProcessingInstruction:
            ;
        }
    } while (!foundElement);
    // document body loop
    bool foundRoot = false;
    for (;;) {
        switch (xml.tokenType()) {
            using enum QXmlStreamReader::TokenType;
        case StartElement:
            {
                auto name = xml.name();
                if (!foundRoot) {
                    if (name != "chat"_L1)
                        return std::unexpected(u"unexpected tag: %1"_s.arg(name));
                    foundRoot = true;
                } else {
                    if (name != "user"_L1 && name != "assistant"_L1 && name != "system"_L1)
                        return std::unexpected(u"unknown role: %1"_s.arg(name));
                    auto content = xml.readElementText();
                    if (xml.tokenType() != EndElement)
                        return xmlError();
                    messages << makeJsonObject({
                        { "role"_L1,    name.toString().trimmed() },
                        { "content"_L1, content                   },
                    });
                }
                break;
            }
        case Characters:
            if (!xml.isWhitespace())
                return std::unexpected(u"unexpected text: %1"_s.arg(xml.text()));
        case Comment:
        case ProcessingInstruction:
        case EndElement:
            break;
        case EndDocument:
            return messages;
        case Invalid:
            return xmlError();
        default:
            return std::unexpected(u"unexpected token: %1"_s.arg(xml.tokenString()));
        }
        xml.readNext();
    }
 }
-void ChatAPI::prompt(
+size_t ChatAPI::saveState(uint8_t *dest) const
-    std::string_view        prompt,
+{
-    const PromptCallback   &promptCallback,
+    Q_UNUSED(dest);
-    const ResponseCallback &responseCallback,
+    return 0;
-    const PromptContext    &promptCtx
+}
 ) {
    Q_UNUSED(promptCallback)
-    if (!isModelLoaded())
+size_t ChatAPI::restoreState(const uint8_t *src)
-        throw std::invalid_argument("Attempted to prompt an unloaded model.");
+{
-    if (!promptCtx.n_predict)
+    Q_UNUSED(src);
-        return; // nothing requested
+    return 0;
 }
 void ChatAPI::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,
                     std::function<bool(int32_t, const std::string&)> responseCallback,
                     std::function<bool(bool)> recalculateCallback,
                     PromptContext &promptCtx,
                     bool special,
                     std::string *fakeReply) {
    Q_UNUSED(promptCallback);
    Q_UNUSED(recalculateCallback);
    Q_UNUSED(special);
    if (!isModelLoaded()) {
        std::cerr << "ChatAPI ERROR: prompt won't work with an unloaded model!\n";
        return;
    }
    if (!promptCtx.n_past) { m_queuedPrompts.clear(); }
    Q_ASSERT(promptCtx.n_past <= m_context.size());
    m_context.resize(promptCtx.n_past);
    // FIXME(cebtenzzre): We're assuming people don't try to use %2 with ChatGPT. What would that even mean?
    m_queuedPrompts << QString::fromStdString(promptTemplate).arg(QString::fromStdString(prompt));
    if (!promptCtx.n_predict && !fakeReply) {
        return; // response explicitly suppressed, queue prompt for later
    }
    QString formattedPrompt = m_queuedPrompts.join("");
    m_queuedPrompts.clear();
    if (fakeReply) {
        promptCtx.n_past += 1;
        m_context.append(formattedPrompt);
        m_context.append(QString::fromStdString(*fakeReply));
        return;
    }
    // FIXME: We don't set the max_tokens on purpose because in order to do so safely without encountering
    // an error we need to be able to count the tokens in our prompt. The only way to do this is to use
-    // the OpenAI tiktoken library or to implement our own tokenization function that matches precisely
+    // the OpenAI tiktokken library or to implement our own tokenization function that matches precisely
    // the tokenization used by the OpenAI model we're calling. OpenAI has not introduced any means of
    // using the REST API to count tokens in a prompt.
-    auto root = makeJsonObject({
+    QJsonObject root;
-        { "model"_L1,       m_modelName     },
+    root.insert("model", m_modelName);
-        { "stream"_L1,      true            },
+    root.insert("stream", true);
-        { "temperature"_L1, promptCtx.temp  },
+    root.insert("temperature", promptCtx.temp);
-        { "top_p"_L1,       promptCtx.top_p },
+    root.insert("top_p", promptCtx.top_p);
    });
    // conversation history
-    {
+    QJsonArray messages;
-        QUtf8StringView promptUtf8(prompt);
+    for (int i = 0; i < m_context.count(); ++i) {
-        QXmlStreamReader xml(promptUtf8);
+        QJsonObject message;
-        auto messages = parsePrompt(xml);
+        message.insert("role", i % 2 == 0 ? "user" : "assistant");
-        if (!messages) {
+        message.insert("content", m_context.at(i));
-            auto error = fmt::format("Failed to parse API model prompt: {}", messages.error());
+        messages.append(message);
            qDebug().noquote() << "ChatAPI ERROR:" << error << "Prompt:\n\n" << promptUtf8 << '\n';
            throw std::invalid_argument(error);
        }
        root.insert("messages"_L1, *messages);
    }
    QJsonObject promptObject;
    promptObject.insert("role", "user");
    promptObject.insert("content", formattedPrompt);
    messages.append(promptObject);
    root.insert("messages", messages);
    QJsonDocument doc(root);
 #if defined(DEBUG)
@ -207,9 +167,12 @@ void ChatAPI::prompt(
    connect(&worker, &ChatAPIWorker::finished, &workerThread, &QThread::quit, Qt::DirectConnection);
    connect(this, &ChatAPI::request, &worker, &ChatAPIWorker::request, Qt::QueuedConnection);
    workerThread.start();
-    emit request(m_apiKey, doc.toJson(QJsonDocument::Compact));
+    emit request(m_apiKey, &promptCtx, doc.toJson(QJsonDocument::Compact));
    workerThread.wait();
    promptCtx.n_past += 1;
    m_context.append(formattedPrompt);
    m_context.append(worker.currentResponse());
    m_responseCallback = nullptr;
 #if defined(DEBUG)
@ -227,8 +190,12 @@ bool ChatAPI::callResponse(int32_t token, const std::string& string)
    return m_responseCallback(token, string);
 }
-void ChatAPIWorker::request(const QString &apiKey, const QByteArray &array)
+void ChatAPIWorker::request(const QString &apiKey,
                            LLModel::PromptContext *promptCtx,
                            const QByteArray &array)
 {
    m_ctx = promptCtx;
    QUrl apiUrl(m_chat->url());
    const QString authorization = u"Bearer %1"_s.arg(apiKey).trimmed();
    QNetworkRequest request(apiUrl);
@ -335,6 +302,7 @@ void ChatAPIWorker::handleReadyRead()
        const QJsonObject choice = choices.first().toObject();
        const QJsonObject delta = choice.value("delta").toObject();
        const QString content = delta.value("content").toString();
        Q_ASSERT(m_ctx);
        m_currentResponse += content;
        if (!m_chat->callResponse(0, content.toStdString())) {
            reply->abort();
--- a/gpt4all-chat/chatapi.h
+++ b/gpt4all-chat/chatapi.h
@ -0,0 +1,144 @@
 #ifndef CHATAPI_H
 #define CHATAPI_H
 #include "../gpt4all-backend/llmodel.h"
 #include <QByteArray>
 #include <QNetworkReply>
 #include <QObject>
 #include <QString>
 #include <QStringList>
 #include <QList>
 #include <cstddef>
 #include <cstdint>
 #include <stdexcept>
 #include <functional>
 #include <string>
 #include <vector>
 class QNetworkAccessManager;
 class ChatAPI;
 class ChatAPIWorker : public QObject {
    Q_OBJECT
 public:
    ChatAPIWorker(ChatAPI *chatAPI)
        : QObject(nullptr)
        , m_ctx(nullptr)
        , m_networkManager(nullptr)
        , m_chat(chatAPI) {}
    virtual ~ChatAPIWorker() {}
    QString currentResponse() const { return m_currentResponse; }
    void request(const QString &apiKey,
                 LLModel::PromptContext *promptCtx,
                 const QByteArray &array);
 Q_SIGNALS:
    void finished();
 private Q_SLOTS:
    void handleFinished();
    void handleReadyRead();
    void handleErrorOccurred(QNetworkReply::NetworkError code);
 private:
    ChatAPI *m_chat;
    LLModel::PromptContext *m_ctx;
    QNetworkAccessManager *m_networkManager;
    QString m_currentResponse;
 };
 class ChatAPI : public QObject, public LLModel {
    Q_OBJECT
 public:
    ChatAPI();
    virtual ~ChatAPI();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
    void prompt(const std::string &prompt,
                const std::string &promptTemplate,
                std::function<bool(int32_t)> promptCallback,
                std::function<bool(int32_t, const std::string&)> responseCallback,
                std::function<bool(bool)> recalculateCallback,
                PromptContext &ctx,
                bool special,
                std::string *fakeReply) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
    void setModelName(const QString &modelName) { m_modelName = modelName; }
    void setAPIKey(const QString &apiKey) { m_apiKey = apiKey; }
    void setRequestURL(const QString &requestURL) { m_requestURL = requestURL; }
    QString url() const { return m_requestURL; }
    QList<QString> context() const { return m_context; }
    void setContext(const QList<QString> &context) { m_context = context; }
    bool callResponse(int32_t token, const std::string &string);
 Q_SIGNALS:
    void request(const QString &apiKey,
                 LLModel::PromptContext *ctx,
                 const QByteArray &array);
 protected:
    // We have to implement these as they are pure virtual in base class, but we don't actually use
    // them as they are only called from the default implementation of 'prompt' which we override and
    // completely replace
    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) override {
        (void)ctx;
        (void)str;
        (void)special;
        throw std::logic_error("not implemented");
    }
    std::string tokenToString(Token id) const override {
        (void)id;
        throw std::logic_error("not implemented");
    }
    Token sampleToken(PromptContext &ctx) const override {
        (void)ctx;
        throw std::logic_error("not implemented");
    }
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override {
        (void)ctx;
        (void)tokens;
        throw std::logic_error("not implemented");
    }
    int32_t contextLength() const override {
        throw std::logic_error("not implemented");
    }
    const std::vector<Token> &endTokens() const override {
        throw std::logic_error("not implemented");
    }
    bool shouldAddBOS() const override {
        throw std::logic_error("not implemented");
    }
 private:
    std::function<bool(int32_t, const std::string&)> m_responseCallback;
    QString m_modelName;
    QString m_apiKey;
    QString m_requestURL;
    QList<QString> m_context;
    QStringList m_queuedPrompts;
 };
 #endif // CHATAPI_H
--- a/gpt4all-chat/src/chatlistmodel.cpp
+++ b/gpt4all-chat/src/chatlistmodel.cpp
@ -1,27 +1,25 @@
 #include "chatlistmodel.h"
 #include "database.h" // IWYU pragma: keep
 #include "mysettings.h"
 #include <QCoreApplication>
 #include <QDataStream>
 #include <QDir>
 #include <QElapsedTimer>
 #include <QEvent>
 #include <QFile>
 #include <QFileInfo>
 #include <QGlobalStatic>
 #include <QGuiApplication>
 #include <QIODevice>
 #include <QSettings>
-#include <QStringList> // IWYU pragma: keep
+#include <QString>
 #include <QStringList>
 #include <Qt>
 #include <QtTypes>
 #include <algorithm>
-
+#define CHAT_FORMAT_MAGIC 0xF5D553CC
-static constexpr quint32 CHAT_FORMAT_MAGIC   = 0xF5D553CC;
+#define CHAT_FORMAT_VERSION 9
 static constexpr qint32  CHAT_FORMAT_VERSION = 12;
 class MyChatListModel: public ChatListModel { };
 Q_GLOBAL_STATIC(MyChatListModel, chatListModelInstance)
@ -53,12 +51,6 @@ void ChatListModel::loadChats()
    connect(thread, &ChatsRestoreThread::finished, thread, &QObject::deleteLater);
    thread->start();
    m_chatSaver = std::make_unique<ChatSaver>();
    connect(this, &ChatListModel::requestSaveChats, m_chatSaver.get(), &ChatSaver::saveChats, Qt::QueuedConnection);
    connect(m_chatSaver.get(), &ChatSaver::saveChatsFinished, this, &ChatListModel::saveChatsFinished, Qt::QueuedConnection);
    // save chats on application quit
    connect(QCoreApplication::instance(), &QCoreApplication::aboutToQuit, this, &ChatListModel::saveChatsSync);
    connect(MySettings::globalInstance(), &MySettings::serverChatChanged, this, &ChatListModel::handleServerEnabledChanged);
 }
@ -81,59 +73,33 @@ ChatSaver::ChatSaver()
    m_thread.start();
 }
 ChatSaver::~ChatSaver()
 {
    m_thread.quit();
    m_thread.wait();
 }
 QVector<Chat *> ChatListModel::getChatsToSave() const
 {
    QVector<Chat *> toSave;
    for (auto *chat : m_chats)
        if (chat != m_serverChat && !chat->isNewChat())
            toSave << chat;
    return toSave;
 }
 void ChatListModel::saveChats()
 {
-    auto toSave = getChatsToSave();
+    QVector<Chat*> toSave;
    for (Chat *chat : m_chats) {
        if (chat == m_serverChat)
            continue;
        if (chat->isNewChat())
            continue;
        toSave.append(chat);
    }
    if (toSave.isEmpty()) {
        emit saveChatsFinished();
        return;
    }
    ChatSaver *saver = new ChatSaver;
    connect(this, &ChatListModel::requestSaveChats, saver, &ChatSaver::saveChats, Qt::QueuedConnection);
    connect(saver, &ChatSaver::saveChatsFinished, this, &ChatListModel::saveChatsFinished, Qt::QueuedConnection);
    emit requestSaveChats(toSave);
 }
 void ChatListModel::saveChatsForQuit()
 {
    saveChats();
    m_startedFinalSave = true;
 }
 void ChatListModel::saveChatsSync()
 {
    auto toSave = getChatsToSave();
    if (!m_startedFinalSave && !toSave.isEmpty())
        m_chatSaver->saveChats(toSave);
 }
 void ChatSaver::saveChats(const QVector<Chat *> &chats)
 {
    // we can be called from the main thread instead of a worker thread at quit time, so take a lock
    QMutexLocker locker(&m_mutex);
    QElapsedTimer timer;
    timer.start();
    const QString savePath = MySettings::globalInstance()->modelPath();
    qsizetype nSavedChats = 0;
    for (Chat *chat : chats) {
        if (!chat->needsSave())
            continue;
        ++nSavedChats;
        QString fileName = "gpt4all-" + chat->id() + ".chat";
        QString filePath = savePath + "/" + fileName;
        QFile originalFile(filePath);
@ -146,8 +112,8 @@ void ChatSaver::saveChats(const QVector<Chat *> &chats)
        }
        QDataStream out(&tempFile);
-        out << CHAT_FORMAT_MAGIC;
+        out << (quint32)CHAT_FORMAT_MAGIC;
-        out << CHAT_FORMAT_VERSION;
+        out << (qint32)CHAT_FORMAT_VERSION;
        out.setVersion(QDataStream::Qt_6_2);
        qDebug() << "serializing chat" << fileName;
@ -157,14 +123,13 @@ void ChatSaver::saveChats(const QVector<Chat *> &chats)
            continue;
        }
        chat->setNeedsSave(false);
        if (originalFile.exists())
            originalFile.remove();
        tempFile.rename(filePath);
    }
    qint64 elapsedTime = timer.elapsed();
-    qDebug() << "serializing chats took" << elapsedTime << "ms, saved" << nSavedChats << "/" << chats.size() << "chats";
+    qDebug() << "serializing chats took:" << elapsedTime << "ms";
    emit saveChatsFinished();
 }
@ -229,16 +194,11 @@ void ChatsRestoreThread::run()
            qint32 version;
            in >> version;
            if (version < 1) {
-                qWarning() << "WARNING: Chat file version" << version << "is not supported:" << file.fileName();
+                qWarning() << "ERROR: Chat file has non supported version:" << file.fileName();
                continue;
            }
            if (version > CHAT_FORMAT_VERSION) {
                qWarning().nospace() << "WARNING: Chat file is from a future version (have " << version << " want "
                                     << CHAT_FORMAT_VERSION << "): " << file.fileName();
                continue;
            }
-            if (version < 2)
+            if (version <= 1)
                in.setVersion(QDataStream::Qt_6_2);
            FileInfo info;
@ -279,21 +239,18 @@ void ChatsRestoreThread::run()
                continue;
            }
-            if (version < 2)
+            if (version <= 1)
                in.setVersion(QDataStream::Qt_6_2);
        }
        qDebug() << "deserializing chat" << f.file;
-        auto chat = std::make_unique<Chat>();
+        Chat *chat = new Chat;
        chat->moveToThread(qGuiApp->thread());
-        bool ok = chat->deserialize(in, version);
+        if (!chat->deserialize(in, version)) {
        if (!ok) {
            qWarning() << "ERROR: Couldn't deserialize chat from file:" << file.fileName();
        } else if (!in.atEnd()) {
            qWarning().nospace() << "error loading chat from " << file.fileName() << ": extra data at end of file";
        } else {
-            emit chatRestored(chat.release());
+            emit chatRestored(chat);
        }
        if (f.oldFile)
           file.remove(); // No longer storing in this directory
--- a/gpt4all-chat/src/chatlistmodel.h
+++ b/gpt4all-chat/src/chatlistmodel.h
@ -7,23 +7,16 @@
 #include <QAbstractListModel>
 #include <QByteArray>
 #include <QDate>
 #include <QDebug>
 #include <QHash>
 #include <QList>
 #include <QMutex>
 #include <QObject>
 #include <QString>
 #include <QThread>
 #include <QVariant>
-#include <QVector> // IWYU pragma: keep
+#include <QVector>
 #include <Qt>
-#include <QtAssert>
+#include <QtGlobal>
 #include <QtLogging>
 #include <QtPreprocessorSupport>
 #include <memory>
 class ChatsRestoreThread : public QThread
 {
@ -40,7 +33,7 @@ class ChatSaver : public QObject
    Q_OBJECT
 public:
    explicit ChatSaver();
-    ~ChatSaver() override;
+    void stop();
 Q_SIGNALS:
    void saveChatsFinished();
@ -50,7 +43,6 @@ public Q_SLOTS:
 private:
    QThread m_thread;
    QMutex  m_mutex;
 };
 class ChatListModel : public QAbstractListModel
@ -155,7 +147,7 @@ public:
        if (m_serverChat)
            return;
-        m_serverChat = new Chat(Chat::server_tag, this);
+        m_serverChat = new Chat(true /*isServer*/, this);
        beginInsertRows(QModelIndex(), m_chats.size(), m_chats.size());
        m_chats.append(m_serverChat);
        endInsertRows();
@ -237,7 +229,6 @@ public:
    void removeChatFile(Chat *chat) const;
    Q_INVOKABLE void saveChats();
    Q_INVOKABLE void saveChatsForQuit();
    void restoreChat(Chat *chat);
    void chatsRestoredFinished();
@ -247,6 +238,7 @@ public Q_SLOTS:
 Q_SIGNALS:
    void countChanged();
    void currentChatChanged();
    void chatsSavedFinished();
    void requestSaveChats(const QVector<Chat*> &);
    void saveChatsFinished();
@ -254,9 +246,6 @@ protected:
    bool eventFilter(QObject *obj, QEvent *ev) override;
 private Q_SLOTS:
    // Used with QCoreApplication::aboutToQuit. Does not require an event loop.
    void saveChatsSync();
    void newChatCountChanged()
    {
        Q_ASSERT(m_newChat && m_newChat->chatModel()->count());
@ -287,16 +276,11 @@ private Q_SLOTS:
        }
    }
 private:
    QVector<Chat *> getChatsToSave() const;
 private:
    Chat* m_newChat = nullptr;
    Chat* m_serverChat = nullptr;
    Chat* m_currentChat = nullptr;
    QList<Chat*> m_chats;
    std::unique_ptr<ChatSaver> m_chatSaver;
    bool m_startedFinalSave = false;
 private:
    explicit ChatListModel();
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
--- a/gpt4all-chat/src/chatllm.h
+++ b/gpt4all-chat/src/chatllm.h
@ -1,94 +1,42 @@
 #ifndef CHATLLM_H
 #define CHATLLM_H
-#include "chatmodel.h"
+#include "database.h" // IWYU pragma: keep
 #include "database.h"
 #include "modellist.h"
-#include <gpt4all-backend/llmodel.h>
+#include "../gpt4all-backend/llmodel.h"
 #include <QByteArray>
 #include <QElapsedTimer>
 #include <QFileInfo>
 #include <QList>
 #include <QObject>
-#include <QPointer>
+#include <QPair>
 #include <QString>
 #include <QStringList> // IWYU pragma: keep
 #include <QThread>
-#include <QVariantMap> // IWYU pragma: keep
+#include <QVariantMap>
-#include <QtNumeric>
+#include <QVector>
 #include <QtGlobal>
 #include <atomic>
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <span>
 #include <string>
 #include <string_view>
 #include <variant>
 #include <vector>
 using namespace Qt::Literals::StringLiterals;
 class ChatLLM;
 class QDataStream;
 // NOTE: values serialized to disk, do not change or reuse
-enum class LLModelTypeV0 { // chat versions 2-5
+enum LLModelType {
-    MPT       = 0,
+    GPTJ_  = 0, // no longer used
-    GPTJ      = 1,
+    LLAMA_ = 1,
-    LLAMA     = 2,
+    API_   = 2,
-    CHATGPT   = 3,
+    BERT_  = 3, // no longer used
    REPLIT    = 4,
    FALCON    = 5,
    BERT      = 6, // not used
    STARCODER = 7,
 };
 enum class LLModelTypeV1 { // since chat version 6 (v2.5.0)
    GPTJ      = 0, // not for new chats
    LLAMA     = 1,
    API       = 2,
    BERT      = 3, // not used
    // none of the below are used in new chats
    REPLIT    = 4,
    FALCON    = 5,
    MPT       = 6,
    STARCODER = 7,
    NONE      = -1, // no state
 };
-inline LLModelTypeV1 parseLLModelTypeV1(int type)
+class ChatLLM;
 {
    switch (LLModelTypeV1(type)) {
    case LLModelTypeV1::GPTJ:
    case LLModelTypeV1::LLAMA:
    case LLModelTypeV1::API:
    // case LLModelTypeV1::BERT: -- not used
    case LLModelTypeV1::REPLIT:
    case LLModelTypeV1::FALCON:
    case LLModelTypeV1::MPT:
    case LLModelTypeV1::STARCODER:
        return LLModelTypeV1(type);
    default:
        return LLModelTypeV1::NONE;
    }
 }
 inline LLModelTypeV1 parseLLModelTypeV0(int v0)
 {
    switch (LLModelTypeV0(v0)) {
    case LLModelTypeV0::MPT:       return LLModelTypeV1::MPT;
    case LLModelTypeV0::GPTJ:      return LLModelTypeV1::GPTJ;
    case LLModelTypeV0::LLAMA:     return LLModelTypeV1::LLAMA;
    case LLModelTypeV0::CHATGPT:   return LLModelTypeV1::API;
    case LLModelTypeV0::REPLIT:    return LLModelTypeV1::REPLIT;
    case LLModelTypeV0::FALCON:    return LLModelTypeV1::FALCON;
    // case LLModelTypeV0::BERT: -- not used
    case LLModelTypeV0::STARCODER: return LLModelTypeV1::STARCODER;
    default:                       return LLModelTypeV1::NONE;
    }
 }
 struct LLModelInfo {
    std::unique_ptr<LLModel> model;
@ -145,6 +93,7 @@ class Chat;
 class ChatLLM : public QObject
 {
    Q_OBJECT
    Q_PROPERTY(bool isRecalc READ isRecalc NOTIFY recalcChanged)
    Q_PROPERTY(QString deviceBackend READ deviceBackend NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString device READ device NOTIFY loadedModelInfoChanged)
    Q_PROPERTY(QString fallbackReason READ fallbackReason NOTIFY loadedModelInfoChanged)
@ -152,14 +101,12 @@ public:
    ChatLLM(Chat *parent, bool isServer = false);
    virtual ~ChatLLM();
    static void destroyStore();
    static std::optional<std::string> checkJinjaTemplateError(const std::string &source);
    void destroy();
    static void destroyStore();
    bool isModelLoaded() const;
-    void regenerateResponse(int index);
+    void regenerateResponse();
-    // used to implement edit functionality
+    void resetResponse();
-    std::optional<QString> popPrompt(int index);
+    void resetContext();
    void stopGenerating() { m_stopGenerating = true; }
@ -169,9 +116,13 @@ public:
    void setForceUnloadModel(bool b) { m_forceUnloadModel = b; }
    void setMarkedForDeletion(bool b) { m_markedForDeletion = b; }
    QString response() const;
    ModelInfo modelInfo() const;
    void setModelInfo(const ModelInfo &info);
    bool isRecalc() const { return m_isRecalc; }
    void acquireModel();
    void resetModel();
@ -196,11 +147,14 @@ public:
        return m_llModelInfo.fallbackReason.value_or(u""_s);
    }
-    bool serialize(QDataStream &stream, int version);
+    QString generatedName() const { return QString::fromStdString(m_nameResponse); }
-    bool deserialize(QDataStream &stream, int version);
+
    bool serialize(QDataStream &stream, int version, bool serializeKV);
    bool deserialize(QDataStream &stream, int version, bool deserializeKV, bool discardKV);
    void setStateFromText(const QVector<QPair<QString, QString>> &stateFromText) { m_stateFromText = stateFromText; }
 public Q_SLOTS:
-    void prompt(const QStringList &enabledCollections);
+    bool prompt(const QList<QString> &collectionList, const QString &prompt);
    bool loadDefaultModel();
    void trySwitchContextOfLoadedModel(const ModelInfo &modelInfo);
    bool loadModel(const ModelInfo &modelInfo);
@ -208,19 +162,22 @@ public Q_SLOTS:
    void unloadModel();
    void reloadModel();
    void generateName();
    void generateQuestions(qint64 elapsed);
    void handleChatIdChanged(const QString &id);
    void handleShouldBeLoadedChanged();
    void handleThreadStarted();
    void handleForceMetalChanged(bool forceMetal);
    void handleDeviceChanged();
    void processSystemPrompt();
    void processRestoreStateFromText();
 Q_SIGNALS:
    void recalcChanged();
    void loadedModelInfoChanged();
    void modelLoadingPercentageChanged(float);
    void modelLoadingError(const QString &error);
    void modelLoadingWarning(const QString &warning);
-    void responseChanged();
+    void responseChanged(const QString &response);
    void responseFailed();
    void promptProcessing();
    void generatingQuestions();
    void responseStopped(qint64 promptResponseMs);
@ -239,53 +196,59 @@ Q_SIGNALS:
    void modelInfoChanged(const ModelInfo &modelInfo);
 protected:
-    struct PromptResult {
+    bool promptInternal(const QList<QString> &collectionList, const QString &prompt, const QString &promptTemplate,
-        QByteArray response;       // raw UTF-8
+        int32_t n_predict, int32_t top_k, float top_p, float min_p, float temp, int32_t n_batch, float repeat_penalty,
-        int        promptTokens;   // note: counts *entire* history, even if cached
+        int32_t repeat_penalty_tokens);
-        int        responseTokens;
+    bool handlePrompt(int32_t token);
-    };
+    bool handleResponse(int32_t token, const std::string &response);
    bool handleRecalculate(bool isRecalc);
    bool handleNamePrompt(int32_t token);
    bool handleNameResponse(int32_t token, const std::string &response);
    bool handleNameRecalculate(bool isRecalc);
    bool handleSystemPrompt(int32_t token);
    bool handleSystemResponse(int32_t token, const std::string &response);
    bool handleSystemRecalculate(bool isRecalc);
    bool handleRestoreStateFromTextPrompt(int32_t token);
    bool handleRestoreStateFromTextResponse(int32_t token, const std::string &response);
    bool handleRestoreStateFromTextRecalculate(bool isRecalc);
    bool handleQuestionPrompt(int32_t token);
    bool handleQuestionResponse(int32_t token, const std::string &response);
    bool handleQuestionRecalculate(bool isRecalc);
    void saveState();
    void restoreState();
-    struct ChatPromptResult : PromptResult {
+protected:
-        QList<ResultInfo> databaseResults;
+    LLModel::PromptContext m_ctx;
-    };
+    quint32 m_promptTokens;
-
+    quint32 m_promptResponseTokens;
    ChatPromptResult promptInternalChat(const QStringList &enabledCollections, const LLModel::PromptContext &ctx,
                                        qsizetype startOffset = 0);
    // passing a string_view directly skips templating and uses the raw string
    PromptResult promptInternal(const std::variant<std::span<const MessageItem>, std::string_view> &prompt,
                                const LLModel::PromptContext &ctx,
                                bool usedLocalDocs);
 private:
    bool loadNewModel(const ModelInfo &modelInfo, QVariantMap &modelLoadProps);
-    std::vector<MessageItem> forkConversation(const QString &prompt) const;
+    std::string m_response;
-
+    std::string m_nameResponse;
-    // Applies the Jinja template. Query mode returns only the last message without special tokens.
+    QString m_questionResponse;
    // Returns a (# of messages, rendered prompt) pair.
    std::string applyJinjaTemplate(std::span<const MessageItem> items) const;
    void generateQuestions(qint64 elapsed);
 protected:
    QPointer<ChatModel> m_chatModel;
 private:
    const Chat *m_chat;
    LLModelInfo m_llModelInfo;
-    LLModelTypeV1 m_llModelType = LLModelTypeV1::NONE;
+    LLModelType m_llModelType;
    ModelInfo m_modelInfo;
    TokenTimer *m_timer;
    QByteArray m_state;
    QThread m_llmThread;
    std::atomic<bool> m_stopGenerating;
    std::atomic<bool> m_shouldBeLoaded;
    std::atomic<bool> m_isRecalc;
    std::atomic<bool> m_forceUnloadModel;
    std::atomic<bool> m_markedForDeletion;
    bool m_isServer;
    bool m_forceMetal;
    bool m_reloadingToChangeVariant;
-    friend class ChatViewResponseHandler;
+    bool m_processedSystemPrompt;
-    friend class SimpleResponseHandler;
+    bool m_restoreStateFromText;
    // m_pristineLoadedState is set if saveSate is unnecessary, either because:
    // - an unload was queued during LLModel::restoreState()
    // - the chat will be restored from text and hasn't been interacted with yet
    bool m_pristineLoadedState = false;
    QVector<QPair<QString, QString>> m_stateFromText;
 };
 #endif // CHATLLM_H
--- a/gpt4all-chat/chatmodel.h
+++ b/gpt4all-chat/chatmodel.h
@ -0,0 +1,474 @@
 #ifndef CHATMODEL_H
 #define CHATMODEL_H
 #include "database.h"
 #include <QAbstractListModel>
 #include <QByteArray>
 #include <QDataStream>
 #include <QHash>
 #include <QList>
 #include <QObject>
 #include <QPair>
 #include <QString>
 #include <QVariant>
 #include <QVector>
 #include <Qt>
 #include <QtGlobal>
 struct ChatItem
 {
    Q_GADGET
    Q_PROPERTY(int id MEMBER id)
    Q_PROPERTY(QString name MEMBER name)
    Q_PROPERTY(QString value MEMBER value)
    Q_PROPERTY(QString prompt MEMBER prompt)
    Q_PROPERTY(QString newResponse MEMBER newResponse)
    Q_PROPERTY(bool currentResponse MEMBER currentResponse)
    Q_PROPERTY(bool stopped MEMBER stopped)
    Q_PROPERTY(bool thumbsUpState MEMBER thumbsUpState)
    Q_PROPERTY(bool thumbsDownState MEMBER thumbsDownState)
    Q_PROPERTY(QList<ResultInfo> sources MEMBER sources)
    Q_PROPERTY(QList<ResultInfo> consolidatedSources MEMBER consolidatedSources)
 public:
    // TODO: Maybe we should include the model name here as well as timestamp?
    int id = 0;
    QString name;
    QString value;
    QString prompt;
    QString newResponse;
    QList<ResultInfo> sources;
    QList<ResultInfo> consolidatedSources;
    bool currentResponse = false;
    bool stopped = false;
    bool thumbsUpState = false;
    bool thumbsDownState = false;
 };
 Q_DECLARE_METATYPE(ChatItem)
 class ChatModel : public QAbstractListModel
 {
    Q_OBJECT
    Q_PROPERTY(int count READ count NOTIFY countChanged)
 public:
    explicit ChatModel(QObject *parent = nullptr) : QAbstractListModel(parent) {}
    enum Roles {
        IdRole = Qt::UserRole + 1,
        NameRole,
        ValueRole,
        PromptRole,
        NewResponseRole,
        CurrentResponseRole,
        StoppedRole,
        ThumbsUpStateRole,
        ThumbsDownStateRole,
        SourcesRole,
        ConsolidatedSourcesRole
    };
    int rowCount(const QModelIndex &parent = QModelIndex()) const override
    {
        Q_UNUSED(parent)
        return m_chatItems.size();
    }
    QVariant data(const QModelIndex &index, int role = Qt::DisplayRole) const override
    {
        if (!index.isValid() || index.row() < 0 || index.row() >= m_chatItems.size())
            return QVariant();
        const ChatItem &item = m_chatItems.at(index.row());
        switch (role) {
            case IdRole:
                return item.id;
            case NameRole:
                return item.name;
            case ValueRole:
                return item.value;
            case PromptRole:
                return item.prompt;
            case NewResponseRole:
                return item.newResponse;
            case CurrentResponseRole:
                return item.currentResponse;
            case StoppedRole:
                return item.stopped;
            case ThumbsUpStateRole:
                return item.thumbsUpState;
            case ThumbsDownStateRole:
                return item.thumbsDownState;
            case SourcesRole:
                return QVariant::fromValue(item.sources);
            case ConsolidatedSourcesRole:
                return QVariant::fromValue(item.consolidatedSources);
        }
        return QVariant();
    }
    QHash<int, QByteArray> roleNames() const override
    {
        QHash<int, QByteArray> roles;
        roles[IdRole] = "id";
        roles[NameRole] = "name";
        roles[ValueRole] = "value";
        roles[PromptRole] = "prompt";
        roles[NewResponseRole] = "newResponse";
        roles[CurrentResponseRole] = "currentResponse";
        roles[StoppedRole] = "stopped";
        roles[ThumbsUpStateRole] = "thumbsUpState";
        roles[ThumbsDownStateRole] = "thumbsDownState";
        roles[SourcesRole] = "sources";
        roles[ConsolidatedSourcesRole] = "consolidatedSources";
        return roles;
    }
    void appendPrompt(const QString &name, const QString &value)
    {
        ChatItem item;
        item.name = name;
        item.value = value;
        beginInsertRows(QModelIndex(), m_chatItems.size(), m_chatItems.size());
        m_chatItems.append(item);
        endInsertRows();
        emit countChanged();
    }
    void appendResponse(const QString &name, const QString &prompt)
    {
        ChatItem item;
        item.id = m_chatItems.count(); // This is only relevant for responses
        item.name = name;
        item.prompt = prompt;
        item.currentResponse = true;
        beginInsertRows(QModelIndex(), m_chatItems.size(), m_chatItems.size());
        m_chatItems.append(item);
        endInsertRows();
        emit countChanged();
    }
    Q_INVOKABLE void clear()
    {
        if (m_chatItems.isEmpty()) return;
        beginResetModel();
        m_chatItems.clear();
        endResetModel();
        emit countChanged();
    }
    Q_INVOKABLE ChatItem get(int index)
    {
        if (index < 0 || index >= m_chatItems.size()) return ChatItem();
        return m_chatItems.at(index);
    }
    Q_INVOKABLE void updateCurrentResponse(int index, bool b)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        if (item.currentResponse != b) {
            item.currentResponse = b;
            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {CurrentResponseRole});
        }
    }
    Q_INVOKABLE void updateStopped(int index, bool b)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        if (item.stopped != b) {
            item.stopped = b;
            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {StoppedRole});
        }
    }
    Q_INVOKABLE void updateValue(int index, const QString &value)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        if (item.value != value) {
            item.value = value;
            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ValueRole});
            emit valueChanged(index, value);
        }
    }
    QList<ResultInfo> consolidateSources(const QList<ResultInfo> &sources) {
        QMap<QString, ResultInfo> groupedData;
        for (const ResultInfo &info : sources) {
            if (groupedData.contains(info.file)) {
                groupedData[info.file].text += "\n---\n" + info.text;
            } else {
                groupedData[info.file] = info;
            }
        }
        QList<ResultInfo> consolidatedSources = groupedData.values();
        return consolidatedSources;
    }
    Q_INVOKABLE void updateSources(int index, const QList<ResultInfo> &sources)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        item.sources = sources;
        item.consolidatedSources = consolidateSources(sources);
        emit dataChanged(createIndex(index, 0), createIndex(index, 0), {SourcesRole});
        emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ConsolidatedSourcesRole});
    }
    Q_INVOKABLE void updateThumbsUpState(int index, bool b)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        if (item.thumbsUpState != b) {
            item.thumbsUpState = b;
            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ThumbsUpStateRole});
        }
    }
    Q_INVOKABLE void updateThumbsDownState(int index, bool b)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        if (item.thumbsDownState != b) {
            item.thumbsDownState = b;
            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {ThumbsDownStateRole});
        }
    }
    Q_INVOKABLE void updateNewResponse(int index, const QString &newResponse)
    {
        if (index < 0 || index >= m_chatItems.size()) return;
        ChatItem &item = m_chatItems[index];
        if (item.newResponse != newResponse) {
            item.newResponse = newResponse;
            emit dataChanged(createIndex(index, 0), createIndex(index, 0), {NewResponseRole});
        }
    }
    int count() const { return m_chatItems.size(); }
    bool serialize(QDataStream &stream, int version) const
    {
        stream << count();
        for (const auto &c : m_chatItems) {
            stream << c.id;
            stream << c.name;
            stream << c.value;
            stream << c.prompt;
            stream << c.newResponse;
            stream << c.currentResponse;
            stream << c.stopped;
            stream << c.thumbsUpState;
            stream << c.thumbsDownState;
            if (version > 7) {
                stream << c.sources.size();
                for (const ResultInfo &info : c.sources) {
                    Q_ASSERT(!info.file.isEmpty());
                    stream << info.collection;
                    stream << info.path;
                    stream << info.file;
                    stream << info.title;
                    stream << info.author;
                    stream << info.date;
                    stream << info.text;
                    stream << info.page;
                    stream << info.from;
                    stream << info.to;
                }
            } else if (version > 2) {
                QList<QString> references;
                QList<QString> referencesContext;
                int validReferenceNumber = 1;
                for (const ResultInfo &info : c.sources) {
                    if (info.file.isEmpty())
                        continue;
                    QString reference;
                    {
                        QTextStream stream(&reference);
                        stream << (validReferenceNumber++) << ". ";
                        if (!info.title.isEmpty())
                            stream << "\"" << info.title << "\". ";
                        if (!info.author.isEmpty())
                            stream << "By " << info.author << ". ";
                        if (!info.date.isEmpty())
                            stream << "Date: " << info.date << ". ";
                        stream << "In " << info.file << ". ";
                        if (info.page != -1)
                            stream << "Page " << info.page << ". ";
                        if (info.from != -1) {
                            stream << "Lines " << info.from;
                            if (info.to != -1)
                                stream << "-" << info.to;
                            stream << ". ";
                        }
                        stream << "[Context](context://" << validReferenceNumber - 1 << ")";
                    }
                    references.append(reference);
                    referencesContext.append(info.text);
                }
                stream << references.join("\n");
                stream << referencesContext;
            }
        }
        return stream.status() == QDataStream::Ok;
    }
    bool deserialize(QDataStream &stream, int version)
    {
        int size;
        stream >> size;
        for (int i = 0; i < size; ++i) {
            ChatItem c;
            stream >> c.id;
            stream >> c.name;
            stream >> c.value;
            stream >> c.prompt;
            stream >> c.newResponse;
            stream >> c.currentResponse;
            stream >> c.stopped;
            stream >> c.thumbsUpState;
            stream >> c.thumbsDownState;
            if (version > 7) {
                qsizetype count;
                stream >> count;
                QList<ResultInfo> sources;
                for (int i = 0; i < count; ++i) {
                    ResultInfo info;
                    stream >> info.collection;
                    stream >> info.path;
                    stream >> info.file;
                    stream >> info.title;
                    stream >> info.author;
                    stream >> info.date;
                    stream >> info.text;
                    stream >> info.page;
                    stream >> info.from;
                    stream >> info.to;
                    sources.append(info);
                }
                c.sources = sources;
                c.consolidatedSources = consolidateSources(sources);
            }else if (version > 2) {
                QString references;
                QList<QString> referencesContext;
                stream >> references;
                stream >> referencesContext;
                if (!references.isEmpty()) {
                    QList<ResultInfo> sources;
                    QList<QString> referenceList = references.split("\n");
                    // Ignore empty lines and those that begin with "---" which is no longer used
                    for (auto it = referenceList.begin(); it != referenceList.end();) {
                        if (it->trimmed().isEmpty() || it->trimmed().startsWith("---"))
                            it = referenceList.erase(it);
                        else
                            ++it;
                    }
                    Q_ASSERT(referenceList.size() == referencesContext.size());
                    for (int j = 0; j < referenceList.size(); ++j) {
                        QString reference = referenceList[j];
                        QString context = referencesContext[j];
                        ResultInfo info;
                        QTextStream refStream(&reference);
                        QString dummy;
                        int validReferenceNumber;
                        refStream >> validReferenceNumber >> dummy;
                        // Extract title (between quotes)
                        if (reference.contains("\"")) {
                            int startIndex = reference.indexOf('"') + 1;
                            int endIndex = reference.indexOf('"', startIndex);
                            info.title = reference.mid(startIndex, endIndex - startIndex);
                        }
                        // Extract author (after "By " and before the next period)
                        if (reference.contains("By ")) {
                            int startIndex = reference.indexOf("By ") + 3;
                            int endIndex = reference.indexOf('.', startIndex);
                            info.author = reference.mid(startIndex, endIndex - startIndex).trimmed();
                        }
                        // Extract date (after "Date: " and before the next period)
                        if (reference.contains("Date: ")) {
                            int startIndex = reference.indexOf("Date: ") + 6;
                            int endIndex = reference.indexOf('.', startIndex);
                            info.date = reference.mid(startIndex, endIndex - startIndex).trimmed();
                        }
                        // Extract file name (after "In " and before the "[Context]")
                        if (reference.contains("In ") && reference.contains(". [Context]")) {
                            int startIndex = reference.indexOf("In ") + 3;
                            int endIndex = reference.indexOf(". [Context]", startIndex);
                            info.file = reference.mid(startIndex, endIndex - startIndex).trimmed();
                        }
                        // Extract page number (after "Page " and before the next space)
                        if (reference.contains("Page ")) {
                            int startIndex = reference.indexOf("Page ") + 5;
                            int endIndex = reference.indexOf(' ', startIndex);
                            if (endIndex == -1) endIndex = reference.length();
                            info.page = reference.mid(startIndex, endIndex - startIndex).toInt();
                        }
                        // Extract lines (after "Lines " and before the next space or hyphen)
                        if (reference.contains("Lines ")) {
                            int startIndex = reference.indexOf("Lines ") + 6;
                            int endIndex = reference.indexOf(' ', startIndex);
                            if (endIndex == -1) endIndex = reference.length();
                            int hyphenIndex = reference.indexOf('-', startIndex);
                            if (hyphenIndex != -1 && hyphenIndex < endIndex) {
                                info.from = reference.mid(startIndex, hyphenIndex - startIndex).toInt();
                                info.to = reference.mid(hyphenIndex + 1, endIndex - hyphenIndex - 1).toInt();
                            } else {
                                info.from = reference.mid(startIndex, endIndex - startIndex).toInt();
                            }
                        }
                        info.text = context;
                        sources.append(info);
                    }
                    c.sources = sources;
                    c.consolidatedSources = consolidateSources(sources);
                }
            }
            beginInsertRows(QModelIndex(), m_chatItems.size(), m_chatItems.size());
            m_chatItems.append(c);
            endInsertRows();
        }
        emit countChanged();
        return stream.status() == QDataStream::Ok;
    }
    QVector<QPair<QString, QString>> text() const
    {
        QVector<QPair<QString, QString>> result;
        for (const auto &c : m_chatItems)
            result << qMakePair(c.name, c.value);
        return result;
    }
 Q_SIGNALS:
    void countChanged();
    void valueChanged(int index, const QString &value);
 private:
    QList<ChatItem> m_chatItems;
 };
 #endif // CHATMODEL_H
--- a/gpt4all-chat/src/chatviewtextprocessor.cpp
+++ b/gpt4all-chat/src/chatviewtextprocessor.cpp
@ -1,32 +1,29 @@
 #include "chatviewtextprocessor.h"
 #include <QAbstractTextDocumentLayout>
 #include <QBrush>
 #include <QChar>
 #include <QClipboard>
 #include <QDebug>
 #include <QFlag>
 #include <QFont>
 #include <QFontMetricsF>
 #include <QGuiApplication>
-#include <QList> // IWYU pragma: keep
+#include <QList>
-#include <QPair>
+#include <QPainter>
 #include <QQuickTextDocument>
 #include <QRegularExpression>
-#include <QStringList> // IWYU pragma: keep
+#include <QStringList>
-#include <QTextBlock> // IWYU pragma: keep
+#include <QTextBlock>
-#include <QTextCharFormat> // IWYU pragma: keep
+#include <QTextCharFormat>
 #include <QTextCursor>
 #include <QTextDocument>
 #include <QTextDocumentFragment>
-#include <QTextFrame> // IWYU pragma: keep
+#include <QTextFrame>
-#include <QTextFrameFormat> // IWYU pragma: keep
+#include <QTextFrameFormat>
 #include <QTextTableCell>
-#include <QtAssert>
+#include <QVariant>
-#include <QtLogging>
+#include <Qt>
 #include <QtGlobal>
 #include <algorithm>
 #include <utility>
 enum Language {
    None,
@ -741,7 +738,7 @@ void SyntaxHighlighter::highlightBlock(const QString &text)
    case Java:
        rules = javaHighlightingRules(); break;
    case Go:
-        rules = goHighlightingRules(); break;
+        rules = javaHighlightingRules(); break;
    case Json:
        rules = jsonHighlightingRules(); break;
    case Latex:
@ -970,6 +967,8 @@ void ChatViewTextProcessor::handleCodeBlocks()
        cursor.setPosition(matchesCode[index].capturedEnd(), QTextCursor::KeepAnchor);
        cursor.removeSelectedText();
        int startPos = cursor.position();
        QTextFrameFormat frameFormat = frameFormatBase;
        QString capturedText = matchesCode[index].captured(1);
        QString codeLanguage;
@ -1005,7 +1004,7 @@ void ChatViewTextProcessor::handleCodeBlocks()
        QTextFrame *mainFrame = cursor.currentFrame();
        cursor.setCharFormat(textFormat);
-        cursor.insertFrame(frameFormat);
+        QTextFrame *frame = cursor.insertFrame(frameFormat);
        QTextTable *table = cursor.insertTable(codeLanguage.isEmpty() ? 1 : 2, 1, tableFormat);
        if (!codeLanguage.isEmpty()) {
@ -1017,6 +1016,7 @@ void ChatViewTextProcessor::handleCodeBlocks()
            headerCursor.insertText(codeLanguage);
            QTextTableCell copy = headerTable->cellAt(0, 1);
            QTextCursor copyCursor = copy.firstCursorPosition();
            int startPos = copyCursor.position();
            CodeCopy newCopy;
            newCopy.text = lines.join("\n");
            newCopy.startPos = copyCursor.position();
--- a/gpt4all-chat/src/chatviewtextprocessor.h
+++ b/gpt4all-chat/src/chatviewtextprocessor.h
@ -3,15 +3,18 @@
 #include <QColor>
 #include <QObject>
-#include <QQmlEngine> // IWYU pragma: keep
+#include <QQmlEngine>
-#include <QQuickTextDocument>
+#include <QQuickTextDocument> // IWYU pragma: keep
 #include <QRectF>
 #include <QSizeF>
 #include <QString>
 #include <QSyntaxHighlighter>
-#include <QVector> // IWYU pragma: keep
+#include <QTextObjectInterface>
-#include <QtTypes>
+#include <QVector>
 // IWYU pragma: no_forward_declare QQuickTextDocument
 class QPainter;
 class QTextDocument;
 class QTextFormat;
 struct CodeColors {
    Q_GADGET
--- a/gpt4all-chat/cmake/Modules/SignWindowsBinaries.cmake
+++ b/gpt4all-chat/cmake/Modules/SignWindowsBinaries.cmake
@ -3,7 +3,7 @@ function(sign_target_windows tgt)
        add_custom_command(TARGET ${tgt}
            POST_BUILD
            COMMAND AzureSignTool.exe sign
-                -du "https://www.nomic.ai/gpt4all"
+                -du "https://gpt4all.io/index.html"
                -kvu https://gpt4all.vault.azure.net
                -kvi "$Env{AZSignGUID}"
                -kvs "$Env{AZSignPWD}"
--- a/gpt4all-chat/cmake/config.h.in
+++ b/gpt4all-chat/cmake/config.h.in
@ -0,0 +1,6 @@
 #ifndef CONFIG_H
 #define CONFIG_H
 #define APP_VERSION "@APP_VERSION@"
 #endif // CONFIG_H
--- a/gpt4all-chat/cmake/cpack-steal-config.cmake.in
+++ b/gpt4all-chat/cmake/cpack-steal-config.cmake.in
@ -1,2 +0,0 @@
 set(OUTPUT_DIR "@CMAKE_BINARY_DIR@")
 file(COPY ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/config DESTINATION ${OUTPUT_DIR}/cpack-config)
--- a/gpt4all-chat/cmake/cpack_config.cmake
+++ b/gpt4all-chat/cmake/cpack_config.cmake
@ -1,50 +0,0 @@
 set(COMPONENT_NAME_MAIN "gpt4all")
 set(CPACK_GENERATOR "IFW")
 set(CPACK_VERBATIM_VARIABLES YES)
 set(CPACK_IFW_VERBOSE ON)
 if (CMAKE_SYSTEM_NAME MATCHES Linux)
    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-linux")
    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@/${COMPONENT_NAME_MAIN}")
 elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
    set(CPACK_IFW_ROOT "C:/Qt/Tools/QtInstallerFramework/4.6")
    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.ico")
    if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$")
        set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64")
    elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
        set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-win64-arm")
    else()
        message(FATAL_ERROR "Unrecognized processor: ${CMAKE_SYSTEM_PROCESSOR}")
    endif()
    set(CPACK_IFW_TARGET_DIRECTORY "@HomeDir@\\${COMPONENT_NAME_MAIN}")
 elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
    set(CPACK_IFW_ROOT "~/Qt/Tools/QtInstallerFramework/4.6")
    set(CPACK_IFW_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns")
    set(CPACK_PACKAGE_FILE_NAME "${COMPONENT_NAME_MAIN}-installer-darwin")
    set(CPACK_IFW_TARGET_DIRECTORY "@ApplicationsDir@/${COMPONENT_NAME_MAIN}")
 endif()
 set(CPACK_COMPONENTS_ALL ${COMPONENT_NAME_MAIN})  # exclude development components
 if (APPLE AND GPT4ALL_SIGN_INSTALL)
    list(APPEND CPACK_COMPONENTS_ALL maintenancetool)
 endif()
 set(CPACK_PACKAGE_INSTALL_DIRECTORY ${COMPONENT_NAME_MAIN})
 set(CPACK_PACKAGE_VERSION_MAJOR ${PROJECT_VERSION_MAJOR})
 set(CPACK_PACKAGE_VERSION_MINOR ${PROJECT_VERSION_MINOR})
 set(CPACK_PACKAGE_VERSION_PATCH ${PROJECT_VERSION_PATCH})
 set(CPACK_PACKAGE_HOMEPAGE_URL "https://www.nomic.ai/gpt4all")
 set(CPACK_PACKAGE_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
 set(CPACK_RESOURCE_FILE_LICENSE ${CMAKE_CURRENT_SOURCE_DIR}/LICENSE)
 set(CPACK_PACKAGE_EXECUTABLES "GPT4All")
 set(CPACK_CREATE_DESKTOP_LINKS "GPT4All")
 set(CPACK_IFW_PACKAGE_NAME "GPT4All")
 set(CPACK_IFW_PACKAGE_TITLE "GPT4All Installer")
 set(CPACK_IFW_PACKAGE_PUBLISHER "Nomic, Inc.")
 set(CPACK_IFW_PRODUCT_URL "https://www.nomic.ai/gpt4all")
 set(CPACK_IFW_PACKAGE_WIZARD_STYLE "Aero")
 set(CPACK_IFW_PACKAGE_LOGO "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png")
 set(CPACK_IFW_PACKAGE_WINDOW_ICON "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png")
 set(CPACK_IFW_PACKAGE_WIZARD_SHOW_PAGE_LIST OFF)
 set(CPACK_IFW_PACKAGE_CONTROL_SCRIPT "${CMAKE_CURRENT_SOURCE_DIR}/cmake/installer_control.qs")
--- a/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
@ -1,26 +1,17 @@
 set(MACDEPLOYQT "@MACDEPLOYQT@")
 set(COMPONENT_NAME_MAIN "@COMPONENT_NAME_MAIN@")
 set(CMAKE_CURRENT_SOURCE_DIR "@CMAKE_CURRENT_SOURCE_DIR@")
 set(GPT4ALL_SIGN_INSTALL "@GPT4ALL_SIGN_INSTALL@")
 set(GPT4ALL_SIGNING_ID "@MAC_SIGNING_IDENTITY@")
-set(CPACK_CONFIG_DIR "@CMAKE_BINARY_DIR@")
+execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2 -sign-for-notarization=${GPT4ALL_SIGNING_ID})
-if (GPT4ALL_SIGN_INSTALL)
+file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
-    set(MAC_NOTARIZE -sign-for-notarization=${GPT4ALL_SIGNING_ID})
+file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
-endif()
+file(COPY ${MYLLAMALIBS}
-execute_process(COMMAND ${MACDEPLOYQT} ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app -qmldir=${CMAKE_CURRENT_SOURCE_DIR} -verbose=2 ${MAC_NOTARIZE})
+     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLMODELLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-32.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/icons/gpt4all-48.png"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 file(COPY "${CMAKE_CURRENT_SOURCE_DIR}/resources/gpt4all.icns"
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data)
 if (GPT4ALL_SIGN_INSTALL)
    # Create signed MaintenanceTool
    set(MT_DATA_DIR ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/maintenancetool/data)
    file(MAKE_DIRECTORY ${MT_DATA_DIR})
    execute_process(
        COMMAND binarycreator --config ${CPACK_CONFIG_DIR}/cpack-config/config/config.xml --create-maintenancetool --sign ${GPT4ALL_SIGNING_ID}
        WORKING_DIRECTORY ${MT_DATA_DIR}
    )
 endif()
--- a/gpt4all-chat/cmake/download_model.cmake
+++ b/gpt4all-chat/cmake/download_model.cmake
@ -1,12 +0,0 @@
 if(NOT DEFINED URL OR NOT DEFINED OUTPUT_PATH OR NOT DEFINED EXPECTED_MD5)
    message(FATAL_ERROR "Usage: cmake -DURL=<url> -DOUTPUT_PATH=<path> -DEXPECTED_MD5=<md5> -P download_model.cmake")
 endif()
 message(STATUS "Downloading model from ${URL} to ${OUTPUT_PATH} ...")
 file(DOWNLOAD "${URL}" "${OUTPUT_PATH}" EXPECTED_MD5 "${EXPECTED_MD5}" STATUS status)
 list(GET status 0 status_code)
 if(NOT status_code EQUAL 0)
    message(FATAL_ERROR "Failed to download model: ${status}")
 endif()
--- a/gpt4all-chat/cmake/installer_control.qs
+++ b/gpt4all-chat/cmake/installer_control.qs
@ -1,44 +0,0 @@
 var finishedText = null;
 function cancelInstaller(message) {
    installer.setDefaultPageVisible(QInstaller.Introduction,         false);
    installer.setDefaultPageVisible(QInstaller.TargetDirectory,      false);
    installer.setDefaultPageVisible(QInstaller.ComponentSelection,   false);
    installer.setDefaultPageVisible(QInstaller.ReadyForInstallation, false);
    installer.setDefaultPageVisible(QInstaller.StartMenuSelection,   false);
    installer.setDefaultPageVisible(QInstaller.PerformInstallation,  false);
    installer.setDefaultPageVisible(QInstaller.LicenseCheck,         false);
    finishedText = message;
    installer.setCanceled();
 }
 function vercmp(a, b) {
    return a.localeCompare(b, undefined, { numeric: true, sensitivity: "base" });
 }
 function Controller() {
 }
 Controller.prototype.TargetDirectoryPageCallback = function() {
    var failedReq = null;
    if (systemInfo.productType === "ubuntu" && vercmp(systemInfo.productVersion, "22.04") < 0) {
        failedReq = "Ubuntu 22.04 LTS";
    } else if (systemInfo.productType === "macos" && vercmp(systemInfo.productVersion, "12.6") < 0) {
        failedReq = "macOS Monterey 12.6";
    }
    if (failedReq !== null) {
        cancelInstaller(
            "Installation cannot continue because GPT4All does not support your operating system: " +
            `${systemInfo.prettyProductName}<br/><br/>` +
            `GPT4All requires ${failedReq} or newer.`
        );
    }
 }
 Controller.prototype.FinishedPageCallback = function() {
    const widget = gui.currentPageWidget();
    if (widget != null && finishedText != null) {
        widget.MessageLabel.setText(finishedText);
    }
 }
--- a/gpt4all-chat/cmake/installer_maintenancetool_component.qs
+++ b/gpt4all-chat/cmake/installer_maintenancetool_component.qs
@ -1,19 +0,0 @@
 function Component()
 {
    component.ifwVersion = installer.value("FrameworkVersion");
    installer.installationStarted.connect(this, Component.prototype.onInstallationStarted);
 }
 Component.prototype.onInstallationStarted = function()
 {
    if (component.updateRequested() || component.installationRequested()) {
        if (installer.value("os") == "win") {
            component.installerbaseBinaryPath = "@TargetDir@/installerbase.exe";
        } else if (installer.value("os") == "x11") {
            component.installerbaseBinaryPath = "@TargetDir@/installerbase";
        } else if (installer.value("os") == "mac") {
            component.installerbaseBinaryPath = "@TargetDir@/MaintenanceTool.app";
        }
        installer.setInstallerBaseBinary(component.installerbaseBinaryPath);
    }
 }
--- a/gpt4all-chat/cmake/installer_gpt4all_component.qs
+++ b/gpt4all-chat/cmake/installer_gpt4all_component.qs
@ -6,7 +6,8 @@ Component.prototype.beginInstallation = function() {
    targetDirectory = installer.value("TargetDir");
 };
-Component.prototype.createOperations = function() {
+Component.prototype.createOperations = function()
 {
    try {
        // call the base create operations function
        component.createOperations();
@ -29,7 +30,7 @@ Component.prototype.createOperations = function() {
                "workingDirectory=" + targetDirectory + "/bin",
                "iconPath=" + targetDirectory + "/gpt4all.ico",
                "iconId=0", "description=Open GPT4All");
-        } else if (systemInfo.productType === "macos") {
+        } else if (systemInfo.productType === "macos" || systemInfo.productType === "osx") {
            var gpt4allAppPath = targetDirectory + "/bin/gpt4all.app";
            var symlinkPath = targetDirectory + "/../GPT4All.app";
            // Remove the symlink if it already exists
@ -55,7 +56,7 @@ Component.prototype.createOperationsForArchive = function(archive)
 {
    component.createOperationsForArchive(archive);
-    if (systemInfo.productType === "macos") {
+    if (systemInfo.productType === "macos" || systemInfo.productType === "osx") {
        var uninstallTargetDirectory = installer.value("TargetDir");
        var symlinkPath = uninstallTargetDirectory + "/../GPT4All.app";
--- a/gpt4all-chat/src/database.cpp
+++ b/gpt4all-chat/src/database.cpp
--- a/gpt4all-chat/src/database.h
+++ b/gpt4all-chat/src/database.h
@ -1,76 +1,52 @@
 #ifndef DATABASE_H
 #define DATABASE_H
-#include "embllm.h"
+#include "embllm.h" // IWYU pragma: keep
 #include <QByteArray>
 #include <QChar>
 #include <QDateTime>
 #include <QElapsedTimer>
 #include <QFileInfo>
 #include <QHash>
 #include <QLatin1String>
 #include <QList>
 #include <QMap>
 #include <QObject>
 #include <QQueue>
 #include <QSet>
 #include <QSqlDatabase>
 #include <QString>
-#include <QStringList> // IWYU pragma: keep
+#include <QStringList>
 #include <QThread>
 #include <QUrl>
-#include <QVector> // IWYU pragma: keep
+#include <QVector>
 #include <QtAssert>
 #include <atomic>
 #include <cstddef>
 #include <list>
 #include <map>
 #include <memory>
 #include <optional>
 #include <utility>
 #include <vector> // IWYU pragma: keep
 using namespace Qt::Literals::StringLiterals;
 class Database;
 class DocumentReader;
 class QFileSystemWatcher;
-class QSqlQuery;
+class QSqlError;
 class QTextStream;
 class QTimer;
 /* Version 0: GPT4All v2.4.3, full-text search
 * Version 1: GPT4All v2.5.3, embeddings in hsnwlib
- * Version 2: GPT4All v3.0.0, embeddings in sqlite
+ * Version 2: GPT4All v3.0.0, embeddings in sqlite */
 * Version 3: GPT4All v3.4.0, hybrid search
 */
 // minimum supported version
 static const int LOCALDOCS_MIN_VER = 1;
 // FIXME: (Adam) The next time we bump the version we should add triggers to manage the fts external
 // content table as recommended in the official documentation to keep the fts index in sync
 // See: https://www.sqlite.org/fts5.html#external_content_tables
 // FIXME: (Adam) The fts virtual table should include the chunk_id explicitly instead of relying upon
 // the id of the two tables to be in sync
 // current version
-static const int LOCALDOCS_VERSION = 3;
+static const int LOCALDOCS_VERSION = 2;
 struct DocumentInfo
 {
-    using key_type = std::pair<int, QString>;
+    int folder;
-
+    QFileInfo doc;
-    int       folder;
+    int currentPage = 0;
-    QFileInfo file;
+    size_t currentPosition = 0;
-    bool      currentlyProcessing = false;
+    bool currentlyProcessing = false;
-
+    bool isPdf() const {
-    key_type key() const { return {folder, file.canonicalFilePath()}; } // for comparison
+        return doc.suffix().compare(u"pdf"_s, Qt::CaseInsensitive) == 0;
-
+    }
    bool isPdf () const { return !file.suffix().compare("pdf"_L1,  Qt::CaseInsensitive); }
    bool isDocx() const { return !file.suffix().compare("docx"_L1, Qt::CaseInsensitive); }
 };
 struct ResultInfo {
@ -165,36 +141,6 @@ struct CollectionItem {
 };
 Q_DECLARE_METATYPE(CollectionItem)
 class ChunkStreamer {
 public:
    enum class Status { DOC_COMPLETE, INTERRUPTED, ERROR, BINARY_SEEN };
    explicit ChunkStreamer(Database *database);
    ~ChunkStreamer();
    void setDocument(DocumentInfo doc, int documentId, const QString &embeddingModel);
    std::optional<DocumentInfo::key_type> currentDocKey() const;
    void reset();
    Status step();
 private:
    Database                              *m_database;
    std::optional<DocumentInfo::key_type>  m_docKey;
    std::unique_ptr<DocumentReader>        m_reader; // may be invalid, always compare key first
    int                                    m_documentId;
    QString                                m_embeddingModel;
    QString                                m_title;
    QString                                m_author;
    QString                                m_subject;
    QString                                m_keywords;
    // working state
    QString                                m_chunk; // has a trailing space for convenience
    int                                    m_nChunkWords = 0;
    int                                    m_page = 0;
 };
 class Database : public QObject
 {
    Q_OBJECT
@ -206,7 +152,6 @@ public:
 public Q_SLOTS:
    void start();
    bool scanQueueInterrupted() const;
    void scanQueueBatch();
    void scanDocuments(int folder_id, const QString &folder_path);
    void forceIndexing(const QString &collection, const QString &embedding_model);
@ -236,12 +181,6 @@ private:
    void commit();
    void rollback();
    bool addChunk(QSqlQuery &q, int document_id, const QString &chunk_text, const QString &file,
                  const QString &title, const QString &author, const QString &subject, const QString &keywords,
                  int page, int from, int to, int words, int *chunk_id);
    bool refreshDocumentIdCache(QSqlQuery &q);
    bool removeChunksByDocumentId(QSqlQuery &q, int document_id);
    bool sqlRemoveDocsByFolderPath(QSqlQuery &q, const QString &path);
    bool hasContent();
    // not found -> 0, , exists and has content -> 1, error -> -1
    int openDatabase(const QString &modelPath, bool create = true, int ver = LOCALDOCS_VERSION);
@ -255,35 +194,19 @@ private:
    void appendChunk(const EmbeddingChunk &chunk);
    void sendChunkList();
    void updateFolderToIndex(int folder_id, size_t countForFolder, bool sendChunks = true);
    void handleDocumentError(const QString &errorMessage,
        int document_id, const QString &document_path, const QSqlError &error);
    size_t countOfDocuments(int folder_id) const;
    size_t countOfBytes(int folder_id) const;
    DocumentInfo dequeueDocument();
    void removeFolderFromDocumentQueue(int folder_id);
-    void enqueueDocumentInternal(DocumentInfo &&info, bool prepend = false);
+    void enqueueDocumentInternal(const DocumentInfo &info, bool prepend = false);
-    void enqueueDocuments(int folder_id, std::list<DocumentInfo> &&infos);
+    void enqueueDocuments(int folder_id, const QVector<DocumentInfo> &infos);
    void scanQueue();
    bool ftsIntegrityCheck();
    bool cleanDB();
    void addFolderToWatch(const QString &path);
    void removeFolderFromWatch(const QString &path);
-    static QList<int> searchEmbeddingsHelper(const std::vector<float> &query, QSqlQuery &q, int nNeighbors);
+    QList<int> searchEmbeddings(const std::vector<float> &query, const QList<QString> &collections, int nNeighbors);
    QList<int> searchEmbeddings(const std::vector<float> &query, const QList<QString> &collections,
        int nNeighbors);
    struct BM25Query {
        QString input;
        QString query;
        bool isExact = false;
        int qlength = 0;
        int ilength = 0;
        int rlength = 0;
    };
    QList<Database::BM25Query> queriesForFTS5(const QString &input);
    QList<int> searchBM25(const QString &query, const QList<QString> &collections, BM25Query &bm25q, int k);
    QList<int> scoreChunks(const std::vector<float> &query, const QList<int> &chunks);
    float computeBM25Weight(const BM25Query &bm25q);
    QList<int> reciprocalRankFusion(const std::vector<float> &query, const QList<int> &embeddingResults,
        const QList<int> &bm25Results, const BM25Query &bm25q, int k);
    QList<int> searchDatabase(const QString &query, const QList<QString> &collections, int k);
    void setStartUpdateTime(CollectionItem &item);
    void setLastUpdateTime(CollectionItem &item);
@ -300,9 +223,8 @@ private:
    QSqlDatabase m_db;
    int m_chunkSize;
    QStringList m_scannedFileExtensions;
-    QTimer *m_scanIntervalTimer;
+    QTimer *m_scanTimer;
-    QElapsedTimer m_scanDurationTimer;
+    QMap<int, QQueue<DocumentInfo>> m_docsToScan;
    std::map<int, std::list<DocumentInfo>> m_docsToScan;
    QList<ResultInfo> m_retrieve;
    QThread m_dbThread;
    QFileSystemWatcher *m_watcher;
@ -311,10 +233,6 @@ private:
    QVector<EmbeddingChunk> m_chunkList;
    QHash<int, CollectionItem> m_collectionMap; // used only for tracking indexing/embedding progress
    std::atomic<bool> m_databaseValid;
    ChunkStreamer m_chunkStreamer;
    QSet<int> m_documentIdCache; // cached list of documents with chunks for fast lookup
    friend class ChunkStreamer;
 };
 #endif // DATABASE_H
--- a/gpt4all-chat/deps/CMakeLists.txt
+++ b/gpt4all-chat/deps/CMakeLists.txt
@ -1,51 +0,0 @@
 include(FetchContent)
 set(BUILD_SHARED_LIBS OFF)
 set(FMT_INSTALL OFF)
 add_subdirectory(fmt)
 set(QAPPLICATION_CLASS QApplication)
 add_subdirectory(SingleApplication)
 set(DUCKX_INSTALL OFF)
 add_subdirectory(DuckX)
 set(QT_VERSION_MAJOR 6)
 add_subdirectory(QXlsx/QXlsx)
 if (NOT GPT4ALL_USING_QTPDF)
    # If we do not use QtPDF, we need to get PDFium.
    set(GPT4ALL_PDFIUM_TAG "chromium/6996")
    if (CMAKE_SYSTEM_NAME MATCHES Linux)
        FetchContent_Declare(
            pdfium
            URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-linux-x64.tgz"
            URL_HASH "SHA256=68b381b87efed539f2e33ae1e280304c9a42643a878cc296c1d66a93b0cb4335"
        )
    elseif (CMAKE_SYSTEM_NAME MATCHES Windows)
        if (CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|AMD64|amd64)$")
            FetchContent_Declare(
                pdfium
                URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-win-x64.tgz"
                URL_HASH "SHA256=83e714c302ceacccf403826d5cb57ea39b77f393d83b8d5781283012774a9378"
            )
        elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|AARCH64|arm64|ARM64)$")
            FetchContent_Declare(
                pdfium
                URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-win-arm64.tgz"
                URL_HASH "SHA256=78e77e871453a4915cbf66fb381b951c9932f88a747c6b2b33c9f27ec2371445"
            )
        endif()
    elseif (CMAKE_SYSTEM_NAME MATCHES Darwin)
        FetchContent_Declare(
            pdfium
            URL "https://github.com/bblanchon/pdfium-binaries/releases/download/${GPT4ALL_PDFIUM_TAG}/pdfium-mac-univ.tgz"
            URL_HASH "SHA256=e7577f3242ff9c1df50025f9615673a43601a201bc51ee4792975f98920793a2"
        )
    endif()
    FetchContent_MakeAvailable(pdfium)
    find_package(PDFium REQUIRED PATHS "${pdfium_SOURCE_DIR}" NO_DEFAULT_PATH)
 endif()
--- a/gpt4all-chat/deps/DuckX
+++ b/gpt4all-chat/deps/DuckX
@ -1 +0,0 @@
 Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc
--- a/gpt4all-chat/deps/QXlsx
+++ b/gpt4all-chat/deps/QXlsx
@ -1 +0,0 @@
 Subproject commit 29e81b369128525749dcb6516195b6b062eda955
--- a/gpt4all-chat/deps/SingleApplication
+++ b/gpt4all-chat/deps/SingleApplication
@ -1 +0,0 @@
 Subproject commit 21bdef01eddcbd78044eea1d50b9dee08d218ff2
--- a/gpt4all-chat/deps/fmt
+++ b/gpt4all-chat/deps/fmt
@ -1 +0,0 @@
 Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592
--- a/gpt4all-chat/deps/json
+++ b/gpt4all-chat/deps/json
@ -1 +0,0 @@
 Subproject commit 606b6347edf0758c531abb6c36743e09a4c48a84
--- a/gpt4all-chat/deps/minja
+++ b/gpt4all-chat/deps/minja
@ -1 +0,0 @@
 Subproject commit e97bb2442cd6ab3d5bb5f5a3e8a1f7d6081d613b
--- a/gpt4all-chat/deps/usearch
+++ b/gpt4all-chat/deps/usearch
@ -1 +0,0 @@
 Subproject commit 9e59f1036657303b29eaf709945f339e403e5f2f
--- a/gpt4all-chat/dev-requirements.txt
+++ b/gpt4all-chat/dev-requirements.txt
@ -1,11 +0,0 @@
 -r test-requirements.txt
 # dev tools
 flake8~=7.1
 mypy~=1.12
 pytype>=2024.10.11
 wemake-python-styleguide~=0.19.2
 # type stubs and other optional modules
 types-requests~=2.32
 urllib3[socks]
--- a/gpt4all-chat/src/download.cpp
+++ b/gpt4all-chat/src/download.cpp
@ -10,37 +10,32 @@
 #include <QDebug>
 #include <QGlobalStatic>
 #include <QGuiApplication>
-#include <QIODevice> // IWYU pragma: keep
+#include <QIODevice>
 #include <QJsonArray>
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QJsonValue>
 #include <QKeyValueIterator>
 #include <QLocale>
 #include <QNetworkRequest>
-#include <QPair> // IWYU pragma: keep
+#include <QPair>
 #include <QRegularExpression>
 #include <QRegularExpressionMatch>
 #include <QSettings>
 #include <QSslConfiguration>
 #include <QSslSocket>
-#include <QStringList> // IWYU pragma: keep
+#include <QStringList>
 #include <QTextStream>
 #include <QUrl>
 #include <QVariant>
-#include <QVector> // IWYU pragma: keep
+#include <QVector>
 #include <Qt>
 #include <QtAssert>
 #include <QtLogging>
 #include <QtMinMax>
 #include <algorithm>
 #include <compare>
 #include <cstddef>
 #include <utility>
 using namespace Qt::Literals::StringLiterals;
 class MyDownload: public Download { };
 Q_GLOBAL_STATIC(MyDownload, downloadInstance)
 Download *Download::globalInstance()
@ -63,6 +58,11 @@ Download::Download()
    m_startTime = QDateTime::currentDateTime();
 }
 static bool operator==(const ReleaseInfo& lhs, const ReleaseInfo& rhs)
 {
    return lhs.version == rhs.version;
 }
 std::strong_ordering Download::compareAppVersions(const QString &a, const QString &b)
 {
    static QRegularExpression versionRegex(R"(^(\d+(?:\.\d+){0,2})(-.+)?$)");
@ -396,9 +396,8 @@ void Download::parseReleaseJsonFile(const QByteArray &jsonData)
        QJsonObject obj = value.toObject();
        QString version = obj["version"].toString();
-        // "notes" field intentionally has a trailing newline for compatibility
+        QString notes = obj["notes"].toString();
-        QString notes = obj["notes"].toString().trimmed();
+        QString contributors = obj["contributors"].toString();
        QString contributors = obj["contributors"].toString().trimmed();
        ReleaseInfo releaseInfo;
        releaseInfo.version = version;
        releaseInfo.notes = notes;
--- a/gpt4all-chat/src/download.h
+++ b/gpt4all-chat/src/download.h
@ -13,14 +13,10 @@
 #include <QSslError>
 #include <QString>
 #include <QThread>
-#include <QtTypes>
+#include <QtGlobal>
 // IWYU pragma: no_forward_declare QFile
 // IWYU pragma: no_forward_declare QList
 // IWYU pragma: no_forward_declare QSslError
 class QByteArray;
 struct ReleaseInfo {
    Q_GADGET
    Q_PROPERTY(QString version MEMBER version)
--- a/gpt4all-chat/src/embllm.cpp
+++ b/gpt4all-chat/src/embllm.cpp
@ -1,35 +1,35 @@
 #include "embllm.h"
 #include "modellist.h"
 #include "mysettings.h"
-#include <gpt4all-backend/llmodel.h>
+#include "../gpt4all-backend/llmodel.h"
 #include <QCoreApplication>
 #include <QDebug>
 #include <QFile>
 #include <QFileInfo>
 #include <QGuiApplication>
 #include <QIODevice>
 #include <QJsonArray>
 #include <QJsonDocument>
 #include <QJsonObject>
 #include <QJsonValue>
 #include <QList>
-#include <QMutexLocker> // IWYU pragma: keep
+#include <QMutexLocker>
 #include <QNetworkAccessManager>
 #include <QNetworkReply>
 #include <QNetworkRequest>
 #include <QUrl>
 #include <Qt>
-#include <QtAssert>
+#include <QtGlobal>
 #include <QtLogging>
 #include <exception>
 #include <string>
 #include <utility>
 #include <vector>
 using namespace Qt::Literals::StringLiterals;
 static const QString EMBEDDING_MODEL_NAME = u"nomic-embed-text-v1.5"_s;
 static const QString LOCAL_EMBEDDING_MODEL = u"nomic-embed-text-v1.5.f16.gguf"_s;
@ -359,11 +359,8 @@ void EmbeddingLLMWorker::handleFinished()
    if (retrievedData.isValid() && retrievedData.canConvert<QVector<EmbeddingChunk>>())
        chunks = retrievedData.value<QVector<EmbeddingChunk>>();
-    QVariant response;
+    QVariant response = reply->attribute(QNetworkRequest::HttpStatusCodeAttribute);
-    if (reply->error() != QNetworkReply::NoError) {
+    Q_ASSERT(response.isValid());
        response = reply->attribute(QNetworkRequest::HttpStatusCodeAttribute);
        Q_ASSERT(response.isValid());
    }
    bool ok;
    int code = response.toInt(&ok);
    if (!ok || code != 200) {
--- a/gpt4all-chat/src/embllm.h
+++ b/gpt4all-chat/src/embllm.h
@ -5,10 +5,10 @@
 #include <QMutex>
 #include <QObject>
 #include <QString>
-#include <QStringList> // IWYU pragma: keep
+#include <QStringList>
 #include <QThread>
 #include <QVariant>
-#include <QVector> // IWYU pragma: keep
+#include <QVector>
 #include <atomic>
 #include <vector>
@ -16,7 +16,6 @@
 class LLModel;
 class QNetworkAccessManager;
 struct EmbeddingChunk {
    QString model; // TODO(jared): use to select model
    int folder_id;
--- a/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
+++ b/gpt4all-chat/flatpak-manifest/io.gpt4all.gpt4all.appdata.xml
@ -32,7 +32,7 @@
            <image>https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/flatpak-manifest/screenshots/model.png</image>
        </screenshot>
    </screenshots>
-    <url type="homepage">https://www.nomic.ai/gpt4all</url>
+    <url type="homepage">https://gpt4all.io</url>
    <url type="bugtracker">https://github.com/nomic-ai/gpt4all/issues</url>
    <url type="vcs-browser">https://github.com/nomic-ai/gpt4all</url>
    <releases>
--- a/gpt4all-chat/icons/edit.svg
+++ b/gpt4all-chat/icons/edit.svg
@ -1 +1,3 @@
-<svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="#000000" viewBox="0 0 256 256"><path d="M227.31,73.37,182.63,28.68a16,16,0,0,0-22.63,0L36.69,152A15.86,15.86,0,0,0,32,163.31V208a16,16,0,0,0,16,16H92.69A15.86,15.86,0,0,0,104,219.31L227.31,96a16,16,0,0,0,0-22.63ZM92.69,208H48V163.31l88-88L180.69,120ZM192,108.68,147.31,64l24-24L216,84.68Z"></path></svg>
+<svg width="32" height="32" viewBox="0 0 32 32" fill="none" xmlns="http://www.w3.org/2000/svg">
 <path d="M28.4138 9.17125L22.8288 3.585C22.643 3.39924 22.4225 3.25188 22.1799 3.15134C21.9372 3.0508 21.6771 2.99905 21.4144 2.99905C21.1517 2.99905 20.8916 3.0508 20.6489 3.15134C20.4062 3.25188 20.1857 3.39924 20 3.585L4.58626 19C4.39973 19.185 4.25185 19.4053 4.15121 19.648C4.05057 19.8907 3.99917 20.151 4.00001 20.4138V26C4.00001 26.5304 4.21072 27.0391 4.5858 27.4142C4.96087 27.7893 5.46958 28 6.00001 28H11.5863C11.849 28.0008 12.1093 27.9494 12.352 27.8488C12.5947 27.7482 12.815 27.6003 13 27.4138L28.4138 12C28.5995 11.8143 28.7469 11.5938 28.8474 11.3511C28.948 11.1084 28.9997 10.8483 28.9997 10.5856C28.9997 10.3229 28.948 10.0628 28.8474 9.82015C28.7469 9.57747 28.5995 9.35698 28.4138 9.17125ZM6.41376 20L17 9.41375L19.0863 11.5L8.50001 22.085L6.41376 20ZM6.00001 22.4138L9.58626 26H6.00001V22.4138ZM12 25.5863L9.91376 23.5L20.5 12.9138L22.5863 15L12 25.5863ZM24 13.5863L18.4138 8L21.4138 5L27 10.585L24 13.5863Z" fill="black"/>
 </svg>
--- a/gpt4all-chat/icons/file-doc.svg
+++ b/gpt4all-chat/icons/file-doc.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><path d="M36,152v56H52a28,28,0,0,0,0-56Z" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M216,200.87A22.12,22.12,0,0,1,200,208c-13.26,0-24-12.54-24-28s10.74-28,24-28a22.12,22.12,0,0,1,16,7.13" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M48,112V40a8,8,0,0,1,8-8h96l56,56v24" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="152 32 152 88 208 88" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><ellipse cx="128" cy="180" rx="24" ry="28" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>
--- a/gpt4all-chat/icons/file-docx.svg
+++ b/gpt4all-chat/icons/file-docx.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><line x1="152" y1="96" x2="208" y2="96" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><line x1="152" y1="160" x2="208" y2="160" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M64,72V40a8,8,0,0,1,8-8H200a8,8,0,0,1,8,8V216a8,8,0,0,1-8,8H72a8,8,0,0,1-8-8V184" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="64 104 76 152 92 120 108 152 120 104" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><rect x="32" y="72" width="120" height="112" rx="8" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>
--- a/gpt4all-chat/icons/file-xls.svg
+++ b/gpt4all-chat/icons/file-xls.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 256 256"><rect width="256" height="256" fill="none"/><polyline points="148 208 120 208 120 152" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M48,112V40a8,8,0,0,1,8-8h96l56,56v24" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><polyline points="152 32 152 88 208 88" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><line x1="48" y1="152" x2="88" y2="208" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><line x1="88" y1="152" x2="48" y2="208" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/><path d="M203.9,153.6s-29.43-7.78-31.8,11,38.43,10.12,35.78,30.72c-2.47,19.16-31.78,11-31.78,11" fill="none" stroke="currentColor" stroke-linecap="round" stroke-linejoin="round" stroke-width="16"/></svg>
--- a/gpt4all-chat/icons/groq.svg
+++ b/gpt4all-chat/icons/groq.svg
@ -1,3 +0,0 @@
 <?xml version="1.0" encoding="utf-8" ?>
 <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
 <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 26.3 26.3"><defs><style>.cls-1{fill:#f05237;}.cls-2{fill:#fff;}</style></defs><g id="Layer_2" data-name="Layer 2"><g id="Content"><circle class="cls-1" cx="13.15" cy="13.15" r="13.15"/><path class="cls-2" d="M13.17,6.88a4.43,4.43,0,0,0,0,8.85h1.45V14.07H13.17a2.77,2.77,0,1,1,2.77-2.76v4.07a2.74,2.74,0,0,1-4.67,2L10.1,18.51a4.37,4.37,0,0,0,3.07,1.29h.06a4.42,4.42,0,0,0,4.36-4.4V11.2a4.43,4.43,0,0,0-4.42-4.32"/></g></g></svg>
--- a/gpt4all-chat/icons/mistral.svg
+++ b/gpt4all-chat/icons/mistral.svg
@ -1 +0,0 @@
 <svg viewBox="0 0 512 512" xmlns="http://www.w3.org/2000/svg" fill-rule="evenodd" clip-rule="evenodd" stroke-linejoin="round" stroke-miterlimit="2"><path d="M189.08 303.228H94.587l.044-94.446h94.497l-.048 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.528 397.674h-94.493l.044-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M283.575 303.228H189.08l.046-94.446h94.496l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M378.07 303.228h-94.495l.044-94.446h94.498l-.047 94.446zM189.128 208.779H94.633l.044-94.448h94.498l-.047 94.448zM378.115 208.779h-94.494l.045-94.448h94.496l-.047 94.448zM94.587 303.227H.093l.044-96.017h94.496l-.046 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.633 208.779H.138l.046-94.448H94.68l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.68 115.902H.185L.23 19.885h94.498l-.047 96.017zM472.657 114.331h-94.495l.044-94.446h94.497l-.046 94.446zM94.54 399.244H.046l.044-97.588h94.497l-.047 97.588z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M94.495 492.123H0l.044-94.446H94.54l-.045 94.446zM472.563 303.228H378.07l.044-94.446h94.496l-.047 94.446zM472.61 208.779h-94.495l.044-94.448h94.498l-.047 94.448z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.517 397.674h-94.494l.044-94.446h94.497l-.047 94.446z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M472.47 492.121h-94.493l.044-96.017h94.496l-.047 96.017z" fill="#1c1c1b" fill-rule="nonzero"/><path d="M228.375 303.22h-96.061l.046-94.446h96.067l-.052 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M322.827 397.666h-94.495l.044-96.018h94.498l-.047 96.018z" fill="#ff4900" fill-rule="nonzero"/><path d="M324.444 303.22h-97.636l.046-94.446h97.638l-.048 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M418.938 303.22h-96.064l.045-94.446h96.066l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M228.423 208.77H132.36l.045-94.445h96.066l-.05 94.446zM418.985 208.77H322.92l.044-94.445h96.069l-.048 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.883 304.79H39.392l.044-96.017h94.496l-.049 96.017z" fill="#ff7000" fill-rule="nonzero"/><path d="M133.929 208.77H39.437l.044-95.445h94.496l-.048 95.445z" fill="#ffa300" fill-rule="nonzero"/><path d="M133.976 114.325H39.484l.044-94.448h94.497l-.05 94.448zM511.954 115.325h-94.493l.044-95.448h94.497l-.048 95.448z" fill="#ffce00" fill-rule="nonzero"/><path d="M133.836 399.667H39.345l.044-96.447h94.496l-.049 96.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M133.79 492.117H39.3l.044-94.448h94.496l-.049 94.448z" fill="#ff0107" fill-rule="nonzero"/><path d="M511.862 303.22h-94.495l.046-94.446h94.496l-.047 94.446z" fill="#ff7000" fill-rule="nonzero"/><path d="M511.907 208.77h-94.493l.044-94.445h94.496l-.047 94.446z" fill="#ffa300" fill-rule="nonzero"/><path d="M511.815 398.666h-94.493l.044-95.447h94.496l-.047 95.447z" fill="#ff4900" fill-rule="nonzero"/><path d="M511.77 492.117h-94.496l.046-94.448h94.496l-.047 94.448z" fill="#ff0107" fill-rule="nonzero"/></svg>
--- a/gpt4all-chat/icons/openai.svg
+++ b/gpt4all-chat/icons/openai.svg
@ -1,2 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?><!-- Uploaded to: SVG Repo, www.svgrepo.com, Generator: SVG Repo Mixer Tools -->
 <svg fill="#000000" width="800px" height="800px" viewBox="0 0 24 24" role="img" xmlns="http://www.w3.org/2000/svg"><title>OpenAI icon</title><path d="M22.2819 9.8211a5.9847 5.9847 0 0 0-.5157-4.9108 6.0462 6.0462 0 0 0-6.5098-2.9A6.0651 6.0651 0 0 0 4.9807 4.1818a5.9847 5.9847 0 0 0-3.9977 2.9 6.0462 6.0462 0 0 0 .7427 7.0966 5.98 5.98 0 0 0 .511 4.9107 6.051 6.051 0 0 0 6.5146 2.9001A5.9847 5.9847 0 0 0 13.2599 24a6.0557 6.0557 0 0 0 5.7718-4.2058 5.9894 5.9894 0 0 0 3.9977-2.9001 6.0557 6.0557 0 0 0-.7475-7.0729zm-9.022 12.6081a4.4755 4.4755 0 0 1-2.8764-1.0408l.1419-.0804 4.7783-2.7582a.7948.7948 0 0 0 .3927-.6813v-6.7369l2.02 1.1686a.071.071 0 0 1 .038.052v5.5826a4.504 4.504 0 0 1-4.4945 4.4944zm-9.6607-4.1254a4.4708 4.4708 0 0 1-.5346-3.0137l.142.0852 4.783 2.7582a.7712.7712 0 0 0 .7806 0l5.8428-3.3685v2.3324a.0804.0804 0 0 1-.0332.0615L9.74 19.9502a4.4992 4.4992 0 0 1-6.1408-1.6464zM2.3408 7.8956a4.485 4.485 0 0 1 2.3655-1.9728V11.6a.7664.7664 0 0 0 .3879.6765l5.8144 3.3543-2.0201 1.1685a.0757.0757 0 0 1-.071 0l-4.8303-2.7865A4.504 4.504 0 0 1 2.3408 7.872zm16.5963 3.8558L13.1038 8.364 15.1192 7.2a.0757.0757 0 0 1 .071 0l4.8303 2.7913a4.4944 4.4944 0 0 1-.6765 8.1042v-5.6772a.79.79 0 0 0-.407-.667zm2.0107-3.0231l-.142-.0852-4.7735-2.7818a.7759.7759 0 0 0-.7854 0L9.409 9.2297V6.8974a.0662.0662 0 0 1 .0284-.0615l4.8303-2.7866a4.4992 4.4992 0 0 1 6.6802 4.66zM8.3065 12.863l-2.02-1.1638a.0804.0804 0 0 1-.038-.0567V6.0742a4.4992 4.4992 0 0 1 7.3757-3.4537l-.142.0805L8.704 5.459a.7948.7948 0 0 0-.3927.6813zm1.0976-2.3654l2.602-1.4998 2.6069 1.4998v2.9994l-2.5974 1.4997-2.6067-1.4997Z"/></svg>
--- a/gpt4all-chat/icons/paperclip.svg
+++ b/gpt4all-chat/icons/paperclip.svg
@ -1,45 +0,0 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
 <svg
   viewBox="0 0 256 256"
   version="1.1"
   id="svg6"
   sodipodi:docname="paperclip-horizontal.svg"
   inkscape:version="1.1.2 (0a00cf5339, 2022-02-04)"
   xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape"
   xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd"
   xmlns="http://www.w3.org/2000/svg"
   xmlns:svg="http://www.w3.org/2000/svg">
  <defs
     id="defs10" />
  <sodipodi:namedview
     id="namedview8"
     pagecolor="#ffffff"
     bordercolor="#666666"
     borderopacity="1.0"
     inkscape:pageshadow="2"
     inkscape:pageopacity="0.0"
     inkscape:pagecheckerboard="0"
     showgrid="false"
     inkscape:zoom="4.421875"
     inkscape:cx="127.88693"
     inkscape:cy="127.88693"
     inkscape:window-width="2560"
     inkscape:window-height="1495"
     inkscape:window-x="0"
     inkscape:window-y="0"
     inkscape:window-maximized="1"
     inkscape:current-layer="svg6" />
  <rect
     width="256"
     height="256"
     fill="none"
     id="rect2" />
  <path
     d="m 144,80 v 112 a -16,16 0 0 1 -32,0 V 48 a -32,32 0 0 1 64,0 v 144 a -48,48 0 0 1 -96,0 V 80"
     fill="none"
     stroke="currentColor"
     stroke-linecap="round"
     stroke-linejoin="round"
     stroke-width="16"
     id="path4" />
 </svg>
--- a/gpt4all-chat/icons/plus_circle.svg
+++ b/gpt4all-chat/icons/plus_circle.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="#000000" viewBox="0 0 256 256"><path d="M128,24A104,104,0,1,0,232,128,104.11,104.11,0,0,0,128,24Zm0,192a88,88,0,1,1,88-88A88.1,88.1,0,0,1,128,216Zm48-88a8,8,0,0,1-8,8H136v32a8,8,0,0,1-16,0V136H88a8,8,0,0,1,0-16h32V88a8,8,0,0,1,16,0v32h32A8,8,0,0,1,176,128Z"></path></svg>
--- a/gpt4all-chat/icons/webpage.svg
+++ b/gpt4all-chat/icons/webpage.svg
@ -1 +0,0 @@
 <svg xmlns="http://www.w3.org/2000/svg" width="32" height="32" fill="#000000" viewBox="0 0 256 256"><path d="M216,40H40A16,16,0,0,0,24,56V200a16,16,0,0,0,16,16H216a16,16,0,0,0,16-16V56A16,16,0,0,0,216,40Zm0,16V88H40V56Zm0,144H40V104H216v96Z"></path></svg>
--- a/gpt4all-chat/src/llm.cpp
+++ b/gpt4all-chat/src/llm.cpp
@ -1,7 +1,7 @@
 #include "llm.h"
-#include <gpt4all-backend/llmodel.h>
+#include "../gpt4all-backend/llmodel.h"
-#include <gpt4all-backend/sysinfo.h>
+#include "../gpt4all-backend/sysinfo.h"
 #include <QCoreApplication>
 #include <QDebug>
@ -12,9 +12,6 @@
 #include <QSettings>
 #include <QUrl>
 #include <QtLogging>
 #include <QtSystemDetection>
 #include <string>
 #ifdef GPT4ALL_OFFLINE_INSTALLER
 #   include <QDesktopServices>
@ -22,13 +19,8 @@
 #   include "network.h"
 #endif
 #ifdef Q_OS_MAC
 #include "macosdock.h"
 #endif
 using namespace Qt::Literals::StringLiterals;
 class MyLLM: public LLM { };
 Q_GLOBAL_STATIC(MyLLM, llmInstance)
 LLM *LLM::globalInstance()
@ -59,7 +51,7 @@ bool LLM::checkForUpdates() const
 {
 #ifdef GPT4ALL_OFFLINE_INSTALLER
 #   pragma message(__FILE__ ": WARNING: offline installer build will not check for updates!")
-    return QDesktopServices::openUrl(QUrl("https://github.com/nomic-ai/gpt4all/releases"));
+    return QDesktopServices::openUrl(QUrl("https://gpt4all.io/"));
 #else
    Network::globalInstance()->trackEvent("check_for_updates");
@ -113,21 +105,3 @@ bool LLM::isNetworkOnline() const
    auto * netinfo = QNetworkInformation::instance();
    return !netinfo || netinfo->reachability() == QNetworkInformation::Reachability::Online;
 }
 void LLM::showDockIcon() const
 {
 #ifdef Q_OS_MAC
    MacOSDock::showIcon();
 #else
    qt_noop();
 #endif
 }
 void LLM::hideDockIcon() const
 {
 #ifdef Q_OS_MAC
    MacOSDock::hideIcon();
 #else
    qt_noop();
 #endif
 }
--- a/gpt4all-chat/src/llm.h
+++ b/gpt4all-chat/src/llm.h
@ -3,8 +3,7 @@
 #include <QObject>
 #include <QString>
-#include <QtTypes>
+#include <QtGlobal>
 class LLM : public QObject
 {
@ -24,9 +23,6 @@ public:
    Q_INVOKABLE QString systemTotalRAMInGBString() const;
    Q_INVOKABLE bool isNetworkOnline() const;
    Q_INVOKABLE void showDockIcon() const;
    Q_INVOKABLE void hideDockIcon() const;
 Q_SIGNALS:
    void isNetworkOnlineChanged();
--- a/gpt4all-chat/src/localdocs.cpp
+++ b/gpt4all-chat/src/localdocs.cpp
@ -5,14 +5,10 @@
 #include "mysettings.h"
 #include <QCoreApplication>
 #include <QDebug>
 #include <QGlobalStatic>
 #include <QGuiApplication>
 #include <QList>
 #include <QUrl>
 #include <Qt>
 #include <QtLogging>
 class MyLocalDocs: public LocalDocs { };
 Q_GLOBAL_STATIC(MyLocalDocs, localDocsInstance)
--- a/gpt4all-chat/src/localdocs.h
+++ b/gpt4all-chat/src/localdocs.h
@ -2,14 +2,11 @@
 #define LOCALDOCS_H
 #include "database.h"
-#include "localdocsmodel.h"
+#include "localdocsmodel.h" // IWYU pragma: keep
 #include <QObject>
 #include <QString>
-#include <QStringList> // IWYU pragma: keep
+#include <QStringList>
 // IWYU pragma: no_forward_declare LocalDocsModel
 class LocalDocs : public QObject
 {
--- a/Show More
+++ b/Show More
		`@ -1 +0,0 @@`
			`Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6`
		`@ -0,0 +1 @@`
							`Subproject commit add387854ea73d83770a62282089dea666fa266f`
		`@ -1,2 +0,0 @@`
			`set(OUTPUT_DIR "@CMAKE_BINARY_DIR@")`
			`file(COPY ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/config DESTINATION ${OUTPUT_DIR}/cpack-config)`
		`@ -1 +0,0 @@`
			`Subproject commit 6e31dfb280e2107fbf4f6a15098c38b014f1bbcc`
		`@ -1 +0,0 @@`
			`Subproject commit 29e81b369128525749dcb6516195b6b062eda955`
		`@ -1 +0,0 @@`
			`Subproject commit 21bdef01eddcbd78044eea1d50b9dee08d218ff2`
		`@ -1 +0,0 @@`
			`Subproject commit 0c9fce2ffefecfdce794e1859584e25877b7b592`
		`@ -1 +0,0 @@`
			`Subproject commit 606b6347edf0758c531abb6c36743e09a4c48a84`
		`@ -1 +0,0 @@`
			`Subproject commit e97bb2442cd6ab3d5bb5f5a3e8a1f7d6081d613b`
		`@ -1 +0,0 @@`
			`Subproject commit 9e59f1036657303b29eaf709945f339e403e5f2f`