2025-08-16 00:04:55 -04:00
57 changed files with 4339 additions and 7910 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -463,47 +463,50 @@ jobs:
    docker:
      - image: mcr.microsoft.com/dotnet/sdk:7.0-jammy # Ubuntu 22.04
    steps:
-      - checkout
+      - when:
-      - attach_workspace:
+          condition: << pipeline.parameters.run-csharp-workflow >>
-          at: /tmp/workspace
+          steps:
-      - run:
+            - checkout
-          name: "Prepare Native Libs"
+            - attach_workspace:
-          command: |
+                at: /tmp/workspace
-            cd gpt4all-bindings/csharp
+            - run:
-            mkdir -p runtimes/linux-x64/native
+                name: "Prepare Native Libs"
-            cp /tmp/workspace/runtimes/linux-x64/*.so runtimes/linux-x64/native/
+                command: |
-            ls -R runtimes
+                  cd gpt4all-bindings/csharp
-      - restore_cache:
+                  mkdir -p runtimes/linux-x64/native
-          keys:
+                  cp /tmp/workspace/runtimes/linux-x64/*.so runtimes/linux-x64/native/
-            - gpt4all-csharp-nuget-packages-nix
+                  ls -R runtimes
-      - run:
+            - restore_cache:
-          name: "Install project dependencies"
+                keys:
-          command: |
+                  - gpt4all-csharp-nuget-packages-nix
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet restore Gpt4All
+                name: "Install project dependencies"
-      - save_cache:
+                command: |
-          paths:
+                  cd gpt4all-bindings/csharp
-            - ~/.nuget/packages
+                  dotnet restore Gpt4All
-          key: gpt4all-csharp-nuget-packages-nix
+            - save_cache:
-      - run:
+                paths:
-          name: Build C# Project
+                  - ~/.nuget/packages
-          command: |
+                key: gpt4all-csharp-nuget-packages-nix
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet build Gpt4All --configuration Release --nologo
+                name: Build C# Project
-      - run:
+                command: |
-          name: "Run C# Tests"
+                  cd gpt4all-bindings/csharp
-          command: |
+                  dotnet build Gpt4All --configuration Release --nologo
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet test Gpt4All.Tests -v n -c Release --filter "SKIP_ON_CI!=True" --logger "trx"
+                name: "Run C# Tests"
-      - run:
+                command: |
-          name: Test results
+                  cd gpt4all-bindings/csharp
-          command: |
+                  dotnet test Gpt4All.Tests -v n -c Release --filter "SKIP_ON_CI!=True" --logger "trx"
-              cd gpt4all-bindings/csharp/Gpt4All.Tests
+            - run:
-              dotnet tool install -g trx2junit
+                name: Test results
-              export PATH="$PATH:$HOME/.dotnet/tools"
+                command: |
-              trx2junit TestResults/*.trx
+                    cd gpt4all-bindings/csharp/Gpt4All.Tests
-      - store_test_results:
+                    dotnet tool install -g trx2junit
-          path: gpt4all-bindings/csharp/Gpt4All.Tests/TestResults
+                    export PATH="$PATH:$HOME/.dotnet/tools"
                    trx2junit TestResults/*.trx
            - store_test_results:
                path: gpt4all-bindings/csharp/Gpt4All.Tests/TestResults
  build-csharp-windows:
    executor:
@ -511,99 +514,111 @@ jobs:
      size: large
      shell: powershell.exe -ExecutionPolicy Bypass
    steps:
-      - checkout
+      - when:
-      - restore_cache:
+          condition: << pipeline.parameters.run-csharp-workflow >>
-          keys:
+          steps:
-            - gpt4all-csharp-nuget-packages-win
+            - checkout
-      - attach_workspace:
+            - restore_cache:
-          at: C:\Users\circleci\workspace
+                keys:
-      - run:
+                  - gpt4all-csharp-nuget-packages-win
-          name: "Prepare Native Libs"
+            - attach_workspace:
-          command: |
+                at: C:\Users\circleci\workspace
-            cd gpt4all-bindings/csharp
+            - run:
-            mkdir -p runtimes\win-x64\native
+                name: "Prepare Native Libs"
-            cp C:\Users\circleci\workspace\runtimes\win-x64\*.dll runtimes\win-x64\native\
+                command: |
-            ls -R runtimes
+                  cd gpt4all-bindings/csharp
-      - run:
+                  mkdir -p runtimes\win-x64\native
-          name: "Install project dependencies"
+                  cp C:\Users\circleci\workspace\runtimes\win-x64\*.dll runtimes\win-x64\native\
-          command: |
+                  ls -R runtimes
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet.exe restore Gpt4All
+                name: "Install project dependencies"
-      - save_cache:
+                command: |
-          paths:
+                  cd gpt4all-bindings/csharp
-            - C:\Users\circleci\.nuget\packages
+                  dotnet.exe restore Gpt4All
-          key: gpt4all-csharp-nuget-packages-win
+            - save_cache:
-      - run:
+                paths:
-          name: Build C# Project
+                  - C:\Users\circleci\.nuget\packages
-          command: |
+                key: gpt4all-csharp-nuget-packages-win
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet.exe build Gpt4All --configuration Release --nologo
+                name: Build C# Project
-      - run:
+                command: |
-          name: "Run C# Tests"
+                  cd gpt4all-bindings/csharp
-          command: |
+                  dotnet.exe build Gpt4All --configuration Release --nologo
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet.exe test Gpt4All.Tests -v n -c Release --filter "SKIP_ON_CI!=True" --logger "trx"
+                name: "Run C# Tests"
-      - run:
+                command: |
-          name: Test results
+                  cd gpt4all-bindings/csharp
-          command: |
+                  dotnet.exe test Gpt4All.Tests -v n -c Release --filter "SKIP_ON_CI!=True" --logger "trx"
-              cd gpt4all-bindings/csharp/Gpt4All.Tests
+            - run:
-              dotnet tool install -g trx2junit
+                name: Test results
-              $Env:Path += ";$Env:USERPROFILE\.dotnet\tools"
+                command: |
-              trx2junit TestResults/*.trx
+                    cd gpt4all-bindings/csharp/Gpt4All.Tests
-      - store_test_results:
+                    dotnet tool install -g trx2junit
-          path: gpt4all-bindings/csharp/Gpt4All.Tests/TestResults
+                    $Env:Path += ";$Env:USERPROFILE\.dotnet\tools"
                    trx2junit TestResults/*.trx
            - store_test_results:
                path: gpt4all-bindings/csharp/Gpt4All.Tests/TestResults
  build-csharp-macos:
    macos:
      xcode: "14.0.0"
    steps:
-      - checkout
+      - when:
-      - restore_cache:
+          condition: << pipeline.parameters.run-csharp-workflow >>
-          keys:
+          steps:
-            - gpt4all-csharp-nuget-packages-nix
+            - checkout
-      - run:
+            - restore_cache:
-          name: Install dependencies
+                keys:
-          command: |
+                  - gpt4all-csharp-nuget-packages-nix
-            brew install --cask dotnet-sdk
+            - run:
-      - attach_workspace:
+                name: Install dependencies
-          at: /tmp/workspace
+                command: |
-      - run:
+                  brew install --cask dotnet-sdk
-          name: "Prepare Native Libs"
+            - attach_workspace:
-          command: |
+                at: /tmp/workspace
-            cd gpt4all-bindings/csharp
+            - run:
-            mkdir -p runtimes/osx/native
+                name: "Prepare Native Libs"
-            cp /tmp/workspace/runtimes/osx-x64/*.dylib runtimes/osx/native/
+                command: |
-            cp /tmp/workspace/runtimes/osx-x64/*.metal runtimes/osx/native/
+                  cd gpt4all-bindings/csharp
-            ls -R runtimes
+                  mkdir -p runtimes/osx/native
-      - run:
+                  cp /tmp/workspace/runtimes/osx-x64/*.dylib runtimes/osx/native/
-          name: "Install project dependencies"
+                  cp /tmp/workspace/runtimes/osx-x64/*.metal runtimes/osx/native/
-          command: |
+                  ls -R runtimes
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet restore Gpt4All
+                name: "Install project dependencies"
-      - save_cache:
+                command: |
-          paths:
+                  cd gpt4all-bindings/csharp
-            - ~/.nuget/packages
+                  dotnet restore Gpt4All
-          key: gpt4all-csharp-nuget-packages-nix
+            - save_cache:
-      - run:
+                paths:
-          name: Build C# Project
+                  - ~/.nuget/packages
-          command: |
+                key: gpt4all-csharp-nuget-packages-nix
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet build Gpt4All --configuration Release --nologo
+                name: Build C# Project
-      - run:
+                command: |
-          name: "Run C# Tests"
+                  cd gpt4all-bindings/csharp
-          command: |
+                  dotnet build Gpt4All --configuration Release --nologo
-            cd gpt4all-bindings/csharp
+            - run:
-            dotnet test Gpt4All.Tests -v n -c Release --filter "SKIP_ON_CI!=True" --logger "trx"
+                name: "Run C# Tests"
-      - run:
+                command: |
-          name: Test results
+                  cd gpt4all-bindings/csharp
-          command: |
+                  dotnet test Gpt4All.Tests -v n -c Release --filter "SKIP_ON_CI!=True" --logger "trx"
-              cd gpt4all-bindings/csharp/Gpt4All.Tests
+            - run:
-              dotnet tool install -g trx2junit
+                name: Test results
-              export PATH="$PATH:$HOME/.dotnet/tools"
+                command: |
-              trx2junit TestResults/*.trx
+                    cd gpt4all-bindings/csharp/Gpt4All.Tests
-      - store_test_results:
+                    dotnet tool install -g trx2junit
-          path: gpt4all-bindings/csharp/Gpt4All.Tests/TestResults
+                    export PATH="$PATH:$HOME/.dotnet/tools"
-
+                    trx2junit TestResults/*.trx
            - store_test_results:
                path: gpt4all-bindings/csharp/Gpt4All.Tests/TestResults
  build-nodejs-linux: 
    docker:
      - image: circleci/node:erbium-bullseye-browsers-legacy
    steps:
      - when:
        condition: << pipeline.parameters.run-ts-workflow >> 
          - checkout
  store-and-upload-nupkgs:
    docker:
      - image: mcr.microsoft.com/dotnet/sdk:6.0-jammy # Ubuntu 22.04
@ -641,27 +656,27 @@ jobs:
          node-version: "18.16"
      - run: node --version
      - node/install-packages:
          app-dir: gpt4all-bindings/typescript
          pkg-manager: yarn
          override-ci-command: yarn install
      - run: cd gpt4all-bindings/typescript
      - run:
          command: yarn run test
          name: Run YARN tests
      - run: 
        command: |
          # excluding llmodel. nodejs bindings dont need llmodel.dll
          cd gpt4all-bindings/typescript
          mkdir -p runtimes/win32-x64/native
          cp /tmp/workspace/runtimes/win-x64/*-*.dll runtimes/win-x64/native/ 
          mkdir -p runtimes/linux-x64/native 
          cp /tmp/workspace/runtimes/linux-x64/*-*.so runtimes/linux-x64/native/ 
          mkdir -p runtimes/osx/native
          cp /tmp/workspace/runtimes/osx-x64/*-*.dylib runtimes/osx/native/
          cp /tmp/workspace/runtimes/osx-x64/*.metal runtimes/osx/native/
      - run:
          name: Publish to NPM
          command: |
-            cd gpt4all-bindings/typescript
+            npm set //registry.npmjs.org/:_authToken=$NPM_TOKEN
-            # excluding llmodel. nodejs bindings dont need llmodel.dll
+            npm publish 
            mkdir -p runtimes/win32-x64/native
            cp /tmp/workspace/runtimes/win-x64/*-*.dll runtimes/win-x64/native/ 
            mkdir -p runtimes/linux-x64/native 
            cp /tmp/workspace/runtimes/linux-x64/*-*.so runtimes/linux-x64/native/ 
            mkdir -p runtimes/osx/native
            cp /tmp/workspace/runtimes/osx-x64/*-*.dylib runtimes/osx/native/
            cp /tmp/workspace/runtimes/osx-x64/*.metal runtimes/osx/native/
            ls -Ra gpt4all-bindings/typescript/runtimes
 #     - run:
 #          name: Publish to NPM
 #          command: |
 #            npm set //registry.npmjs.org/:_authToken=$NPM_TOKEN
 #            npm publish 
 workflows:
  version: 2
@ -741,8 +756,6 @@ workflows:
          type: approval
      - nuget-hold:
          type: approval
      - npm-hold:
          type: approval
      - build-bindings-backend-linux:
          filters:
            branches:
@ -768,16 +781,6 @@ workflows:
          requires:
            - hold
      # NodeJs Jobs 
      - prepare-npm-pkg: 
          filters:
            branches:
              only:
          requires:
            - node/test
            - npm-hold
 #            - build-bindings-backend-linux
 #            - build-bindings-backend-windows-msvc
 #            - build-bindings-backend-macos
      # CSharp Jobs
      - build-csharp-linux:
          filters:
@ -806,3 +809,4 @@ workflows:
            - build-csharp-windows
            - build-csharp-linux
            - build-csharp-macos
--- a/.gitignore
+++ b/.gitignore
@ -1,6 +1,3 @@
 *.arrow
 squad_*
 *sbert_embedded*
 *.pkl
 ckpts*
 .deepspeed_env
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,6 +1,5 @@
 cmake_minimum_required(VERSION 3.16)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if(APPLE)
  option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
@ -20,7 +19,7 @@ endif()
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")
 set(LLMODEL_VERSION_MAJOR 0)
-set(LLMODEL_VERSION_MINOR 3)
+set(LLMODEL_VERSION_MINOR 2)
 set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
@ -125,10 +124,6 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
        add_library(mpt-${BUILD_VARIANT} SHARED
            mpt.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
        prepare_target(mpt ggml-230511)
        add_library(bert-${BUILD_VARIANT} SHARED
            bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
        prepare_target(bert llama-mainline)
    endif()
 endforeach()
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
--- a/gpt4all-backend/bert_impl.h
+++ b/gpt4all-backend/bert_impl.h
@ -1,44 +0,0 @@
 #ifndef BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #error This file is NOT meant to be included outside of bert.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #endif
 #ifndef BERT_H
 #define BERT_H
 #include <string>
 #include <functional>
 #include <vector>
 #include <memory>
 #include "llmodel.h"
 struct BertPrivate;
 class Bert : public LLModel {
 public:
    Bert();
    ~Bert();
    bool supportsEmbedding() const override { return true; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath) override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
    std::vector<float> embedding(const std::string &text) override;
 private:
    std::unique_ptr<BertPrivate> d_ptr;
 protected:
    std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
    Token sampleToken(PromptContext &ctx) const override;
    std::string tokenToString(Token) const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
    int32_t contextLength() const override;
    const std::vector<Token>& endTokens() const override;
 };
 #endif // BERT_H
--- a/gpt4all-backend/falcon_impl.h
+++ b/gpt4all-backend/falcon_impl.h
@ -16,8 +16,6 @@ public:
    Falcon();
    ~Falcon();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath) override;
--- a/gpt4all-backend/gptj_impl.h
+++ b/gpt4all-backend/gptj_impl.h
@ -15,8 +15,6 @@ public:
    GPTJ();
    ~GPTJ();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath) override;
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@ -15,8 +15,6 @@ public:
    LLamaModel();
    ~LLamaModel();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath) override;
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -10,19 +10,17 @@
 #include <cassert>
 #include <cstdlib>
 #include <sstream>
 #ifdef _MSC_VER
 #include <windows.h>
 #include <processthreadsapi.h>
 #endif
 std::string s_implementations_search_path = ".";
 static bool has_at_least_minimal_hardware() {
-#if defined(__x86_64__) || defined(_M_X64)
+#ifdef __x86_64__
    #ifndef _MSC_VER
        return __builtin_cpu_supports("avx");
    #else
-        return IsProcessorFeaturePresent(PF_AVX_INSTRUCTIONS_AVAILABLE);
+        int cpuInfo[4];
        __cpuid(cpuInfo, 1);
        return cpuInfo[2] & (1 << 28);
    #endif
 #else
    return true; // Don't know how to handle non-x86_64
@ -30,53 +28,54 @@ static bool has_at_least_minimal_hardware() {
 }
 static bool requires_avxonly() {
-#if defined(__x86_64__) || defined(_M_X64)
+#ifdef __x86_64__
    #ifndef _MSC_VER
        return !__builtin_cpu_supports("avx2");
    #else
-        return !IsProcessorFeaturePresent(PF_AVX2_INSTRUCTIONS_AVAILABLE);
+        int cpuInfo[4];
        __cpuidex(cpuInfo, 7, 0);
        return !(cpuInfo[1] & (1 << 5));
    #endif
 #else
    return false; // Don't know how to handle non-x86_64
 #endif
 }
-LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
+LLModel::Implementation::Implementation(Dlhandle &&dlhandle_) : dlhandle(new Dlhandle(std::move(dlhandle_))) {
-    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
+    auto get_model_type = dlhandle->get<const char *()>("get_model_type");
    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
    assert(get_model_type);
-    m_modelType = get_model_type();
+    modelType = get_model_type();
-    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
+    auto get_build_variant = dlhandle->get<const char *()>("get_build_variant");
    assert(get_build_variant);
-    m_buildVariant = get_build_variant();
+    buildVariant = get_build_variant();
-    m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match");
+    magicMatch = dlhandle->get<bool(std::ifstream&)>("magic_match");
-    assert(m_magicMatch);
+    assert(magicMatch);
-    m_construct = m_dlhandle->get<LLModel *()>("construct");
+    construct_ = dlhandle->get<LLModel *()>("construct");
-    assert(m_construct);
+    assert(construct_);
 }
 LLModel::Implementation::Implementation(Implementation &&o)
-    : m_magicMatch(o.m_magicMatch)
+    : construct_(o.construct_)
-    , m_construct(o.m_construct)
+    , modelType(o.modelType)
-    , m_modelType(o.m_modelType)
+    , buildVariant(o.buildVariant)
-    , m_buildVariant(o.m_buildVariant)
+    , magicMatch(o.magicMatch)
-    , m_dlhandle(o.m_dlhandle) {
+    , dlhandle(o.dlhandle) {
-    o.m_dlhandle = nullptr;
+    o.dlhandle = nullptr;
 }
 LLModel::Implementation::~Implementation() {
-    if (m_dlhandle) delete m_dlhandle;
+    if (dlhandle) delete dlhandle;
 }
 bool LLModel::Implementation::isImplementation(const Dlhandle &dl) {
    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
 }
-const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList() {
+const std::vector<LLModel::Implementation> &LLModel::implementationList() {
    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
    // individual models without the cleanup of the static list interfering
-    static auto* libs = new std::vector<Implementation>([] () {
+    static auto* libs = new std::vector<LLModel::Implementation>([] () {
-        std::vector<Implementation> fres;
+        std::vector<LLModel::Implementation> fres;
        auto search_in_directory = [&](const std::string& paths) {
            std::stringstream ss(paths);
@ -108,17 +107,17 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    return *libs;
 }
-const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) {
+const LLModel::Implementation* LLModel::implementation(std::ifstream& f, const std::string& buildVariant) {
    for (const auto& i : implementationList()) {
        f.seekg(0);
-        if (!i.m_magicMatch(f)) continue;
+        if (!i.magicMatch(f)) continue;
-        if (buildVariant != i.m_buildVariant) continue;
+        if (buildVariant != i.buildVariant) continue;
        return &i;
    }
    return nullptr;
 }
-LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant) {
+LLModel *LLModel::construct(const std::string &modelPath, std::string buildVariant) {
    if (!has_at_least_minimal_hardware())
        return nullptr;
@ -127,15 +126,14 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
    std::ifstream f(modelPath, std::ios::binary);
    if (!f) return nullptr;
    // Get correct implementation
-    const Implementation* impl = nullptr;
+    const LLModel::Implementation* impl = nullptr;
    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
        if (buildVariant == "auto") {
            size_t total_mem = getSystemTotalRAMInBytes();
            impl = implementation(f, "metal");
            if(impl) {
-                LLModel* metalimpl = impl->m_construct();
+                LLModel* metalimpl = impl->construct();
                metalimpl->m_implementation = impl;
                size_t req_mem = metalimpl->requiredMem(modelPath);
                float req_to_total = (float) req_mem / (float) total_mem;
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
@ -162,17 +160,14 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::s
        if (!impl) return nullptr;
    }
    f.close();
    // Construct and return llmodel implementation
-    auto fres = impl->m_construct();
+    return impl->construct();
    fres->m_implementation = impl;
    return fres;
 }
-void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
+void LLModel::setImplementationsSearchPath(const std::string& path) {
    s_implementations_search_path = path;
 }
-const std::string& LLModel::Implementation::implementationsSearchPath() {
+const std::string& LLModel::implementationsSearchPath() {
    return s_implementations_search_path;
 }
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -12,34 +12,32 @@
 #define LLMODEL_MAX_PROMPT_BATCH 128
 class Dlhandle;
 class LLModel {
 public:
    using Token = int32_t;
    class Implementation {
        LLModel *(*construct_)();
    public:
        Implementation(Dlhandle&&);
        Implementation(const Implementation&) = delete;
        Implementation(Implementation&&);
        ~Implementation();
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }
        static bool isImplementation(const Dlhandle&);
        static const std::vector<Implementation>& implementationList();
        static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant);
        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
        static void setImplementationsSearchPath(const std::string& path);
        static const std::string& implementationsSearchPath();
-    private:
+        std::string_view modelType, buildVariant;
-        bool (*m_magicMatch)(std::ifstream& f);
+        bool (*magicMatch)(std::ifstream& f);
-        LLModel *(*m_construct)();
+        Dlhandle *dlhandle;
-    private:
+        // The only way an implementation should be constructed
-        std::string_view m_modelType;
+        LLModel *construct() const {
-        std::string_view m_buildVariant;
+            auto fres = construct_();
-        Dlhandle *m_dlhandle;
+            fres->m_implementation = this;
            return fres;
        }
    };
    struct PromptContext {
@ -61,25 +59,18 @@ public:
    explicit LLModel() {}
    virtual ~LLModel() {}
    virtual bool supportsEmbedding() const = 0;
    virtual bool supportsCompletion() const = 0;
    virtual bool loadModel(const std::string &modelPath) = 0;
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath) = 0;
    virtual size_t stateSize() const { return 0; }
    virtual size_t saveState(uint8_t */*dest*/) const { return 0; }
    virtual size_t restoreState(const uint8_t */*src*/) { return 0; }
    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
    virtual void prompt(const std::string &prompt,
                        std::function<bool(int32_t)> promptCallback,
                        std::function<bool(int32_t, const std::string&)> responseCallback,
                        std::function<bool(bool)> recalculateCallback,
                        PromptContext &ctx);
    virtual std::vector<float> embedding(const std::string &text);
    virtual void setThreadCount(int32_t /*n_threads*/) {}
    virtual int32_t threadCount() const { return 1; }
@ -87,6 +78,13 @@ public:
        return *m_implementation;
    }
    static const std::vector<Implementation>& implementationList();
    static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant);
    static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
    static void setImplementationsSearchPath(const std::string& path);
    static const std::string& implementationsSearchPath();
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
@ -102,9 +100,5 @@ protected:
    void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
    const Implementation *m_implementation = nullptr;
 private:
    friend class LLMImplementation;
 };
 #endif // LLMODEL_H
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@ -29,7 +29,7 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *build_va
    int error_code = 0;
    try {
-        wrapper->llModel = LLModel::Implementation::construct(model_path, build_variant);
+        wrapper->llModel = LLModel::construct(model_path, build_variant);
    } catch (const std::exception& e) {
        error_code = EINVAL;
        last_error_message = e.what();
@ -166,25 +166,6 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
    ctx->context_erase = wrapper->promptContext.contextErase;
 }
 float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size)
 {
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
    std::vector<float> embeddingVector = wrapper->llModel->embedding(text);
    float *embedding = (float *)malloc(embeddingVector.size() * sizeof(float));
    if(embedding == nullptr) {
        *embedding_size = 0;
        return nullptr;
    }
    std::copy(embeddingVector.begin(), embeddingVector.end(), embedding);
    *embedding_size = embeddingVector.size();
    return embedding;
 }
 void llmodel_free_embedding(float *ptr)
 {
    free(ptr);
 }
 void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
 {
    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
@ -199,10 +180,10 @@ int32_t llmodel_threadCount(llmodel_model model)
 void llmodel_set_implementation_search_path(const char *path)
 {
-    LLModel::Implementation::setImplementationsSearchPath(path);
+    LLModel::setImplementationsSearchPath(path);
 }
 const char *llmodel_get_implementation_search_path()
 {
-    return LLModel::Implementation::implementationsSearchPath().c_str();
+    return LLModel::implementationsSearchPath().c_str();
 }
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@ -171,23 +171,6 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
                    llmodel_recalculate_callback recalculate_callback,
                    llmodel_prompt_context *ctx);
 /**
 * Generate an embedding using the model.
 * @param model A pointer to the llmodel_model instance.
 * @param text A string representing the text to generate an embedding for.
 * @param embedding_size A pointer to a size_t type that will be set by the call indicating the length
 * of the returned floating point array.
 * @return A pointer to an array of floating point values passed to the calling method which then will
 * be responsible for lifetime of this memory.
 */
 float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size);
 /**
 * Frees the memory allocated by the llmodel_embedding function.
 * @param ptr A pointer to the embedding as returned from llmodel_embedding.
 */
 void llmodel_free_embedding(float *ptr);
 /**
 * Set the number of threads to be used by the model.
 * @param model A pointer to the llmodel_model instance.
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@ -33,14 +33,7 @@ void LLModel::prompt(const std::string &prompt,
                     PromptContext &promptCtx)
 {
    if (!isModelLoaded()) {
-        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
+        std::cerr << implementation().modelType << " ERROR: prompt won't work with an unloaded model!\n";
        return;
    }
    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support text completion or chat!\n";
        responseCallback(-1, errorMessage);
        std::cerr << implementation().modelType() << errorMessage;
        return;
    }
@ -52,8 +45,8 @@ void LLModel::prompt(const std::string &prompt,
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
-        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
+        std::cerr << implementation().modelType << " ERROR: The prompt is" << embd_inp.size() <<
-            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
+            "tokens and the context window is" << promptCtx.n_ctx << "!\n";
        return;
    }
@ -71,7 +64,7 @@ void LLModel::prompt(const std::string &prompt,
        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
-            std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
+            std::cerr << implementation().modelType << ": reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
@ -79,7 +72,7 @@ void LLModel::prompt(const std::string &prompt,
        }
        if (!evalTokens(promptCtx, batch)) {
-            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
+            std::cerr << implementation().modelType << " ERROR: Failed to process prompt\n";
            return;
        }
@ -110,7 +103,7 @@ void LLModel::prompt(const std::string &prompt,
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
            // Erase the first percentage of context from the tokens...
-            std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
+            std::cerr << implementation().modelType << ": reached the end of the context window so resizing\n";
            promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
            promptCtx.n_past = promptCtx.tokens.size();
            recalculateContext(promptCtx, recalculateCallback);
@ -118,7 +111,7 @@ void LLModel::prompt(const std::string &prompt,
        }
        if (!evalTokens(promptCtx, { id })) {
-            std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
+            std::cerr << implementation().modelType << " ERROR: Failed to predict next token\n";
            return;
        }
@ -165,12 +158,3 @@ void LLModel::prompt(const std::string &prompt,
        cachedTokens.clear();
    }
 }
 std::vector<float> LLModel::embedding(const std::string &/*text*/)
 {
    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support generating embeddings!\n";
        std::cerr << implementation().modelType() << errorMessage;
    }
    return std::vector<float>();
 }
--- a/gpt4all-backend/mpt_impl.h
+++ b/gpt4all-backend/mpt_impl.h
@ -15,8 +15,6 @@ public:
    MPT();
    ~MPT();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath) override;
--- a/gpt4all-backend/replit_impl.h
+++ b/gpt4all-backend/replit_impl.h
@ -17,8 +17,6 @@ public:
    Replit();
    ~Replit();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string & modelPath) override;
--- a/gpt4all-backend/scripts/convert_bert_hf_to_ggml.py
+++ b/gpt4all-backend/scripts/convert_bert_hf_to_ggml.py
@ -1,102 +0,0 @@
 import sys
 import struct
 import json
 import torch
 import numpy as np
 from transformers import AutoModel, AutoTokenizer
 if len(sys.argv) < 3:
    print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = sys.argv[1]
 fname_out = sys.argv[1] + "/ggml-model.bin"
 with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
    encoder = json.load(f)
 with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
    hparams = json.load(f)
 with open(dir_model + "/vocab.txt", "r", encoding="utf-8") as f:
    vocab = f.readlines()
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
    fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True)
 print (model)
 print(tokenizer.encode('I believe the meaning of life is'))
 list_vars = model.state_dict()
 for name in list_vars.keys():
    print(name, list_vars[name].shape, list_vars[name].dtype)
 fout = open(fname_out, "wb")
 print(hparams)
 fout.write(struct.pack("i", 0x62657274)) # magic: ggml in hex
 fout.write(struct.pack("i", hparams["vocab_size"]))
 fout.write(struct.pack("i", hparams["max_position_embeddings"]))
 fout.write(struct.pack("i", hparams["hidden_size"]))
 fout.write(struct.pack("i", hparams["intermediate_size"]))
 fout.write(struct.pack("i", hparams["num_attention_heads"]))
 fout.write(struct.pack("i", hparams["num_hidden_layers"]))
 fout.write(struct.pack("i", ftype))
 for i in range(hparams["vocab_size"]):
    text = vocab[i][:-1] # strips newline at the end
    #print(f"{i}:{text}")
    data = bytes(text, 'utf-8')
    fout.write(struct.pack("i", len(data)))
    fout.write(data)
 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    if name in ['embeddings.position_ids', 'pooler.dense.weight', 'pooler.dense.bias']:
        continue
    print("Processing variable: " + name + " with shape: ", data.shape)
    n_dims = len(data.shape);
    # ftype == 0 -> float32, ftype == 1 -> float16
    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
            print("  Converting to float16")
            data = data.astype(np.float16)
            l_type = 1
    else:
        l_type = 0
    # header
    str = name.encode('utf-8')
    fout.write(struct.pack("iii", n_dims, len(str), l_type))
    for i in range(n_dims):
        fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
    fout.write(str);
    # data
    data.tofile(fout)
 fout.close()
 print("Done. Output file: " + fname_out)
 print("")
--- a/gpt4all-bindings/python/docs/gpt4all_faq.md
+++ b/gpt4all-bindings/python/docs/gpt4all_faq.md
@ -2,13 +2,11 @@
 ## What models are supported by the GPT4All ecosystem?
-Currently, there are five different model architectures that are supported:
+Currently, there are three different model architectures that are supported:
-1. GPT-J - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
+1. GPTJ - Based off of the GPT-J architecture with examples found [here](https://huggingface.co/EleutherAI/gpt-j-6b)
-2. LLaMA - Based off of the LLaMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
+2. LLAMA - Based off of the LLAMA architecture with examples found [here](https://huggingface.co/models?sort=downloads&search=llama)
 3. MPT - Based off of Mosaic ML's MPT architecture with examples found [here](https://huggingface.co/mosaicml/mpt-7b)
 4. Replit - Based off of Replit Inc.'s Replit architecture with examples found [here](https://huggingface.co/replit/replit-code-v1-3b)
 5. Falcon - Based off of TII's Falcon architecture with examples found [here](https://huggingface.co/tiiuae/falcon-40b)
 ## Why so many different architectures? What differentiates them?
@ -27,10 +25,6 @@ The upstream [llama.cpp](https://github.com/ggerganov/llama.cpp) project has int
 Fortunately, we have engineered a submoduling system allowing us to dynamically load different versions of the underlying library so that
 GPT4All just works.
 ## What are the system requirements?
 Your CPU needs to support [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) and you need enough RAM to load a model into memory.
 ## What about GPU inference?
 In newer versions of llama.cpp, there has been some added support for NVIDIA GPU's for inference. We're investigating how to incorporate this into our downloadable installers.
--- a/gpt4all-bindings/python/docs/gpt4all_python.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python.md
@ -1,7 +1,8 @@
-# GPT4All Python Generation API
+# GPT4All Python API
 The `GPT4All` python package provides bindings to our C/C++ model backend libraries.
 The source code and local build instructions can be found [here](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python).
 ## Quickstart
 ```bash
@ -108,5 +109,5 @@ with model.chat_session():
    print(model.current_chat_session)
 ```
-### API documentation
+
 ::: gpt4all.gpt4all.GPT4All
--- a/gpt4all-bindings/python/docs/gpt4all_python_embedding.md
+++ b/gpt4all-bindings/python/docs/gpt4all_python_embedding.md
@ -1,35 +0,0 @@
 # Embeddings
 GPT4All supports generating high quality embeddings of arbitrary length documents of text using a CPU optimized contrastively trained [Sentence Transformer](https://www.sbert.net/). These embeddings are comparable in quality for many tasks with OpenAI.
 ## Quickstart
 ```bash
 pip install gpt4all
 ```
 ### Generating embeddings
 The embedding model will automatically be downloaded if not installed.
 === "Embed4All Example"
    ``` py
    from gpt4all import GPT4All, Embed4All
    text = 'The quick brown fox jumps over the lazy dog'
    embedder = Embed4All()
    output = embedder.embed(text)
    print(output)
    ```
 === "Output"
    ```
    [0.034696947783231735, -0.07192722707986832, 0.06923297047615051, ...]
    ```
 ### Speed of embedding generation
 The following table lists the generation speed for text document captured on an Intel i913900HX CPU with DDR5 5600 running with 8 threads under stable load.
 | Tokens          | 128  | 512  | 2048 | 8129 | 16,384 |
 | --------------- | ---- | ---- | ---- | ---- | ---- |
 | Wall time (s)   | .02  | .08  | .24  | .96  | 1.9  |
 | Tokens / Second | 6508 | 6431 | 8622 | 8509 | 8369 |
 ### API documentation
 ::: gpt4all.gpt4all.Embed4All
--- a/gpt4all-bindings/python/gpt4all/init.py
+++ b/gpt4all-bindings/python/gpt4all/init.py
@ -1,2 +1,2 @@
-from .gpt4all import GPT4All, Embed4All  # noqa
+from .gpt4all import GPT4All  # noqa
 from .pyllmodel import LLModel  # noqa
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -15,36 +15,6 @@ from . import pyllmodel
 # TODO: move to config
 DEFAULT_MODEL_DIRECTORY = os.path.join(str(Path.home()), ".cache", "gpt4all").replace("\\", "\\\\")
 class Embed4All:
    """
    Python class that handles embeddings for GPT4All.
    """
    def __init__(
        self,
        n_threads: Optional[int] = None,
    ):
        """
        Constructor
        Args:
            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
        """
        self.gpt4all = GPT4All(model_name='ggml-all-MiniLM-L6-v2-f16.bin', n_threads=n_threads)
    def embed(
        self,
        text: str
    ) -> list[float]:
        """
        Generate an embedding.
        Args:
            text: The text document to generate an embedding for.
        Returns:
            An embedding of your document of text.
        """
        return self.gpt4all.model.generate_embedding(text)
 class GPT4All:
    """
@ -69,7 +39,7 @@ class GPT4All:
            model_type: Model architecture. This argument currently does not have any functionality and is just used as
                descriptive identifier for user. Default is None.
            allow_download: Allow API to download models from gpt4all.io. Default is True.
-            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
+            n_threads: number of CPU threads used by GPT4All. Default is None, than the number of threads are determined automatically.
        """
        self.model_type = model_type
        self.model = pyllmodel.LLModel()
--- a/gpt4all-bindings/python/gpt4all/pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py
@ -112,19 +112,6 @@ llmodel.llmodel_prompt.argtypes = [
 llmodel.llmodel_prompt.restype = None
 llmodel.llmodel_embedding.argtypes = [
    ctypes.c_void_p,
    ctypes.c_char_p,
    ctypes.POINTER(ctypes.c_size_t),
 ]
 llmodel.llmodel_embedding.restype = ctypes.POINTER(ctypes.c_float)
 llmodel.llmodel_free_embedding.argtypes = [
    ctypes.POINTER(ctypes.c_float)
 ]
 llmodel.llmodel_free_embedding.restype = None
 llmodel.llmodel_setThreadCount.argtypes = [ctypes.c_void_p, ctypes.c_int32]
 llmodel.llmodel_setThreadCount.restype = None
@ -154,11 +141,10 @@ class LLModel:
        self.model = None
        self.model_name = None
        self.context = None
        self.llmodel_lib = llmodel
    def __del__(self):
        if self.model is not None:
-            self.llmodel_lib.llmodel_model_destroy(self.model)
+            llmodel.llmodel_model_destroy(self.model)
    def memory_needed(self, model_path: str) -> int:
        model_path_enc = model_path.encode("utf-8")
@ -247,17 +233,6 @@ class LLModel:
        self.context.repeat_last_n = repeat_last_n
        self.context.context_erase = context_erase
    def generate_embedding(
        self,
        text: str
    ) -> list[float]:
        embedding_size = ctypes.c_size_t()
        c_text = ctypes.c_char_p(text.encode('utf-8'))
        embedding_ptr = llmodel.llmodel_embedding(self.model, c_text, ctypes.byref(embedding_size))
        embedding_array = [embedding_ptr[i] for i in range(embedding_size.value)]
        llmodel.llmodel_free_embedding(embedding_ptr)
        return list(embedding_array)
    def prompt_model(
        self,
        prompt: str,
--- a/gpt4all-bindings/python/gpt4all/tests/test_embed_timings.py
+++ b/gpt4all-bindings/python/gpt4all/tests/test_embed_timings.py
@ -1,18 +0,0 @@
 import sys
 from io import StringIO
 from gpt4all import GPT4All, Embed4All
 import time
 def time_embedding(i, embedder):
    text = 'foo bar ' * i
    start_time = time.time()
    output = embedder.embed(text)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time report: {2 * i / elapsed_time} tokens/second with {2 * i} tokens taking {elapsed_time} seconds")
 if __name__ == "__main__":
    embedder = Embed4All(n_threads=8)
    for i in [2**n for n in range(6, 14)]:
        time_embedding(i, embedder)
--- a/gpt4all-bindings/python/gpt4all/tests/test_gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/tests/test_gpt4all.py
@ -1,8 +1,8 @@
 import sys
 from io import StringIO
-from gpt4all import GPT4All, Embed4All
+from gpt4all import GPT4All
-import time
+
 def test_inference():
    model = GPT4All(model_name='orca-mini-3b.ggmlv3.q4_0.bin')
@ -99,11 +99,3 @@ def test_inference_mpt():
    output = model.generate(prompt)
    assert isinstance(output, str)
    assert len(output) > 0
 def test_embedding():
    text = 'The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox jumps over the lazy dog The quick brown fox'
    embedder = Embed4All()
    output = embedder.embed(text)
    #for i, value in enumerate(output):
        #print(f'Value at index {i}: {value}')
    assert len(output) == 384
--- a/gpt4all-bindings/python/mkdocs.yml
+++ b/gpt4all-bindings/python/mkdocs.yml
@ -10,9 +10,7 @@ use_directory_urls: false
 nav:
    - 'index.md'
    - 'Bindings':
-      - 'GPT4All in Python':
+      - 'GPT4All in Python': 'gpt4all_python.md'
        - 'Generation': 'gpt4all_python.md'
        - 'Embedding': 'gpt4all_python_embedding.md'
      - 'GPT4ALL in NodeJs': 'gpt4all_typescript.md'
      - 'GPT4All Chat Client': 'gpt4all_chat.md'
      - 'gpt4all_cli.md'
--- a/gpt4all-bindings/python/setup.py
+++ b/gpt4all-bindings/python/setup.py
@ -61,7 +61,7 @@ copy_prebuilt_C_lib(SRC_CLIB_DIRECtORY,
 setup(
    name=package_name,
-    version="1.0.6",
+    version="1.0.3",
    description="Python bindings for GPT4All",
    author="Richard Guo",
    author_email="richard@nomic.ai",
--- a/gpt4all-bindings/typescript/README.md
+++ b/gpt4all-bindings/typescript/README.md
@ -53,7 +53,7 @@ const response = await createCompletion(ll, [
 *   (win) msvc version 143
    *   Can be obtained with visual studio 2022 build tools
-### Build (from source)
+### Build
 ```sh
 git clone https://github.com/nomic-ai/gpt4all.git
@ -138,7 +138,7 @@ This package is in active development, and breaking changes may happen until the
 *   \[ ] createTokenStream, an async iterator that streams each token emitted from the model. Planning on following this [example](https://github.com/nodejs/node-addon-examples/tree/main/threadsafe-async-iterator)
 *   \[ ] proper unit testing (integrate with circle ci)
 *   \[ ] publish to npm under alpha tag `gpt4all@alpha`
-*   \[x] have more people test on other platforms (mac tester needed)
+*   \[ ] have more people test on other platforms (mac tester needed)
 *   \[x] switch to new pluggable backend
 ### Documentation
--- a/gpt4all-bindings/typescript/binding.gyp
+++ b/gpt4all-bindings/typescript/binding.gyp
@ -53,7 +53,7 @@
                '-fno-rtti',
            ],
            'cflags_cc': [
-                '-std=c++2a'
+                '-std=c++20'
            ]
        }]
      ]
--- a/gpt4all-bindings/typescript/index.cc
+++ b/gpt4all-bindings/typescript/index.cc
@ -10,7 +10,6 @@ Napi::Function NodeModelWrapper::GetClass(Napi::Env env) {
       InstanceMethod("stateSize", &NodeModelWrapper::StateSize),
       InstanceMethod("raw_prompt", &NodeModelWrapper::Prompt),
       InstanceMethod("setThreadCount", &NodeModelWrapper::SetThreadCount),
       InstanceMethod("embed", &NodeModelWrapper::GenerateEmbedding),
       InstanceMethod("threadCount", &NodeModelWrapper::ThreadCount),
       InstanceMethod("getLibraryPath", &NodeModelWrapper::GetLibraryPath),
    });
@ -92,23 +91,6 @@ Napi::Function NodeModelWrapper::GetClass(Napi::Env env) {
    return Napi::Number::New(info.Env(), static_cast<int64_t>(llmodel_get_state_size(GetInference())));
  }
  Napi::Value NodeModelWrapper::GenerateEmbedding(const Napi::CallbackInfo& info) {
    auto env = info.Env();
    std::string text = info[0].As<Napi::String>().Utf8Value();
    size_t embedding_size = 0;
    float* arr = llmodel_embedding(GetInference(), text.c_str(), &embedding_size);
    auto arr_size = sizeof(arr) / sizeof(float);
    Napi::Float32Array js_array = Napi::Float32Array::New(info.Env(), arr_size);
    for (size_t i = 0; i < arr_size; ++i) {
        float element = *(arr + i);
        js_array[i] = element;
    }
    llmodel_free_embedding(arr);
    return js_array;
  }
 /**
 * Generate a response using the model.
--- a/gpt4all-bindings/typescript/index.h
+++ b/gpt4all-bindings/typescript/index.h
@ -23,7 +23,6 @@ public:
  void SetThreadCount(const Napi::CallbackInfo& info);
  Napi::Value getName(const Napi::CallbackInfo& info);
  Napi::Value ThreadCount(const Napi::CallbackInfo& info);
  Napi::Value GenerateEmbedding(const Napi::CallbackInfo& info);
  /*
   * The path that is used to search for the dynamic libraries
   */
--- a/gpt4all-bindings/typescript/package.json
+++ b/gpt4all-bindings/typescript/package.json
@ -1,6 +1,6 @@
 {
  "name": "gpt4all",
-  "version": "2.0.0rc",
+  "version": "2.0.0",
  "packageManager": "yarn@3.5.1",
  "main": "src/gpt4all.js",
  "repository": "nomic-ai/gpt4all",
--- a/gpt4all-bindings/typescript/scripts/prebuild.js
+++ b/gpt4all-bindings/typescript/scripts/prebuild.js
@ -6,7 +6,7 @@ async function createPrebuilds(combinations) {
            platform,
            arch,
            napi: true,
-            targets: ["18.16.0"]
+            targets: ["18.15.0"]
        };
        try {
            await createPrebuild(opts);
--- a/gpt4all-bindings/typescript/yarn.lock
+++ b/gpt4all-bindings/typescript/yarn.lock
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -1,6 +1,5 @@
 cmake_minimum_required(VERSION 3.16)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
@ -18,7 +17,7 @@ endif()
 set(APP_VERSION_MAJOR 2)
 set(APP_VERSION_MINOR 4)
-set(APP_VERSION_PATCH 14)
+set(APP_VERSION_PATCH 13)
 set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
 # Include the binary directory for the generated header file
@ -206,8 +205,6 @@ install(TARGETS replit-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NA
 if(APPLE)
 install(TARGETS replit-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 endif()
 install(TARGETS bert-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS bert-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 set(CPACK_GENERATOR "IFW")
 set(CPACK_VERBATIM_VARIABLES YES)
--- a/gpt4all-chat/README.md
+++ b/gpt4all-chat/README.md
@ -51,7 +51,19 @@ One click installers for macOS, Linux, and Windows at https://gpt4all.io
 If you've already checked out the source code and/or built the program make sure when you do a git fetch to get the latest changes and that you also do ```git submodule update --init --recursive``` to update the submodules.
 ## Manual download of models
-* You can find a 'Model Explorer' on the official website where you can manually download models that we support: https://gpt4all.io/index.html
+* https://gpt4all.io/models/ggml-mpt-7b-chat.bin (default) (md5sum 756249d3d6abe23bde3b1ae272628640) Current best non-commercially licensable chat model based on MPT and trained by Mosaic ML.
 * https://gpt4all.io/models/ggml-gpt4all-j-v1.3-groovy.bin (default) (md5sum 81a09a0ddf89690372fc296ff7f625af) Current best commercially licensable model based on GPT-J and trained by Nomic AI on the latest curated GPT4All dataset.
 * https://gpt4all.io/models/ggml-gpt4all-l13b-snoozy.bin (md5sum 91f886b68fbce697e9a3cd501951e455) Current best non-commercially licensable model based on Llama 13b and trained by Nomic AI on the latest curated GPT4All dataset.
 * https://gpt4all.io/models/ggml-gpt4all-j-v1.2-jazzy.bin (md5sum 879344aaa9d62fdccbda0be7a09e7976) A commercially licensable model based on GPT-J and trained by Nomic AI on the v2 GPT4All dataset.
 * https://gpt4all.io/models/ggml-gpt4all-j-v1.1-breezy.bin (md5sum 61d48a82cb188cceb14ebb8082bfec37) A commercially licensable model based on GPT-J and trained by Nomic AI on the v1 GPT4All dataset.
 * https://gpt4all.io/models/ggml-gpt4all-j.bin (md5sum 5b5a3f9b858d33b29b52b89692415595) A commercially licensable model based on GPT-J and trained by Nomic AI on the v0 GPT4All dataset.
 * https://gpt4all.io/models/ggml-vicuna-7b-1.1-q4_2.bin (md5sum 29119f8fa11712704c6b22ac5ab792ea) An non-commercially licensable model based on Llama 7b and trained by teams from UC Berkeley, CMU, Stanford, MBZUAI, and UC San Diego.
 * https://gpt4all.io/models/ggml-vicuna-13b-1.1-q4_2.bin (md5sum 95999b7b0699e2070af63bf5d34101a8) An non-commercially licensable model based on Llama 13b and trained by teams from UC Berkeley, CMU, Stanford, MBZUAI, and UC San Diego.
 * https://gpt4all.io/models/ggml-wizardLM-7B.q4_2.bin (md5sum 99e6d129745a3f1fb1121abed747b05a) An non-commercially licensable model based on Llama 7b and trained by Microsoft and Peking University.
 * https://gpt4all.io/models/ggml-stable-vicuna-13B.q4_2.bin (md5sum 6cb4ee297537c9133bddab9692879de0) An non-commercially licensable model based on Llama 13b and RLHF trained by Stable AI.
 * https://gpt4all.io/models/ggml-mpt-7b-base.bin (md5sum 120c32a51d020066288df045ef5d52b9) A commercially licensable model base pre-trained by Mosaic ML.
 * https://gpt4all.io/models/ggml-nous-gpt4-vicuna-13b.bin (md5sum d5eafd5b0bd0d615cfd5fd763f642dfe) A non-commercially licensable model based on Vicuna 13b, fine-tuned on ~180,000 instructions, trained by Nous Research.
 * https://gpt4all.io/models/ggml-mpt-7b-instruct.bin (md5sum 1cfa4958f489f0a0d1ffdf6b37322809) A commercially licensable instruct model based on MPT and trained by Mosaic ML.
 ## Terminal Only Interface with no Qt dependency
--- a/gpt4all-chat/chatgpt.cpp
+++ b/gpt4all-chat/chatgpt.cpp
@ -155,7 +155,7 @@ void ChatGPTWorker::request(const QString &apiKey,
    m_ctx = promptCtx;
    QUrl openaiUrl("https://api.openai.com/v1/chat/completions");
-    const QString authorization = QString("Bearer %1").arg(apiKey).trimmed();
+    const QString authorization = QString("Bearer %1").arg(apiKey);
    QNetworkRequest request(openaiUrl);
    request.setHeader(QNetworkRequest::ContentTypeHeader, "application/json");
    request.setRawHeader("Authorization", authorization.toUtf8());
@ -244,7 +244,7 @@ void ChatGPTWorker::handleReadyRead()
 void ChatGPTWorker::handleErrorOccurred(QNetworkReply::NetworkError code)
 {
    QNetworkReply *reply = qobject_cast<QNetworkReply *>(sender());
-    if (!reply || reply->error() == QNetworkReply::OperationCanceledError /*when we call abort on purpose*/) {
+    if (!reply) {
        emit finished();
        return;
    }
--- a/gpt4all-chat/chatgpt.h
+++ b/gpt4all-chat/chatgpt.h
@ -46,8 +46,6 @@ public:
    ChatGPT();
    virtual ~ChatGPT();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath) override;
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -14,7 +14,6 @@
 #define REPLIT_INTERNAL_STATE_VERSION 0
 #define LLAMA_INTERNAL_STATE_VERSION 0
 #define FALCON_INTERNAL_STATE_VERSION 0
 #define BERT_INTERNAL_STATE_VERSION 0
 class LLModelStore {
 public:
@ -241,11 +240,11 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
 #if defined(Q_OS_MAC) && defined(__arm__)
            if (m_forceMetal)
-                m_llModelInfo.model = LLMImplementation::construct(filePath.toStdString(), "metal");
+                m_llModelInfo.model = LLModel::construct(filePath.toStdString(), "metal");
            else
-                m_llModelInfo.model = LLMImplementation::construct(filePath.toStdString(), "auto");
+                m_llModelInfo.model = LLModel::construct(filePath.toStdString(), "auto");
 #else
-            m_llModelInfo.model = LLModel::Implementation::construct(filePath.toStdString(), "auto");
+            m_llModelInfo.model = LLModel::construct(filePath.toStdString(), "auto");
 #endif
            if (m_llModelInfo.model) {
@ -259,13 +258,12 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
                    m_llModelInfo = LLModelInfo();
                    emit modelLoadingError(QString("Could not load model due to invalid model file for %1").arg(modelInfo.filename()));
                } else {
-                    switch (m_llModelInfo.model->implementation().modelType()[0]) {
+                    switch (m_llModelInfo.model->implementation().modelType[0]) {
                    case 'L': m_llModelType = LLModelType::LLAMA_; break;
                    case 'G': m_llModelType = LLModelType::GPTJ_; break;
                    case 'M': m_llModelType = LLModelType::MPT_; break;
                    case 'R': m_llModelType = LLModelType::REPLIT_; break;
                    case 'F': m_llModelType = LLModelType::FALCON_; break;
                    case 'B': m_llModelType = LLModelType::BERT_; break;
                    default:
                        {
                            delete std::exchange(m_llModelInfo.model, nullptr);
@ -630,8 +628,8 @@ bool ChatLLM::handleNameRecalculate(bool isRecalc)
    qDebug() << "name recalc" << m_llmThread.objectName() << isRecalc;
 #endif
    Q_UNUSED(isRecalc);
-    qt_noop();
+    Q_UNREACHABLE();
-    return true;
+    return false;
 }
 bool ChatLLM::handleSystemPrompt(int32_t token)
@ -671,8 +669,7 @@ bool ChatLLM::serialize(QDataStream &stream, int version)
        case MPT_: stream << MPT_INTERNAL_STATE_VERSION; break;
        case GPTJ_: stream << GPTJ_INTERNAL_STATE_VERSION; break;
        case LLAMA_: stream << LLAMA_INTERNAL_STATE_VERSION; break;
-        case FALCON_: stream << FALCON_INTERNAL_STATE_VERSION; break;
+        case FALCON_: stream << LLAMA_INTERNAL_STATE_VERSION; break;
        case BERT_: stream << BERT_INTERNAL_STATE_VERSION; break;
        default: Q_UNREACHABLE();
        }
    }
@ -791,18 +788,13 @@ void ChatLLM::processSystemPrompt()
    if (!isModelLoaded() || m_processedSystemPrompt || m_isServer)
        return;
    const std::string systemPrompt = MySettings::globalInstance()->modelSystemPrompt(m_modelInfo).toStdString();
    if (QString::fromStdString(systemPrompt).trimmed().isEmpty()) {
        m_processedSystemPrompt = true;
        return;
    }
    m_stopGenerating = false;
    auto promptFunc = std::bind(&ChatLLM::handleSystemPrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleSystemResponse, this, std::placeholders::_1,
        std::placeholders::_2);
    auto recalcFunc = std::bind(&ChatLLM::handleSystemRecalculate, this, std::placeholders::_1);
    const std::string systemPrompt = MySettings::globalInstance()->modelSystemPrompt(m_modelInfo).toStdString();
    const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
    const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
    const float top_p = MySettings::globalInstance()->modelTopP(m_modelInfo);
--- a/gpt4all-chat/chatllm.h
+++ b/gpt4all-chat/chatllm.h
@ -16,7 +16,6 @@ enum LLModelType {
    CHATGPT_,
    REPLIT_,
    FALCON_,
    BERT_
 };
 struct LLModelInfo {
--- a/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
+++ b/gpt4all-chat/cmake/deploy-qt-mac.cmake.in
@ -7,19 +7,16 @@ file(GLOB MYMPTLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NA
 file(GLOB MYLLAMALIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllama*)
 file(GLOB MYREPLITLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libreplit*)
 file(GLOB MYFALCONLLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libfalcon*)
 file(GLOB MYBERTLLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libbert*)
 file(GLOB MYLLMODELLIBS ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/lib/libllmodel.*)
 file(COPY ${MYGPTJLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYMPTLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLAMALIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYREPLITLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYFALCONLLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
-file(COPY ${MYBERTLLIBS}
+file(COPY ${MYLLAMALIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
 file(COPY ${MYLLMODELLIBS}
     DESTINATION ${CPACK_TEMPORARY_INSTALL_DIRECTORY}/packages/${COMPONENT_NAME_MAIN}/data/bin/gpt4all.app/Contents/Frameworks)
--- a/gpt4all-chat/llm.cpp
+++ b/gpt4all-chat/llm.cpp
@ -8,7 +8,6 @@
 #include <QFile>
 #include <QProcess>
 #include <QResource>
 #include <QSettings>
 #include <fstream>
 class MyLLM: public LLM { };
@ -34,7 +33,7 @@ LLM::LLM()
    if (directoryExists(frameworksDir))
        llmodelSearchPaths += ";" + frameworksDir;
 #endif
-    LLModel::Implementation::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
+    LLModel::setImplementationsSearchPath(llmodelSearchPaths.toStdString());
 #if defined(__x86_64__)
    #ifndef _MSC_VER
@ -49,13 +48,7 @@ LLM::LLM()
 #endif
    m_compatHardware = minimal;
-}
+    emit compatHardwareChanged();
 bool LLM::hasSettingsAccess() const
 {
    QSettings settings;
    settings.sync();
    return settings.status() == QSettings::NoError;
 }
 bool LLM::checkForUpdates() const
--- a/gpt4all-chat/llm.h
+++ b/gpt4all-chat/llm.h
@ -6,11 +6,12 @@
 class LLM : public QObject
 {
    Q_OBJECT
    Q_PROPERTY(bool compatHardware READ compatHardware NOTIFY compatHardwareChanged)
 public:
    static LLM *globalInstance();
-    Q_INVOKABLE bool hasSettingsAccess() const;
+    bool compatHardware() const { return m_compatHardware; }
    Q_INVOKABLE bool compatHardware() const { return m_compatHardware; }
    Q_INVOKABLE bool checkForUpdates() const;
    Q_INVOKABLE bool directoryExists(const QString &path) const;
@ -21,6 +22,7 @@ public:
 Q_SIGNALS:
    void chatListModelChanged();
    void modelListChanged();
    void compatHardwareChanged();
 private:
    bool m_compatHardware;
--- a/gpt4all-chat/main.qml
+++ b/gpt4all-chat/main.qml
@ -89,22 +89,14 @@ Window {
    property bool hasShownModelDownload: false
    property bool hasShownFirstStart: false
    property bool hasShownSettingsAccess: false
    function startupDialogs() {
-        if (!LLM.compatHardware()) {
+        if (!LLM.compatHardware) {
            Network.sendNonCompatHardware();
            errorCompatHardware.open();
            return;
        }
        // check if we have access to settings and if not show an error
        if (!hasShownSettingsAccess && !LLM.hasSettingsAccess()) {
            errorSettingsAccess.open();
            hasShownSettingsAccess = true;
            return;
        }
        // check for first time start of this version
        if (!hasShownFirstStart && Download.isFirstStart()) {
            firstStartDialog.open();
@ -143,20 +135,6 @@ Window {
            + qsTr("https://en.wikipedia.org/wiki/Advanced_Vector_Extensions</a>")
    }
    PopupDialog {
        id: errorSettingsAccess
        anchors.centerIn: parent
        shouldTimeOut: false
        shouldShowBusy: false
        modal: true
        text: qsTr("<h3>Encountered an error starting up:</h3><br>")
            + qsTr("<i>\"Inability to access settings file.\"</i>")
            + qsTr("<br><br>Unfortunately, something is preventing the program from accessing ")
            + qsTr("the settings file. This could be caused by incorrect permissions in the local ")
            + qsTr("app config directory where the settings file is located. ")
            + qsTr("Check out our <a href=\"https://discord.gg/4M2QFmTt2k\">discord channel</a> for help.")
    }
    StartupDialog {
        id: firstStartDialog
        anchors.centerIn: parent
--- a/gpt4all-chat/metadata/models.json
+++ b/gpt4all-chat/metadata/models.json
@ -1,16 +1,18 @@
 [
  {
    "order": "a",
-    "md5sum": "e8d47924f433bd561cb5244557147793",
+    "md5sum": "4acc146dd43eb02845c233c29289c7c5",
-    "name": "Wizard v1.1",
+    "name": "Hermes",
-    "filename": "wizardlm-13b-v1.1-superhot-8k.ggmlv3.q4_0.bin",
+    "filename": "nous-hermes-13b.ggmlv3.q4_0.bin",
-    "filesize": "7323310848",
+    "filesize": "8136777088",
    "requires": "2.4.7",
    "ramrequired": "16",
    "parameters": "13 billion",
    "quant": "q4_0",
    "type": "LLaMA",
-    "systemPrompt": " ",
+    "description": "<strong>Best overall model</strong><br><ul><li>Instruction based<li>Gives long responses<li>Curated with 300,000 uncensored instructions<li>Trained by Nous Research<li>Cannot be used commercially</ul>",
-    "description": "<strong>Best overall model</strong><br><ul><li>Instruction based<li>Gives very long responses<li>Finetuned with only 1k of high-quality data<li>Trained by Microsoft and Peking University<li>Cannot be used commercially</ul"
+    "url": "https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q4_0.bin",
    "promptTemplate": "### Instruction:\n%1\n### Response:\n"
  },
  {
    "order": "b",
@ -23,29 +25,12 @@
    "parameters": "7 billion",
    "quant": "q4_0",
    "type": "Falcon",
    "systemPrompt": " ",
    "description": "<strong>Best overall smaller model</strong><br><ul><li>Fast responses</li><li>Instruction based</li><li>Trained by TII<li>Finetuned by Nomic AI<li>Licensed for commercial use</ul>",
    "url": "https://huggingface.co/nomic-ai/gpt4all-falcon-ggml/resolve/main/ggml-model-gpt4all-falcon-q4_0.bin",
    "promptTemplate": "### Instruction:\n%1\n### Response:\n"
  },
  {
    "order": "c",
    "md5sum": "4acc146dd43eb02845c233c29289c7c5",
    "name": "Hermes",
    "filename": "nous-hermes-13b.ggmlv3.q4_0.bin",
    "filesize": "8136777088",
    "requires": "2.4.7",
    "ramrequired": "16",
    "parameters": "13 billion",
    "quant": "q4_0",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Extremely good model</strong><br><ul><li>Instruction based<li>Gives long responses<li>Curated with 300,000 uncensored instructions<li>Trained by Nous Research<li>Cannot be used commercially</ul>",
    "url": "https://huggingface.co/TheBloke/Nous-Hermes-13B-GGML/resolve/main/nous-hermes-13b.ggmlv3.q4_0.bin",
    "promptTemplate": "### Instruction:\n%1\n### Response:\n"
  },
  {
    "order": "e",
    "md5sum": "81a09a0ddf89690372fc296ff7f625af",
    "name": "Groovy",
    "filename": "ggml-gpt4all-j-v1.3-groovy.bin",
@ -54,11 +39,10 @@
    "parameters": "7 billion",
    "quant": "q4_0",
    "type": "GPT-J",
    "systemPrompt": " ",
    "description": "<strong>Creative model can be used for commercial purposes</strong><br><ul><li>Fast responses<li>Creative responses</li><li>Instruction based</li><li>Trained by Nomic AI<li>Licensed for commercial use</ul>"
  },
  {
-    "order": "f",
+    "order": "e",
    "md5sum": "11d9f060ca24575a2c303bdc39952486",
    "name": "Snoozy",
    "filename": "GPT4All-13B-snoozy.ggmlv3.q4_0.bin",
@ -68,12 +52,11 @@
    "parameters": "13 billion",
    "quant": "q4_0",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Very good overall model</strong><br><ul><li>Instruction based<li>Based on the same dataset as Groovy<li>Slower than Groovy, with higher quality responses<li>Trained by Nomic AI<li>Cannot be used commercially</ul>",
    "url": "https://huggingface.co/TheBloke/GPT4All-13B-snoozy-GGML/resolve/main/GPT4All-13B-snoozy.ggmlv3.q4_0.bin"
  },
  {
-    "order": "g",
+    "order": "f",
    "md5sum": "756249d3d6abe23bde3b1ae272628640",
    "name": "MPT Chat",
    "filename": "ggml-mpt-7b-chat.bin",
@ -88,9 +71,9 @@
    "systemPrompt": "<|im_start|>system\n- You are a helpful assistant chatbot trained by MosaicML.\n- You answer questions.\n- You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.\n- You are more than just an information source, you are also able to write poetry, short stories, and make jokes.<|im_end|>"
  },
  {
-    "order": "h",
+    "order": "g",
    "md5sum": "e64e74375ce9d36a3d0af3db1523fd0a",
-    "name": "Mini Orca",
+    "name": "Orca",
    "filename": "orca-mini-7b.ggmlv3.q4_0.bin",
    "filesize": "3791749248",
    "requires": "2.4.7",
@ -104,9 +87,9 @@
    "systemPrompt": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n"
  },
  {
-    "order": "i",
+    "order": "h",
    "md5sum": "6a087f7f4598fad0bb70e6cb4023645e",
-    "name": "Mini Orca (Small)",
+    "name": "Orca (Small)",
    "filename": "orca-mini-3b.ggmlv3.q4_0.bin",
    "filesize": "1928446208",
    "requires": "2.4.7",
@ -120,9 +103,9 @@
    "systemPrompt": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n"
  },
  {
-    "order": "j",
+    "order": "i",
    "md5sum": "959b7f65b2d12fd1e3ff99e7493c7a3a",
-    "name": "Mini Orca (Large)",
+    "name": "Orca (Large)",
    "filename": "orca-mini-13b.ggmlv3.q4_0.bin",
    "filesize": "7323329152",
    "requires": "2.4.7",
@ -136,7 +119,7 @@
    "systemPrompt": "### System:\nYou are an AI assistant that follows instruction extremely well. Help as much as you can.\n\n"
  },
  {
-    "order": "k",
+    "order": "j",
    "md5sum": "29119f8fa11712704c6b22ac5ab792ea",
    "name": "Vicuna",
    "filename": "ggml-vicuna-7b-1.1-q4_2.bin",
@ -145,11 +128,10 @@
    "parameters": "7 billion",
    "quant": "q4_2",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Good small model - trained by teams from UC Berkeley, CMU, Stanford, MBZUAI, and UC San Diego</strong><br><ul><li>Instruction based<li>Cannot be used commercially</ul>"
  },
  {
-    "order": "l",
+    "order": "k",
    "md5sum": "95999b7b0699e2070af63bf5d34101a8",
    "name": "Vicuna (large)",
    "filename": "ggml-vicuna-13b-1.1-q4_2.bin",
@ -158,11 +140,10 @@
    "parameters": "13 billion",
    "quant": "q4_2",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Good larger model - trained by teams from UC Berkeley, CMU, Stanford, MBZUAI, and UC San Diego</strong><br><ul><li>Instruction based<li>Cannot be used commercially</ul>"
  },
  {
-    "order": "m",
+    "order": "l",
    "md5sum": "99e6d129745a3f1fb1121abed747b05a",
    "name": "Wizard",
    "filename": "ggml-wizardLM-7B.q4_2.bin",
@ -171,11 +152,10 @@
    "parameters": "7 billion",
    "quant": "q4_2",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Good small model - trained by by Microsoft and Peking University</strong><br><ul><li>Instruction based<li>Cannot be used commercially</ul>"
  },
  {
-    "order": "n",
+    "order": "m",
    "md5sum": "6cb4ee297537c9133bddab9692879de0",
    "name": "Stable Vicuna",
    "filename": "ggml-stable-vicuna-13B.q4_2.bin",
@ -188,7 +168,7 @@
    "systemPrompt": "## Assistant: I am StableVicuna, a large language model created by CarperAI. I am here to chat!\n\n"
  },
  {
-    "order": "o",
+    "order": "n",
    "md5sum": "1cfa4958f489f0a0d1ffdf6b37322809",
    "name": "MPT Instruct",
    "filename": "ggml-mpt-7b-instruct.bin",
@ -198,11 +178,10 @@
    "parameters": "7 billion",
    "quant": "q4_0",
    "type": "MPT",
    "systemPrompt": " ",
    "description": "<strong>Mosaic's instruction model</strong><br><ul><li>Instruction based<li>Trained by Mosaic ML<li>Licensed for commercial use</ul>"
  },
  {
-    "order": "p",
+    "order": "o",
    "md5sum": "120c32a51d020066288df045ef5d52b9",
    "name": "MPT Base",
    "filename": "ggml-mpt-7b-base.bin",
@ -212,11 +191,10 @@
    "parameters": "7 billion",
    "quant": "q4_0",
    "type": "MPT",
    "systemPrompt": " ",
    "description": "<strong>Trained for text completion with no assistant finetuning</strong><br><ul><li>Completion based<li>Trained by Mosaic ML<li>Licensed for commercial use</ul>"
  },
  {
-    "order": "q",
+    "order": "p",
    "md5sum": "d5eafd5b0bd0d615cfd5fd763f642dfe",
    "name": "Nous Vicuna",
    "filename": "ggml-nous-gpt4-vicuna-13b.bin",
@ -225,11 +203,10 @@
    "parameters": "13 billion",
    "quant": "q4_0",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Trained on ~180,000 instructions</strong><br><ul><li>Instruction based<li>Trained by Nous Research<li>Cannot be used commercially</ul>"
  },
  {
-    "order": "r",
+    "order": "q",
    "md5sum": "489d21fd48840dcb31e5f92f453f3a20",
    "name": "Wizard Uncensored",
    "filename": "wizardLM-13B-Uncensored.ggmlv3.q4_0.bin",
@ -239,12 +216,11 @@
    "parameters": "13 billion",
    "quant": "q4_0",
    "type": "LLaMA",
    "systemPrompt": " ",
    "description": "<strong>Trained on uncensored assistant data and instruction data</strong><br><ul><li>Instruction based<li>Cannot be used commercially</ul>",
    "url": "https://huggingface.co/TheBloke/WizardLM-13B-Uncensored-GGML/resolve/main/wizardLM-13B-Uncensored.ggmlv3.q4_0.bin"
  },
  {
-    "order": "s",
+    "order": "r",
    "md5sum": "615890cb571fcaa0f70b2f8d15ef809e",
    "disableGUI": "true",
    "name": "Replit",
@ -255,23 +231,7 @@
    "parameters": "3 billion",
    "quant": "f16",
    "type": "Replit",
    "systemPrompt": " ",
    "description": "<strong>Trained on subset of the Stack</strong><br><ul><li>Code completion based<li>Licensed for commercial use</ul>",
    "url": "https://huggingface.co/nomic-ai/ggml-replit-code-v1-3b/resolve/main/ggml-replit-code-v1-3b.bin"
  },
  {
    "order": "t",
    "md5sum": "031bb5d5722c08d13e3e8eaf55c37391",
    "disableGUI": "true",
    "name": "Bert",
    "filename": "ggml-all-MiniLM-L6-v2-f16.bin",
    "filesize": "45521167",
    "requires": "2.4.14",
    "ramrequired": "1",
    "parameters": "1 million",
    "quant": "f16",
    "type": "Bert",
    "systemPrompt": " ",
    "description": "<strong>Sbert</strong><br><ul><li>For embeddings"
  }
 ]
--- a/gpt4all-chat/metadata/release.json
+++ b/gpt4all-chat/metadata/release.json
@ -416,40 +416,6 @@
 * Akarshan Biswas
 * Adam Treat (Nomic AI)
 * Community (beta testers, bug reporters)
 "
  },
  {
    "version": "2.4.13",
    "notes":
 "
 * Fix bug with prolonging shutdown with generation
 * Fix bug with update model info on deleting chats
 * Fix bug with preventing closing of model download dialog
 * Allows allow closing the model download dialog
 * Fix numerous bugs with download of models.json and provide backup option
 * Add json and c# highlighting
 * Fix bug with chatgpt crashing
 * Fix bug with chatgpt not working for some keys
 * Fix bug with mixpanel opt outs not counting
 * Fix problem with OOM errors causing crash and then repeating on next start
 * Fix default thread setting and provide guardrails
 * Fix tap handler in settings dialog for buttons
 * Fix color of some text fields on macOS for settings dialog
 * Fix problem with startup dialog not closing
 * Provide error dialog for settings file not accessible
 * Try and fix problems with avx-only detection
 * Fix showing error in model downloads unnecessarily
 * Prefer 7b models to load by default
 * Add Wizard v1.1 to download list
 * Rename Orca models to Mini Orca
 * Don't use a system prompt unless model was trained with one by default
 ",
    "contributors":
 "
 * Lakshay Kansal (Nomic AI)
 * Aaron Miller (Nomic AI)
 * Adam Treat (Nomic AI)
 * Community (beta testers, bug reporters)
 "
  }
 ]
--- a/gpt4all-chat/modellist.cpp
+++ b/gpt4all-chat/modellist.cpp
@ -161,6 +161,16 @@ int InstalledModels::count() const
    return rowCount();
 }
 QString InstalledModels::firstId() const
 {
    if (rowCount() > 0) {
        QModelIndex firstIndex = index(0, 0);
        return sourceModel()->data(firstIndex, ModelList::IdRole).toString();
    } else {
        return QString();
    }
 }
 DownloadableModels::DownloadableModels(QObject *parent)
    : QSortFilterProxyModel(parent)
    , m_expanded(false)
@ -212,7 +222,6 @@ ModelList::ModelList()
    : QAbstractListModel(nullptr)
    , m_installedModels(new InstalledModels(this))
    , m_downloadableModels(new DownloadableModels(this))
    , m_asyncModelRequestOngoing(false)
 {
    m_installedModels->setSourceModel(this);
    m_downloadableModels->setSourceModel(this);
@ -288,9 +297,12 @@ ModelInfo ModelList::defaultModelInfo() const
    settings.sync();
    // The user default model can be set by the user in the settings dialog. The "default" user
-    // default model is "Application default" which signals we should use the logic here.
+    // default model is "Application default" which signals we should use the default model that was
    // specified by the models.json file.
    const QString userDefaultModelName = MySettings::globalInstance()->userDefaultModel();
    const bool hasUserDefaultName = !userDefaultModelName.isEmpty() && userDefaultModelName != "Application default";
    const QString defaultModelName = settings.value("defaultModel").toString();
    const bool hasDefaultName = hasUserDefaultName ? false : !defaultModelName.isEmpty();
    ModelInfo *defaultModel = nullptr;
    for (ModelInfo *info : m_models) {
@ -298,10 +310,12 @@ ModelInfo ModelList::defaultModelInfo() const
            continue;
        defaultModel = info;
-        const size_t ramrequired = defaultModel->ramrequired;
+        // If we don't have either setting, then just use the first model that is installed
        if (!hasUserDefaultName && !hasDefaultName)
            break;
-        // If we don't have either setting, then just use the first model that requires less than 16GB that is installed
+        // If we don't have a user specified default, but *do* have a default setting and match, then use it
-        if (!hasUserDefaultName && !info->isChatGPT && ramrequired > 0 && ramrequired < 16)
+        if (!hasUserDefaultName && hasDefaultName && (defaultModel->id() == defaultModelName))
            break;
        // If we have a user specified default and match, then use it
@ -821,7 +835,7 @@ void ModelList::updateModelsFromDirectory()
                    for (const QString &id : modelsById) {
                        updateData(id, FilenameRole, filename);
                        updateData(id, ChatGPTRole, filename.startsWith("chatgpt-"));
-                        updateData(id, DirpathRole, info.dir().absolutePath() + "/");
+                        updateData(id, DirpathRole, path);
                        updateData(id, FilesizeRole, toFileSize(info.size()));
                    }
                }
@ -832,6 +846,14 @@ void ModelList::updateModelsFromDirectory()
    processDirectory(exePath);
    if (localPath != exePath)
        processDirectory(localPath);
    if (installedModels()->count()) {
        const QString firstModel =
            installedModels()->firstId();
        QSettings settings;
        settings.setValue("defaultModel", firstModel);
        settings.sync();
    }
 }
 void ModelList::updateModelsFromJson()
@ -877,9 +899,6 @@ void ModelList::updateModelsFromJson()
 void ModelList::updateModelsFromJsonAsync()
 {
    m_asyncModelRequestOngoing = true;
    emit asyncModelRequestOngoingChanged();
 #if defined(USE_LOCAL_MODELSJSON)
    QUrl jsonUrl("file://" + QDir::homePath() + "/dev/large_language_models/gpt4all/gpt4all-chat/metadata/models.json");
 #else
@ -892,37 +911,17 @@ void ModelList::updateModelsFromJsonAsync()
    QNetworkReply *jsonReply = m_networkManager.get(request);
    connect(qApp, &QCoreApplication::aboutToQuit, jsonReply, &QNetworkReply::abort);
    connect(jsonReply, &QNetworkReply::finished, this, &ModelList::handleModelsJsonDownloadFinished);
    connect(jsonReply, &QNetworkReply::errorOccurred, this, &ModelList::handleModelsJsonDownloadErrorOccurred);
 }
 void ModelList::handleModelsJsonDownloadFinished()
 {
    QNetworkReply *jsonReply = qobject_cast<QNetworkReply *>(sender());
-    if (!jsonReply) {
+    if (!jsonReply)
        m_asyncModelRequestOngoing = false;
        emit asyncModelRequestOngoingChanged();
        return;
    }
    QByteArray jsonData = jsonReply->readAll();
    jsonReply->deleteLater();
    parseModelsJsonFile(jsonData, true);
    m_asyncModelRequestOngoing = false;
    emit asyncModelRequestOngoingChanged();
 }
 void ModelList::handleModelsJsonDownloadErrorOccurred(QNetworkReply::NetworkError code)
 {
    // TODO: Show what error occurred in the GUI
    m_asyncModelRequestOngoing = false;
    emit asyncModelRequestOngoingChanged();
    QNetworkReply *reply = qobject_cast<QNetworkReply *>(sender());
    if (!reply)
        return;
    qWarning() << QString("ERROR: Modellist download failed with error code \"%1-%2\"")
                      .arg(code).arg(reply->errorString()).toStdString();
 }
 void ModelList::handleSslErrors(QNetworkReply *reply, const QList<QSslError> &errors)
@ -1109,6 +1108,14 @@ void ModelList::parseModelsJsonFile(const QByteArray &jsonData, bool save)
        updateData(id, ModelList::QuantRole, "NA");
        updateData(id, ModelList::TypeRole, "GPT");
    }
    if (installedModels()->count()) {
        const QString firstModel =
            installedModels()->firstId();
        QSettings settings;
        settings.setValue("defaultModel", firstModel);
        settings.sync();
    }
 }
 void ModelList::updateModelsFromSettings()
--- a/gpt4all-chat/modellist.h
+++ b/gpt4all-chat/modellist.h
@ -127,6 +127,7 @@ class InstalledModels : public QSortFilterProxyModel
 public:
    explicit InstalledModels(QObject *parent);
    int count() const;
    QString firstId() const;
 Q_SIGNALS:
    void countChanged();
@ -168,7 +169,6 @@ class ModelList : public QAbstractListModel
    Q_PROPERTY(InstalledModels* installedModels READ installedModels NOTIFY installedModelsChanged)
    Q_PROPERTY(DownloadableModels* downloadableModels READ downloadableModels NOTIFY downloadableModelsChanged)
    Q_PROPERTY(QList<QString> userDefaultModelList READ userDefaultModelList NOTIFY userDefaultModelListChanged)
    Q_PROPERTY(bool asyncModelRequestOngoing READ asyncModelRequestOngoing NOTIFY asyncModelRequestOngoingChanged)
 public:
    static ModelList *globalInstance();
@ -296,14 +296,12 @@ public:
    }
    QString incompleteDownloadPath(const QString &modelFile);
    bool asyncModelRequestOngoing() const { return m_asyncModelRequestOngoing; }
 Q_SIGNALS:
    void countChanged();
    void installedModelsChanged();
    void downloadableModelsChanged();
    void userDefaultModelListChanged();
    void asyncModelRequestOngoingChanged();
 private Q_SLOTS:
    void updateModelsFromJson();
@ -312,7 +310,6 @@ private Q_SLOTS:
    void updateModelsFromDirectory();
    void updateDataForSettings();
    void handleModelsJsonDownloadFinished();
    void handleModelsJsonDownloadErrorOccurred(QNetworkReply::NetworkError code);
    void handleSslErrors(QNetworkReply *reply, const QList<QSslError> &errors);
 private:
@ -331,7 +328,6 @@ private:
    QList<ModelInfo*> m_models;
    QHash<QString, ModelInfo*> m_modelMap;
    QFileSystemWatcher *m_watcher;
    bool m_asyncModelRequestOngoing;
 private:
    explicit ModelList();
--- a/gpt4all-chat/qml/ModelDownloaderDialog.qml
+++ b/gpt4all-chat/qml/ModelDownloaderDialog.qml
@ -41,7 +41,7 @@ MyDialog {
        }
        Label {
-            visible: !ModelList.downloadableModels.count && !ModelList.asyncModelRequestOngoing
+            visible: !ModelList.downloadableModels.count
            Layout.fillWidth: true
            Layout.fillHeight: true
            horizontalAlignment: Qt.AlignHCenter
@ -50,15 +50,6 @@ MyDialog {
            color: theme.mutedTextColor
        }
        MyBusyIndicator {
            visible: !ModelList.downloadableModels.count && ModelList.asyncModelRequestOngoing
            running: ModelList.asyncModelRequestOngoing
            Accessible.role: Accessible.Animation
            Layout.alignment: Qt.AlignCenter
            Accessible.name: qsTr("Busy indicator")
            Accessible.description: qsTr("Displayed when the models request is ongoing")
        }
        ScrollView {
            id: scrollView
            ScrollBar.vertical.policy: ScrollBar.AlwaysOn
--- a/gpt4all-chat/responsetext.cpp
+++ b/gpt4all-chat/responsetext.cpp
@ -18,9 +18,6 @@ enum Language {
    Go,
    Json,
    Csharp,
    Latex,
    Html,
    Php
 };
 static QColor keywordColor      = "#2e95d3"; // blue
@ -36,11 +33,6 @@ static QColor commandColor = functionCallColor;
 static QColor variableColor = numberColor;
 static QColor keyColor = functionColor;
 static QColor valueColor = stringColor;
 static QColor parameterColor = stringColor;
 static QColor attributeNameColor = numberColor;
 static QColor attributeValueColor = stringColor;
 static QColor specialCharacterColor = functionColor;
 static QColor doctypeColor = commentColor;
 static Language stringToLanguage(const QString &language)
 {
@ -70,12 +62,6 @@ static Language stringToLanguage(const QString &language)
        return Go;
    if (language == "json")
        return Json;
    if (language == "latex")
        return Latex;
    if (language == "html")
        return Html;
    if (language == "php")
        return Php;
    return None;
 }
@ -575,135 +561,6 @@ static QVector<HighlightingRule> bashHighlightingRules()
    return highlightingRules;
 }
 static QVector<HighlightingRule> latexHighlightingRules()
 {
    static QVector<HighlightingRule> highlightingRules;
    if (highlightingRules.isEmpty()) {
        HighlightingRule rule;
        QTextCharFormat commandFormat;
        commandFormat.setForeground(commandColor); // commandColor needs to be set to your liking
        rule.pattern = QRegularExpression("\\\\[A-Za-z]+"); // Pattern for LaTeX commands
        rule.format = commandFormat;
        highlightingRules.append(rule);
        QTextCharFormat commentFormat;
        commentFormat.setForeground(commentColor); // commentColor needs to be set to your liking
        rule.pattern = QRegularExpression("%[^\n]*"); // Pattern for LaTeX comments
        rule.format = commentFormat;
        highlightingRules.append(rule);
    }
    return highlightingRules;
 }
 static QVector<HighlightingRule> htmlHighlightingRules()
 {
    static QVector<HighlightingRule> highlightingRules;
    if (highlightingRules.isEmpty()) {
        HighlightingRule rule;
        QTextCharFormat attributeNameFormat;
        attributeNameFormat.setForeground(attributeNameColor);
        rule.pattern = QRegularExpression("\\b(\\w+)\\s*=");
        rule.format = attributeNameFormat;
        highlightingRules.append(rule);
        QTextCharFormat attributeValueFormat;
        attributeValueFormat.setForeground(attributeValueColor);
        rule.pattern = QRegularExpression("\".*?\"|'.*?'");
        rule.format = attributeValueFormat;
        highlightingRules.append(rule);
        QTextCharFormat commentFormat;
        commentFormat.setForeground(commentColor);
        rule.pattern = QRegularExpression("<!--.*?-->");
        rule.format = commentFormat;
        highlightingRules.append(rule);
        QTextCharFormat specialCharacterFormat;
        specialCharacterFormat.setForeground(specialCharacterColor);
        rule.pattern = QRegularExpression("&[a-zA-Z0-9#]*;");
        rule.format = specialCharacterFormat;
        highlightingRules.append(rule);
        QTextCharFormat doctypeFormat;
        doctypeFormat.setForeground(doctypeColor);
        rule.pattern = QRegularExpression("<!DOCTYPE.*?>");
        rule.format = doctypeFormat;
        highlightingRules.append(rule);
    }
    return highlightingRules;
 }
 static QVector<HighlightingRule> phpHighlightingRules()
 {
    static QVector<HighlightingRule> highlightingRules;
    if (highlightingRules.isEmpty()) {
        HighlightingRule rule;
        QTextCharFormat functionCallFormat;
        functionCallFormat.setForeground(functionCallColor);
        rule.pattern = QRegularExpression("\\b(\\w+)\\s*(?=\\()");
        rule.format = functionCallFormat;
        highlightingRules.append(rule);
        QTextCharFormat functionFormat;
        functionFormat.setForeground(functionColor);
        rule.pattern = QRegularExpression("\\bfunction\\s+(\\w+)\\b");
        rule.format = functionFormat;
        highlightingRules.append(rule);
        QTextCharFormat numberFormat;
        numberFormat.setForeground(numberColor);
        rule.pattern = QRegularExpression("\\b[0-9]*\\.?[0-9]+\\b");
        rule.format = numberFormat;
        highlightingRules.append(rule);
        QTextCharFormat keywordFormat;
        keywordFormat.setForeground(keywordColor);
        QStringList keywordPatterns = {
            "\\bif\\b", "\\belse\\b", "\\belseif\\b", "\\bwhile\\b", "\\bfor\\b",
            "\\bforeach\\b", "\\breturn\\b", "\\bprint\\b", "\\binclude\\b", "\\brequire\\b",
            "\\binclude_once\\b", "\\brequire_once\\b", "\\btry\\b", "\\bcatch\\b",
            "\\bfinally\\b", "\\bcontinue\\b", "\\bbreak\\b", "\\bclass\\b", "\\bfunction\\b",
            "\\bnew\\b", "\\bthrow\\b", "\\barray\\b", "\\bpublic\\b", "\\bprivate\\b",
            "\\bprotected\\b", "\\bstatic\\b", "\\bglobal\\b", "\\bisset\\b", "\\bunset\\b",
            "\\bnull\\b", "\\btrue\\b", "\\bfalse\\b"
        };
        for (const QString &pattern : keywordPatterns) {
            rule.pattern = QRegularExpression(pattern);
            rule.format = keywordFormat;
            highlightingRules.append(rule);
        }
        QTextCharFormat stringFormat;
        stringFormat.setForeground(stringColor);
        rule.pattern = QRegularExpression("\".*?\"");
        rule.format = stringFormat;
        highlightingRules.append(rule);
        rule.pattern = QRegularExpression("\'.*?\'");
        rule.format = stringFormat;
        highlightingRules.append(rule);
        QTextCharFormat commentFormat;
        commentFormat.setForeground(commentColor);
        rule.pattern = QRegularExpression("//[^\n]*");
        rule.format = commentFormat;
        highlightingRules.append(rule);
        rule.pattern = QRegularExpression("/\\*.*?\\*/");
        rule.format = commentFormat;
        highlightingRules.append(rule);
    }
    return highlightingRules;
 }
 static QVector<HighlightingRule> jsonHighlightingRules()
 {
    static QVector<HighlightingRule> highlightingRules;
@ -759,12 +616,6 @@ void SyntaxHighlighter::highlightBlock(const QString &text)
        rules = javaHighlightingRules();
    else if (block.userState() == Json)
        rules = jsonHighlightingRules();
    else if (block.userState() == Latex)
        rules = latexHighlightingRules();
    else if (block.userState() == Html)
        rules = htmlHighlightingRules();
    else if (block.userState() == Php)
        rules = phpHighlightingRules();
    for (const HighlightingRule &rule : qAsConst(rules)) {
        QRegularExpressionMatchIterator matchIterator = rule.pattern.globalMatch(text);
@ -970,10 +821,7 @@ void ResponseText::handleCodeBlocks()
                || firstWord == "java"
                || firstWord == "go"
                || firstWord == "golang"
-                || firstWord == "json"
+                || firstWord == "json") {
                || firstWord == "latex"
                || firstWord == "html"
                || firstWord == "php") {
                codeLanguage = firstWord;
                capturedText.remove(0, match.captured(0).length());
            }
--- a/gpt4all-training/configs/deepspeed/ds_config_mpt.json
+++ b/gpt4all-training/configs/deepspeed/ds_config_mpt.json
@ -1,49 +0,0 @@
 {
 	"train_batch_size": "auto",
 	"gradient_accumulation_steps": "auto",
 	"train_micro_batch_size_per_gpu": "auto",
 	"fp16": {
 	  "enabled": "auto",
 	  "min_loss_scale": 1,
 	  "loss_scale_window": 1000,
 	  "hysteresis": 2,
 	  "initial_scale_power": 32
 	},
 	"bf16": {
 		"enabled": "auto"
 	},
 	"gradient_clipping": 1.0,
 	"zero_optimization": {
 		"stage": 1,
 		"offload_param": {
 		  "device": "none"
 		},
 		"offload_optimizer": {
 		  "device": "none"
 		},
 		"allgather_partitions": true,
 		"allgather_bucket_size": 5e8,
 		"contiguous_gradients": true
 	  },
 	"optimizer": {
 		"type": "AdamW",
 		"params": {
 		  "lr": "auto",
 		  "betas": [
 			0.9,
 			0.999
 		  ],
 		  "eps": 1e-08
 		}
 	  },
 	  "scheduler": {
 		"type": "WarmupDecayLR",
 		"params": {
 		  "warmup_min_lr": 0,
 		  "warmup_max_lr": "auto",
 		  "warmup_num_steps": "auto",
 		  "warmup_type": "linear",
 		  "total_num_steps": "auto"
 		}
 	  }
 }
--- a/gpt4all-training/configs/deepspeed/ds_config_pythia.json
+++ b/gpt4all-training/configs/deepspeed/ds_config_pythia.json
@ -1,48 +0,0 @@
 {
 	"train_batch_size": "auto",
 	"gradient_accumulation_steps": "auto",
 	"train_micro_batch_size_per_gpu": "auto",
 	"fp16": {
 	  "enabled": "auto",
 	  "min_loss_scale": 1,
 	  "loss_scale_window": 1000,
 	  "hysteresis": 2,
 	  "initial_scale_power": 32
 	},
 	"bf16": {
 		"enabled": "auto"
 	},
 	"gradient_clipping": 1.0,
 	"zero_optimization": {
 	  "stage": 2,
 	  "offload_param": {
 		"device": "none"
 	  },
 	  "offload_optimizer": {
 		"device": "none"
 	  },
 	  "allgather_partitions": true,
 	  "allgather_bucket_size": 5e8,
 	  "contiguous_gradients": true
 	},
 	"optimizer": {
 		"type": "AdamW",
 		"params": {
 		  "lr": "auto",
 		  "betas": [
 			0.9,
 			0.999
 		  ],
 		  "eps": 1e-08
 		}
 	  },
 	  "scheduler": {
 		"type": "WarmupLR",
 		"params": {
 		  "warmup_min_lr": 0,
 		  "warmup_max_lr": "auto",
 		  "warmup_num_steps": "auto",
 		  "warmup_type": "linear"
 		}
 	  }
 }
--- a/gpt4all-training/configs/train/finetune_falcon.yaml
+++ b/gpt4all-training/configs/train/finetune_falcon.yaml
@ -1,34 +0,0 @@
 # model/tokenizer
 model_name: "tiiuae/falcon-7b"
 tokenizer_name: "tiiuae/falcon-7b"
 gradient_checkpointing: true
 save_name: "nomic-ai/gpt4all-falcon"
 # dataset
 streaming: false
 num_proc: 64
 dataset_path: "nomic-ai/gpt4all-j-prompt-generations"
 revision: "v1.3-groovy"
 max_length: 1024
 batch_size: 32
 # train dynamics
 lr: 2.0e-5
 min_lr: 0 
 weight_decay: 0.0
 eval_every: 500
 eval_steps: 105
 save_every: 1000
 log_grads_every: 500
 output_dir: "ckpts/falcon"
 checkpoint: "/home/paperspace/gpt4all/ckpts/mpt/step_1000"
 lora: false
 warmup_steps: 500
 num_epochs: 2 
 # logging
 wandb: true
 wandb_entity: "gpt4all"
 wandb_project_name: "gpt4all"
 seed: 42
--- a/gpt4all-training/configs/train/finetune_mpt.yaml
+++ b/gpt4all-training/configs/train/finetune_mpt.yaml
@ -1,34 +0,0 @@
 # model/tokenizer
 model_name: "mosaicml/mpt-7b"
 tokenizer_name: "mosaicml/mpt-7b"
 gradient_checkpointing: false
 save_name: "nomic-ai/mpt-finetuned-round2"
 # dataset
 streaming: false
 num_proc: 64
 dataset_path: "nomic-ai/gpt4all-j-prompt-generations"
 revision: "v1.3-groovy"
 max_length: 1024
 batch_size: 8
 # train dynamics
 lr: 2.0e-5
 min_lr: 0 
 weight_decay: 0.0
 eval_every: 500
 eval_steps: 105
 save_every: 1000
 log_grads_every: 500
 output_dir: "ckpts/mpt"
 checkpoint: null
 lora: false
 warmup_steps: 500
 num_epochs: 2 
 # logging
 wandb: false
 wandb_entity: "gpt4all"
 wandb_project_name: "gpt4all"
 seed: 42
--- a/gpt4all-training/configs/train/finetune_openllama.yaml
+++ b/gpt4all-training/configs/train/finetune_openllama.yaml
@ -1,34 +0,0 @@
 # model/tokenizer
 model_name: "openlm-research/open_llama_7b"
 tokenizer_name: "openlm-research/open_llama_7b"
 gradient_checkpointing: true
 save_name: "nomic-ai/gpt4all-openllama"
 # dataset
 streaming: false
 num_proc: 64
 dataset_path: "nomic-ai/gpt4all-updated"
 revision: null
 max_length: 1024
 batch_size: 32
 # train dynamics
 lr: 2.0e-5
 min_lr: 0 
 weight_decay: 0.0
 eval_every: 500
 log_every: 10
 save_every: 1000
 log_grads_every: 500
 output_dir: "ckpts/falcon"
 checkpoint: null
 lora: false
 warmup_steps: 500
 num_epochs: 3 
 # logging
 wandb: true
 wandb_entity: "gpt4all"
 wandb_project_name: "gpt4all"
 seed: 42
--- a/gpt4all-training/data.py
+++ b/gpt4all-training/data.py
@ -12,7 +12,7 @@ def tokenize_inputs(config, tokenizer, examples):
    # hacky backward compatible
    different_eos = tokenizer.eos_token != "</s>"
-    out = {"labels": [], "input_ids": [], "attention_mask": []}
+    out = {"labels": [], "input_ids": []}
    for prompt, response in zip(examples["prompt"], examples["response"]):
        if different_eos:
            if response.count("</s> \n") > 0:
@ -49,10 +49,9 @@ def tokenize_inputs(config, tokenizer, examples):
            print(response)
            raise
-        padded = tokenizer.pad({"input_ids": input_tokens}, padding="max_length", max_length=max_length, return_tensors="pt")
+        input_tokens = tokenizer.pad({"input_ids": input_tokens}, padding="max_length", max_length=max_length)["input_ids"]
        out["labels"].append(labels)
-        out["input_ids"].append(padded["input_ids"])
+        out["input_ids"].append(input_tokens)
        out["attention_mask"].append(padded["attention_mask"])
    out = {k: torch.stack(v) if isinstance(v, list) else v for k, v in out.items()}
@ -73,7 +72,7 @@ def load_data(config, tokenizer):
        dataset = load_dataset("json", data_files=files, split="train")
    else:
-        dataset = load_dataset(dataset_path, split="train", revision=config["revision"] if "revision" in config else None)
+        dataset = load_dataset(dataset_path, split="train")
    dataset = dataset.train_test_split(test_size=.05, seed=config["seed"])
@ -84,23 +83,19 @@ def load_data(config, tokenizer):
    else:
        kwargs = {}
    cols_to_keep = ["input_ids", "labels", "attention_mask"]
    # tokenize inputs and return labels and attention mask
    train_dataset = train_dataset.map(
        lambda ele: tokenize_inputs(config, tokenizer, ele),
        batched=True,
        remove_columns=["source", "prompt"],
        **kwargs
    )
    remove_cols = [col for col in train_dataset.column_names if col not in cols_to_keep]
    train_dataset = train_dataset.remove_columns(remove_cols)
    val_dataset = val_dataset.map(
        lambda ele: tokenize_inputs(config, tokenizer, ele),
        batched=True,
        remove_columns=["source", "prompt"],
        **kwargs
    )
    remove_cols = [col for col in val_dataset.column_names if col not in cols_to_keep]
    val_dataset = val_dataset.remove_columns(remove_cols)
    train_dataset = train_dataset.with_format("torch")
    val_dataset = val_dataset.with_format("torch")
@ -111,14 +106,12 @@ def load_data(config, tokenizer):
        train_dataset,
        collate_fn=DefaultDataCollator(),
        batch_size=config["batch_size"],
        shuffle=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        collate_fn=DefaultDataCollator(),
        batch_size=config["batch_size"],
        shuffle=True,
    )
    return train_dataloader, val_dataloader
--- a/gpt4all-training/requirements.txt
+++ b/gpt4all-training/requirements.txt
@ -1,10 +1,10 @@
 accelerate
 datasets
 einops
 torchmetrics
 evaluate
 transformers>=4.28.0
 wandb
 pip
 peft
 nodelist-inflator
 deepspeed
--- a/gpt4all-training/train.py
+++ b/gpt4all-training/train.py
@ -1,5 +1,5 @@
 import os
-from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler
+from transformers import AutoModelForCausalLM, AutoTokenizer, get_scheduler, LlamaForCausalLM
 import torch
 from torch.optim import AdamW
 from argparse import ArgumentParser
@ -42,7 +42,7 @@ def train(accelerator, config):
    accelerator.print(config)
    accelerator.print(f"Using {accelerator.num_processes} GPUs")
-    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'], use_fast=False)
+    tokenizer = AutoTokenizer.from_pretrained(config['tokenizer_name'], model_max_length=config['max_length'])
    # if no pad token, set it to eos
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
@ -53,7 +53,6 @@ def train(accelerator, config):
    checkpoint = config["gradient_checkpointing"]
    model = AutoModelForCausalLM.from_pretrained(config["model_name"], 
                                                    use_cache=False if checkpoint else True,
                                                    trust_remote_code=True) 
@ -87,7 +86,7 @@ def train(accelerator, config):
    # decay to min_lr instead of 0
    lr_ratio = config["min_lr"] / config["lr"]
    accelerator.print(f"Len of train_dataloader: {len(train_dataloader)}")
-    total_num_steps = (len(train_dataloader) / gradient_accumulation_steps) * (config["num_epochs"])
+    total_num_steps = (len(train_dataloader) / gradient_accumulation_steps) * config["num_epochs"]
    # instead of decaying to zero, decay to ratio of min_lr / lr
    total_num_steps += int(total_num_steps * lr_ratio) + config["warmup_steps"]
    accelerator.print(f"Total training steps: {total_num_steps}")
@ -105,7 +104,7 @@ def train(accelerator, config):
        )
    else:
        scheduler = DummyScheduler(
-            optimizer, total_num_steps=total_num_steps, warmup_num_steps=config["warmup_steps"]
+            optimizer, total_num_steps=config["warmup_steps"], warmup_num_steps=config["warmup_steps"]
        )
    model, optimizer, train_dataloader, val_dataloader, scheduler = accelerator.prepare(
@ -118,34 +117,26 @@ def train(accelerator, config):
    if config["checkpoint"]:
        accelerator.load_state(config["checkpoint"])
        accelerator.print(f"Resumed from checkpoint: {config['checkpoint']}")
-        path = os.path.basename(config["checkpoint"])
+        path = os.path.basename(config["train_args"]["resume_from_checkpoint"])
        training_difference = os.path.splitext(path)[0]
        resume_step = int(training_difference.replace("step_", ""))
-        train_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
+        accelerator.skip_first_batches(train_dataloader, resume_step)
        accelerator.print(f"Resuming from step {resume_step}")
    else:
        resume_step = 0
    # log gradients
    if accelerator.is_main_process and config["wandb"]:
        wandb.watch(model, log_freq=config["log_grads_every"], log="all")
-
+    for epoch in range(config["num_epochs"]):
    accelerator.wait_for_everyone()
    for epoch in range(0, config["num_epochs"]):
        train_loss = MeanMetric(nan_strategy="error").to(model.device)
        for step, batch in enumerate(tqdm(train_dataloader)):
            curr_step = epoch * len(train_dataloader) + step
            model.train()
            outputs = model(**batch)
            loss = outputs.loss
            # gather loss before backprop in case of gradient accumulation
            loss_values = accelerator.gather_for_metrics({"loss": loss.detach().float()})
            if config["wandb"]:
                accelerator.log({"loss": torch.mean(loss_values["loss"]).item()}, step=curr_step)
            train_loss.update(loss_values["loss"])
            loss = loss / gradient_accumulation_steps
@ -153,8 +144,9 @@ def train(accelerator, config):
            # get gradient norm of all params
            # log LR in case something weird happens 
-            if step > 0 and step % (config["log_lr_every"]) == 0:
+            if step > 0 and step % (config["eval_every"] // 10) == 0:
                if config["wandb"]:
                    curr_step = step + epoch * len(train_dataloader)
                    accelerator.log({"lr": scheduler.get_last_lr()[0]}, step=curr_step)
            if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
@ -164,6 +156,7 @@ def train(accelerator, config):
            if step > 0 and step % config["save_every"] == 0:
                curr_step = step + epoch * len(train_dataloader)
                accelerator.save_state(f"{config['output_dir']}/step_{curr_step}")
            if step > 0 and (step % config["eval_every"] == 0 or step == len(train_dataloader) - 1):
@ -177,6 +170,7 @@ def train(accelerator, config):
                }
                if config["wandb"]:
                    curr_step = step + epoch * len(train_dataloader)
                    accelerator.log({**log_train, **log_val}, step=curr_step)
                accelerator.print(f"Current LR: {scheduler.get_last_lr()[0]}")
@ -187,14 +181,8 @@ def train(accelerator, config):
        accelerator.print(f"Epoch {epoch} finished")
        accelerator.print(f"Pushing to HF hub")
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            f"{config['output_dir']}/epoch_{epoch}",
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save,
            state_dict=accelerator.get_state_dict(model),
        )
        try:
            if accelerator.is_main_process:
                unwrapped_model.push_to_hub(config["save_name"] + f"-epoch_{epoch}", private=True)
@ -203,17 +191,22 @@ def train(accelerator, config):
            accelerator.print(e)
            accelerator.print(f"Failed to push to hub")
    if config["num_epochs"] > 1:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
-            f"{config['output_dir']}/final",
+            f"{config['output_dir']}/epoch_{epoch}",
            is_main_process=accelerator.is_main_process,
            save_function=accelerator.save,
            state_dict=accelerator.get_state_dict(model),
        )
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        f"{config['output_dir']}/final",
        is_main_process=accelerator.is_main_process,
        save_function=accelerator.save,
        state_dict=accelerator.get_state_dict(model),
    )
    accelerator.end_training()
`@ -1,2 +1,2 @@`
	`from .gpt4all import GPT4All, Embed4All # noqa`	`from .gpt4all import GPT4All # noqa`
	`from .pyllmodel import LLModel # noqa`	`from .pyllmodel import LLModel # noqa`