ci: update path-filtering orb to 1.3.0 (#3588 )

Signed-off-by: Jared Van Bortel <jared@nomic.ai>
readme: add Windows ARM download link
2025-06-24 00:02:31 -04:00 · 2025-05-27 15:46:52 -04:00 · 2025-02-24 19:51:59 -05:00 · 2025-02-24 19:44:45 -05:00 · 2025-02-24 19:41:13 -05:00 · 2025-02-24 17:15:34 -05:00
479 changed files with 56780 additions and 25538 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,13 +1,17 @@
 version: 2.1
 setup: true
 orbs:
-  path-filtering: circleci/path-filtering@0.0.1
+  path-filtering: circleci/path-filtering@1.3.0
 workflows:
  version: 2.1
  generate-config:
    jobs:
      - path-filtering/filter:
          filters:
            tags:
              only:
                - /.*/
          base-revision: main
          config-path: .circleci/continue_config.yml
          mapping: |
@ -15,6 +19,4 @@ workflows:
            gpt4all-backend/.* run-all-workflows true
            gpt4all-bindings/python/.* run-python-workflow true
            gpt4all-bindings/typescript/.* run-ts-workflow true
            gpt4all-bindings/csharp/.* run-csharp-workflow true
            gpt4all-chat/.* run-chat-workflow true
            .* run-default-workflow true
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
--- a/.circleci/grab_notary_id.py
+++ b/.circleci/grab_notary_id.py
@ -0,0 +1,17 @@
 import re
 import sys
 ID_REG = r"id: (.*)"
 def main() -> None:
    notary_log = sys.argv[1]
    with open(notary_log, "r") as f:
        notary_output = f.read()
        id_m = re.search(ID_REG, notary_output)
        if id_m:
            print(id_m.group(1))
        else:
            raise RuntimeError("Unable to parse ID from notarization logs")
 if __name__ == "__main__":
    main()
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = blong, afterall, som, assistent, crasher
+ignore-words-list = blong, afterall, assistent, crasher, requestor
-skip = .git,*.pdf,*.svg,*.lock
+skip = ./.git,./gpt4all-chat/translations,*.pdf,*.svg,*.lock
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@ -14,6 +14,6 @@ jobs:
    steps:
      - name: Checkout
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
      - name: Codespell
        uses: codespell-project/actions-codespell@v2
--- a/.gitignore
+++ b/.gitignore
@ -181,6 +181,8 @@ CMakeLists.txt.user
 gpt4all-chat/models/*
 build_*
 build-*
 cmake-build-*
 /gpt4all-chat/tests/python/config.py
 # IntelliJ
 .idea/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,4 +1,25 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/llama.cpp-mainline
+	path = gpt4all-backend/deps/llama.cpp-mainline
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
 [submodule "gpt4all-chat/usearch"]
 	path = gpt4all-chat/deps/usearch
 	url = https://github.com/nomic-ai/usearch.git
 [submodule "gpt4all-chat/deps/SingleApplication"]
 	path = gpt4all-chat/deps/SingleApplication
 	url = https://github.com/nomic-ai/SingleApplication.git
 [submodule "gpt4all-chat/deps/fmt"]
 	path = gpt4all-chat/deps/fmt
 	url = https://github.com/fmtlib/fmt.git
 [submodule "gpt4all-chat/deps/DuckX"]
 	path = gpt4all-chat/deps/DuckX
 	url = https://github.com/nomic-ai/DuckX.git
 [submodule "gpt4all-chat/deps/QXlsx"]
 	path = gpt4all-chat/deps/QXlsx
 	url = https://github.com/nomic-ai/QXlsx.git
 [submodule "gpt4all-chat/deps/minja"]
 	path = gpt4all-chat/deps/minja
 	url = https://github.com/nomic-ai/minja.git
 [submodule "gpt4all-chat/deps/json"]
 	path = gpt4all-chat/deps/json
 	url = https://github.com/nlohmann/json.git
--- a/LICENSE_SOM.txt
+++ b/LICENSE_SOM.txt
@ -1,30 +0,0 @@
 Software for Open Models License (SOM)
 Version 1.0 dated August 30th, 2023
 This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
 This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
 1. Definitions
 The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
 A “Model” is the output of a machine learning algorithm, and excludes the Software.
 “Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
 “Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
 2. Grant of Rights. Subject to the conditions and limitations in section 3:
 (A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
 (B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
 3. Conditions and Limitations
 (A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
 (B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
 (C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
 (D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
 (E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
 (F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -0,0 +1,77 @@
 # MAINTAINERS
 ## Rules
 * All content inside GPT4All shall have a documented maintainer
 * If a maintainer decides to retire or resign a call for volunteers will go
  out
 * If no further maintainer can be found in a reasonable time frame, then the
  content will be marked deprecated and removed in time
 ## Job
 Maintainers will be...
 1. Responsible for overseeing content under their stewardship
 2. Responsible for triaging new issues, reviewing PRs, assigning priority
   to tasks
 3. Responsible for keeping content in sufficient quality in a timely fashion
 ## List
 Adam Treat ([@manyoso](https://github.com/manyoso))<br/>
 E-mail: adam@nomic.ai<br/>
 Discord: `@gonzochess75`
 - Overall project maintainer
 - Chat UI
 Jared Van Bortel ([@cebtenzzre](https://github.com/cebtenzzre))<br/>
 E-mail: jared@nomic.ai<br/>
 Discord: `@cebtenzzre`
 - gpt4all-backend
 - Python binding
 - Python CLI app
 Jacob Nguyen ([@jacoobes](https://github.com/jacoobes))<br/>
 Discord: `@jacoobes`<br/>
 E-mail: `jacoobes@sern.dev`
 - TypeScript binding
 Dominik ([@cosmic-snow](https://github.com/cosmic-snow))<br/>
 E-mail: cosmic-snow@mailfence.com<br/>
 Discord: `@cosmic__snow`
 - Community documentation (GitHub Wiki)
 Max Cembalest ([@mcembalest](https://github.com/mcembalest))<br/>
 E-mail: max@nomic.ai<br/>
 Discord: `@maxcembalest.`
 - Official documentation (gpt4all-bindings/python/docs -> https://docs.gpt4all.io/)
 Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
 E-mail: thiagojramos@outlook.com<br/>
 - pt\_BR translation
 不知火 Shiranui ([@supersonictw](https://github.com/supersonictw))<br/>
 E-mail: supersonic@livemail.tw<br/>
 Discord: `@supersonictw`
 - zh\_TW translation
 Jeremy Tayco ([@jstayco](https://github.com/jstayco))<br/>
 E-mail: jstayco@protonmail.ch<br/>
 Discord: `@vertana`
 - es\_MX translation
 Riccardo Giovanetti ([@Harvester62](https://github.com/Harvester62))<br/>
 E-mail: riccardo.giovanetti@gmail.com<br/>
 Discord: `@harvester62`
 - it\_IT translation
 Tim ([@Tim453](https://github.com/Tim453))<br/>
 E-mail: tim453@mailbox.org<br/>
 Discord: `@Tim453`
 - Flatpak
 Jack ([@wuodoo](https://github.com/wuodoo))<br/>
 E-mail: 2296103047@qq.com<br/>
 Discord: `@mikage`
 - zh\_CN translation
--- a/README.md
+++ b/README.md
@ -1,87 +1,117 @@
 <h1 align="center">GPT4All</h1>
 <p align="center">Open-source large language models that run locally on your CPU and nearly any GPU</p>
 <p align="center">
-<a href="https://gpt4all.io">GPT4All Website and Models</a>
+  Now with support for DeepSeek R1 Distillations
 </p>
 <p align="center">
-<a href="https://docs.gpt4all.io">GPT4All Documentation</a>
+  <a href="https://www.nomic.ai/gpt4all">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a> &bull; <a href="https://www.youtube.com/watch?v=gQcZDXRVJok">YouTube Tutorial</a>
 </p>
 <p align="center">
-<a href="https://discord.gg/mGZE39AS3e">Discord</a>
+  GPT4All runs large language models (LLMs) privately on everyday desktops & laptops.
 </p>
 <p align="center">
  No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>.
 </p>
 <p align="center">
-<a href="https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html">🦜️🔗 Official Langchain Backend</a> 
+  Read about what's new in <a href="https://www.nomic.ai/blog/tag/gpt4all">our blog</a>.
 </p>
 <p align="center">
  <a href="https://nomic.ai/gpt4all/#newsletter-form">Subscribe to the newsletter</a>
 </p>
 https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311
 <p align="center">
 GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
 </p>
-<p align="center">
+## Download Links
- <a href="https://www.phorm.ai/query?projectId=755eecd3-24ad-49cc-abf4-0ab84caacf63"><img src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=" alt="phorm.ai"></a>
+
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64-arm.exe">
    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows ARM Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
    <img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
  </a> &mdash;
 </p>
 <p>
  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
  </a> &mdash;
 </p>
 <p>
  The Windows and Linux builds require Intel Core i3 2nd Gen / AMD Bulldozer, or better.
 </p>
 <p>
  The Windows ARM build supports Qualcomm Snapdragon and Microsoft SQ1/SQ2 processors.
 </p>
 <p>
  The Linux build is x86-64 only (no ARM).
 </p>
 <p>
  The macOS build requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
 </p>
-<p align="center">
+See the full [System Requirements](gpt4all-chat/system_requirements.md) for more details.
-  <img width="600" height="365" src="https://user-images.githubusercontent.com/13879686/231876409-e3de1934-93bb-4b4b-9013-b491a969ebbc.gif">
+
-</p>
+<br/>
-<p align="center">
+<br/>
-Run on an M1 macOS Device (not sped up!)
+<p>
  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
    <img style="height: 2em; width: auto" alt='Get it on Flathub' src='https://flathub.org/api/badge'><br/>
    Flathub (community maintained)
  </a>
 </p>
-## GPT4All: An ecosystem of open-source on-edge large language models.
+## Install GPT4All Python
-> [!IMPORTANT]
+`gpt4all` gives you access to LLMs with our Python client around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations. 
 > GPT4All v2.5.0 and newer only supports models in GGUF format (.gguf). Models used with a previous version of GPT4All (.bin extension) will no longer work.
-GPT4All is an ecosystem to run **powerful** and **customized** large language models that work locally on consumer grade CPUs and any GPU. Note that your CPU needs to support [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions).
+Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
-Learn more in the [documentation](https://docs.gpt4all.io).
+```bash
 pip install gpt4all
 ```
-A GPT4All model is a 3GB - 8GB file that you can download and plug into the GPT4All open-source ecosystem software. **Nomic AI** supports and maintains this software ecosystem to enforce quality and security alongside spearheading the effort to allow any person or enterprise to easily train and deploy their own on-edge large language models.
+```python
 from gpt4all import GPT4All
 model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
 with model.chat_session():
    print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
 ```
-### What's New ([Issue Tracker](https://github.com/orgs/nomic-ai/projects/2))
+
 ## Integrations
 :parrot::link: [Langchain](https://python.langchain.com/v0.2/docs/integrations/providers/gpt4all/)
 :card_file_box: [Weaviate Vector Database](https://github.com/weaviate/weaviate) - [module docs](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all)
 :telescope: [OpenLIT (OTel-native Monitoring)](https://github.com/openlit/openlit) - [Docs](https://docs.openlit.io/latest/integrations/gpt4all)
 ## Release History
 - **July 2nd, 2024**: V3.0.0 Release
    - Fresh redesign of the chat application UI
    - Improved user workflow for LocalDocs
    - Expanded access to more model architectures
 - **October 19th, 2023**: GGUF Support Launches with Support for:
-    - Mistral 7b base model, an updated model gallery on [gpt4all.io](https://gpt4all.io), several new local code models including Rift Coder v1.5
+    - Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5
    - [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) support for Q4\_0 and Q4\_1 quantizations in GGUF.
    - Offline build support for running old versions of the GPT4All Local LLM Chat Client.
- **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on AMD, Intel, Samsung, Qualcomm and NVIDIA GPUs.
+- **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
- **August 15th, 2023**: GPT4All API launches allowing inference of local LLMs from docker containers.
+- **July 2023**: Stable support for LocalDocs, a feature that allows you to privately and locally chat with your data.
- **July 2023**: Stable support for LocalDocs, a GPT4All Plugin that allows you to privately and locally chat with your data.
+- **June 28th, 2023**: [Docker-based API server] launches allowing inference of local LLMs from an OpenAI-compatible HTTP endpoint.
-
+[Docker-based API server]: https://github.com/nomic-ai/gpt4all/tree/cef74c2be20f5b697055d5b8b506861c7b997fab/gpt4all-api
 ### Chat Client
 Run any GPT4All model natively on your home desktop with the auto-updating desktop chat client. See <a href="https://gpt4all.io">GPT4All Website</a> for a full list of open-source models you can run with this powerful desktop application.
 Direct Installer Links:
 * [macOS](https://gpt4all.io/installers/gpt4all-installer-darwin.dmg)
 * [Windows](https://gpt4all.io/installers/gpt4all-installer-win64.exe)
 * [Ubuntu](https://gpt4all.io/installers/gpt4all-installer-linux.run)
 Find the most up-to-date information on the [GPT4All Website](https://gpt4all.io/)
 ### Chat Client building and running
 * Follow the visual instructions on the chat client [build_and_run](gpt4all-chat/build_and_run.md) page
 ### Bindings
 * <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python/README.md">:snake: Official Python Bindings</a> [![Downloads](https://static.pepy.tech/badge/gpt4all/week)](https://pepy.tech/project/gpt4all)
 * <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript">:computer: Official Typescript Bindings</a>
 * <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/golang">:computer: Official GoLang Bindings</a>
 * <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/csharp">:computer: Official C# Bindings</a>
 * <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/java">:computer: Official Java Bindings</a>
 ### Integrations
 * 🗃️ [Weaviate Vector Database](https://github.com/weaviate/weaviate) - [module docs](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all)
 ## Contributing
 GPT4All welcomes contributions, involvement, and discussion from the open source community!
@ -91,20 +121,6 @@ Check project discord, with project owners, or through existing issues/PRs to av
 Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
 Example tags: `backend`, `bindings`, `python-bindings`, `documentation`, etc.
 ## Technical Reports
 <p align="center">
 <a href="https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf">:green_book: Technical Report 3: GPT4All Snoozy and Groovy </a>
 </p>
 <p align="center">
 <a href="https://static.nomic.ai/gpt4all/2023_GPT4All-J_Technical_Report_2.pdf">:green_book: Technical Report 2: GPT4All-J </a>
 </p>
 <p align="center">
 <a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report 1: GPT4All</a>
 </p>
 ## Citation
 If you utilize this repository, models or data in a downstream project, please consider citing it with:
--- a/common/common.cmake
+++ b/common/common.cmake
@ -0,0 +1,41 @@
 function(gpt4all_add_warning_options target)
    if (MSVC)
        return()
    endif()
    target_compile_options("${target}" PRIVATE
        # base options
        -Wall
        -Wextra
        # extra options
        -Wcast-align
        -Wextra-semi
        -Wformat=2
        -Wmissing-include-dirs
        -Wsuggest-override
        -Wvla
        # errors
        -Werror=format-security
        -Werror=init-self
        -Werror=pointer-arith
        -Werror=undef
        # disabled warnings
        -Wno-sign-compare
        -Wno-unused-parameter
    )
    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
        target_compile_options("${target}" PRIVATE
            -Wduplicated-branches
            -Wduplicated-cond
            -Wlogical-op
            -Wno-reorder
            -Wno-null-dereference
        )
    elseif (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
        target_compile_options("${target}" PRIVATE
            -Wunreachable-code-break
            -Wunreachable-code-return
            -Werror=pointer-integer-compare
            -Wno-reorder-ctor
        )
    endif()
 endfunction()
--- a/gpt4all-api/.gitignore
+++ b/gpt4all-api/.gitignore
@ -1,112 +0,0 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 app/__pycache__/
 gpt4all_api/__pycache__/
 gpt4all_api/app/api_v1/__pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # VS Code
 .vscode/
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 .hypothesis/
 .pytest_cache/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # pyenv
 .python-version
 # celery beat schedule file
 celerybeat-schedule
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 *.lock
 *.cache
--- a/gpt4all-api/.isort.cfg
+++ b/gpt4all-api/.isort.cfg
@ -1,7 +0,0 @@
 [settings]
 known_third_party=geopy,nltk,np,numpy,pandas,pysbd,fire,torch
 line_length=120
 include_trailing_comma=True
 multi_line_output=3
 use_parentheses=True
--- a/gpt4all-api/LICENSE
+++ b/gpt4all-api/LICENSE
@ -1,13 +0,0 @@
 Copyright 2023 Nomic, Inc.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
--- a/gpt4all-api/README.md
+++ b/gpt4all-api/README.md
@ -1,90 +0,0 @@
 # GPT4All REST API
 NOTICE: We are considering to deprecate this API as it has become challenging to maintain and test. If you have any interest in maintaining this or would like to takeover and adopt or discuss the future of this API please speak up in the discord channel.
 This directory contains the source code to run and build docker images that run a FastAPI app
 for serving inference from GPT4All models. The API matches the OpenAI API spec.
 ## Tutorial
 The following tutorial assumes that you have checked out this repo and cd'd into it.
 ### Starting the app
 First change your working directory to `gpt4all/gpt4all-api`.
 Now you can build the FastAPI docker image. You only have to do this on initial build or when you add new dependencies to the requirements.txt file:
 ```bash
 DOCKER_BUILDKIT=1 docker build -t gpt4all_api --progress plain -f gpt4all_api/Dockerfile.buildkit .
 ```
 Then, start the backend with:
 ```bash
 docker compose up --build
 ```
 This will run both the API and locally hosted GPU inference server. If you want to run the API without the GPU inference server, you can run:
 ```bash
 docker compose up --build gpt4all_api
 ```
 To run the API with the GPU inference server, you will need to include environment variables (like the `MODEL_ID`). Edit the `.env` file and run
 ```bash
 docker compose --env-file .env up --build
 ```
 #### Spinning up your app
 Run `docker compose up` to spin up the backend. Monitor the logs for errors in-case you forgot to set an environment variable above.
 #### Development
 Run
 ```bash
 docker compose up --build
 ```
 and edit files in the `app` directory. The api will hot-reload on changes.
 You can run the unit tests with
 ```bash
 make test
 ```
 #### Viewing API documentation
 Once the FastAPI ap is started you can access its documentation and test the search endpoint by going to:
 ```
 localhost:80/docs
 ```
 This documentation should match the OpenAI OpenAPI spec located at https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 #### Running inference
 ```python
 import openai
 openai.api_base = "http://localhost:4891/v1"
 openai.api_key = "not needed for a local LLM"
 def test_completion():
    model = "gpt4all-j-v1.3-groovy"
    prompt = "Who is Michael Jordan?"
    response = openai.Completion.create(
        model=model,
        prompt=prompt,
        max_tokens=50,
        temperature=0.28,
        top_p=0.95,
        n=1,
        echo=True,
        stream=False
    )
    assert len(response['choices'][0]['text']) > len(prompt)
    print(response)
 ```
--- a/gpt4all-api/docker-compose.gpu.yaml
+++ b/gpt4all-api/docker-compose.gpu.yaml
@ -1,24 +0,0 @@
 version: "3.8"
 services:
  gpt4all_gpu:
    image: ghcr.io/huggingface/text-generation-inference:0.9.3
    container_name: gpt4all_gpu
    restart: always #restart on error (usually code compilation from save during bad state)
    environment:
      - HUGGING_FACE_HUB_TOKEN=token
      - USE_FLASH_ATTENTION=false
      - MODEL_ID=''
      - NUM_SHARD=1
    command: --model-id $MODEL_ID --num-shard $NUM_SHARD
    volumes:
      - ./:/data
    ports:
      - "8080:80"
    shm_size: 1g
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              capabilities: [gpu]
--- a/gpt4all-api/docker-compose.yaml
+++ b/gpt4all-api/docker-compose.yaml
@ -1,22 +0,0 @@
 version: "3.8"
 services:
  gpt4all_api:
    image: gpt4all_api
    container_name: gpt4all_api
    restart: always #restart on error (usually code compilation from save during bad state)
    ports:
      - "4891:4891"
    env_file:
      - .env
    environment:
      - APP_ENVIRONMENT=dev
      - WEB_CONCURRENCY=2
      - LOGLEVEL=debug
      - PORT=4891
      - model=${MODEL_BIN} # using variable from .env file
      - inference_mode=cpu
    volumes:
      - './gpt4all_api/app:/app'
      - './gpt4all_api/models:/models' # models are mounted in the container
    command: ["/start-reload.sh"]
--- a/gpt4all-api/gpt4all_api/Dockerfile.buildkit
+++ b/gpt4all-api/gpt4all_api/Dockerfile.buildkit
@ -1,17 +0,0 @@
 # syntax=docker/dockerfile:1.0.0-experimental
 FROM tiangolo/uvicorn-gunicorn:python3.11
 # Put first so anytime this file changes other cached layers are invalidated.
 COPY gpt4all_api/requirements.txt /requirements.txt
 RUN pip install --upgrade pip
 # Run various pip install commands with ssh keys from host machine.
 RUN --mount=type=ssh pip install -r /requirements.txt && \
  rm -Rf /root/.cache && rm -Rf /tmp/pip-install*
 # Finally, copy app and client.
 COPY gpt4all_api/app /app
 RUN mkdir -p /models
--- a/gpt4all-api/gpt4all_api/README.md
+++ b/gpt4all-api/gpt4all_api/README.md
@ -1 +0,0 @@
 # FastAPI app for serving GPT4All models
--- a/gpt4all-api/gpt4all_api/app/api_v1/api.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/api.py
@ -1,9 +0,0 @@
 from api_v1.routes import chat, completions, engines, health
 from fastapi import APIRouter
 router = APIRouter()
 router.include_router(chat.router)
 router.include_router(completions.router)
 router.include_router(engines.router)
 router.include_router(health.router)
--- a/gpt4all-api/gpt4all_api/app/api_v1/events.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/events.py
@ -1,29 +0,0 @@
 import logging
 from api_v1.settings import settings
 from fastapi import HTTPException
 from fastapi.responses import JSONResponse
 from starlette.requests import Request
 log = logging.getLogger(__name__)
 startup_msg_fmt = """
 Starting up GPT4All API
 """
 async def on_http_error(request: Request, exc: HTTPException):
    return JSONResponse({'detail': exc.detail}, status_code=exc.status_code)
 async def on_startup(app):
    startup_msg = startup_msg_fmt.format(settings=settings)
    log.info(startup_msg)
 def startup_event_handler(app):
    async def start_app() -> None:
        await on_startup(app)
    return start_app
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
@ -1,103 +0,0 @@
 import logging
 import time
 from typing import List
 from uuid import uuid4
 from fastapi import APIRouter, HTTPException
 from gpt4all import GPT4All
 from pydantic import BaseModel, Field
 from api_v1.settings import settings
 from fastapi.responses import StreamingResponse
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 class ChatCompletionMessage(BaseModel):
    role: str
    content: str
 class ChatCompletionRequest(BaseModel):
    model: str = Field(settings.model, description='The model to generate a completion from.')
    messages: List[ChatCompletionMessage] = Field(..., description='Messages for the chat completion.')
    temperature: float = Field(settings.temp, description='Model temperature')
 class ChatCompletionChoice(BaseModel):
    message: ChatCompletionMessage
    index: int
    logprobs: float
    finish_reason: str
 class ChatCompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class ChatCompletionResponse(BaseModel):
    id: str
    object: str = 'text_completion'
    created: int
    model: str
    choices: List[ChatCompletionChoice]
    usage: ChatCompletionUsage
 router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])
@router.post("/completions", response_model=ChatCompletionResponse)
 async def chat_completion(request: ChatCompletionRequest):
    '''
    Completes a GPT4All model response based on the last message in the chat.
    '''
    # GPU is not implemented yet
    if settings.inference_mode == "gpu":
        raise HTTPException(status_code=400,
              detail=f"Not implemented yet: Can only infer in CPU mode.")
    # we only support the configured model
    if request.model != settings.model:
        raise HTTPException(status_code=400,
              detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
    # run only of we have a message
    if request.messages:
        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
        # format system message and conversation history correctly
        formatted_messages = ""
        for message in request.messages:
            formatted_messages += f"<|im_start|>{message.role}\n{message.content}<|im_end|>\n"
        # the LLM will complete the response of the assistant
        formatted_messages += "<|im_start|>assistant\n"
        response = model.generate(
            prompt=formatted_messages,
            temp=request.temperature
            )
        # the LLM may continue to hallucinate the conversation, but we want only the first response
        # so, cut off everything after first <|im_end|>
        index = response.find("<|im_end|>")
        response_content = response[:index].strip()
    else:
        response_content = "No messages received."
    # Create a chat message for the response
    response_message = ChatCompletionMessage(role="assistant", content=response_content)
    # Create a choice object with the response message
    response_choice = ChatCompletionChoice(
        message=response_message,
        index=0,
        logprobs=-1.0,  # Placeholder value
        finish_reason="length"  # Placeholder value
    )
    # Create the response object
    chat_response = ChatCompletionResponse(
        id=str(uuid4()),
        created=int(time.time()),
        model=request.model,
        choices=[response_choice],
        usage=ChatCompletionUsage(prompt_tokens=0, completion_tokens=0, total_tokens=0),  # Placeholder values
    )
    return chat_response
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
@ -1,215 +0,0 @@
 import json
 from typing import List, Dict, Iterable, AsyncIterable
 import logging
 import time
 from typing import Dict, List, Union, Optional
 from uuid import uuid4
 import aiohttp
 import asyncio
 from api_v1.settings import settings
 from fastapi import APIRouter, Depends, Response, Security, status, HTTPException
 from fastapi.responses import StreamingResponse
 from gpt4all import GPT4All
 from pydantic import BaseModel, Field
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.DEBUG)
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 class CompletionRequest(BaseModel):
    model: str = Field(settings.model, description='The model to generate a completion from.')
    prompt: Union[List[str], str] = Field(..., description='The prompt to begin completing from.')
    max_tokens: int = Field(None, description='Max tokens to generate')
    temperature: float = Field(settings.temp, description='Model temperature')
    top_p: Optional[float] = Field(settings.top_p, description='top_p')
    top_k: Optional[int] = Field(settings.top_k, description='top_k')
    n: int = Field(1, description='How many completions to generate for each prompt')
    stream: bool = Field(False, description='Stream responses')
    repeat_penalty: float = Field(settings.repeat_penalty, description='Repeat penalty')
 class CompletionChoice(BaseModel):
    text: str
    index: int
    logprobs: float
    finish_reason: str
 class CompletionUsage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int
 class CompletionResponse(BaseModel):
    id: str
    object: str = 'text_completion'
    created: int
    model: str
    choices: List[CompletionChoice]
    usage: CompletionUsage
 class CompletionStreamResponse(BaseModel):
    id: str
    object: str = 'text_completion'
    created: int
    model: str
    choices: List[CompletionChoice]
 router = APIRouter(prefix="/completions", tags=["Completion Endpoints"])
 def stream_completion(output: Iterable, base_response: CompletionStreamResponse):
    """
    Streams a GPT4All output to the client.
    Args:
        output: The output of GPT4All.generate(), which is an iterable of tokens.
        base_response: The base response object, which is cloned and modified for each token.
    Returns:
        A Generator of CompletionStreamResponse objects, which are serialized to JSON Event Stream format.
    """
    for token in output:
        chunk = base_response.copy()
        chunk.choices = [dict(CompletionChoice(
            text=token,
            index=0,
            logprobs=-1,
            finish_reason=''
        ))]
        yield f"data: {json.dumps(dict(chunk))}\n\n"
 async def gpu_infer(payload, header):
    async with aiohttp.ClientSession() as session:
        try:
            async with session.post(
                settings.hf_inference_server_host, headers=header, data=json.dumps(payload)
            ) as response:
                resp = await response.json()
            return resp
        except aiohttp.ClientError as e:
            # Handle client-side errors (e.g., connection error, invalid URL)
            logger.error(f"Client error: {e}")
        except aiohttp.ServerError as e:
            # Handle server-side errors (e.g., internal server error)
            logger.error(f"Server error: {e}")
        except json.JSONDecodeError as e:
            # Handle JSON decoding errors
            logger.error(f"JSON decoding error: {e}")
        except Exception as e:
            # Handle other unexpected exceptions
            logger.error(f"Unexpected error: {e}")
@router.post("/", response_model=CompletionResponse)
 async def completions(request: CompletionRequest):
    '''
    Completes a GPT4All model response.
    '''
    if settings.inference_mode == "gpu":
        params = request.dict(exclude={'model', 'prompt', 'max_tokens', 'n'})
        params["max_new_tokens"] = request.max_tokens
        params["num_return_sequences"] = request.n
        header = {"Content-Type": "application/json"}
        if isinstance(request.prompt, list):
            tasks = []
            for prompt in request.prompt:
                payload = {"parameters": params}
                payload["inputs"] = prompt
                task = gpu_infer(payload, header)
                tasks.append(task)
            results = await asyncio.gather(*tasks)
            choices = []
            for response in results:
                scores = response["scores"] if "scores" in response else -1.0
                choices.append(
                    dict(
                        CompletionChoice(
                            text=response["generated_text"], index=0, logprobs=scores, finish_reason='stop'
                        )
                    )
                )
            return CompletionResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=choices,
                usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
            )
        else:
            payload = {"parameters": params}
            # If streaming, we need to return a StreamingResponse
            payload["inputs"] = request.prompt
            resp = await gpu_infer(payload, header)
            output = resp["generated_text"]
            # this returns all logprobs
            scores = resp["scores"] if "scores" in resp else -1.0
            return CompletionResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=[dict(CompletionChoice(text=output, index=0, logprobs=scores, finish_reason='stop'))],
                usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
            )
    else:
        if request.model != settings.model:
            raise HTTPException(status_code=400,
                                detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
        if isinstance(request.prompt, list):
            if len(request.prompt) > 1:
                raise HTTPException(status_code=400, detail="Can only infer one inference per request in CPU mode.")
            else:
                request.prompt = request.prompt[0]
        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
        output = model.generate(prompt=request.prompt,
                                max_tokens=request.max_tokens,
                                streaming=request.stream,
                                top_k=request.top_k,
                                top_p=request.top_p,
                                temp=request.temperature,
                                )
        # If streaming, we need to return a StreamingResponse
        if request.stream:
            base_chunk = CompletionStreamResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=[]
            )
            return StreamingResponse((response for response in stream_completion(output, base_chunk)),
                                     media_type="text/event-stream")
        else:
            return CompletionResponse(
                id=str(uuid4()),
                created=time.time(),
                model=request.model,
                choices=[dict(CompletionChoice(
                    text=output,
                    index=0,
                    logprobs=-1,
                    finish_reason='stop'
                ))],
                usage={
                    'prompt_tokens': 0,  # TODO how to compute this?
                    'completion_tokens': 0,
                    'total_tokens': 0
                }
            )
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/embeddings.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/embeddings.py
@ -1,65 +0,0 @@
 from typing import List, Union
 from fastapi import APIRouter
 from api_v1.settings import settings
 from gpt4all import Embed4All
 from pydantic import BaseModel, Field
 ### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
 class EmbeddingRequest(BaseModel):
    model: str = Field(
        settings.model, description="The model to generate an embedding from."
    )
    input: Union[str, List[str], List[int], List[List[int]]] = Field(
        ..., description="Input text to embed, encoded as a string or array of tokens."
    )
 class EmbeddingUsage(BaseModel):
    prompt_tokens: int = 0
    total_tokens: int = 0
 class Embedding(BaseModel):
    index: int = 0
    object: str = "embedding"
    embedding: List[float]
 class EmbeddingResponse(BaseModel):
    object: str = "list"
    model: str
    data: List[Embedding]
    usage: EmbeddingUsage
 router = APIRouter(prefix="/embeddings", tags=["Embedding Endpoints"])
 embedder = Embed4All()
 def get_embedding(data: EmbeddingRequest) -> EmbeddingResponse:
    """
    Calculates the embedding for the given input using a specified model.
    Args:
        data (EmbeddingRequest): An EmbeddingRequest object containing the input data
        and model name.
    Returns:
        EmbeddingResponse: An EmbeddingResponse object encapsulating the calculated embedding,
        usage info, and the model name.
    """
    embedding = embedder.embed(data.input)
    return EmbeddingResponse(
        data=[Embedding(embedding=embedding)], usage=EmbeddingUsage(), model=data.model
    )
@router.post("/", response_model=EmbeddingResponse)
 def embeddings(data: EmbeddingRequest):
    """
    Creates a GPT4All embedding
    """
    return get_embedding(data)
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
@ -1,39 +0,0 @@
 import requests
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel, Field
 from typing import List, Dict
 # Define the router for the engines module
 router = APIRouter(prefix="/engines", tags=["Search Endpoints"])
 # Define the models for the engines module
 class ListEnginesResponse(BaseModel):
    data: List[Dict] = Field(..., description="All available models.")
 class EngineResponse(BaseModel):
    data: List[Dict] = Field(..., description="All available models.")
 # Define the routes for the engines module
@router.get("/", response_model=ListEnginesResponse)
 async def list_engines():
    try:
        response = requests.get('https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models2.json')
        response.raise_for_status()  # This will raise an HTTPError if the HTTP request returned an unsuccessful status code
        engines = response.json()
        return ListEnginesResponse(data=engines)
    except requests.RequestException as e:
        logger.error(f"Error fetching engine list: {e}")
        raise HTTPException(status_code=500, detail="Error fetching engine list")
 # Define the routes for the engines module
@router.get("/{engine_id}", response_model=EngineResponse)
 async def retrieve_engine(engine_id: str):
    try:
        # Implement logic to fetch a specific engine's details
        # This is a placeholder, replace with your actual data retrieval logic
        engine_details = {"id": engine_id, "name": "Engine Name", "description": "Engine Description"}
        return EngineResponse(data=[engine_details])
    except Exception as e:
        logger.error(f"Error fetching engine details: {e}")
        raise HTTPException(status_code=500, detail=f"Error fetching details for engine {engine_id}")
--- a/gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
@ -1,13 +0,0 @@
 import logging
 from fastapi import APIRouter
 from fastapi.responses import JSONResponse
 log = logging.getLogger(__name__)
 router = APIRouter(prefix="/health", tags=["Health"])
@router.get('/', response_class=JSONResponse)
 async def health_check():
    """Runs a health check on this instance of the API."""
    return JSONResponse({'status': 'ok'}, headers={'Access-Control-Allow-Origin': '*'})
--- a/gpt4all-api/gpt4all_api/app/api_v1/settings.py
+++ b/gpt4all-api/gpt4all_api/app/api_v1/settings.py
@ -1,19 +0,0 @@
 from pydantic import BaseSettings
 class Settings(BaseSettings):
    app_environment = 'dev'
    model: str = 'ggml-mpt-7b-chat.bin'
    gpt4all_path: str = '/models'
    inference_mode: str = "cpu"
    hf_inference_server_host: str = "http://gpt4all_gpu:80/generate"
    sentry_dns: str = None
    temp: float = 0.18
    top_p: float = 1.0
    top_k: int = 50
    repeat_penalty: float = 1.18
 settings = Settings()
--- a/gpt4all-api/gpt4all_api/app/docs.py
+++ b/gpt4all-api/gpt4all_api/app/docs.py
@ -1,3 +0,0 @@
 desc = 'GPT4All API'
 endpoint_paths = {'health': '/health'}
--- a/gpt4all-api/gpt4all_api/app/main.py
+++ b/gpt4all-api/gpt4all_api/app/main.py
@ -1,84 +0,0 @@
 import logging
 import os
 import docs
 from api_v1 import events
 from api_v1.api import router as v1_router
 from api_v1.settings import settings
 from fastapi import FastAPI, HTTPException, Request
 from fastapi.logger import logger as fastapi_logger
 from starlette.middleware.cors import CORSMiddleware
 logger = logging.getLogger(__name__)
 app = FastAPI(title='GPT4All API', description=docs.desc)
 # CORS Configuration (in-case you want to deploy)
 app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["GET", "POST", "OPTIONS"],
    allow_headers=["*"],
 )
 logger.info('Adding v1 endpoints..')
 # add v1
 app.include_router(v1_router, prefix='/v1')
 app.add_event_handler('startup', events.startup_event_handler(app))
 app.add_exception_handler(HTTPException, events.on_http_error)
@app.on_event("startup")
 async def startup():
    global model
    if settings.inference_mode == "cpu":
        logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
        from gpt4all import GPT4All
        model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
    else:
        # is it possible to do this once the server is up?
        ## TODO block until HF inference server is up.
        logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
@app.on_event("shutdown")
 async def shutdown():
    logger.info("Shutting down API")
 if settings.sentry_dns is not None:
    import sentry_sdk
    def traces_sampler(sampling_context):
        if 'health' in sampling_context['transaction_context']['name']:
            return False
    sentry_sdk.init(
        dsn=settings.sentry_dns, traces_sample_rate=0.1, traces_sampler=traces_sampler, send_default_pii=False
    )
 # This is needed to get logs to show up in the app
 if "gunicorn" in os.environ.get("SERVER_SOFTWARE", ""):
    gunicorn_error_logger = logging.getLogger("gunicorn.error")
    gunicorn_logger = logging.getLogger("gunicorn")
    root_logger = logging.getLogger()
    fastapi_logger.setLevel(gunicorn_logger.level)
    fastapi_logger.handlers = gunicorn_error_logger.handlers
    root_logger.setLevel(gunicorn_logger.level)
    uvicorn_logger = logging.getLogger("uvicorn.access")
    uvicorn_logger.handlers = gunicorn_error_logger.handlers
 else:
    # https://github.com/tiangolo/fastapi/issues/2019
    LOG_FORMAT2 = (
        "[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
    )
    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
--- a/gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
+++ b/gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
@ -1,93 +0,0 @@
 """
 Use the OpenAI python API to test gpt4all models.
 """
 from typing import List, get_args
 import os
 from dotenv import load_dotenv
 import openai
 openai.api_base = "http://localhost:4891/v1"
 openai.api_key = "not needed for a local LLM"
 # Load the .env file
 env_path = 'gpt4all-api/gpt4all_api/.env'
 load_dotenv(dotenv_path=env_path)
 # Fetch MODEL_ID from .env file
 model_id = os.getenv('MODEL_BIN', 'default_model_id')
 embedding = os.getenv('EMBEDDING', 'default_embedding_model_id')
 print (model_id)
 print (embedding)
 def test_completion():
    model = model_id
    prompt = "Who is Michael Jordan?"
    response = openai.Completion.create(
        model=model, prompt=prompt, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
    )
    assert len(response['choices'][0]['text']) > len(prompt)
 def test_streaming_completion():
    model = model_id
    prompt = "Who is Michael Jordan?"
    tokens = []
    for resp in openai.Completion.create(
            model=model,
            prompt=prompt,
            max_tokens=50,
            temperature=0.28,
            top_p=0.95,
            n=1,
            echo=True,
            stream=True):
        tokens.append(resp.choices[0].text)
    assert (len(tokens) > 0)
    assert (len("".join(tokens)) > len(prompt))
 # Modified test batch, problems with keyerror in response
 def test_batched_completion():
    model = model_id  # replace with your specific model ID
    prompt = "Who is Michael Jordan?"
    responses = []
    # Loop to create completions one at a time
    for _ in range(3):
        response = openai.Completion.create(
            model=model, prompt=prompt, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
        )
        responses.append(response)
    # Assertions to check the responses
    for response in responses:
        assert len(response['choices'][0]['text']) > len(prompt)
    assert len(responses) == 3
 def test_embedding():
    model = embedding
    prompt = "Who is Michael Jordan?"
    response = openai.Embedding.create(model=model, input=prompt)
    output = response["data"][0]["embedding"]
    args = get_args(List[float])
    assert response["model"] == model
    assert isinstance(output, list)
    assert all(isinstance(x, args) for x in output)
 def test_chat_completion():
    model = model_id
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Knock knock."},
            {"role": "assistant", "content": "Who's there?"},
            {"role": "user", "content": "Orange."},
            ]
    )
    assert response.choices[0].message.role == "assistant"
    assert len(response.choices[0].message.content) > 0
--- a/gpt4all-api/gpt4all_api/env
+++ b/gpt4all-api/gpt4all_api/env
@ -1,3 +0,0 @@
 # Add your GGUF compatible model LLM here. ie: MODEL_BIN="mistral-7b-instruct-v0.1.Q4_0", rename file ".env"
 # Make sure this LLM matches the model you placed inside the models folder
 MODEL_BIN=""
--- a/gpt4all-api/gpt4all_api/models/README.md
+++ b/gpt4all-api/gpt4all_api/models/README.md
@ -1 +0,0 @@
 ### Drop GGUF compatible models here, make sure it matches MODEL_BIN on your .env file
--- a/gpt4all-api/gpt4all_api/requirements.txt
+++ b/gpt4all-api/gpt4all_api/requirements.txt
@ -1,13 +0,0 @@
 aiohttp>=3.6.2
 aiofiles
 pydantic>=1.4.0,<2.0.0
 requests>=2.24.0
 ujson>=2.0.2
 fastapi>=0.95.0
 Jinja2>=3.0
 gpt4all>=1.0.0
 pytest
 openai==0.28.0
 black
 isort
 python-dotenv
--- a/gpt4all-api/makefile
+++ b/gpt4all-api/makefile
@ -1,46 +0,0 @@
 ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
 APP_NAME:=gpt4all_api
 PYTHON:=python3.8
 SHELL := /bin/bash
 all: dependencies
 fresh: clean dependencies
 testenv: clean_testenv test_build
 	docker compose -f docker-compose.yaml up --build
 testenv_gpu: clean_testenv test_build
 	docker compose -f docker-compose.yaml -f docker-compose.gpu.yaml up --build
 testenv_d: clean_testenv test_build
 	docker compose env up --build -d
 test:
 	docker compose exec $(APP_NAME) pytest -svv --disable-warnings -p no:cacheprovider /app/tests
 test_build:
    DOCKER_BUILDKIT=1 docker build -t $(APP_NAME) --progress plain -f $(APP_NAME)/Dockerfile.buildkit .
 clean_testenv:
 	docker compose down -v
 fresh_testenv: clean_testenv testenv
 venv:
 	if [ ! -d $(ROOT_DIR)/venv ]; then $(PYTHON) -m venv $(ROOT_DIR)/venv; fi
 dependencies: venv
 	source $(ROOT_DIR)/venv/bin/activate; $(PYTHON) -m pip install -r $(ROOT_DIR)/$(APP_NAME)/requirements.txt
 clean: clean_testenv
 	# Remove existing environment
 	rm -rf $(ROOT_DIR)/venv;
 	rm -rf $(ROOT_DIR)/$(APP_NAME)/*.pyc;
 black:
 	source $(ROOT_DIR)/venv/bin/activate; black -l 120 -S --target-version py38 $(APP_NAME)
 isort:
 	source $(ROOT_DIR)/venv/bin/activate; isort  --ignore-whitespace --atomic -w 120 $(APP_NAME)
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,9 +1,20 @@
-cmake_minimum_required(VERSION 3.16)
+cmake_minimum_required(VERSION 3.23)  # for FILE_SET
 include(../common/common.cmake)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 if (APPLE)
    option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
 else()
    option(LLMODEL_KOMPUTE "llmodel: use Kompute"              ON)
    option(LLMODEL_VULKAN  "llmodel: use Vulkan"               OFF)
    option(LLMODEL_CUDA    "llmodel: use CUDA"                 ON)
    option(LLMODEL_ROCM    "llmodel: use ROCm"                 OFF)
 endif()
 if (APPLE)
  if (BUILD_UNIVERSAL)
    # Build a Universal binary on macOS
    # This requires that the found Qt library is compiled as Universal binaries.
@ -25,7 +36,7 @@ set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
-set(CMAKE_CXX_STANDARD 20)
+set(CMAKE_CXX_STANDARD 23)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 set(BUILD_SHARED_LIBS ON)
@ -39,36 +50,88 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()
 set(DIRECTORY deps/llama.cpp-mainline)
 include(llama.cpp.cmake)
-set(BUILD_VARIANTS default avxonly)
+set(BUILD_VARIANTS)
-if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
+if (APPLE)
-    set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
+    list(APPEND BUILD_VARIANTS metal)
 endif()
 if (LLMODEL_KOMPUTE)
    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
 else()
    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
 endif()
 if (LLMODEL_VULKAN)
    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
 endif()
 if (LLMODEL_CUDA)
    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-set(CMAKE_VERBOSE_MAKEFILE ON)
+    # Defaults must be set before enable_language(CUDA).
    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
        # 52 == lowest CUDA 12 standard
        # 60 == f16 CUDA intrinsics
        # 61 == integer CUDA intrinsics
        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
        else()
            set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
        endif()
    endif()
    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
    include(CheckLanguage)
    check_language(CUDA)
    if (NOT CMAKE_CUDA_COMPILER)
        message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
    endif()
    enable_language(CUDA)
    list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
 endif()
 if (LLMODEL_ROCM)
    enable_language(HIP)
    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
-    if (BUILD_VARIANT STREQUAL avxonly)
+    if (BUILD_VARIANT MATCHES avxonly)
-        set(GPT4ALL_ALLOW_NON_AVX NO)
+        set(GPT4ALL_ALLOW_NON_AVX OFF)
    else()
-        set(GPT4ALL_ALLOW_NON_AVX YES)
+        set(GPT4ALL_ALLOW_NON_AVX ON)
    endif()
-    set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
+    set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
-    set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
+    set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
-    set(LLAMA_FMA  ${GPT4ALL_ALLOW_NON_AVX})
+    set(GGML_FMA  ${GPT4ALL_ALLOW_NON_AVX})
-    if (BUILD_VARIANT STREQUAL metal)
+    set(GGML_METAL   OFF)
-        set(LLAMA_METAL YES)
+    set(GGML_KOMPUTE OFF)
-    else()
+    set(GGML_VULKAN  OFF)
-        set(LLAMA_METAL NO)
+    set(GGML_CUDA    OFF)
    set(GGML_ROCM    OFF)
    if (BUILD_VARIANT MATCHES metal)
        set(GGML_METAL   ON)
    elseif (BUILD_VARIANT MATCHES kompute)
        set(GGML_KOMPUTE ON)
    elseif (BUILD_VARIANT MATCHES vulkan)
        set(GGML_VULKAN  ON)
    elseif (BUILD_VARIANT MATCHES cuda)
        set(GGML_CUDA    ON)
    elseif (BUILD_VARIANT MATCHES rocm)
        set(GGML_HIPBLAS ON)
    endif()
    # Include GGML
-    set(LLAMA_K_QUANTS YES)
+    include_ggml(-mainline-${BUILD_VARIANT})
-    include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
+
    if (BUILD_VARIANT MATCHES metal)
        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
    endif()
    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
@ -88,24 +151,35 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Add each individual implementations
    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        llamamodel.cpp llmodel_shared.cpp)
+        src/llamamodel.cpp src/llmodel_shared.cpp)
    gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        src include/gpt4all-backend
    )
    prepare_target(llamamodel-mainline llama-mainline)
-    if (NOT LLAMA_METAL)
+    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
-        add_library(gptj-${BUILD_VARIANT} SHARED
+        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
        prepare_target(gptj llama-mainline)
    endif()
 endforeach()
 add_library(llmodel
-    llmodel.h llmodel.cpp llmodel_shared.cpp
+    src/dlhandle.cpp
-    llmodel_c.h llmodel_c.cpp
+    src/llmodel.cpp
-    dlhandle.h
+    src/llmodel_c.cpp
    src/llmodel_shared.cpp
 )
 gpt4all_add_warning_options(llmodel)
 target_sources(llmodel PUBLIC
    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
    FILES include/gpt4all-backend/llmodel.h
          include/gpt4all-backend/llmodel_c.h
          include/gpt4all-backend/sysinfo.h
 )
 target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
 target_include_directories(llmodel PRIVATE src include/gpt4all-backend)
 set_target_properties(llmodel PROPERTIES
                              VERSION ${PROJECT_VERSION}
--- a/gpt4all-backend/README.md
+++ b/gpt4all-backend/README.md
@ -27,7 +27,7 @@ Unfortunately, no for three reasons:
 # What is being done to make them more compatible?
-A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.
+A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.
 # What about GPU inference?
--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -0,0 +1 @@
 Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
--- a/gpt4all-backend/dlhandle.h
+++ b/gpt4all-backend/dlhandle.h
@ -1,108 +0,0 @@
 #ifndef DLHANDLE_H
 #define DLHANDLE_H
 #ifndef _WIN32
 #include <string>
 #include <stdexcept>
 #include <utility>
 #include <dlfcn.h>
 class Dlhandle {
    void *chandle;
 public:
    class Exception : public std::runtime_error {
    public:
        using std::runtime_error::runtime_error;
    };
    Dlhandle() : chandle(nullptr) {}
    Dlhandle(const std::string& fpath, int flags = RTLD_LAZY | RTLD_LOCAL) {
        chandle = dlopen(fpath.c_str(), flags);
        if (!chandle) {
            throw Exception("dlopen(\""+fpath+"\"): "+dlerror());
        }
    }
    Dlhandle(const Dlhandle& o) = delete;
    Dlhandle(Dlhandle&& o) : chandle(o.chandle) {
        o.chandle = nullptr;
    }
    ~Dlhandle() {
        if (chandle) dlclose(chandle);
    }
    auto operator =(Dlhandle&& o) {
        chandle = std::exchange(o.chandle, nullptr);
    }
    bool is_valid() const {
        return chandle != nullptr;
    }
    operator bool() const {
        return is_valid();
    }
    template<typename T>
    T* get(const std::string& fname) const {
        auto fres = reinterpret_cast<T*>(dlsym(chandle, fname.c_str()));
        return (dlerror()==NULL)?fres:nullptr;
    }
    auto get_fnc(const std::string& fname) const {
        return get<void*(...)>(fname);
    }
 };
 #else
 #include <algorithm>
 #include <filesystem>
 #include <string>
 #include <exception>
 #include <stdexcept>
 #ifndef NOMINMAX
    #define NOMINMAX
 #endif
 #include <windows.h>
 #include <libloaderapi.h>
 class Dlhandle {
    HMODULE chandle;
 public:
    class Exception : public std::runtime_error {
    public:
        using std::runtime_error::runtime_error;
    };
    Dlhandle() : chandle(nullptr) {}
    Dlhandle(const std::string& fpath) {
        std::string afpath = std::filesystem::absolute(fpath).string();
        std::replace(afpath.begin(), afpath.end(), '/', '\\');
        chandle = LoadLibraryExA(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
        if (!chandle) {
            throw Exception("dlopen(\""+fpath+"\"): Error");
        }
    }
    Dlhandle(const Dlhandle& o) = delete;
    Dlhandle(Dlhandle&& o) : chandle(o.chandle) {
        o.chandle = nullptr;
    }
    ~Dlhandle() {
        if (chandle) FreeLibrary(chandle);
    }
    bool is_valid() const {
        return chandle != nullptr;
    }
    template<typename T>
    T* get(const std::string& fname) const {
        return reinterpret_cast<T*>(GetProcAddress(chandle, fname.c_str()));
    }
    auto get_fnc(const std::string& fname) const {
        return get<void*(...)>(fname);
    }
 };
 #endif
 #endif // DLHANDLE_H
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@ -1,837 +0,0 @@
 #define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "gptj_impl.h"
 #include "utils.h"
 #include "llmodel_shared.h"
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
 #include <map>
 #include <string>
 #include <vector>
 #include <iostream>
 #if defined(_WIN32) && defined(_MSC_VER)
    #define WIN32_LEAN_AND_MEAN
    #ifndef NOMINMAX
        #define NOMINMAX
    #endif
    #include <windows.h>
    #include <io.h>
    #include <stdio.h>
 #else
    #include <unistd.h>
 #endif
 #include <sstream>
 #include <unordered_set>
 #include <ggml.h>
 namespace {
 const char *modelType_ = "GPT-J";
 }
 // default hparams (GPT-J 6B)
 struct gptj_hparams {
    int32_t n_vocab = 50400;
    int32_t n_ctx   = 2048;
    int32_t n_embd  = 4096;
    int32_t n_head  = 16;
    int32_t n_layer = 28;
    int32_t n_rot   = 64;
    float norm_eps  = 1e-5;
 };
 struct gptj_layer {
    // normalization
    struct ggml_tensor * ln_1_g;
    struct ggml_tensor * ln_1_b;
    // attention
    struct ggml_tensor * c_attn_q_proj_w;
    struct ggml_tensor * c_attn_k_proj_w;
    struct ggml_tensor * c_attn_v_proj_w;
    struct ggml_tensor * c_attn_proj_w;
    // ff
    struct ggml_tensor * c_mlp_fc_w;
    struct ggml_tensor * c_mlp_fc_b;
    struct ggml_tensor * c_mlp_proj_w;
    struct ggml_tensor * c_mlp_proj_b;
 };
 struct gptj_model {
    gptj_hparams hparams;
    // normalization
    struct ggml_tensor * ln_f_g;
    struct ggml_tensor * ln_f_b;
    struct ggml_tensor * wte; // position embedding
    struct ggml_tensor * lmh_g; // language model head
    struct ggml_tensor * lmh_b; // language model bias
    std::vector<gptj_layer> layers;
    // key + value memory
    struct llm_kv_cache kv_self;
    //
    struct ggml_context * ctx;
    std::map<std::string, struct ggml_tensor *> tensors;
    llm_buffer eval_buf;
    llm_buffer scr0_buf;
    llm_buffer scr1_buf;
    ~gptj_model() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
 };
 static bool kv_cache_init(
        const struct gptj_hparams & hparams,
              struct llm_kv_cache & cache,
                         ggml_type   wtype,
                               int   n_ctx) {
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;
    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
    const int64_t n_elements = n_embd*n_mem;
    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2_MiB);
    struct ggml_init_params params;
    params.mem_size   = cache.buf.size;
    params.mem_buffer = cache.buf.addr;
    params.no_alloc   = false;
    cache.ctx = ggml_init(params);
    if (!cache.ctx) {
        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
        return false;
    }
    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
    return true;
 }
 // load the model's weights from a file path
 bool gptj_model_load(const std::string &fname, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr) {
    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
    if(mem_req != nullptr) {
        *mem_req = 0;
    }
    // create the ggml context
    struct gguf_init_params params = {
        /*.no_alloc = */ false,
        /*.ctx      = */ &model.ctx,
    };
    gguf_context *ggufctx = gguf_init_from_file(fname.c_str(), params);
    if (!ggufctx) {
        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
        return false;
    }
    // load hparams
    {
        auto & hparams = model.hparams;
        bool ok = false;
        int keyidx;
        do {
            keyidx = gguf_find_key(ggufctx, "gptj.context_length");
            if (keyidx == -1) { break; }
            hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx);
            keyidx = gguf_find_key(ggufctx, "gptj.embedding_length");
            if (keyidx == -1) { break; }
            hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx);
            keyidx = gguf_find_key(ggufctx, "gptj.attention.head_count");
            if (keyidx == -1) { break; }
            hparams.n_head = gguf_get_val_u32(ggufctx, keyidx);
            keyidx = gguf_find_key(ggufctx, "gptj.block_count");
            if (keyidx == -1) { break; }
            hparams.n_layer = gguf_get_val_u32(ggufctx, keyidx);
            keyidx = gguf_find_key(ggufctx, "gptj.rope.dimension_count");
            if (keyidx == -1) { break; }
            hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx);
            keyidx = gguf_find_key(ggufctx, "gptj.attention.layer_norm_epsilon");
            if (keyidx == -1) { break; }
            hparams.norm_eps = gguf_get_val_f32(ggufctx, keyidx);
            ok = true;
        } while (false);
        if (!ok) {
            fprintf(stderr, "%s: required hparam missing!\n", __func__);
            return false;
        }
        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
    }
    // load vocab
    {
        auto & hparams = model.hparams;
        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
        if (keyidx == -1) {
            fprintf(stderr, "%s: tokenizer model not found!\n", __func__);
            return false;
        }
        if (strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
            fprintf(stderr, "%s: tokenizer model not supported!\n", __func__);
            return false;
        }
        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
        if (tokens_keyidx == -1) {
            fprintf(stderr, "%s: gpt2 tokenizer vocab not found!\n", __func__);
            return false;
        }
        hparams.n_vocab = gguf_get_arr_n(ggufctx, tokens_keyidx);
        printf("%s: gpt2 tokenizer vocab = %d\n", __func__, int(hparams.n_vocab));
        for (int i = 0; i < hparams.n_vocab; i++) {
            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
            vocab.token_to_id[word] = i;
            vocab.id_to_token[i] = word;
        }
    }
    auto & ctx = model.ctx;
    size_t ctx_size = ggml_get_mem_size(ctx);
    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
    if (mem_req != nullptr) {
        *mem_req = ctx_size;
        gguf_free(ggufctx);
        return false;
    }
    // prepare memory for the weights
    {
        const auto & hparams = model.hparams;
        model.layers.resize(hparams.n_layer);
        model.wte    = ggml_get_tensor(ctx, "token_embd.weight");
        model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight");
        model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias");
        model.lmh_g  = ggml_get_tensor(ctx, "output.weight");
        model.lmh_b  = ggml_get_tensor(ctx, "output.bias");
        auto name = [](int i, std::string n) {
            static std::string key;
            key = "blk." + std::to_string(i) + "." + n;
            return key.c_str();
        };
        for (int i = 0; i < hparams.n_layer; ++i) {
            auto & layer = model.layers[i];
            layer.ln_1_g          = ggml_get_tensor(ctx, name(i, "attn_norm.weight"));
            layer.ln_1_b          = ggml_get_tensor(ctx, name(i, "attn_norm.bias"));
            layer.c_attn_q_proj_w = ggml_get_tensor(ctx, name(i, "attn_q.weight"));
            layer.c_attn_k_proj_w = ggml_get_tensor(ctx, name(i, "attn_k.weight"));
            layer.c_attn_v_proj_w = ggml_get_tensor(ctx, name(i, "attn_v.weight"));
            layer.c_attn_proj_w   = ggml_get_tensor(ctx, name(i, "attn_output.weight"));
            layer.c_mlp_fc_w      = ggml_get_tensor(ctx, name(i, "ffn_up.weight"));
            layer.c_mlp_fc_b      = ggml_get_tensor(ctx, name(i, "ffn_up.bias"));
            layer.c_mlp_proj_w    = ggml_get_tensor(ctx, name(i, "ffn_down.weight"));
            layer.c_mlp_proj_b    = ggml_get_tensor(ctx, name(i, "ffn_down.bias"));
        }
    }
    // key + value memory
    {
        const auto & hparams = model.hparams;
        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
            ggml_free(ctx);
            return false;
        }
        const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
        printf("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
    }
    model.scr0_buf.resize(256u * 1024 * 1024);
    model.scr1_buf.resize(256u * 1024 * 1024);
    return true;
 }
 // evaluate the transformer
 //
 //   - model:     the model
 //   - n_threads: number of threads to use
 //   - n_past:    the context size so far
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
 // The GPT-J model requires about 16MB of memory per input token.
 //
 bool gptj_eval(
        gptj_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
              std::vector<float>         & embd_w,
              size_t                     & mem_per_token) {
    const int N = embd_inp.size();
    const auto & hparams = model.hparams;
    const int n_embd  = hparams.n_embd;
    const int n_layer = hparams.n_layer;
    const int n_ctx   = hparams.n_ctx;
    const int n_head  = hparams.n_head;
    const int n_vocab = hparams.n_vocab;
    const int n_rot   = hparams.n_rot;
    const size_t init_buf_size = 1024_MiB;
    if (!model.eval_buf.addr || model.eval_buf.size < init_buf_size)
        model.eval_buf.resize(init_buf_size);
    if (mem_per_token > 0 && mem_per_token*N > model.eval_buf.size) {
        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.eval_buf.size, buf_size_new);
        // reallocate
        model.eval_buf.resize(buf_size_new);
        if (model.eval_buf.addr == nullptr) {
            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.eval_buf.size);
            return false;
        }
    }
    struct ggml_init_params params = {
        .mem_size   = model.eval_buf.size,
        .mem_buffer = model.eval_buf.addr,
        .no_alloc = false
    };
    struct ggml_context * ctx0 = ggml_init(params);
    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
    // KQ_pos - contains the positions
    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    int * data = (int *) KQ_pos->data;
    for (int i = 0; i < N; ++i) {
        data[i] = n_past + i;
    }
    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
    // wte
    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
    for (int il = 0; il < n_layer; ++il) {
        struct ggml_tensor * cur;
        ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
        // norm
        {
            cur = ggml_norm(ctx0, inpL, model.hparams.norm_eps);
            // cur = ln_1_g*cur + ln_1_b
            cur = ggml_add(ctx0,
                    ggml_mul(ctx0,
                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
                        cur),
                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
        }
        struct ggml_tensor * inpSA = cur;
        // self-attention
        {
            struct ggml_tensor * Qcur = ggml_rope(
                ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N),
                KQ_pos, n_rot, 0, 0
            );
            struct ggml_tensor * Kcur = ggml_rope(
                ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N),
                KQ_pos, n_rot, 0, 0
            );
            // store key and value to memory
            {
                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));
                struct ggml_tensor * v = ggml_view_2d(ctx0, model.kv_self.v, N, n_embd,
                        (   n_ctx)*ggml_element_size(model.kv_self.v),
                        (il*n_ctx)*ggml_element_size(model.kv_self.v)*n_embd + n_past*ggml_element_size(model.kv_self.v));
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
            }
            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
            struct ggml_tensor * K =
                ggml_permute(ctx0,
                        ggml_reshape_3d(ctx0,
                            ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),
                            n_embd/n_head, n_head, n_past + N),
                        0, 2, 1, 3);
            // K * Q
            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
            // KQ_scaled = KQ / sqrt(n_embd/n_head)
            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
            // KQ_masked = mask_past(KQ_scaled)
            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
            // KQ = soft_max(KQ_masked)
            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
            struct ggml_tensor * V =
                ggml_view_3d(ctx0, model.kv_self.v,
                        n_past + N, n_embd/n_head, n_head,
                        n_ctx*ggml_element_size(model.kv_self.v),
                        n_ctx*ggml_element_size(model.kv_self.v)*n_embd/n_head,
                        il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd);
            // KQV = transpose(V) * KQ_soft_max
            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
            // KQV_merged = KQV.permute(0, 2, 1, 3)
            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
            // cur = KQV_merged.contiguous().view(n_embd, N)
            cur = ggml_cpy(ctx0,
                    KQV_merged,
                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
            // projection (no bias)
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].c_attn_proj_w,
                    cur);
        }
        struct ggml_tensor * inpFF = cur;
        ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
        // feed-forward network
        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
        {
            // note here we pass inpSA instead of cur
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].c_mlp_fc_w,
                    inpSA);
            cur = ggml_add(ctx0,
                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
                    cur);
            // GELU activation
            cur = ggml_gelu(ctx0, cur);
            // projection
            // cur = proj_w*cur + proj_b
            cur = ggml_mul_mat(ctx0,
                    model.layers[il].c_mlp_proj_w,
                    cur);
            cur = ggml_add(ctx0,
                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
                    cur);
        }
        // self-attention + FF
        cur  = ggml_add(ctx0, cur, inpFF);
        // input for next layer
        inpL = ggml_add(ctx0, cur, inpL);
    }
    ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
    // norm
    {
        inpL = ggml_norm(ctx0, inpL, model.hparams.norm_eps);
        // inpL = ln_f_g*inpL + ln_f_b
        inpL = ggml_add(ctx0,
                ggml_mul(ctx0,
                    ggml_repeat(ctx0, model.ln_f_g, inpL),
                    inpL),
                ggml_repeat(ctx0, model.ln_f_b, inpL));
    }
    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
    // lm_head
    {
        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
        inpL = ggml_add(ctx0,
                ggml_repeat(ctx0, model.lmh_b, inpL),
                inpL);
    }
    // logits -> probs
    //inpL = ggml_soft_max(ctx0, inpL);
    ggml_build_forward_expand(gf, inpL);
    // run the computation
    {
        std::unique_ptr<uint8_t []> data;
        auto plan = ggml_graph_plan(gf, n_threads);
        if (plan.work_size > 0) {
            data.reset(new uint8_t[plan.work_size]);
            plan.work_data = data.get();
        }
        ggml_graph_compute(gf, &plan);
    }
    //if (n_past%100 == 0) {
    //    ggml_graph_print   (gf);
    //    ggml_graph_dump_dot(gf, NULL, "gpt-2.dot");
    //}
    //embd_w.resize(n_vocab*N);
    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
    // return result for just the last token
    embd_w.resize(n_vocab);
    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
    if (mem_per_token == 0) {
        mem_per_token = ggml_used_mem(ctx0)/N;
    }
    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
    ggml_free(ctx0);
    return true;
 }
 #define GPTJ_MAX_RNG_STATE 64*1024
 size_t gptj_get_state_size(const gptj_model &model)
 {
    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
    // for reference, std::mt19937(1337) serializes to 6701 bytes.
    const size_t s_rng_size        = sizeof(size_t);
    const size_t s_rng             = GPTJ_MAX_RNG_STATE;
    const size_t s_kv_size         = sizeof(size_t);
    const size_t s_kv_ntok         = sizeof(int);
    const size_t s_kv              = model.kv_self.buf.size;
    const size_t s_total = (
        + s_rng_size
        + s_rng
        + s_kv_size
        + s_kv_ntok
        + s_kv
    );
    fflush(stdout);
    return s_total;
 }
 size_t gptj_copy_state_data(const gptj_model &model, const std::mt19937 &rng, uint8_t *dest)
 {
    uint8_t * out = dest;
    fflush(stdout);
    // copy rng
    {
        std::stringstream rng_ss;
        rng_ss << rng;
        const size_t rng_size = rng_ss.str().size();
        char rng_buf[GPTJ_MAX_RNG_STATE];
        memset(&rng_buf[0], 0, GPTJ_MAX_RNG_STATE);
        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
        memcpy(out, &rng_size,   sizeof(rng_size));   out += sizeof(rng_size);
        memcpy(out, &rng_buf[0], GPTJ_MAX_RNG_STATE); out += GPTJ_MAX_RNG_STATE;
    }
    // copy kv cache
    {
        const size_t kv_size = model.kv_self.buf.size;
        const int    kv_ntok = model.kv_self.n;
        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
        if (kv_size) {
            memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
        }
    }
    const size_t written  = out - dest;
    assert(written == gptj_get_state_size(model));
    fflush(stdout);
    return written;
 }
 size_t gptj_set_state_data(gptj_model *model, std::mt19937 *rng, const uint8_t *src)
 {
    const uint8_t * in = src;
    // set rng
    {
        size_t rng_size;
        char   rng_buf[GPTJ_MAX_RNG_STATE];
        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
        memcpy(&rng_buf[0], in, GPTJ_MAX_RNG_STATE); in += GPTJ_MAX_RNG_STATE;
        std::stringstream rng_ss;
        rng_ss.str(std::string(&rng_buf[0], rng_size));
        rng_ss >> *rng;
        assert(rng_ss.fail() == false);
    }
    // set kv cache
    {
        size_t kv_size;
        int kv_ntok;
        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
        if (kv_size) {
            assert(model->kv_self.buf.size == kv_size);
            void * k_data = model->kv_self.k->data; // remember data pointers
            void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
            memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;
            model->kv_self.k->data = k_data; // restore correct data pointers
            model->kv_self.v->data = v_data;
        }
        model->kv_self.n = kv_ntok;
    }
    const size_t nread    = in - src;
    assert(nread == gptj_get_state_size(*model));
    fflush(stdout);
    return nread;
 }
 struct GPTJPrivate {
    const std::string modelPath;
    bool modelLoaded;
    gpt_vocab vocab;
    gptj_model *model = nullptr;
    int64_t n_threads = 0;
    size_t mem_per_token = 0;
    std::mt19937 rng;
 };
 GPTJ::GPTJ()
    : d_ptr(new GPTJPrivate) {
    d_ptr->model = new gptj_model;
    d_ptr->model->ctx = nullptr;
    d_ptr->modelLoaded = false;
 }
 size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
    (void)n_ctx;
    (void)ngl;
    gptj_model dummy_model;
    gpt_vocab dummy_vocab;
    size_t mem_req;
    gptj_model_load(modelPath, dummy_model, dummy_vocab, &mem_req);
    return mem_req;
 }
 bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
    (void)n_ctx;
    (void)ngl;
    d_ptr->modelLoaded = false;
    std::mt19937 rng(time(NULL));
    d_ptr->rng = rng;
    // load the model
    bool ok = gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab);
    fflush(stdout);
    if (!ok) {
        std::cerr << "GPT-J ERROR: failed to load model from " <<  modelPath;
        return false;
    }
    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    return true;
 }
 void GPTJ::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
 }
 int32_t GPTJ::threadCount() const
 {
    return d_ptr->n_threads;
 }
 GPTJ::~GPTJ()
 {
    delete d_ptr->model;
 }
 bool GPTJ::isModelLoaded() const
 {
    return d_ptr->modelLoaded;
 }
 size_t GPTJ::stateSize() const
 {
    return gptj_get_state_size(*d_ptr->model);
 }
 size_t GPTJ::saveState(uint8_t *dest) const
 {
    return gptj_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
 }
 size_t GPTJ::restoreState(const uint8_t *src)
 {
    return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);
 }
 std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &ctx, const std::string &str, bool special) const
 {
    (void)ctx;
    (void)special;
    return ::gpt_tokenize(d_ptr->vocab, str);
 }
 LLModel::Token GPTJ::sampleToken(PromptContext &promptCtx) const
 {
    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
    return gpt_sample_top_k_top_p(d_ptr->model->hparams.n_vocab,
        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
        n_prev_toks,
        promptCtx.logits,
        promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
        promptCtx.repeat_penalty,
        d_ptr->rng);
 }
 std::string GPTJ::tokenToString(Token id) const
 {
    return d_ptr->vocab.id_to_token[id];
 }
 bool GPTJ::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
    // determine the required inference memory per token:
    static bool initialized = false;
    if (!initialized) {
        gptj_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, ctx.logits,
            d_ptr->mem_per_token);
        initialized = true;
    }
    return gptj_eval(*d_ptr->model, d_ptr->n_threads, ctx.n_past, tokens, ctx.logits, d_ptr->mem_per_token);
 }
 int32_t GPTJ::contextLength() const
 {
    return d_ptr->model->hparams.n_ctx;
 }
 const std::vector<LLModel::Token> &GPTJ::endTokens() const
 {
    static const std::vector<LLModel::Token> fres = {50256};
    return fres;
 }
 std::string get_arch_name(gguf_context *ctx_gguf) {
    std::string arch_name;
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
    if (ktype != GGUF_TYPE_STRING) {
        throw std::runtime_error("ERROR: Can't get general architecture from gguf file.");
    }
    return gguf_get_val_str(ctx_gguf, kid);
 }
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
 #define DLL_EXPORT __attribute__ ((visibility ("default")))
 #endif
 extern "C" {
 DLL_EXPORT bool is_g4a_backend_model_implementation() {
    return true;
 }
 DLL_EXPORT const char *get_model_type() {
    return modelType_;
 }
 DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }
 DLL_EXPORT bool magic_match(const char * fname) {
    struct ggml_context * ctx_meta = NULL;
    struct gguf_init_params params = {
        /*.no_alloc = */ true,
        /*.ctx      = */ &ctx_meta,
    };
    gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
    if (!ctx_gguf)
        return false;
    bool isValid = gguf_get_version(ctx_gguf) <= 3;
    isValid = isValid && get_arch_name(ctx_gguf) == "gptj";
    gguf_free(ctx_gguf);
    return isValid;
 }
 DLL_EXPORT LLModel *construct() {
    return new GPTJ;
 }
 }
--- a/gpt4all-backend/gptj_impl.h
+++ b/gpt4all-backend/gptj_impl.h
@ -1,42 +0,0 @@
 #ifndef GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #error This file is NOT meant to be included outside of gptj.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #endif
 #ifndef GPTJ_H
 #define GPTJ_H
 #include <string>
 #include <functional>
 #include <vector>
 #include "llmodel.h"
 struct GPTJPrivate;
 class GPTJ : public LLModel {
 public:
    GPTJ();
    ~GPTJ();
    bool supportsEmbedding() const override { return false; }
    bool supportsCompletion() const override { return true; }
    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
    size_t saveState(uint8_t *dest) const override;
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
 private:
    GPTJPrivate *d_ptr;
 protected:
    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
    Token sampleToken(PromptContext &ctx) const override;
    std::string tokenToString(Token id) const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
    int32_t contextLength() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override { return false; }
 };
 #endif // GPTJ_H
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -0,0 +1,273 @@
 #ifndef LLMODEL_H
 #define LLMODEL_H
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <expected>
 #include <functional>
 #include <optional>
 #include <span>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 class Dlhandle;
 using namespace std::string_literals;
 #define LLMODEL_MAX_PROMPT_BATCH 128
 class LLModel {
 public:
    using Token = int32_t;
    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
    using ProgressCallback    = std::function<bool(float progress)>;
    class BadArchError: public std::runtime_error {
    public:
        BadArchError(std::string arch)
            : runtime_error("Unsupported model architecture: " + arch)
            , m_arch(std::move(arch))
            {}
        const std::string &arch() const noexcept { return m_arch; }
    private:
        std::string m_arch;
    };
    class MissingImplementationError: public std::runtime_error {
    public:
        using std::runtime_error::runtime_error;
    };
    class UnsupportedModelError: public std::runtime_error {
    public:
        using std::runtime_error::runtime_error;
    };
    struct GPUDevice {
        const char *backend;
        int index;
        int type;
        size_t heapSize;
        std::string name;
        std::string vendor;
        GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
            vendor(std::move(vendor)) {}
        std::string selectionName() const
        {
            assert(backend == "cuda"s || backend == "kompute"s);
            return backendName() + ": " + name;
        }
        std::string backendName() const { return backendIdToName(backend); }
        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
        static std::string updateSelectionName(const std::string &name) {
            if (name == "Auto" || name == "CPU" || name == "Metal")
                return name;
            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
                return name.starts_with(entry.second + ": ");
            });
            if (it != s_backendNames.end())
                return name;
            return "Vulkan: " + name; // previously, there were only Vulkan devices
        }
    private:
        static inline const std::unordered_map<std::string, std::string> s_backendNames {
            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
        };
    };
    class Implementation {
    public:
        Implementation(const Implementation &) = delete;
        Implementation(Implementation &&);
        ~Implementation();
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }
        static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
        static bool isEmbeddingModel(const std::string &modelPath);
        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
        static void setImplementationsSearchPath(const std::string &path);
        static const std::string &implementationsSearchPath();
        static bool hasSupportedCPU();
        // 0 for no, 1 for yes, -1 for non-x86_64
        static int cpuSupportsAVX2();
    private:
        Implementation(Dlhandle &&);
        static const std::vector<Implementation> &implementationList();
        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
        static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
        char *(*m_getFileArch)(const char *fname);
        bool (*m_isArchSupported)(const char *arch);
        LLModel *(*m_construct)();
        std::string_view m_modelType;
        std::string_view m_buildVariant;
        Dlhandle *m_dlhandle;
    };
    struct PromptContext {
        int32_t n_predict = 200;
        int32_t top_k = 40;
        float   top_p = 0.9f;
        float   min_p = 0.0f;
        float   temp = 0.9f;
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
    };
    explicit LLModel() {}
    virtual ~LLModel() {}
    virtual bool supportsEmbedding() const = 0;
    virtual bool supportsCompletion() const = 0;
    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
    virtual size_t stateSize() const = 0;
    virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
    virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;
    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
    virtual void prompt(std::string_view        prompt,
                        const PromptCallback   &promptCallback,
                        const ResponseCallback &responseCallback,
                        const PromptContext    &ctx);
    virtual int32_t countPromptTokens(std::string_view prompt) const;
    virtual size_t embeddingSize() const {
        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
    }
    // user-specified prefix
    virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
                       EmbedCancelCallback *cancelCb = nullptr);
    // automatic prefix
    virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
    virtual int32_t threadCount() const { return 1; }
    const Implementation &implementation() const {
        return *m_implementation;
    }
    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
        (void)memoryRequired;
        return {};
    }
    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
        (void)memoryRequired;
        (void)name;
        return false;
    }
    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
        (void)device;
        if (unavail_reason) {
            *unavail_reason = "model has no GPU support";
        }
        return false;
    }
    virtual bool usingGPUDevice() const { return false; }
    virtual const char *backendName() const { return "cpu"; }
    virtual const char *gpuDeviceName() const { return nullptr; }
    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
    virtual int32_t contextLength() const = 0;
    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
    virtual bool isSpecialToken(Token id) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
    virtual void initSampler(const PromptContext &ctx) = 0;
    virtual Token sampleToken() const = 0;
    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
    virtual int32_t inputLength() const = 0;
    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
    virtual void setModelInputPosition(int32_t pos) = 0;
    virtual void appendInputToken(Token tok) = 0;
    virtual std::span<const Token> inputTokens() const = 0;
    virtual const std::vector<Token> &endTokens() const = 0;
    virtual bool shouldAddBOS() const = 0;
    virtual int32_t maxContextLength(std::string const &modelPath) const
    {
        (void)modelPath;
        return -1;
    }
    virtual int32_t layerCount(std::string const &modelPath) const
    {
        (void)modelPath;
        return -1;
    }
    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
    {
        (void)modelPath;
        return std::unexpected("not implemented");
    }
    const Implementation *m_implementation = nullptr;
    ProgressCallback m_progressCallback;
    static bool staticProgressCallback(float progress, void* ctx)
    {
        LLModel* model = static_cast<LLModel*>(ctx);
        if (model && model->m_progressCallback)
            return model->m_progressCallback(progress);
        return true;
    }
    // prefill context with prompt
    auto decodePrompt(const PromptCallback &promptCallback,
                      const PromptContext  &promptCtx,
                      std::vector<Token>    embd_inp)
        -> std::optional<int32_t>;
    // generate a response
    void generateResponse(const ResponseCallback &responseCallback,
                          const PromptContext    &promptCtx,
                          int32_t                 nPast);
    friend class LLMImplementation;
 };
 #endif // LLMODEL_H
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -1,9 +1,9 @@
 #ifndef LLMODEL_C_H
 #define LLMODEL_C_H
 #include <stdint.h>
 #include <stddef.h>
 #include <stdbool.h>
 #include <stddef.h>
 #include <stdint.h>
 #ifdef __GNUC__
 #define DEPRECATED __attribute__ ((deprecated))
@ -23,6 +23,11 @@ extern "C" {
 */
 typedef void *llmodel_model;
 /**
 * A token.
 */
 typedef int32_t token_t;
 /**
 * llmodel_prompt_context structure for holding the prompt context.
 * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
@ -30,12 +35,6 @@ typedef void *llmodel_model;
 * behavior.
 */
 struct llmodel_prompt_context {
    float *logits;          // logits of current context
    size_t logits_size;     // the size of the raw logits vector
    int32_t *tokens;        // current tokens in the context window
    size_t tokens_size;     // the size of the raw tokens vector
    int32_t n_past;         // number of tokens in past conversation
    int32_t n_ctx;          // number of tokens possible in context window
    int32_t n_predict;      // number of tokens to predict
    int32_t top_k;          // top k logits to sample from
    float   top_p;          // nucleus sampling probability threshold
@ -48,6 +47,7 @@ struct llmodel_prompt_context {
 };
 struct llmodel_gpu_device {
    const char * backend;
    int index;
    int type; // same as VkPhysicalDeviceType
    size_t heapSize;
@ -62,10 +62,12 @@ typedef struct llmodel_gpu_device llmodel_gpu_device;
 /**
 * Callback type for prompt processing.
- * @param token_id The token id of the prompt.
+ * @param token_ids An array of token ids of the prompt.
 * @param n_token_ids The number of tokens in the array.
 * @param cached Whether the tokens were already in cache.
 * @return a bool indicating whether the model should keep processing.
 */
-typedef bool (*llmodel_prompt_callback)(int32_t token_id);
+typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
 /**
 * Callback type for response.
@ -73,14 +75,18 @@ typedef bool (*llmodel_prompt_callback)(int32_t token_id);
 * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
 * @return a bool indicating whether the model should keep generating.
 */
-typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
+typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
 /**
- * Callback type for recalculation of context.
+ * Embedding cancellation callback for use with llmodel_embed.
- * @param whether the model is recalculating the context.
+ * @param batch_sizes The number of tokens in each batch that will be embedded.
- * @return a bool indicating whether the model should keep generating.
+ * @param n_batch The number of batches that will be embedded.
 * @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
 * @return True to cancel llmodel_embed, false to continue.
 */
-typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
+typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
 typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
 /**
 * Create a llmodel instance.
@ -94,11 +100,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
 * @param model_path A string representing the path to the model file; will only be used to detect model type.
- * @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
+ * @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
 * @param error A pointer to a string; will only be set on error.
 * @return A pointer to the llmodel_model instance; NULL on error.
 */
-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error);
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
 /**
 * Destroy a llmodel instance.
@ -140,46 +146,57 @@ bool llmodel_isModelLoaded(llmodel_model model);
 * @param model A pointer to the llmodel_model instance.
 * @return the size in bytes of the internal state of the model
 */
-uint64_t llmodel_get_state_size(llmodel_model model);
+uint64_t llmodel_state_get_size(llmodel_model model);
 /**
- * Saves the internal state of the model to the specified destination address.
+ * Saves the internal state of the model.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param dest A pointer to the destination.
+ * @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
- * @return the number of bytes copied
+ * @param state_size The size of the destination for the state.
 * @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
 * be freed with llmodel_state_free_input_tokens.
 * @param n_input_tokens Where to store the size of the token cache state.
 * @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
 * size is set to zero.
 */
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
+uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
                                token_t **input_tokens_out, uint64_t *n_input_tokens);
 /**
 * Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
 * @param input_tokens The token cache buffer.
 */
 void llmodel_state_free_input_tokens(token_t *input_tokens);
 /**
 * Restores the internal state of the model using data from the specified address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param src A pointer to the src.
+ * @param state A pointer to the state data.
- * @return the number of bytes read
+ * @param state_size The size of the state data.
 * @param input_tokens The token cache associated with the saved state.
 * @param n_input_tokens The number of tokens in input_tokens.
 * @return The number of bytes read, or zero on error.
 */
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
+uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
                                const token_t *input_tokens, uint64_t n_input_tokens);
 /**
 * Generate a response using the model.
 * @param model A pointer to the llmodel_model instance.
 * @param prompt A string representing the input prompt.
 * @param prompt_template A string representing the input prompt template.
 * @param prompt_callback A callback function for handling the processing of prompt.
 * @param response_callback A callback function for handling the generated response.
 * @param recalculate_callback A callback function for handling recalculation requests.
 * @param special True if special tokens in the prompt should be processed, false otherwise.
 * @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
 * @param ctx A pointer to the llmodel_prompt_context structure.
 * @param error A pointer to a string; will only be set on error.
 */
-void llmodel_prompt(llmodel_model model, const char *prompt,
+bool llmodel_prompt(llmodel_model               model,
-                    const char *prompt_template,
+                    const char                 *prompt,
                    llmodel_prompt_callback     prompt_callback,
                    llmodel_response_callback   response_callback,
                    llmodel_recalculate_callback recalculate_callback,
                    llmodel_prompt_context     *ctx,
-                    bool special,
+                    const char                **error);
                    const char *fake_reply);
 /**
 * Generate an embedding using the model.
@ -198,12 +215,14 @@ void llmodel_prompt(llmodel_model model, const char *prompt,
 * truncate.
 * @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
 * long_text_mode="mean" will raise an error. Disabled by default.
 * @param cancel_cb Cancellation callback, or NULL. See the documentation of llmodel_emb_cancel_callback.
 * @param error Return location for a malloc()ed string that will be set on error, or NULL.
 * @return A pointer to an array of floating point values passed to the calling method which then will
 * be responsible for lifetime of this memory. NULL if an error occurred.
 */
 float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
-                     int dimensionality, size_t *token_count, bool do_mean, bool atlas, const char **error);
+                     int dimensionality, size_t *token_count, bool do_mean, bool atlas,
                     llmodel_emb_cancel_callback cancel_cb, const char **error);
 /**
 * Frees the memory allocated by the llmodel_embedding function.
@ -280,9 +299,18 @@ bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gp
 bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
 /**
- * @return True if a GPU device is successfully initialized, false otherwise.
+ * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
 */
-bool llmodel_has_gpu_device(llmodel_model model);
+const char *llmodel_model_backend_name(llmodel_model model);
 /**
 * @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
 */
 const char *llmodel_model_gpu_device_name(llmodel_model model);
 int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
 void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
 #ifdef __cplusplus
 }
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
@ -2,9 +2,9 @@
 #define SYSINFO_H
 #include <fstream>
 #include <string>
 #include <sstream>
 #include <iomanip>
 #include <sstream>
 #include <string>
 #if defined(__linux__)
 #   include <unistd.h>
@ -12,6 +12,10 @@
 #   include <sys/types.h>
 #   include <sys/sysctl.h>
 #elif defined(_WIN32)
 #   define WIN32_LEAN_AND_MEAN
 #   ifndef NOMINMAX
 #       define NOMINMAX
 #   endif
 #   include <windows.h>
 #endif
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +0,0 @@
 Subproject commit e3c4f65d786d26f1daa7aebfb1b67cd6c31ea082
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -1,247 +0,0 @@
 #include "llmodel.h"
 #include "dlhandle.h"
 #include "sysinfo.h"
 #include <cassert>
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <memory>
 #include <regex>
 #include <sstream>
 #include <string>
 #include <vector>
 #ifdef _MSC_VER
 #include <intrin.h>
 #endif
 std::string s_implementations_search_path = ".";
 #if !(defined(__x86_64__) || defined(_M_X64))
    // irrelevant on non-x86_64
    #define cpu_supports_avx()  -1
    #define cpu_supports_avx2() -1
 #elif defined(_MSC_VER)
    // MSVC
    static int get_cpu_info(int func_id, int reg_id) {
        int info[4];
        __cpuid(info, func_id);
        return info[reg_id];
    }
    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
    #define cpu_supports_avx()  (get_cpu_info(1, 2) & (1 << 28))
    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
    #define cpu_supports_avx2() (get_cpu_info(7, 1) & (1 <<  5))
 #else
    // gcc/clang
    #define cpu_supports_avx()  __builtin_cpu_supports("avx")
    #define cpu_supports_avx2() __builtin_cpu_supports("avx2")
 #endif
 LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
    assert(get_model_type);
    m_modelType = get_model_type();
    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
    assert(get_build_variant);
    m_buildVariant = get_build_variant();
    m_magicMatch = m_dlhandle->get<bool(const char*)>("magic_match");
    assert(m_magicMatch);
    m_construct = m_dlhandle->get<LLModel *()>("construct");
    assert(m_construct);
 }
 LLModel::Implementation::Implementation(Implementation &&o)
    : m_magicMatch(o.m_magicMatch)
    , m_construct(o.m_construct)
    , m_modelType(o.m_modelType)
    , m_buildVariant(o.m_buildVariant)
    , m_dlhandle(o.m_dlhandle) {
    o.m_dlhandle = nullptr;
 }
 LLModel::Implementation::~Implementation() {
    delete m_dlhandle;
 }
 static bool isImplementation(const Dlhandle &dl) {
    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
 }
 const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList() {
    if (cpu_supports_avx() == 0) {
        throw std::runtime_error("CPU does not support AVX");
    }
    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
    // individual models without the cleanup of the static list interfering
    static auto* libs = new std::vector<Implementation>([] () {
        std::vector<Implementation> fres;
        std::string impl_name_re = "(gptj|llamamodel-mainline)";
        if (cpu_supports_avx2() == 0) {
            impl_name_re += "-avxonly";
        } else {
            impl_name_re += "-(default|metal)";
        }
        std::regex re(impl_name_re);
        auto search_in_directory = [&](const std::string& paths) {
            std::stringstream ss(paths);
            std::string path;
            // Split the paths string by the delimiter and process each path.
            while (std::getline(ss, path, ';')) {
                std::filesystem::path fs_path(path);
                // Iterate over all libraries
                for (const auto& f : std::filesystem::directory_iterator(fs_path)) {
                    const std::filesystem::path& p = f.path();
                    if (p.extension() != LIB_FILE_EXT) continue;
                    if (!std::regex_search(p.stem().string(), re)) continue;
                    // Add to list if model implementation
                    try {
                        Dlhandle dl(p.string());
                        if (!isImplementation(dl))
                            continue;
                        fres.emplace_back(Implementation(std::move(dl)));
                    } catch (...) {}
                }
            }
        };
        search_in_directory(s_implementations_search_path);
        return fres;
    }());
    // Return static result
    return *libs;
 }
 const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    bool buildVariantMatched = false;
    for (const auto& i : implementationList()) {
        if (buildVariant != i.m_buildVariant) continue;
        buildVariantMatched = true;
        if (!i.m_magicMatch(fname)) continue;
        return &i;
    }
    if (!buildVariantMatched)
        throw std::runtime_error("Could not find any implementations for build variant: " + buildVariant);
    return nullptr; // unsupported model format
 }
 LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant, int n_ctx) {
    // Get correct implementation
    const Implementation* impl = nullptr;
    #if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
        if (buildVariant == "auto") {
            size_t total_mem = getSystemTotalRAMInBytes();
            impl = implementation(modelPath.c_str(), "metal");
            if(impl) {
                LLModel* metalimpl = impl->m_construct();
                metalimpl->m_implementation = impl;
                /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
                 * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
                 * most (all?) places where this is called, causing underestimation of required
                 * memory. */
                size_t req_mem = metalimpl->requiredMem(modelPath, n_ctx, 100);
                float req_to_total = (float) req_mem / (float) total_mem;
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
                if (req_to_total >= 0.53) {
                    delete metalimpl;
                    impl = nullptr;
                } else {
                    return metalimpl;
                }
            }
        }
    #else
        (void)n_ctx;
    #endif
    if (!impl) {
        //TODO: Auto-detect CUDA/OpenCL
        if (buildVariant == "auto") {
            if (cpu_supports_avx2() == 0) {
                buildVariant = "avxonly";
            } else {
                buildVariant = "default";
            }
        }
        impl = implementation(modelPath.c_str(), buildVariant);
        if (!impl) return nullptr;
    }
    // Construct and return llmodel implementation
    auto fres = impl->m_construct();
    fres->m_implementation = impl;
    return fres;
 }
 LLModel *LLModel::Implementation::constructDefaultLlama() {
    static std::unique_ptr<LLModel> llama([]() -> LLModel * {
        const std::vector<LLModel::Implementation> *impls;
        try {
            impls = &implementationList();
        } catch (const std::runtime_error &e) {
            std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
            return nullptr;
        }
        const LLModel::Implementation *impl = nullptr;
        for (const auto &i: *impls) {
            if (i.m_buildVariant == "metal" || i.m_modelType != "LLaMA") continue;
            impl = &i;
        }
        if (!impl) {
            std::cerr << __func__ << ": could not find llama.cpp implementation\n";
            return nullptr;
        }
        auto fres = impl->m_construct();
        fres->m_implementation = impl;
        return fres;
    }());
    return llama.get();
 }
 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
    auto *llama = constructDefaultLlama();
    if (llama) { return llama->availableGPUDevices(memoryRequired); }
    return {};
 }
 int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
    auto *llama = constructDefaultLlama();
    return llama ? llama->maxContextLength(modelPath) : -1;
 }
 int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
    auto *llama = constructDefaultLlama();
    return llama ? llama->layerCount(modelPath) : -1;
 }
 bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
    auto *llama = constructDefaultLlama();
    return llama && llama->isEmbeddingModel(modelPath);
 }
 void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
    s_implementations_search_path = path;
 }
 const std::string& LLModel::Implementation::implementationsSearchPath() {
    return s_implementations_search_path;
 }
 bool LLModel::Implementation::hasSupportedCPU() {
    return cpu_supports_avx() != 0;
 }
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -1,200 +0,0 @@
 #ifndef LLMODEL_H
 #define LLMODEL_H
 #include <cstdint>
 #include <fstream>
 #include <functional>
 #include <limits>
 #include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
 #define LLMODEL_MAX_PROMPT_BATCH 128
 class Dlhandle;
 class LLModel {
 public:
    using Token = int32_t;
    struct GPUDevice {
        int index;
        int type;
        size_t heapSize;
        std::string name;
        std::string vendor;
        GPUDevice(int index, int type, size_t heapSize, std::string name, std::string vendor):
            index(index), type(type), heapSize(heapSize), name(std::move(name)), vendor(std::move(vendor)) {}
    };
    class Implementation {
    public:
        Implementation(const Implementation &) = delete;
        Implementation(Implementation &&);
        ~Implementation();
        std::string_view modelType() const { return m_modelType; }
        std::string_view buildVariant() const { return m_buildVariant; }
        static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto", int n_ctx = 2048);
        static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
        static bool isEmbeddingModel(const std::string &modelPath);
        static void setImplementationsSearchPath(const std::string &path);
        static const std::string &implementationsSearchPath();
        static bool hasSupportedCPU();
    private:
        Implementation(Dlhandle &&);
        static const std::vector<Implementation> &implementationList();
        static const Implementation *implementation(const char *fname, const std::string &buildVariant);
        static LLModel *constructDefaultLlama();
        bool (*m_magicMatch)(const char *fname);
        LLModel *(*m_construct)();
        std::string_view m_modelType;
        std::string_view m_buildVariant;
        Dlhandle *m_dlhandle;
    };
    struct PromptContext {
        std::vector<float> logits;      // logits of current context
        std::vector<int32_t> tokens;    // current tokens in the context window
        int32_t n_past = 0;             // number of tokens in past conversation
        int32_t n_ctx = 0;              // number of tokens possible in context window
        int32_t n_predict = 200;
        int32_t top_k = 40;
        float   top_p = 0.9f;
        float   min_p = 0.0f;
        float   temp = 0.9f;
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context window
        int32_t n_last_batch_tokens = 0;
    };
    using ProgressCallback = std::function<bool(float progress)>;
    explicit LLModel() {}
    virtual ~LLModel() {}
    virtual bool supportsEmbedding() const = 0;
    virtual bool supportsCompletion() const = 0;
    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; };
    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
    virtual size_t stateSize() const { return 0; }
    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }
    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
    virtual void prompt(const std::string &prompt,
                        const std::string &promptTemplate,
                        std::function<bool(int32_t)> promptCallback,
                        std::function<bool(int32_t, const std::string&)> responseCallback,
                        std::function<bool(bool)> recalculateCallback,
                        PromptContext &ctx,
                        bool special = false,
                        std::string *fakeReply = nullptr);
    virtual size_t embeddingSize() const {
        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
    }
    // user-specified prefix
    virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
    // automatic prefix
    virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
                       int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
    virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
    virtual int32_t threadCount() const { return 1; }
    const Implementation &implementation() const {
        return *m_implementation;
    }
    virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
        (void)memoryRequired;
        return {};
    }
    virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
        (void)memoryRequired;
        (void)name;
        return false;
    }
    virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
        (void)device;
        if (unavail_reason) {
            *unavail_reason = "model has no GPU support";
        }
        return false;
    }
    virtual bool hasGPUDevice() { return false; }
    virtual bool usingGPUDevice() { return false; }
    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
    virtual Token sampleToken(PromptContext &ctx) const = 0;
    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
    virtual int32_t contextLength() const = 0;
    virtual const std::vector<Token> &endTokens() const = 0;
    virtual bool shouldAddBOS() const = 0;
    virtual int32_t maxContextLength(std::string const &modelPath) const
    {
        (void)modelPath;
        return -1;
    }
    virtual int32_t layerCount(std::string const &modelPath) const
    {
        (void)modelPath;
        return -1;
    }
    // This is a helper function called from the default implementation of 'prompt' but it can be
    // shared by all base classes so it isn't virtual
    void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
    const Implementation *m_implementation = nullptr;
    ProgressCallback m_progressCallback;
    static bool staticProgressCallback(float progress, void* ctx)
    {
        LLModel* model = static_cast<LLModel*>(ctx);
        if (model && model->m_progressCallback)
            return model->m_progressCallback(progress);
        return true;
    }
    void decodePrompt(std::function<bool(int32_t)> promptCallback,
                      std::function<bool(int32_t, const std::string&)> responseCallback,
                      std::function<bool(bool)> recalculateCallback,
                      PromptContext &promptCtx,
                      std::vector<Token> embd_inp);
    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                          std::function<bool(bool)> recalculateCallback,
                          PromptContext &promptCtx);
 private:
    friend class LLMImplementation;
 };
 #endif // LLMODEL_H
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@ -1,297 +0,0 @@
 #include "llmodel.h"
 #include <cassert>
 #include <iostream>
 #include <regex>
 #include <string>
 #include <unordered_set>
 // TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
 void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
    int n_keep = shouldAddBOS();
    const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
    // Erase the first percentage of context from the tokens
    std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
    size_t i = n_keep;
    promptCtx.n_past = n_keep;
    while (i < promptCtx.tokens.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
        std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
        assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << "LLModel ERROR: Failed to process prompt\n";
            goto stop_generating;
        }
        promptCtx.n_past += batch.size();
        if (!recalculate(true))
            goto stop_generating;
        i = batch_end;
    }
    assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));
 stop_generating:
    recalculate(false);
 }
 static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err) {
    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
    placeholders.clear();
    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
    if (placeholders.size() > 2) {
        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
        return false;
    }
    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
        return false;
    }
    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
        return false;
    }
    return true;
 }
 void LLModel::prompt(const std::string &prompt,
                     const std::string &promptTemplate,
                     std::function<bool(int32_t)> promptCallback,
                     std::function<bool(int32_t, const std::string&)> responseCallback,
                     std::function<bool(bool)> recalculateCallback,
                     PromptContext &promptCtx,
                     bool special,
                     std::string *fakeReply)
 {
    if (!isModelLoaded()) {
        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
        return;
    }
    if (!supportsCompletion()) {
        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
        responseCallback(-1, errorMessage);
        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
        return;
    }
    // parse the prompt template
    std::vector<std::smatch> placeholders;
    {
        std::string err;
        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
            responseCallback(-1, err);
            std::cerr << err << "\n";
            return;
        }
    }
    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
    // tokenize the user prompt
    std::vector<Token> embd_inp;
    if (placeholders.empty()) {
        // this is unusual, but well-defined
        std::cerr << __func__ << ": prompt template has no placeholder\n";
        embd_inp = tokenize(promptCtx, promptTemplate, true);
    } else {
        // template: beginning of user prompt
        const auto &phUser = placeholders[0];
        std::string userPrefix(phUser.prefix());
        if (!userPrefix.empty()) {
            embd_inp = tokenize(promptCtx, userPrefix, true);
            promptCtx.n_past += embd_inp.size();
        }
        // user input (shouldn't have special token processing)
        auto tokens = tokenize(promptCtx, prompt, special);
        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
        promptCtx.n_past += tokens.size();
        // template: end of user prompt + start of assistant prompt
        size_t start = phUser.position() + phUser.length();
        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
        auto userToAsst = promptTemplate.substr(start, end - start);
        if (!userToAsst.empty()) {
            tokens = tokenize(promptCtx, userToAsst, true);
            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
            promptCtx.n_past += tokens.size();
        }
    }
    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
    // decode the user prompt
    decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
    // decode the assistant's reply, either generated or spoofed
    if (fakeReply == nullptr) {
        generateResponse(responseCallback, recalculateCallback, promptCtx);
    } else {
        embd_inp = tokenize(promptCtx, *fakeReply, false);
        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
    }
    // decode the rest of the prompt template
    // template: end of assistant prompt
    std::string asstSuffix;
    if (placeholders.size() >= 2) {
        size_t start = placeholders[1].position() + placeholders[1].length();
        asstSuffix = promptTemplate.substr(start);
    } else {
        asstSuffix = "\n\n"; // default to a blank link, good for e.g. Alpaca
    }
    if (!asstSuffix.empty()) {
        embd_inp = tokenize(promptCtx, asstSuffix, true);
        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
    }
 }
 void LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
                           std::function<bool(int32_t, const std::string&)> responseCallback,
                           std::function<bool(bool)> recalculateCallback,
                           PromptContext &promptCtx,
                           std::vector<Token> embd_inp) {
    // save the context size
    promptCtx.n_ctx = contextLength();
    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
        return;
    }
    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
    promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
    // process the prompt in batches
    size_t i = 0;
    while (i < embd_inp.size()) {
        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
        // Check if the context has run out...
        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, batch)) {
            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
            return;
        }
        size_t tokens = batch_end - i;
        for (size_t t = 0; t < tokens; ++t) {
            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(batch.at(t));
            promptCtx.n_past += 1;
            if (!promptCallback(batch.at(t)))
                return;
        }
        i = batch_end;
    }
 }
 void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
                               std::function<bool(bool)> recalculateCallback,
                               PromptContext &promptCtx) {
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    std::unordered_set<std::string> reversePrompts
        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
    // predict next tokens
    for (int i = 0; i < promptCtx.n_predict; i++) {
        // sample next token
        auto id = sampleToken(promptCtx);
        // Check if the context has run out...
        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
            recalculateContext(promptCtx, recalculateCallback);
            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
        }
        if (!evalTokens(promptCtx, { id })) {
            std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
            return;
        }
        // display text
        for (const auto token : endTokens()) {
            if (id == token) return;
        }
        const std::string str = tokenToString(id);
        // Check if the provided str is part of our reverse prompts
        bool foundPartialReversePrompt = false;
        const std::string completed = cachedResponse + std::string(str);
        if (reversePrompts.find(completed) != reversePrompts.end())
            return;
        // Check if it partially matches our reverse prompts and if so, cache
        for (const auto& s : reversePrompts) {
            if (s.compare(0, completed.size(), completed) == 0) {
                foundPartialReversePrompt = true;
                cachedResponse = completed;
                break;
            }
        }
        // Regardless the token gets added to our cache
        cachedTokens.push_back(id);
        // Continue if we have found a partial match
        if (foundPartialReversePrompt)
            continue;
        // Empty the cache
        for (auto t : cachedTokens) {
            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
                promptCtx.tokens.erase(promptCtx.tokens.begin());
            promptCtx.tokens.push_back(t);
            promptCtx.n_past += 1;
            //TODO: Conversion to std::string can be avoided here...
            if (!responseCallback(t, std::string(tokenToString(t))))
                return;
        }
        cachedTokens.clear();
    }
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
    size_t *tokenCount, bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)prefix;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
    bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)isRetrieval;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@ -1,46 +0,0 @@
 #pragma once
 #include <cstdint>
 #include <cstddef>
 #include <vector>
 #include <ggml.h>
 struct llm_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
    void resize(size_t size) {
        delete[] addr;
        addr = new uint8_t[size];
        this->size = size;
    }
    ~llm_buffer() {
        delete[] addr;
    }
 };
 struct llm_kv_cache {
    struct ggml_tensor * k;
    struct ggml_tensor * v;
    struct ggml_context * ctx = NULL;
    llm_buffer buf;
    int n; // number of tokens currently in the cache
    ~llm_kv_cache() {
        if (ctx) {
            ggml_free(ctx);
        }
    }
 };
 inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
    if (plan.work_size > 0) {
        buf.resize(plan.work_size);
        plan.work_data = buf.addr;
    }
    ggml_graph_compute(graph, &plan);
 }
--- a/gpt4all-backend/scripts/convert_bert_hf_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_bert_hf_to_gguf.py
@ -1,140 +0,0 @@
 #!/usr/bin/env python3
 from __future__ import annotations
 import json
 import struct
 import sys
 from pathlib import Path
 import gguf
 import numpy as np
 from transformers import AutoConfig, AutoModel, AutoTokenizer
 if not 2 <= len(sys.argv) < 4:
    print("Usage: {} dir-model [ftype]\n".format(Path(__file__).name))
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = Path(sys.argv[1])
 with open(dir_model / "vocab.txt", encoding="utf-8") as f:
    vocab = f.readlines()
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
 fname_out = dir_model / ("ggml-model-" + ftype_str[ftype] + ".gguf")
 ARCH = gguf.MODEL_ARCH.BERT
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 config = AutoConfig.from_pretrained(dir_model)
 block_count = config.num_hidden_layers
 gguf_writer.add_name("BERT")
 gguf_writer.add_context_length(config.max_position_embeddings)
 gguf_writer.add_embedding_length(config.hidden_size)
 gguf_writer.add_feed_forward_length(config.intermediate_size)
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_head_count(config.num_attention_heads)
 gguf_writer.add_file_type(ftype)
 print("gguf: get tokenizer metadata")
 try:
    with open(dir_model / "tokenizer.json", encoding="utf-8") as f:
        tokenizer_json = json.load(f)
 except FileNotFoundError as e:
    print(f'Error: Missing {e.filename!r}', file=sys.stderr)
    sys.exit(1)
 print("gguf: get wordpiece tokenizer vocab")
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 print(tokenizer.encode('I believe the meaning of life is'))
 tokens: list[bytearray] = []
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 # The number of tokens in tokenizer.json can differ from the expected vocab size.
 # This causes downstream issues with mismatched tensor sizes when running the inference
 for i in range(config.vocab_size):
    try:
        text = reverse_vocab[i]
    except KeyError:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
 gguf_writer.add_tokenizer_model("bert")  # wordpiece
 gguf_writer.add_token_list(tokens)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)
 print("gguf: get tensor metadata")
 model = AutoModel.from_pretrained(dir_model, config=config, low_cpu_mem_usage=True)
 print(model)
 tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
 list_vars = model.state_dict()
 for name in list_vars.keys():
    print(name, list_vars[name].shape, list_vars[name].dtype)
 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    if name in ['embeddings.position_ids', 'pooler.dense.weight', 'pooler.dense.bias']:
        continue
    print("Processing variable:", name, "with shape:", data.shape)
    n_dims = len(data.shape)
    # ftype == 0 -> float32, ftype == 1 -> float16
    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
        print("  Converting to float16")
        data = data.astype(np.float16)
        l_type = 1
    else:
        l_type = 0
    # map tensor names
    new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
    if new_name is None:
        print("Can not map tensor '" + name + "'")
        sys.exit()
    gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 print("gguf: write tensors")
 gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print()
--- a/gpt4all-backend/scripts/convert_gptj_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_gptj_to_gguf.py
@ -1,165 +0,0 @@
 #!/usr/bin/env python3
 # Convert GPT-J-6B h5 transformer model to ggml format
 #
 # Load the model using GPTJForCausalLM.
 # Iterate over all variables and write them to a binary file.
 #
 # For each variable, write the following:
 #   - Number of dimensions (int)
 #   - Name length (int)
 #   - Dimensions (int[n_dims])
 #   - Name (char[name_length])
 #   - Data (float[n_dims])
 #
 # By default, the bigger matrices are converted to 16-bit floats.
 # This can be disabled by adding the "ftype" CLI argument.
 #
 # At the start of the ggml file we write the model parameters
 # and vocabulary.
 #
 from __future__ import annotations
 import sys
 import struct
 import json
 from pathlib import Path
 import gguf
 import numpy as np
 from transformers import AutoConfig, AutoTokenizer, GPTJForCausalLM
 from transformers.models.gpt2 import tokenization_gpt2
 if not 2 <= len(sys.argv) < 4:
    print("Usage: python {} dir-model [ftype]\n".format(Path(__file__).name))
    print("  ftype == 0 -> float32")
    print("  ftype == 1 -> float16")
    sys.exit(1)
 # output in the same directory as the model
 dir_model = Path(sys.argv[1])
 fname_out = dir_model / "ggml-model.gguf"
 # possible data types
 #   ftype == 0 -> float32
 #   ftype == 1 -> float16
 #
 # map from ftype to string
 ftype_str = ["f32", "f16"]
 ftype = 1
 if len(sys.argv) > 2:
    ftype = int(sys.argv[2])
    if ftype < 0 or ftype > 1:
        print("Invalid ftype: " + str(ftype))
        sys.exit(1)
 fname_out = dir_model / ("ggml-model-" + ftype_str[ftype] + ".gguf")
 ARCH = gguf.MODEL_ARCH.GPTJ
 gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
 print("gguf: get model metadata")
 config = AutoConfig.from_pretrained(dir_model)
 block_count = config.n_layer
 gguf_writer.add_name("GPT-J")
 gguf_writer.add_context_length(config.n_positions)
 gguf_writer.add_embedding_length(config.n_embd)
 gguf_writer.add_block_count(block_count)
 gguf_writer.add_feed_forward_length(4 * config.n_embd)
 gguf_writer.add_head_count(config.n_head)
 gguf_writer.add_rope_dimension_count(config.rotary_dim)
 gguf_writer.add_layer_norm_eps(config.layer_norm_epsilon)
 gguf_writer.add_file_type(ftype)
 print("gguf: get gpt2 tokenizer vocab")
 tokenizer = AutoTokenizer.from_pretrained(dir_model)
 reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
 byte_encoder = tokenization_gpt2.bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}
 tokens: list[bytearray] = []
 for i in range(config.vocab_size):
    if i in reverse_vocab:
        try:
            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
        except KeyError:
            text = bytearray()
            for c in reverse_vocab[i]:
                if ord(c) < 256:  # single byte character
                    text.append(byte_decoder[c])
                else:  # multibyte special token character
                    text.extend(c.encode('utf-8'))
    else:
        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
        pad_token = f"[PAD{i}]".encode("utf8")
        text = bytearray(pad_token)
    tokens.append(text)
 gguf_writer.add_tokenizer_model("gpt2")
 gguf_writer.add_token_list(tokens)
 special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
 special_vocab.add_to_gguf(gguf_writer)
 print("gguf: get tensor metadata")
 model = GPTJForCausalLM.from_pretrained(dir_model, config=config, low_cpu_mem_usage=True)
 #print (model)
 tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
 list_vars = model.state_dict()
 #print (list_vars)
 for name in list_vars.keys():
    data = list_vars[name].squeeze().numpy()
    print("Processing variable:", name, "with shape:", data.shape)
    # we don't need these
    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
        print("  Skipping variable:", name)
        continue
    n_dims = len(data.shape)
    # ftype == 0 -> float32, ftype == 1 -> float16
    ftype_cur = 0
    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
        print("  Converting to float16")
        data = data.astype(np.float16)
        ftype_cur = 1
    elif ftype == 1 or data.dtype != np.float32:
        print("  Converting to float32")
        data = data.astype(np.float32)
        ftype_cur = 0
    # map tensor names
    new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
    if new_name is None:
        print("Can not map tensor '" + name + "'")
        sys.exit()
    gguf_writer.add_tensor(new_name, data)
 print("gguf: write header")
 gguf_writer.write_header_to_file()
 print("gguf: write metadata")
 gguf_writer.write_kv_data_to_file()
 print("gguf: write tensors")
 gguf_writer.write_tensors_to_file()
 gguf_writer.close()
 print(f"gguf: model successfully exported to '{fname_out}'")
 print()
--- a/gpt4all-backend/src/dlhandle.cpp
+++ b/gpt4all-backend/src/dlhandle.cpp
@ -0,0 +1,73 @@
 #include "dlhandle.h"
 #include <string>
 #ifndef _WIN32
 #   include <dlfcn.h>
 #else
 #   include <cassert>
 #   include <sstream>
 #   define WIN32_LEAN_AND_MEAN
 #   ifndef NOMINMAX
 #       define NOMINMAX
 #   endif
 #   include <windows.h>
 #endif
 using namespace std::string_literals;
 namespace fs = std::filesystem;
 #ifndef _WIN32
 Dlhandle::Dlhandle(const fs::path &fpath)
 {
    chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
    if (!chandle) {
        throw Exception("dlopen: "s + dlerror());
    }
 }
 Dlhandle::~Dlhandle()
 {
    if (chandle) dlclose(chandle);
 }
 void *Dlhandle::get_internal(const char *symbol) const
 {
    return dlsym(chandle, symbol);
 }
 #else // defined(_WIN32)
 Dlhandle::Dlhandle(const fs::path &fpath)
 {
    fs::path afpath = fs::absolute(fpath);
    // Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
    UINT lastErrorMode = GetErrorMode();
    SetErrorMode(lastErrorMode | SEM_FAILCRITICALERRORS);
    chandle = LoadLibraryExW(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
    SetErrorMode(lastErrorMode);
    if (!chandle) {
        DWORD err = GetLastError();
        std::ostringstream ss;
        ss << "LoadLibraryExW failed with error 0x" << std::hex << err;
        throw Exception(ss.str());
    }
 }
 Dlhandle::~Dlhandle()
 {
    if (chandle) FreeLibrary(HMODULE(chandle));
 }
 void *Dlhandle::get_internal(const char *symbol) const
 {
    return GetProcAddress(HMODULE(chandle), symbol);
 }
 #endif // defined(_WIN32)
--- a/gpt4all-backend/src/dlhandle.h
+++ b/gpt4all-backend/src/dlhandle.h
@ -0,0 +1,47 @@
 #pragma once
 #include <filesystem>
 #include <stdexcept>
 #include <string>
 #include <utility>
 namespace fs = std::filesystem;
 class Dlhandle {
    void *chandle = nullptr;
 public:
    class Exception : public std::runtime_error {
    public:
        using std::runtime_error::runtime_error;
    };
    Dlhandle() = default;
    Dlhandle(const fs::path &fpath);
    Dlhandle(const Dlhandle &o) = delete;
    Dlhandle(Dlhandle &&o)
        : chandle(o.chandle)
    {
        o.chandle = nullptr;
    }
    ~Dlhandle();
    Dlhandle &operator=(Dlhandle &&o) {
        chandle = std::exchange(o.chandle, nullptr);
        return *this;
    }
    template <typename T>
    T *get(const std::string &symbol) const {
        return reinterpret_cast<T *>(get_internal(symbol.c_str()));
    }
    auto get_fnc(const std::string &symbol) const {
        return get<void*(...)>(symbol);
    }
 private:
    void *get_internal(const char *symbol) const;
 };
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -4,12 +4,15 @@
 #ifndef LLAMAMODEL_H
 #define LLAMAMODEL_H
 #include <functional>
 #include <memory>
 #include <string>
 #include <vector>
 #include "llmodel.h"
 #include <memory>
 #include <span>
 #include <string>
 #include <string_view>
 #include <vector>
 #include <unordered_map>
 struct LLamaPrivate;
 struct EmbModelSpec;
@ -26,42 +29,56 @@ public:
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
-    size_t saveState(uint8_t *dest) const override;
+    size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
-    size_t restoreState(const uint8_t *src) override;
+    size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
-    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
-    bool hasGPUDevice() override;
+    bool usingGPUDevice() const override;
-    bool usingGPUDevice() override;
+    const char *backendName() const override;
    const char *gpuDeviceName() const override;
    size_t embeddingSize() const override;
    // user-specified prefix
    void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
-               int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
+               int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
               EmbedCancelCallback *cancelCb = nullptr) override;
    // automatic prefix
    void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
    int32_t contextLength() const override;
    auto specialTokens() -> std::unordered_map<std::string, std::string> const override;
 protected:
    std::vector<Token> tokenize(std::string_view str) const override;
    bool isSpecialToken(Token id) const override;
    std::string tokenToString(Token id) const override;
    void initSampler(const PromptContext &ctx) override;
    Token sampleToken() const override;
    bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
    void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
    int32_t inputLength() const override;
    int32_t computeModelInputPosition(std::span<const Token> input) const override;
    void setModelInputPosition(int32_t pos) override;
    void appendInputToken(Token tok) override;
    std::span<const Token> inputTokens() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;
    int32_t maxContextLength(std::string const &modelPath) const override;
    int32_t layerCount(std::string const &modelPath) const override;
    auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;
    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
                       const EmbModelSpec *spec);
 private:
    std::unique_ptr<LLamaPrivate> d_ptr;
    bool m_supportsEmbedding = false;
    bool m_supportsCompletion = false;
 protected:
    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
    std::string tokenToString(Token id) const override;
    Token sampleToken(PromptContext &ctx) const override;
    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
    int32_t contextLength() const override;
    const std::vector<Token> &endTokens() const override;
    bool shouldAddBOS() const override;
    int32_t maxContextLength(std::string const &modelPath) const override;
    int32_t layerCount(std::string const &modelPath) const override;
    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
                       size_t *tokenCount, bool doMean, bool atlas, const EmbModelSpec *spec);
 };
 #endif // LLAMAMODEL_H
--- a/gpt4all-backend/src/llmodel.cpp
+++ b/gpt4all-backend/src/llmodel.cpp
@ -0,0 +1,358 @@
 #include "llmodel.h"
 #include "dlhandle.h"
 #include <cassert>
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
 #include <iterator>
 #include <memory>
 #include <optional>
 #include <regex>
 #include <sstream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #ifdef _WIN32
 #   define WIN32_LEAN_AND_MEAN
 #   ifndef NOMINMAX
 #       define NOMINMAX
 #   endif
 #   include <windows.h>
 #endif
 #ifdef _MSC_VER
 #   include <intrin.h>
 #endif
 #if defined(__APPLE__) && defined(__aarch64__)
 #   include "sysinfo.h" // for getSystemTotalRAMInBytes
 #endif
 namespace fs = std::filesystem;
 #ifndef __APPLE__
 static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
 #elif defined(__aarch64__)
 static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
 #else
 static const std::string DEFAULT_BACKENDS[] = {"cpu"};
 #endif
 std::string s_implementations_search_path = ".";
 #if !(defined(__x86_64__) || defined(_M_X64))
    // irrelevant on non-x86_64
    #define cpu_supports_avx()  -1
    #define cpu_supports_avx2() -1
 #elif defined(_MSC_VER)
    // MSVC
    static int get_cpu_info(int func_id, int reg_id) {
        int info[4];
        __cpuid(info, func_id);
        return info[reg_id];
    }
    // AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
    #define cpu_supports_avx()  !!(get_cpu_info(1, 2) & (1 << 28))
    // AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
    #define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 <<  5))
 #else
    // gcc/clang
    #define cpu_supports_avx()  !!__builtin_cpu_supports("avx")
    #define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
 #endif
 LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
    : m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
    auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
    assert(get_model_type);
    m_modelType = get_model_type();
    auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
    assert(get_build_variant);
    m_buildVariant = get_build_variant();
    m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
    assert(m_getFileArch);
    m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
    assert(m_isArchSupported);
    m_construct = m_dlhandle->get<LLModel *()>("construct");
    assert(m_construct);
 }
 LLModel::Implementation::Implementation(Implementation &&o)
    : m_getFileArch(o.m_getFileArch)
    , m_isArchSupported(o.m_isArchSupported)
    , m_construct(o.m_construct)
    , m_modelType(o.m_modelType)
    , m_buildVariant(o.m_buildVariant)
    , m_dlhandle(o.m_dlhandle) {
    o.m_dlhandle = nullptr;
 }
 LLModel::Implementation::~Implementation()
 {
    delete m_dlhandle;
 }
 static bool isImplementation(const Dlhandle &dl)
 {
    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
 }
 // Add the CUDA Toolkit to the DLL search path on Windows.
 // This is necessary for chat.exe to find CUDA when started from Qt Creator.
 static void addCudaSearchPath()
 {
 #ifdef _WIN32
    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
        auto libDir = std::wstring(cudaPath) + L"\\bin";
        if (!AddDllDirectory(libDir.c_str())) {
            auto err = GetLastError();
            std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
        }
    }
 #endif
 }
 const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
 {
    if (cpu_supports_avx() == 0) {
        throw std::runtime_error("CPU does not support AVX");
    }
    // NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
    // individual models without the cleanup of the static list interfering
    static auto* libs = new std::vector<Implementation>([] () {
        std::vector<Implementation> fres;
        addCudaSearchPath();
        std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
        if (cpu_supports_avx2() == 0) {
            impl_name_re += "-avxonly";
        }
        std::regex re(impl_name_re);
        auto search_in_directory = [&](const std::string& paths) {
            std::stringstream ss(paths);
            std::string path;
            // Split the paths string by the delimiter and process each path.
            while (std::getline(ss, path, ';')) {
                fs::directory_iterator iter;
                try {
                    iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
                } catch (const fs::filesystem_error &) {
                    continue; // skip nonexistent path
                }
                // Iterate over all libraries
                for (const auto &f : iter) {
                    const fs::path &p = f.path();
                    if (p.extension() != LIB_FILE_EXT) continue;
                    if (!std::regex_search(p.stem().string(), re)) continue;
                    // Add to list if model implementation
                    Dlhandle dl;
                    try {
                        dl = Dlhandle(p);
                    } catch (const Dlhandle::Exception &e) {
                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
                        continue;
                    }
                    if (!isImplementation(dl)) {
                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
                        continue;
                    }
                    fres.emplace_back(Implementation(std::move(dl)));
                }
            }
        };
        search_in_directory(s_implementations_search_path);
        return fres;
    }());
    // Return static result
    return *libs;
 }
 static std::string applyCPUVariant(const std::string &buildVariant)
 {
    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
        return buildVariant + "-avxonly";
    }
    return buildVariant;
 }
 const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
 {
    bool buildVariantMatched = false;
    std::optional<std::string> archName;
    for (const auto& i : implementationList()) {
        if (buildVariant != i.m_buildVariant) continue;
        buildVariantMatched = true;
        char *arch = i.m_getFileArch(fname);
        if (!arch) continue;
        archName = arch;
        bool archSupported = i.m_isArchSupported(arch);
        free(arch);
        if (archSupported) return &i;
    }
    if (!buildVariantMatched)
        return nullptr;
    if (!archName)
        throw UnsupportedModelError("Unsupported file format");
    throw BadArchError(std::move(*archName));
 }
 LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
 {
    std::vector<std::string> desiredBackends;
    if (backend != "auto") {
        desiredBackends.push_back(backend);
    } else {
        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
    }
    for (const auto &desiredBackend: desiredBackends) {
        const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
        if (impl) {
            // Construct llmodel implementation
            auto *fres = impl->m_construct();
            fres->m_implementation = impl;
 #if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
            /* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
             * load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
             * most (all?) places where this is called, causing underestimation of required
             * memory. */
            if (backend == "auto" && desiredBackend == "metal") {
                // on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
                size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
                if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
                    delete fres;
                    continue;
                }
            }
 #else
            (void)n_ctx;
 #endif
            return fres;
        }
    }
    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
 }
 LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
 {
    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
    const std::vector<Implementation> *impls;
    try {
        impls = &implementationList();
    } catch (const std::runtime_error &e) {
        std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
        return nullptr;
    }
    std::vector<std::string> desiredBackends;
    if (backend) {
        desiredBackends.push_back(backend.value());
    } else {
        desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
    }
    const Implementation *impl = nullptr;
    for (const auto &desiredBackend: desiredBackends) {
        auto cacheIt = implCache.find(desiredBackend);
        if (cacheIt != implCache.end())
            return cacheIt->second.get(); // cached
        for (const auto &i: *impls) {
            if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
                impl = &i;
                break;
            }
        }
        if (impl) {
            auto *fres = impl->m_construct();
            fres->m_implementation = impl;
            implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
            return fres;
        }
    }
    std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
    return nullptr;
 }
 std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
 {
    std::vector<LLModel::GPUDevice> devices;
 #ifndef __APPLE__
    static const std::string backends[] = {"kompute", "cuda"};
    for (const auto &backend: backends) {
        auto *llama = constructGlobalLlama(backend);
        if (llama) {
            auto backendDevs = llama->availableGPUDevices(memoryRequired);
            devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
        }
    }
 #endif
    return devices;
 }
 int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
 {
    auto *llama = constructGlobalLlama();
    return llama ? llama->maxContextLength(modelPath) : -1;
 }
 int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
 {
    auto *llama = constructGlobalLlama();
    return llama ? llama->layerCount(modelPath) : -1;
 }
 bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
 {
    auto *llama = constructGlobalLlama();
    return llama && llama->isEmbeddingModel(modelPath);
 }
 auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
 {
    auto *llama = constructGlobalLlama();
    return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
 }
 void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
 {
    s_implementations_search_path = path;
 }
 const std::string& LLModel::Implementation::implementationsSearchPath()
 {
    return s_implementations_search_path;
 }
 bool LLModel::Implementation::hasSupportedCPU()
 {
    return cpu_supports_avx() != 0;
 }
 int LLModel::Implementation::cpuSupportsAVX2()
 {
    return cpu_supports_avx2();
 }
--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -1,20 +1,31 @@
 #include "llmodel_c.h"
 #include "llmodel.h"
-#include <cerrno>
+#include <algorithm>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <exception>
 #include <iostream>
 #include <memory>
 #include <optional>
-#include <utility>
+#include <string>
 #include <string_view>
 #include <vector>
 #include <span>
 namespace ranges = std::ranges;
 static_assert(sizeof(token_t) == sizeof(LLModel::Token));
 struct LLModelWrapper {
    LLModel *llModel = nullptr;
    LLModel::PromptContext promptContext;
    ~LLModelWrapper() { delete llModel; }
 };
-llmodel_model llmodel_model_create(const char *model_path) {
+llmodel_model llmodel_model_create(const char *model_path)
 {
    const char *error;
    auto fres = llmodel_model_create2(model_path, "auto", &error);
    if (!fres) {
@ -23,7 +34,8 @@ llmodel_model llmodel_model_create(const char *model_path) {
    return fres;
 }
-static void llmodel_set_error(const char **errptr, const char *message) {
+static void llmodel_set_error(const char **errptr, const char *message)
 {
    thread_local static std::string last_error_message;
    if (errptr) {
        last_error_message = message;
@ -31,26 +43,23 @@ static void llmodel_set_error(const char **errptr, const char *message) {
    }
 }
-llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, const char **error) {
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
 {
    LLModel *llModel;
    try {
-        llModel = LLModel::Implementation::construct(model_path, build_variant);
+        llModel = LLModel::Implementation::construct(model_path, backend);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return nullptr;
    }
    if (!llModel) {
        llmodel_set_error(error, "Model format not supported (no matching implementation found)");
        return nullptr;
    }
    auto wrapper = new LLModelWrapper;
    wrapper->llModel = llModel;
    return wrapper;
 }
-void llmodel_model_destroy(llmodel_model model) {
+void llmodel_model_destroy(llmodel_model model)
 {
    delete static_cast<LLModelWrapper *>(model);
 }
@ -79,87 +88,85 @@ bool llmodel_isModelLoaded(llmodel_model model)
    return wrapper->llModel->isModelLoaded();
 }
-uint64_t llmodel_get_state_size(llmodel_model model)
+uint64_t llmodel_state_get_size(llmodel_model model)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->stateSize();
 }
-uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
+uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
                                token_t **input_tokens_out, uint64_t *n_input_tokens)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->saveState(dest);
+    std::vector<LLModel::Token> inputTokens;
    auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
    if (bytesWritten) {
        auto *buf = new LLModel::Token[inputTokens.size()];
        ranges::copy(inputTokens, buf);
        *input_tokens_out = buf;
        *n_input_tokens = uint64_t(inputTokens.size());
    } else {
        *input_tokens_out = nullptr;
        *n_input_tokens = 0;
    }
    return bytesWritten;
 }
-uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
+void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
 {
    delete[] input_tokens;
 }
 uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
                                const token_t *input_tokens, uint64_t n_input_tokens)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState(src);
+    return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
 }
-void llmodel_prompt(llmodel_model model, const char *prompt,
+bool llmodel_prompt(llmodel_model               model,
-                    const char *prompt_template,
+                    const char                 *prompt,
                    llmodel_prompt_callback     prompt_callback,
                    llmodel_response_callback   response_callback,
                    llmodel_recalculate_callback recalculate_callback,
                    llmodel_prompt_context     *ctx,
-                    bool special,
+                    const char                **error)
                    const char *fake_reply)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    auto response_func = [response_callback](int32_t token_id, const std::string &response) {
+    // Copy the C prompt context
-        return response_callback(token_id, response.c_str());
+    LLModel::PromptContext promptContext {
        .n_predict      = ctx->n_predict,
        .top_k          = ctx->top_k,
        .top_p          = ctx->top_p,
        .min_p          = ctx->min_p,
        .temp           = ctx->temp,
        .n_batch        = ctx->n_batch,
        .repeat_penalty = ctx->repeat_penalty,
        .repeat_last_n  = ctx->repeat_last_n,
        .contextErase   = ctx->context_erase,
    };
-    if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
+    auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
-        wrapper->promptContext.tokens.resize(ctx->n_past);
+        return prompt_callback(token_ids.data(), token_ids.size(), cached);
-
+    };
-    // Copy the C prompt context
+    auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
-    wrapper->promptContext.n_past = ctx->n_past;
+        return response_callback(token_id, piece.data());
-    wrapper->promptContext.n_ctx = ctx->n_ctx;
+    };
    wrapper->promptContext.n_predict = ctx->n_predict;
    wrapper->promptContext.top_k = ctx->top_k;
    wrapper->promptContext.top_p = ctx->top_p;
    wrapper->promptContext.min_p = ctx->min_p;
    wrapper->promptContext.temp = ctx->temp;
    wrapper->promptContext.n_batch = ctx->n_batch;
    wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
    wrapper->promptContext.contextErase = ctx->context_erase;
    std::string fake_reply_str;
    if (fake_reply) { fake_reply_str = fake_reply; }
    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;
    // Call the C++ prompt method
-    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
+    try {
-                             wrapper->promptContext, special, fake_reply_p);
+        wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
    } catch (std::exception const &e) {
        llmodel_set_error(error, e.what());
        return false;
    }
-    // Update the C context by giving access to the wrappers raw pointers to std::vector data
+    return true;
    // which involves no copies
    ctx->logits = wrapper->promptContext.logits.data();
    ctx->logits_size = wrapper->promptContext.logits.size();
    ctx->tokens = wrapper->promptContext.tokens.data();
    ctx->tokens_size = wrapper->promptContext.tokens.size();
    // Update the rest of the C prompt context
    ctx->n_past = wrapper->promptContext.n_past;
    ctx->n_ctx = wrapper->promptContext.n_ctx;
    ctx->n_predict = wrapper->promptContext.n_predict;
    ctx->top_k = wrapper->promptContext.top_k;
    ctx->top_p = wrapper->promptContext.top_p;
    ctx->min_p = wrapper->promptContext.min_p;
    ctx->temp = wrapper->promptContext.temp;
    ctx->n_batch = wrapper->promptContext.n_batch;
    ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;
    ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;
    ctx->context_erase = wrapper->promptContext.contextErase;
 }
 float *llmodel_embed(
    llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
-    size_t *token_count, bool do_mean, bool atlas, const char **error
+    size_t *token_count, bool do_mean, bool atlas, llmodel_emb_cancel_callback cancel_cb, const char **error
 ) {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
@ -185,7 +192,7 @@ float *llmodel_embed(
        if (prefix) { prefixStr = prefix; }
        embedding = new float[embd_size];
-        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas);
+        wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas, cancel_cb);
    } catch (std::exception const &e) {
        llmodel_set_error(error, e.what());
        return nullptr;
@ -253,6 +260,7 @@ struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired,
    for (unsigned i = 0; i < devices.size(); i++) {
        const auto &dev  =   devices[i];
              auto &cdev = c_devices[i];
        cdev.backend  = dev.backend;
        cdev.index    = dev.index;
        cdev.type     = dev.type;
        cdev.heapSize = dev.heapSize;
@ -281,8 +289,32 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
    return wrapper->llModel->initializeGPUDevice(device);
 }
-bool llmodel_has_gpu_device(llmodel_model model)
+const char *llmodel_model_backend_name(llmodel_model model)
 {
-    auto *wrapper = static_cast<LLModelWrapper *>(model);
+    const auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->hasGPUDevice();
+    return wrapper->llModel->backendName();
 }
 const char *llmodel_model_gpu_device_name(llmodel_model model)
 {
    const auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->gpuDeviceName();
 }
 int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
 {
    auto *wrapper = static_cast<const LLModelWrapper *>(model);
    try {
        return wrapper->llModel->countPromptTokens(prompt);
    } catch (const std::exception& e) {
        llmodel_set_error(error, e.what());
        return -1;
    }
 }
 void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
 {
    auto *wrapper = static_cast<const LLModelWrapper *>(model);
    for (auto &[name, token] : wrapper->llModel->specialTokens())
        callback(name.c_str(), token.c_str());
 }
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -0,0 +1,298 @@
 #include "llmodel.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
 #include <iostream>
 #include <iterator>
 #include <optional>
 #include <ranges>
 #include <stdexcept>
 #include <string>
 #include <string_view>
 #include <vector>
 namespace ranges = std::ranges;
 namespace views  = std::ranges::views;
 void LLModel::prompt(
    std::string_view        prompt,
    const PromptCallback   &promptCallback,
    const ResponseCallback &responseCallback,
    const PromptContext    &promptCtx
 ) {
    if (!isModelLoaded())
        throw std::invalid_argument("Attempted to prompt an unloaded model.");
    if (!supportsCompletion())
        throw std::invalid_argument("Not a text completion model.");
    if (!promptCtx.n_batch)
        throw std::invalid_argument("Batch size cannot be zero.");
    if (!promptCtx.n_predict)
        return; // nothing requested
    auto embd_inp = tokenize(prompt);
    if (embd_inp.empty())
        throw std::invalid_argument("Prompt tokenized to zero tokens.");
    if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
        generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
 }
 int32_t LLModel::countPromptTokens(std::string_view prompt) const
 {
    if (!isModelLoaded())
        throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
    return int32_t(tokenize(prompt).size());
 }
 auto LLModel::decodePrompt(
    const PromptCallback &promptCallback,
    const PromptContext  &promptCtx,
    std::vector<Token>    embd_inp
 ) -> std::optional<int32_t>
 {
    assert(!embd_inp.empty());
    int32_t nCtx = contextLength();
    int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
    // Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
    // requested n_past.
    // This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
    int32_t nPast = computeModelInputPosition(embd_inp);
    // always decode up to a full batch before generating, even if cached
    nPast -= std::min(n_batch, nPast);
    // TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
    if (!nPast && int32_t(embd_inp.size()) > nCtx) {
        // no cache hit -> shift the input before even processing
        int32_t nKeep     = shouldAddBOS();
        auto    newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
        int32_t nDiscard  = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
        // execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
        auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
        if (!promptCallback(discardedTokens, true))
            return std::nullopt;
        // erase nDiscard tokens
        embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
        assert(int32_t(embd_inp.size()) <= nCtx);
        // check the cache again, just in case
        nPast = computeModelInputPosition(embd_inp);
        nPast -= std::min(n_batch, nPast);
    }
    setModelInputPosition(nPast);
    // execute the callback even for skipped tokens
    if (!promptCallback(embd_inp | views::take(nPast), true))
        return std::nullopt;
    // process the prompt in batches
    for (int32_t i = nPast; i < embd_inp.size();) {
        auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
        std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
        // Check if the context has run out...
        if (nPast + int32_t(batch.size()) > nCtx) {
            shiftContext(promptCtx, &nPast);
            assert(nPast + int32_t(batch.size()) <= nCtx);
        }
        // FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
        if (!evalTokens(nPast, batch))
            throw std::runtime_error("An internal error was encountered during prompt processing.");
        for (auto &tok : batch) {
            appendInputToken(tok);
            nPast++;
            if (!promptCallback({ &tok, 1 }, false))
                return std::nullopt;
        }
        i = batch_end;
    }
    return nPast;
 }
 /*
 * If string s overlaps with the string key such that some prefix of the key is at the end
 * of the string, return the position in s where the first match starts. Otherwise, return
 * std::string::npos. Examples:
 * s = "bfo",  key = "foo" -> 1
 * s = "fooa", key = "foo" -> npos
 */
 static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
 {
    if (s.empty() || key.empty())
        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
            return start;
    }
    return std::string::npos;
 }
 void LLModel::generateResponse(
    const ResponseCallback &responseCallback,
    const PromptContext    &promptCtx,
    int32_t                 nPast
 ) {
    static const char *stopSequences[] {
        "### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
        "<|im_start|>", "<|im_end|>", "<|endoftext|>",
    };
    initSampler(promptCtx);
    std::string cachedResponse;
    std::vector<Token> cachedTokens;
    int n_predicted = 0;
    // Predict next tokens
    for (bool stop = false; !stop;) {
        // Sample next token
        std::optional<Token> new_tok = sampleToken();
        std::string new_piece = tokenToString(new_tok.value());
        cachedTokens.push_back(new_tok.value());
        cachedResponse += new_piece;
        auto accept = [this, &promptCtx, &new_tok, &nPast] {
            // Shift context if out of space
            if (nPast >= contextLength()) {
                shiftContext(promptCtx, &nPast);
                assert(nPast < contextLength());
            }
            // Accept the token
            Token tok = std::exchange(new_tok, std::nullopt).value();
            if (!evalTokens(nPast, { &tok, 1 }))
                throw std::runtime_error("An internal error was encountered during response generation.");
            appendInputToken(tok);
            nPast++;
        };
        // Check for EOS
        auto lengthLimit = std::string::npos;
        for (const auto token : endTokens()) {
            if (new_tok == token) {
                stop = true;
                lengthLimit = cachedResponse.size() - new_piece.size();
            }
        }
        if (lengthLimit != std::string::npos) {
            // EOS matched
        } else if (!isSpecialToken(new_tok.value())) {
            // Check if the response contains a stop sequence
            for (const auto &p : stopSequences) {
                auto match = cachedResponse.find(p);
                if (match != std::string::npos) stop = true;
                lengthLimit = std::min(lengthLimit, match);
                if (match == 0) break;
            }
            // Check if the response matches the start of a stop sequence
            if (lengthLimit == std::string::npos) {
                for (const auto &p : stopSequences) {
                    auto match = stringsOverlap(cachedResponse, p);
                    lengthLimit = std::min(lengthLimit, match);
                    if (match == 0) break;
                }
            }
        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
            // Special tokens must exactly match a stop sequence
            stop = true;
            lengthLimit = cachedResponse.size() - new_piece.size();
        }
        // Empty the cache, up to the length limit
        std::string::size_type responseLength = 0;
        while (!cachedTokens.empty()) {
            Token tok = cachedTokens.front();
            std::string piece = tokenToString(tok);
            // Stop if the piece (or part of it) does not fit within the length limit
            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
                break;
            // Remove token from cache
            assert(cachedResponse.starts_with(piece));
            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
            // Accept the token, if needed (not cached)
            if (cachedTokens.empty() && new_tok)
                accept();
            // Send the token
            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
                stop = true;
                break;
            }
            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
            // output token IDs and could cache a partial token for the next prompt call
            responseLength += piece.size();
        }
        assert(cachedTokens.empty() == cachedResponse.empty());
        // Accept the token, if needed (in cache)
        if (new_tok) {
            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
            if (stop) {
                cachedTokens.pop_back();
            } else {
                accept();
            }
        }
    }
    if (inputLength() < cachedTokens.size()) {
        /* This is theoretically possible if the longest stop sequence is greater than
         * n_ctx * contextErase tokens. */
        throw std::runtime_error("shifted too much context, can't go back");
    }
 #ifndef NDEBUG
    auto inp = inputTokens();
    auto discard_start = inp.end() - cachedTokens.size();
    assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
 #endif
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
 ) {
    (void)texts;
    (void)embeddings;
    (void)prefix;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    (void)cancelCb;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
 void LLModel::embed(
    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
    bool doMean, bool atlas
 ) {
    (void)texts;
    (void)embeddings;
    (void)isRetrieval;
    (void)dimensionality;
    (void)tokenCount;
    (void)doMean;
    (void)atlas;
    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
 }
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
@ -0,0 +1,17 @@
 #pragma once
 #include <cassert>
 #ifdef NDEBUG
 #   ifdef __has_builtin
 #       if __has_builtin(__builtin_unreachable)
 #           define UNREACHABLE() __builtin_unreachable()
 #       else
 #           define UNREACHABLE() do {} while (0)
 #       endif
 #   else
 #       define UNREACHABLE() do {} while (0)
 #   endif
 #else
 #   define UNREACHABLE() assert(!"Unreachable statement was reached")
 #endif
--- a/gpt4all-backend/utils.cpp
+++ b/gpt4all-backend/utils.cpp
@ -1,328 +0,0 @@
 #include "utils.h"
 #include <fstream>
 #include <regex>
 void replace(std::string & str, const std::string & needle, const std::string & replacement) {
    size_t pos = 0;
    while ((pos = str.find(needle, pos)) != std::string::npos) {
        str.replace(pos, needle.length(), replacement);
        pos += replacement.length();
    }
 }
 std::map<std::string, int32_t> json_parse(const std::string & fname) {
    std::map<std::string, int32_t> result;
    // read file into string
    std::string json;
    {
        std::ifstream ifs(fname);
        if (!ifs) {
            fprintf(stderr, "Failed to open %s\n", fname.c_str());
            exit(1);
        }
        json = std::string((std::istreambuf_iterator<char>(ifs)),
                (std::istreambuf_iterator<char>()));
    }
    if (json[0] != '{') {
        return result;
    }
    // parse json
    {
        bool has_key  = false;
        bool in_token = false;
        std::string str_key = "";
        std::string str_val = "";
        int n = json.size();
        for (int i = 1; i < n; ++i) {
            if (!in_token) {
                if (json[i] == ' ') continue;
                if (json[i] == '"') {
                    in_token = true;
                    continue;
                }
            } else {
                if (json[i] == '\\' && i+1 < n) {
                    if (has_key == false) {
                        str_key += json[i];
                    } else {
                        str_val += json[i];
                    }
                    ++i;
                } else if (json[i] == '"') {
                    if (has_key == false) {
                        has_key = true;
                        ++i;
                        while (json[i] == ' ') ++i;
                        ++i; // :
                        while (json[i] == ' ') ++i;
                        if (json[i] != '\"') {
                            while (json[i] != ',' && json[i] != '}') {
                                str_val += json[i++];
                            }
                            has_key = false;
                        } else {
                            in_token = true;
                            continue;
                        }
                    } else {
                        has_key = false;
                    }
                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
                    ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
                    try {
                        result[str_key] = std::stoi(str_val);
                    } catch (...) {
                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
                    }
                    str_key = "";
                    str_val = "";
                    in_token = false;
                    continue;
                }
                if (has_key == false) {
                    str_key += json[i];
                } else {
                    str_val += json[i];
                }
            }
        }
    }
    return result;
 }
 std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {
    std::vector<std::string> words;
    // first split the text into words
    {
        std::string str = text;
        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
        std::regex re(pat);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            for (auto x : m) {
                words.push_back(x);
            }
            str = m.suffix();
        }
    }
    // find the longest tokens that form the words:
    std::vector<gpt_vocab::id> tokens;
    for (const auto & word : words) {
        if (word.size() == 0) continue;
        int i = 0;
        int n = word.size();
        while (i < n) {
            int j = n;
            while (j > i) {
                auto it = vocab.token_to_id.find(word.substr(i, j-i));
                if (it != vocab.token_to_id.end()) {
                    tokens.push_back(it->second);
                    i = j;
                    break;
                }
                --j;
            }
            if (i == n) {
                break;
            }
            if (j == i) {
                auto sub = word.substr(i, 1);
                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
                    tokens.push_back(vocab.token_to_id.at(sub));
                } else {
                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
                }
                ++i;
            }
        }
    }
    return tokens;
 }
 std::string regex_escape(const std::string &s) {
  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
  return std::regex_replace(s, metacharacters, "\\$&");
 }
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
    // Generate the subpattern from the special_tokens vector if it's not empty
    if (!vocab.special_tokens.empty()) {
        std::vector<gpt_vocab::id> out;
        std::vector<std::string> chunks;
        std::string str = text;
        std::string special_tokens_subpattern;
        for (const auto &token : vocab.special_tokens) {
            if (!special_tokens_subpattern.empty()) {
                special_tokens_subpattern += "|";
            }
            special_tokens_subpattern += regex_escape(token);
        }
        std::regex re(special_tokens_subpattern);
        std::smatch m;
        while (std::regex_search(str, m, re)) {
            auto tok = vocab.token_to_id.find(m.str());
            if (tok != vocab.token_to_id.end()) {
                auto tokid = tok->second;
                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
                out.push_back(tokid);
                str = m.suffix();
            }
        }
        if (!str.empty()) {
            auto tokrest = gpt_tokenize_inner(vocab, str);
            out.insert(out.end(), tokrest.begin(), tokrest.end());
        }
        return out;
    } else {
        return gpt_tokenize_inner(vocab, text);
    }
 }
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
    vocab.token_to_id = ::json_parse(fname);
    for (const auto & kv : vocab.token_to_id) {
        vocab.id_to_token[kv.second] = kv.first;
    }
    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
    // print the vocabulary
    //for (auto kv : vocab.token_to_id) {
    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
    //}
    return true;
 }
 gpt_vocab::id gpt_sample_top_k_top_p(
        const size_t actualVocabSize,
        const int32_t * last_n_tokens_data,
        int   last_n_tokens_size,
        const std::vector<float> logits,
        int    top_k,
        double top_p,
        double temp,
        float repeat_penalty,
        std::mt19937 & rng) {
    int n_logits = actualVocabSize;
    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
    const auto * plogits = logits.data();
    if (temp <= 0) {
        // select the token with the highest logit directly
        float max_logit = plogits[0];
        gpt_vocab::id max_id = 0;
        for (int i = 1; i < n_logits; ++i) {
            if (plogits[i] > max_logit) {
                max_logit = plogits[i];
                max_id = i;
            }
        }
        return max_id;
    }
    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
    logits_id.reserve(n_logits);
    {
        const float scale = 1.0f/temp;
        for (int i = 0; i < n_logits; ++i) {
            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
                if (plogits[i] < 0.0f) {
                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
                } else {
                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
                }
            } else {
                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
            }
        }
    }
    // find the top K tokens
    std::partial_sort(
            logits_id.begin(),
            logits_id.begin() + top_k, logits_id.end(),
            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
        return a.first > b.first;
    });
    logits_id.resize(top_k);
    double maxl = -INFINITY;
    for (const auto & kv : logits_id) {
        maxl = std::max(maxl, kv.first);
    }
    // compute probs for the top K tokens
    std::vector<double> probs;
    probs.reserve(logits_id.size());
    double sum = 0.0;
    for (const auto & kv : logits_id) {
        double p = exp(kv.first - maxl);
        probs.push_back(p);
        sum += p;
    }
    // normalize the probs
    for (auto & p : probs) {
        p /= sum;
    }
    if (top_p < 1.0f) {
        double cumsum = 0.0f;
        for (int i = 0; i < top_k; i++) {
            cumsum += probs[i];
            if (cumsum >= top_p) {
                top_k = i + 1;
                probs.resize(top_k);
                logits_id.resize(top_k);
                break;
            }
        }
        cumsum = 1.0/cumsum;
        for (int i = 0; i < (int) probs.size(); i++) {
            probs[i] *= cumsum;
        }
    }
    //printf("\n");
    //for (int i = 0; i < (int) probs.size(); i++) {
    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
    //}
    //exit(0);
    std::discrete_distribution<> dist(probs.begin(), probs.end());
    int idx = dist(rng);
    return logits_id[idx].second;
 }
--- a/gpt4all-backend/utils.h
+++ b/gpt4all-backend/utils.h
@ -1,97 +0,0 @@
 // Various helper functions and utilities
 #pragma once
 #include <string>
 #include <map>
 #include <vector>
 #include <random>
 #include <thread>
 //
 // General purpose inline functions
 //
 constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes) {
    return bytes*1024*1024;
 }
 //
 // CLI argument parsing
 //
 struct gpt_params {
    int32_t seed      = -1; // RNG seed
    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    int32_t n_predict = 200; // new tokens to predict
    // sampling parameters
    int32_t top_k = 40;
    float   top_p = 0.9f;
    float   temp  = 0.9f;
    int32_t n_batch = 8; // batch size for prompt processing
    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
    std::string prompt;
 };
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
 void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
 std::string gpt_random_prompt(std::mt19937 & rng);
 //
 // Vocab utils
 //
 struct gpt_vocab {
    using id    = int32_t;
    using token = std::string;
    std::map<token, id> token_to_id;
    std::map<id, token> id_to_token;
    std::vector<std::string> special_tokens;
    void add_special_token(const std::string &token) {
        special_tokens.push_back(token);
    }
 };
 void replace(std::string & str, const std::string & needle, const std::string & replacement);
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 // split text into tokens
 //
 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
 //
 // Regex (Python):
 // r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
 //
 // Regex (C++):
 // R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
 //
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
 // load the tokens from encoder.json
 bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
 // sample next token given probabilities for each embedding
 //
 //   - consider only the top K tokens
 //   - from them, consider only the top tokens with cumulative probability > P
 //
 // TODO: not sure if this implementation is correct
 //
 gpt_vocab::id gpt_sample_top_k_top_p(
        const size_t actualVocabSize,
        const int32_t * last_n_tokens_data,
        int   last_n_tokens_size,
        const std::vector<float> logits,
        int    top_k,
        double top_p,
        double temp,
        float repeat_penalty,
        std::mt19937 & rng);
--- a/gpt4all-bindings/README.md
+++ b/gpt4all-bindings/README.md
@ -1,3 +1,21 @@
-# GPT4All Bindings
+# GPT4All Language Bindings
-This directory will contain language specific bindings on top of the C/C++ model backends.
+These are the language bindings for the GPT4All backend. They provide functionality to load GPT4All models (and other llama.cpp models), generate text, and (in the case of the Python bindings) embed text as a vector representation.
-We will have one directory per language binding (e.g. Python, Typescript, Golang, etc.).
+
 See their respective folders for language-specific documentation.
 ### Languages
 - [Python](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python) (Nomic official, maintained by [@cebtenzzre](https://github.com/cebtenzzre))
 - [Node.js/Typescript](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript) (community, maintained by [@jacoobes](https://github.com/jacoobes) and [@iimez](https://github.com/iimez))
 <br/>
 <br/>
 <details><summary><b>Archived Bindings</b></summary>
 <br/>
 The following bindings have been removed from this repository due to lack of maintenance. If adopted, they can be brought back&mdash;feel free to message a developer on Dicsord if you are interested in maintaining one of them. Below are links to their last available version (not necessarily the last working version).
 - C#: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/csharp)
 - Java: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/java)
 - Go: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/golang)
 </details>
--- a/gpt4all-bindings/cli/README.md
+++ b/gpt4all-bindings/cli/README.md
@ -2,8 +2,7 @@
 GPT4All on the command-line.
-## Documentation
+More details on the [wiki](https://github.com/nomic-ai/gpt4all/wiki/Python-CLI).
 <https://docs.gpt4all.io/gpt4all_cli.html>
 ## Quickstart
@ -34,11 +33,11 @@ python -m pip install --user --upgrade gpt4all typer
 # run the CLI
 python app.py repl
 ```
-By default, it will automatically download the `groovy` model to `.cache/gpt4all/` in your user
+By default, it will automatically download the `Mistral Instruct` model to `.cache/gpt4all/` in your
-directory, if necessary.
+user directory, if necessary.
 If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
 for example:
 ```shell
-python app.py repl --model /home/user/my-gpt4all-models/gpt4all-13b-snoozy-q4_0.gguf
+python app.py repl --model /home/user/my-gpt4all-models/mistral-7b-instruct-v0.1.Q4_0.gguf
 ```
--- a/gpt4all-bindings/cli/app.py
+++ b/gpt4all-bindings/cli/app.py
@ -113,10 +113,7 @@ def _old_loop(gpt4all_instance):
        full_response = gpt4all_instance.chat_completion(
            MESSAGES,
            # preferential kwargs for chat ux
            logits_size=0,
            tokens_size=0,
            n_past=0,
            n_ctx=0,
            n_predict=200,
            top_k=40,
            top_p=0.9,
--- a/gpt4all-bindings/csharp/.editorconfig
+++ b/gpt4all-bindings/csharp/.editorconfig
@ -1,348 +0,0 @@
 # EditorConfig is awesome: https://EditorConfig.org
 # top-most EditorConfig file
 root = true
 # Don't use tabs for indentation.
 [*]
 indent_style = space
 # (Please don't specify an indent_size here; that has too many unintended consequences.)
 # Code files
 [*.{cs,csx,vb,vbx}]
 indent_size = 4
 insert_final_newline = true
 charset = utf-8-bom
 # XML project files
 [*.{csproj,vbproj,vcxproj,vcxproj.filters,proj,projitems,shproj}]
 indent_size = 4
 # XML config files
 [*.{props,targets,ruleset,config,nuspec,resx,vsixmanifest,vsct}]
 indent_size = 2
 # JSON files
 [*.json]
 indent_size = 2
 # Powershell files
 [*.ps1]
 indent_size = 2
 # Shell script files
 [*.sh]
 end_of_line = lf
 indent_size = 2
 insert_final_newline = true
 # Dotnet code style settings:
 [*.{cs,vb}]
 # IDE0055: Fix formatting
 dotnet_diagnostic.IDE0055.severity = error
 dotnet_diagnostic.CS1573.severity = suggestion
 dotnet_diagnostic.CS1591.severity = suggestion
 # Sort using and Import directives with System.* appearing first
 dotnet_sort_system_directives_first = true
 dotnet_separate_import_directive_groups = false
 # Avoid "this." and "Me." if not necessary
 dotnet_style_qualification_for_field = false:suggestion
 dotnet_style_qualification_for_property = false:suggestion
 dotnet_style_qualification_for_method = false:suggestion
 dotnet_style_qualification_for_event = false:suggestion
 # Use language keywords instead of framework type names for type references
 dotnet_style_predefined_type_for_locals_parameters_members = true:warning
 dotnet_style_predefined_type_for_member_access = true:warning
 # Suggest more modern language features when available
 dotnet_style_object_initializer = true:suggestion
 dotnet_style_collection_initializer = true:suggestion
 dotnet_style_coalesce_expression = true:suggestion
 dotnet_style_null_propagation = true:suggestion
 dotnet_style_explicit_tuple_names = true:suggestion
 # Whitespace options
 dotnet_style_allow_multiple_blank_lines_experimental = false
 # Private fields are camelCase with '_' prefix
 dotnet_naming_rule.private_members_with_underscore.symbols  = private_fields
 dotnet_naming_rule.private_members_with_underscore.style    = prefix_underscore
 dotnet_naming_rule.private_members_with_underscore.severity = error
 dotnet_naming_symbols.private_fields.applicable_kinds           = field
 dotnet_naming_symbols.private_fields.applicable_accessibilities = private
 dotnet_naming_style.prefix_underscore.capitalization = camel_case
 dotnet_naming_style.prefix_underscore.required_prefix = _
 # Non-private static fields are PascalCase
 dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.severity = suggestion
 dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.symbols = non_private_static_fields
 dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.style = non_private_static_field_style
 dotnet_naming_symbols.non_private_static_fields.applicable_kinds = field
 dotnet_naming_symbols.non_private_static_fields.applicable_accessibilities = public, protected, internal, protected_internal, private_protected
 dotnet_naming_symbols.non_private_static_fields.required_modifiers = static
 dotnet_naming_style.non_private_static_field_style.capitalization = pascal_case
 # Non-private readonly fields are PascalCase
 dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.severity = suggestion
 dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.symbols = non_private_readonly_fields
 dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.style = non_private_static_field_style
 dotnet_naming_symbols.non_private_readonly_fields.applicable_kinds = field
 dotnet_naming_symbols.non_private_readonly_fields.applicable_accessibilities = public, protected, internal, protected_internal, private_protected
 dotnet_naming_symbols.non_private_readonly_fields.required_modifiers = readonly
 dotnet_naming_style.non_private_readonly_field_style.capitalization = pascal_case
 # Constants are PascalCase
 dotnet_naming_rule.constants_should_be_pascal_case.severity = suggestion
 dotnet_naming_rule.constants_should_be_pascal_case.symbols = constants
 dotnet_naming_rule.constants_should_be_pascal_case.style = non_private_static_field_style
 dotnet_naming_symbols.constants.applicable_kinds = field, local
 dotnet_naming_symbols.constants.required_modifiers = const
 dotnet_naming_style.constant_style.capitalization = pascal_case
 # Static fields are camelCase and start with s_
 dotnet_naming_rule.static_fields_should_be_camel_case.severity = none
 dotnet_naming_rule.static_fields_should_be_camel_case.symbols = static_fields
 dotnet_naming_rule.static_fields_should_be_camel_case.style = static_field_style
 dotnet_naming_symbols.static_fields.applicable_kinds = field
 dotnet_naming_symbols.static_fields.required_modifiers = static
 dotnet_naming_style.static_field_style.capitalization = camel_case
 dotnet_naming_style.static_field_style.required_prefix = s_
 # Instance fields are camelCase and start with _
 dotnet_naming_rule.instance_fields_should_be_camel_case.severity = none
 dotnet_naming_rule.instance_fields_should_be_camel_case.symbols = instance_fields
 dotnet_naming_rule.instance_fields_should_be_camel_case.style = instance_field_style
 dotnet_naming_symbols.instance_fields.applicable_kinds = field
 dotnet_naming_style.instance_field_style.capitalization = camel_case
 dotnet_naming_style.instance_field_style.required_prefix = _
 # Locals and parameters are camelCase
 dotnet_naming_rule.locals_should_be_camel_case.severity = suggestion
 dotnet_naming_rule.locals_should_be_camel_case.symbols = locals_and_parameters
 dotnet_naming_rule.locals_should_be_camel_case.style = camel_case_style
 dotnet_naming_symbols.locals_and_parameters.applicable_kinds = parameter, local
 dotnet_naming_style.camel_case_style.capitalization = camel_case
 # Local functions are PascalCase
 dotnet_naming_rule.local_functions_should_be_pascal_case.severity = suggestion
 dotnet_naming_rule.local_functions_should_be_pascal_case.symbols = local_functions
 dotnet_naming_rule.local_functions_should_be_pascal_case.style = non_private_static_field_style
 dotnet_naming_symbols.local_functions.applicable_kinds = local_function
 dotnet_naming_style.local_function_style.capitalization = pascal_case
 # By default, name items with PascalCase
 dotnet_naming_rule.members_should_be_pascal_case.severity = suggestion
 dotnet_naming_rule.members_should_be_pascal_case.symbols = all_members
 dotnet_naming_rule.members_should_be_pascal_case.style = non_private_static_field_style
 dotnet_naming_symbols.all_members.applicable_kinds = *
 dotnet_naming_style.pascal_case_style.capitalization = pascal_case
 # error RS2008: Enable analyzer release tracking for the analyzer project containing rule '{0}'
 dotnet_diagnostic.RS2008.severity = none
 # IDE0073: File header
 dotnet_diagnostic.IDE0073.severity = none
 #file_header_template = Licensed to the .NET Foundation under one or more agreements.\nThe .NET Foundation licenses this file to you under the MIT license.\nSee the LICENSE file in the project root for more information.
 # IDE0035: Remove unreachable code
 dotnet_diagnostic.IDE0035.severity = warning
 # IDE0036: Order modifiers
 dotnet_diagnostic.IDE0036.severity = warning
 # IDE0043: Format string contains invalid placeholder
 dotnet_diagnostic.IDE0043.severity = warning
 # IDE0044: Make field readonly
 dotnet_diagnostic.IDE0044.severity = warning
 # IDE1006: Naming rule violation
 #dotnet_diagnostic.IDE1006.severity = none
 # RS0016: Only enable if API files are present
 dotnet_public_api_analyzer.require_api_files = true
 dotnet_style_operator_placement_when_wrapping = beginning_of_line
 tab_width = 4
 end_of_line = crlf
 dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion
 dotnet_style_prefer_auto_properties = true:silent
 dotnet_style_prefer_simplified_boolean_expressions = true:suggestion
 dotnet_style_prefer_conditional_expression_over_assignment = true:silent
 dotnet_style_prefer_conditional_expression_over_return = true:silent
 dotnet_style_prefer_inferred_tuple_names = true:suggestion
 dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion
 dotnet_style_prefer_compound_assignment = true:suggestion
 dotnet_style_prefer_simplified_interpolation = true:suggestion
 dotnet_style_namespace_match_folder = true:suggestion
 # CSharp code style settings:
 [*.cs]
 # Newline settings
 csharp_new_line_before_open_brace = all
 csharp_new_line_before_else = true
 csharp_new_line_before_catch = true
 csharp_new_line_before_finally = true
 csharp_new_line_before_members_in_object_initializers = true
 csharp_new_line_before_members_in_anonymous_types = true
 csharp_new_line_between_query_expression_clauses = true
 # Indentation preferences
 csharp_indent_block_contents = true
 csharp_indent_braces = false
 csharp_indent_case_contents = true
 csharp_indent_case_contents_when_block = true
 csharp_indent_switch_labels = true
 csharp_indent_labels = flush_left
 # Whitespace options
 csharp_style_allow_embedded_statements_on_same_line_experimental = false
 csharp_style_allow_blank_lines_between_consecutive_braces_experimental = false
 csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental = false
 # Prefer "var" everywhere
 csharp_style_var_for_built_in_types = true:suggestion
 csharp_style_var_when_type_is_apparent = true:suggestion
 csharp_style_var_elsewhere = true:suggestion
 # Prefer method-like constructs to have a block body
 csharp_style_expression_bodied_methods = false:none
 csharp_style_expression_bodied_constructors = false:none
 csharp_style_expression_bodied_operators = false:none
 # Prefer property-like constructs to have an expression-body
 csharp_style_expression_bodied_properties = true:none
 csharp_style_expression_bodied_indexers = true:none
 csharp_style_expression_bodied_accessors = true:none
 # Suggest more modern language features when available
 csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
 csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
 csharp_style_inlined_variable_declaration = true:suggestion
 csharp_style_throw_expression = true:suggestion
 csharp_style_conditional_delegate_call = true:suggestion
 # Space preferences
 csharp_space_after_cast = false
 csharp_space_after_colon_in_inheritance_clause = true
 csharp_space_after_comma = true
 csharp_space_after_dot = false
 csharp_space_after_keywords_in_control_flow_statements = true
 csharp_space_after_semicolon_in_for_statement = true
 csharp_space_around_binary_operators = before_and_after
 csharp_space_around_declaration_statements = do_not_ignore
 csharp_space_before_colon_in_inheritance_clause = true
 csharp_space_before_comma = false
 csharp_space_before_dot = false
 csharp_space_before_open_square_brackets = false
 csharp_space_before_semicolon_in_for_statement = false
 csharp_space_between_empty_square_brackets = false
 csharp_space_between_method_call_empty_parameter_list_parentheses = false
 csharp_space_between_method_call_name_and_opening_parenthesis = false
 csharp_space_between_method_call_parameter_list_parentheses = false
 csharp_space_between_method_declaration_empty_parameter_list_parentheses = false
 csharp_space_between_method_declaration_name_and_open_parenthesis = false
 csharp_space_between_method_declaration_parameter_list_parentheses = false
 csharp_space_between_parentheses = false
 csharp_space_between_square_brackets = false
 # Blocks are allowed
 csharp_prefer_braces = true:silent
 csharp_preserve_single_line_blocks = true
 csharp_preserve_single_line_statements = true
 # Target-type new expressio
 csharp_style_implicit_object_creation_when_type_is_apparent = true:suggestion
 # Currently only enabled for C# due to crash in VB analyzer.  VB can be enabled once
 # https://github.com/dotnet/roslyn/pull/54259 has been published.
 dotnet_style_allow_statement_immediately_after_block_experimental = false
 dotnet_diagnostic.RCS0003.severity=warning
 dotnet_diagnostic.RCS1036.severity=error
 dotnet_diagnostic.IDE0005.severity=warning
 dotnet_diagnostic.IDE0007.severity=error
 csharp_using_directive_placement = outside_namespace:silent
 csharp_prefer_simple_using_statement = true:suggestion
 csharp_style_namespace_declarations = block_scoped:silent
 csharp_style_expression_bodied_lambdas = true:silent
 csharp_style_expression_bodied_local_functions = false:silent
 csharp_style_prefer_null_check_over_type_check = true:suggestion
 dotnet_diagnostic.RCS1075.severity = suggestion
 [src/CodeStyle/**.{cs,vb}]
 # warning RS0005: Do not use generic CodeAction.Create to create CodeAction
 dotnet_diagnostic.RS0005.severity = none
 [src/{Analyzers,CodeStyle,Features,Workspaces,EditorFeatures,VisualStudio}/**/*.{cs,vb}]
 # IDE0011: Add braces
 csharp_prefer_braces = when_multiline:warning
 # NOTE: We need the below severity entry for Add Braces due to https://github.com/dotnet/roslyn/issues/44201
 dotnet_diagnostic.IDE0011.severity = warning
 # IDE0040: Add accessibility modifiers
 dotnet_diagnostic.IDE0040.severity = warning
 # CONSIDER: Are IDE0051 and IDE0052 too noisy to be warnings for IDE editing scenarios? Should they be made build-only warnings?
 # IDE0051: Remove unused private member
 dotnet_diagnostic.IDE0051.severity = warning
 # IDE0052: Remove unread private member
 dotnet_diagnostic.IDE0052.severity = warning
 # IDE0059: Unnecessary assignment to a value
 dotnet_diagnostic.IDE0059.severity = warning
 # IDE0060: Remove unused parameter
 dotnet_diagnostic.IDE0060.severity = warning
 # CA1012: Abstract types should not have public constructors
 dotnet_diagnostic.CA1012.severity = warning
 # CA1822: Make member static
 dotnet_diagnostic.CA1822.severity = warning
 # Prefer "var" everywhere
 dotnet_diagnostic.IDE0007.severity = warning
 csharp_style_var_for_built_in_types = true:warning
 csharp_style_var_when_type_is_apparent = true:warning
 csharp_style_var_elsewhere = true:warning
 # dotnet_style_allow_multiple_blank_lines_experimental
 dotnet_diagnostic.IDE2000.severity = warning
 # csharp_style_allow_embedded_statements_on_same_line_experimental
 dotnet_diagnostic.IDE2001.severity = warning
 # csharp_style_allow_blank_lines_between_consecutive_braces_experimental
 dotnet_diagnostic.IDE2002.severity = warning
 # dotnet_style_allow_statement_immediately_after_block_experimental
 dotnet_diagnostic.IDE2003.severity = warning
 # csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental
 dotnet_diagnostic.IDE2004.severity = warning
 [src/{VisualStudio}/**/*.{cs,vb}]
 # CA1822: Make member static
 # There is a risk of accidentally breaking an internal API that partners rely on though IVT.
 dotnet_code_quality.CA1822.api_surface = private
--- a/gpt4all-bindings/csharp/.gitignore
+++ b/gpt4all-bindings/csharp/.gitignore
@ -1,379 +0,0 @@
 ## Ignore Visual Studio temporary files, build results, and
 ## files generated by popular Visual Studio add-ons.
 ##
 ## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
 runtimes
 **/*nuget
 *.zip
 include/
 *.exp
 *.lib
 *.dll
 # User-specific files
 *.rsuser
 *.suo
 *.user
 *.userosscache
 *.sln.docstates
 # User-specific files (MonoDevelop/Xamarin Studio)
 *.userprefs
 # Mono auto generated files
 mono_crash.*
 Tests/**/launchSettings.json
 # Build results
 [Dd]ebug/
 [Dd]ebugPublic/
 [Rr]elease/
 [Rr]eleases/
 x64/
 x86/
 [Ww][Ii][Nn]32/
 [Aa][Rr][Mm]/
 [Aa][Rr][Mm]64/
 bld/
 [Bb]in/
 [Oo]bj/
 [Oo]ut/
 [Ll]og/
 [Ll]ogs/
 # Visual Studio 2015/2017 cache/options directory
 .vs/
 # Uncomment if you have tasks that create the project's static files in wwwroot
 #wwwroot/
 # Visual Studio 2017 auto generated files
 Generated\ Files/
 # MSTest test Results
 [Tt]est[Rr]esult*/
 [Bb]uild[Ll]og.*
 # NUnit
 *.VisualState.xml
 TestResult.xml
 nunit-*.xml
 # Build Results of an ATL Project
 [Dd]ebugPS/
 [Rr]eleasePS/
 dlldata.c
 # Benchmark Results
 BenchmarkDotNet.Artifacts/
 # .NET Core
 project.lock.json
 project.fragment.lock.json
 artifacts/
 # ASP.NET Scaffolding
 ScaffoldingReadMe.txt
 # StyleCop
 StyleCopReport.xml
 # Files built by Visual Studio
 *_i.c
 *_p.c
 *_h.h
 *.ilk
 *.meta
 *.obj
 *.iobj
 *.pch
 *.pdb
 *.ipdb
 *.pgc
 *.pgd
 *.rsp
 *.sbr
 *.tlb
 *.tli
 *.tlh
 *.tmp
 *.tmp_proj
 *_wpftmp.csproj
 *.log
 *.vspscc
 *.vssscc
 .builds
 *.pidb
 *.svclog
 *.scc
 # Chutzpah Test files
 _Chutzpah*
 # Visual C++ cache files
 ipch/
 *.aps
 *.ncb
 *.opendb
 *.opensdf
 *.sdf
 *.cachefile
 *.VC.db
 *.VC.VC.opendb
 # Visual Studio profiler
 *.psess
 *.vsp
 *.vspx
 *.sap
 # Visual Studio Trace Files
 *.e2e
 # TFS 2012 Local Workspace
 $tf/
 # Guidance Automation Toolkit
 *.gpState
 # ReSharper is a .NET coding add-in
 _ReSharper*/
 *.[Rr]e[Ss]harper
 *.DotSettings.user
 # TeamCity is a build add-in
 _TeamCity*
 # DotCover is a Code Coverage Tool
 *.dotCover
 # AxoCover is a Code Coverage Tool
 .axoCover/*
 !.axoCover/settings.json
 # Coverlet is a free, cross platform Code Coverage Tool
 coverage*.json
 coverage*.xml
 coverage*.info
 # Visual Studio code coverage results
 *.coverage
 *.coveragexml
 # NCrunch
 _NCrunch_*
 .*crunch*.local.xml
 nCrunchTemp_*
 # MightyMoose
 *.mm.*
 AutoTest.Net/
 # Web workbench (sass)
 .sass-cache/
 # Installshield output folder
 [Ee]xpress/
 # DocProject is a documentation generator add-in
 DocProject/buildhelp/
 DocProject/Help/*.HxT
 DocProject/Help/*.HxC
 DocProject/Help/*.hhc
 DocProject/Help/*.hhk
 DocProject/Help/*.hhp
 DocProject/Help/Html2
 DocProject/Help/html
 # Click-Once directory
 publish/
 # Publish Web Output
 *.[Pp]ublish.xml
 *.azurePubxml
 # Note: Comment the next line if you want to checkin your web deploy settings,
 # but database connection strings (with potential passwords) will be unencrypted
 *.pubxml
 *.publishproj
 # Microsoft Azure Web App publish settings. Comment the next line if you want to
 # checkin your Azure Web App publish settings, but sensitive information contained
 # in these scripts will be unencrypted
 PublishScripts/
 # NuGet Packages
 *.nupkg
 # NuGet Symbol Packages
 *.snupkg
 # The packages folder can be ignored because of Package Restore
 **/[Pp]ackages/*
 # except build/, which is used as an MSBuild target.
 !**/[Pp]ackages/build/
 # Uncomment if necessary however generally it will be regenerated when needed
 #!**/[Pp]ackages/repositories.config
 # NuGet v3's project.json files produces more ignorable files
 *.nuget.props
 *.nuget.targets
 # Microsoft Azure Build Output
 csx/
 *.build.csdef
 # Microsoft Azure Emulator
 ecf/
 rcf/
 # Windows Store app package directories and files
 AppPackages/
 BundleArtifacts/
 Package.StoreAssociation.xml
 _pkginfo.txt
 *.appx
 *.appxbundle
 *.appxupload
 # Visual Studio cache files
 # files ending in .cache can be ignored
 *.[Cc]ache
 # but keep track of directories ending in .cache
 !?*.[Cc]ache/
 # Others
 ClientBin/
 ~$*
 *~
 *.dbmdl
 *.dbproj.schemaview
 *.jfm
 *.pfx
 *.publishsettings
 orleans.codegen.cs
 # Including strong name files can present a security risk
 # (https://github.com/github/gitignore/pull/2483#issue-259490424)
 #*.snk
 # Since there are multiple workflows, uncomment next line to ignore bower_components
 # (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
 #bower_components/
 # RIA/Silverlight projects
 Generated_Code/
 # Backup & report files from converting an old project file
 # to a newer Visual Studio version. Backup files are not needed,
 # because we have git ;-)
 _UpgradeReport_Files/
 Backup*/
 UpgradeLog*.XML
 UpgradeLog*.htm
 ServiceFabricBackup/
 *.rptproj.bak
 # SQL Server files
 *.mdf
 *.ldf
 *.ndf
 # Business Intelligence projects
 *.rdl.data
 *.bim.layout
 *.bim_*.settings
 *.rptproj.rsuser
 *- [Bb]ackup.rdl
 *- [Bb]ackup ([0-9]).rdl
 *- [Bb]ackup ([0-9][0-9]).rdl
 # Microsoft Fakes
 FakesAssemblies/
 # GhostDoc plugin setting file
 *.GhostDoc.xml
 # Node.js Tools for Visual Studio
 .ntvs_analysis.dat
 node_modules/
 # Visual Studio 6 build log
 *.plg
 # Visual Studio 6 workspace options file
 *.opt
 # Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
 *.vbw
 # Visual Studio LightSwitch build output
 **/*.HTMLClient/GeneratedArtifacts
 **/*.DesktopClient/GeneratedArtifacts
 **/*.DesktopClient/ModelManifest.xml
 **/*.Server/GeneratedArtifacts
 **/*.Server/ModelManifest.xml
 _Pvt_Extensions
 # Paket dependency manager
 .paket/paket.exe
 paket-files/
 # FAKE - F# Make
 .fake/
 # CodeRush personal settings
 .cr/personal
 # Python Tools for Visual Studio (PTVS)
 __pycache__/
 *.pyc
 # Cake - Uncomment if you are using it
 # tools/**
 # !tools/packages.config
 # Tabs Studio
 *.tss
 # Telerik's JustMock configuration file
 *.jmconfig
 # BizTalk build output
 *.btp.cs
 *.btm.cs
 *.odx.cs
 *.xsd.cs
 # OpenCover UI analysis results
 OpenCover/
 # Azure Stream Analytics local run output
 ASALocalRun/
 # MSBuild Binary and Structured Log
 *.binlog
 # NVidia Nsight GPU debugger configuration file
 *.nvuser
 # MFractors (Xamarin productivity tool) working folder
 .mfractor/
 # Local History for Visual Studio
 .localhistory/
 # BeatPulse healthcheck temp database
 healthchecksdb
 # Backup folder for Package Reference Convert tool in Visual Studio 2017
 MigrationBackup/
 # Ionide (cross platform F# VS Code tools) working folder
 .ionide/
 # Fody - auto-generated XML schema
 FodyWeavers.xsd
 # JetBrains Rider
 .idea
 # Visual Studio Code
 .vscode
--- a/gpt4all-bindings/csharp/Directory.Build.props
+++ b/gpt4all-bindings/csharp/Directory.Build.props
@ -1,44 +0,0 @@
 <?xml version="1.0" encoding="utf-8"?>
 <Project>
    <PropertyGroup>
        <Company></Company>
        <Copyright></Copyright>
        <NeutralLanguage>en-US</NeutralLanguage>
        <Version>0.6.4-alpha</Version>
        <VersionSuffix>$(VersionSuffix)</VersionSuffix>
        <Version Condition=" '$(VersionSuffix)' != '' ">$(Version)$(VersionSuffix)</Version>
        <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
        <RepositoryUrl></RepositoryUrl>
        <RepositoryType>git</RepositoryType>
        <IncludeSymbols>true</IncludeSymbols>
        <IncludeSource>true</IncludeSource>
        <AnalysisLevel>latest-minimum</AnalysisLevel>
 		<EnforceCodeStyleInBuild>true</EnforceCodeStyleInBuild>
    </PropertyGroup>
    <ItemGroup>
        <Using Include="System"/>
    </ItemGroup>
    <PropertyGroup>
        <LangVersion>preview</LangVersion>
        <Features>strict</Features>
    </PropertyGroup>
 	<ItemGroup>
        <PackageReference Include="Roslynator.Analyzers" Version="4.2.0">
            <PrivateAssets>all</PrivateAssets>
            <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
        </PackageReference>
        <PackageReference Include="Roslynator.CodeAnalysis.Analyzers" Version="4.2.0">
            <PrivateAssets>all</PrivateAssets>
            <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
        </PackageReference>
        <PackageReference Include="Roslynator.Formatting.Analyzers" Version="4.2.0">
            <PrivateAssets>all</PrivateAssets>
            <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
        </PackageReference>
    </ItemGroup>
 </Project>
--- a/gpt4all-bindings/csharp/Gpt4All.Samples/Gpt4All.Samples.csproj
+++ b/gpt4all-bindings/csharp/Gpt4All.Samples/Gpt4All.Samples.csproj
@ -1,33 +0,0 @@
 <Project Sdk="Microsoft.NET.Sdk">
    <PropertyGroup>
        <OutputType>Exe</OutputType>
        <TargetFramework>net8.0</TargetFramework>
        <ImplicitUsings>enable</ImplicitUsings>
        <Nullable>enable</Nullable>
        <GenerateDocumentationFile>true</GenerateDocumentationFile>
    </PropertyGroup>
    <ItemGroup>
        <ProjectReference Include="..\Gpt4All\Gpt4All.csproj" />
    </ItemGroup>
    <ItemGroup>
        <!-- Windows -->
        <None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
        <!-- Linux -->
        <None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
        <!-- MacOS -->
        <None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
    </ItemGroup>
    <ItemGroup>
        <!-- Windows -->
        <None Condition="$([MSBuild]::IsOSPlatform('Windows'))" Include="..\runtimes\win-x64\native\*.dll" Visible="False" CopyToOutputDirectory="PreserveNewest" />
        <!-- Linux -->
        <None Condition="$([MSBuild]::IsOSPlatform('Linux'))" Include="..\runtimes\linux-x64\native\*.so" Visible="False" CopyToOutputDirectory="PreserveNewest" />
         <!-- MacOS -->
        <None Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.dylib" Visible="False" CopyToOutputDirectory="PreserveNewest" />
        <Content Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.metal" Visible="False" CopyToOutputDirectory="PreserveNewest" />
    </ItemGroup>
 </Project>
--- a/gpt4all-bindings/csharp/Gpt4All.Samples/Program.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Samples/Program.cs
@ -1,22 +0,0 @@
 using Gpt4All;
 var modelFactory = new Gpt4AllModelFactory();
 if (args.Length < 2)
 {
    Console.WriteLine($"Usage: Gpt4All.Samples <model-path> <prompt>");
    return;
 }
 var modelPath = args[0];
 var prompt = args[1];
 using var model = modelFactory.LoadModel(modelPath);
 var result = await model.GetStreamingPredictionAsync(
    prompt,
    PredictRequestOptions.Defaults);
 await foreach (var token in result.GetPredictionStreamingAsync())
 {
    Console.Write(token);
 }
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/Constants.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/Constants.cs
@ -1,9 +0,0 @@
 namespace Gpt4All.Tests;
 public static class Constants
 {
    public const string MODELS_BASE_DIR = "../../../models";
    public const string LLAMA_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-gpt4all-l13b-snoozy.bin";
    public const string GPTJ_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-gpt4all-j-v1.3-groovy.bin";
    public const string MPT_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-mpt-7b-chat.bin";
 }
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/Gpt4All.Tests.csproj
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/Gpt4All.Tests.csproj
@ -1,60 +0,0 @@
 <Project Sdk="Microsoft.NET.Sdk">
    <PropertyGroup>
        <TargetFramework>net8.0</TargetFramework>
        <Nullable>enable</Nullable>
        <IsPackable>false</IsPackable>
        <GenerateDocumentationFile>true</GenerateDocumentationFile>
    </PropertyGroup>
    <ItemGroup>
        <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.6.2" />
        <PackageReference Include="xunit" Version="2.4.2" />
        <PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
            <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
            <PrivateAssets>all</PrivateAssets>
        </PackageReference>
        <PackageReference Include="coverlet.collector" Version="6.0.0">
            <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
            <PrivateAssets>all</PrivateAssets>
        </PackageReference>
    </ItemGroup>
    <ItemGroup>
        <ProjectReference Include="..\Gpt4All\Gpt4All.csproj" />
    </ItemGroup>
    <ItemGroup>
        <!-- Windows -->
        <None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
        <!-- Linux -->
        <None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
        <!-- MacOS -->
        <None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
    </ItemGroup>
    <ItemGroup>
        <!-- Windows -->
        <None Condition="$([MSBuild]::IsOSPlatform('Windows'))" Include="..\runtimes\win-x64\native\*.dll" Visible="False" CopyToOutputDirectory="PreserveNewest" />
        <!-- Linux -->
        <None Condition="$([MSBuild]::IsOSPlatform('Linux'))" Include="..\runtimes\linux-x64\native\*.so" Visible="False" CopyToOutputDirectory="PreserveNewest" />
         <!-- MacOS -->
        <None Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.dylib" Visible="False" CopyToOutputDirectory="PreserveNewest" />
    </ItemGroup>
    <ItemGroup>
      <PackageReference Update="Roslynator.Analyzers" Version="4.3.0">
        <PrivateAssets>all</PrivateAssets>
        <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
      </PackageReference>
      <PackageReference Update="Roslynator.CodeAnalysis.Analyzers" Version="4.3.0">
        <PrivateAssets>all</PrivateAssets>
        <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
      </PackageReference>
      <PackageReference Update="Roslynator.Formatting.Analyzers" Version="4.3.0">
        <PrivateAssets>all</PrivateAssets>
        <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
      </PackageReference>
    </ItemGroup>
 </Project>
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/ModelFactoryTests.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/ModelFactoryTests.cs
@ -1,34 +0,0 @@
 using Xunit;
 namespace Gpt4All.Tests;
 public class ModelFactoryTests
 {
    private readonly Gpt4AllModelFactory _modelFactory;
    public ModelFactoryTests()
    {
        _modelFactory = new Gpt4AllModelFactory();
    }
    [Fact]
    [Trait(Traits.SkipOnCI, "True")]
    public void CanLoadLlamaModel()
    {
        using var model = _modelFactory.LoadModel(Constants.LLAMA_MODEL_PATH);
    }
    [Fact]
    [Trait(Traits.SkipOnCI, "True")]
    public void CanLoadGptjModel()
    {
        using var model = _modelFactory.LoadModel(Constants.GPTJ_MODEL_PATH);
    }
    [Fact]
    [Trait(Traits.SkipOnCI, "True")]
    public void CanLoadMptModel()
    {
        using var model = _modelFactory.LoadModel(Constants.MPT_MODEL_PATH);
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/NativeLibraryLoaderTests.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/NativeLibraryLoaderTests.cs
@ -1,56 +0,0 @@
 using System.IO;
 using Gpt4All.LibraryLoader;
 using Xunit;
 namespace Gpt4All.Tests;
 public class NativeLibraryLoaderTests
 {
    [Fact]
    public void NativeLibraryShouldLoad()
    {
        var result = NativeLibraryLoader.LoadNativeLibrary(bypassLoading: false);
        Assert.True(result.IsSuccess);
    }
    private const string LLModelLib = "libllmodel.{0}";
    [PlatformSpecificFact(Platforms.Windows)]
    public void NativeLibraryShouldLoad_Windows()
    {
        var libraryLoader = new WindowsLibraryLoader();
        var libraryPath = Path.Combine(
            Environment.CurrentDirectory,
            string.Format(LLModelLib, "dll"));
        var result = libraryLoader.OpenLibrary(libraryPath);
        Assert.True(result.IsSuccess);
    }
    [PlatformSpecificFact(Platforms.Linux)]
    public void NativeLibraryShouldLoad_Linux()
    {
        var libraryLoader = new LinuxLibraryLoader();
        var libraryPath = Path.Combine(
            Environment.CurrentDirectory,
            string.Format(LLModelLib, "so"));
        var result = libraryLoader.OpenLibrary(libraryPath);
        Assert.True(result.IsSuccess);
    }
    [PlatformSpecificFact(Platforms.MacOS)]
    public void NativeLibraryShouldLoad_MacOS()
    {
        var libraryLoader = new MacOsLibraryLoader();
        var libraryPath = Path.Combine(
            Environment.CurrentDirectory,
            string.Format(LLModelLib, "dylib"));
        var result = libraryLoader.OpenLibrary(libraryPath);
        Assert.True(result.IsSuccess);
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/PlatformSpecificFactAttribute.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/PlatformSpecificFactAttribute.cs
@ -1,27 +0,0 @@
 using Xunit;
 namespace Gpt4All.Tests;
 public static class Platforms
 {
    public const string Windows = "windows";
    public const string Linux = "linux";
    public const string MacOS = "macOS";
 }
 /// <summary>
 /// This attribute ensures the Fact is only run on the specified platform.
 /// </summary>
 /// <remarks>
 /// <see cref="OperatingSystem.IsOSPlatform(string)"/> for info about the platform string.
 /// </remarks>
 public class PlatformSpecificFactAttribute : FactAttribute
 {
    public PlatformSpecificFactAttribute(string platform)
    {
        if (!OperatingSystem.IsOSPlatform(platform))
        {
            Skip = $"Test only runs on {platform}.";
        }
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/Traits.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/Traits.cs
@ -1,6 +0,0 @@
 namespace Gpt4All.Tests;
 public static class Traits
 {
    public const string SkipOnCI = "SKIP_ON_CI";
 }
--- a/gpt4all-bindings/csharp/Gpt4All.sln
+++ b/gpt4all-bindings/csharp/Gpt4All.sln
@ -1,47 +0,0 @@
 Microsoft Visual Studio Solution File, Format Version 12.00
 # Visual Studio Version 17
 VisualStudioVersion = 17.5.33516.290
 MinimumVisualStudioVersion = 10.0.40219.1
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Gpt4All.Samples", "Gpt4All.Samples\Gpt4All.Samples.csproj", "{59864AE8-E45D-42F7-A7C0-1308EF185F39}"
 EndProject
 Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{DA396C11-CEAD-4368-8234-FB12255A30D2}"
 	ProjectSection(SolutionItems) = preProject
 		.gitignore = .gitignore
 		build_linux.sh = build_linux.sh
 		build_win-mingw.ps1 = build_win-mingw.ps1
 		build_win-msvc.ps1 = build_win-msvc.ps1
 		docs\gpt4all_csharp.md = docs\gpt4all_csharp.md
 		README.md = README.md
 	EndProjectSection
 EndProject
 Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Gpt4All", "Gpt4All\Gpt4All.csproj", "{6015C62B-2008-426B-A334-740D6F1FE38B}"
 EndProject
 Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gpt4All.Tests", "Gpt4All.Tests\Gpt4All.Tests.csproj", "{33A72341-52C1-4EAE-878B-A98BC77F686A}"
 EndProject
 Global
 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
 		Debug|Any CPU = Debug|Any CPU
 		Release|Any CPU = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
 		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Release|Any CPU.Build.0 = Release|Any CPU
 		{6015C62B-2008-426B-A334-740D6F1FE38B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{6015C62B-2008-426B-A334-740D6F1FE38B}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{6015C62B-2008-426B-A334-740D6F1FE38B}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{6015C62B-2008-426B-A334-740D6F1FE38B}.Release|Any CPU.Build.0 = Release|Any CPU
 		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
 		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Debug|Any CPU.Build.0 = Debug|Any CPU
 		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Release|Any CPU.ActiveCfg = Release|Any CPU
 		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Release|Any CPU.Build.0 = Release|Any CPU
 	EndGlobalSection
 	GlobalSection(SolutionProperties) = preSolution
 		HideSolutionNode = FALSE
 	EndGlobalSection
 	GlobalSection(ExtensibilityGlobals) = postSolution
 		SolutionGuid = {17632027-F4C2-4903-B88F-310CE3DE386B}
 	EndGlobalSection
 EndGlobal
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/ILLModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/ILLModel.cs
@ -1,29 +0,0 @@
 namespace Gpt4All.Bindings;
 /// <summary>
 /// Represents the interface exposed by the universal wrapper for GPT4All language models built around llmodel C-API.
 /// </summary>
 public interface ILLModel : IDisposable
 {
    ulong GetStateSizeBytes();
    int GetThreadCount();
    void SetThreadCount(int threadCount);
    bool IsLoaded();
    bool Load(string modelPath);
    void Prompt(
        string text,
        LLModelPromptContext context,
        Func<ModelPromptEventArgs, bool>? promptCallback = null,
        Func<ModelResponseEventArgs, bool>? responseCallback = null,
        Func<ModelRecalculatingEventArgs, bool>? recalculateCallback = null,
        CancellationToken cancellationToken = default);
    unsafe ulong RestoreStateData(byte* destination);
    unsafe ulong SaveStateData(byte* source);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
@ -1,212 +0,0 @@
 using Microsoft.Extensions.Logging;
 using Microsoft.Extensions.Logging.Abstractions;
 namespace Gpt4All.Bindings;
 /// <summary>
 /// Arguments for the response processing callback
 /// </summary>
 /// <param name="TokenId">The token id of the response</param>
 /// <param name="Response"> The response string. NOTE: a token_id of -1 indicates the string is an error string</param>
 /// <return>
 /// A bool indicating whether the model should keep generating
 /// </return>
 public record ModelResponseEventArgs(int TokenId, string Response)
 {
    public bool IsError => TokenId == -1;
 }
 /// <summary>
 /// Arguments for the prompt processing callback
 /// </summary>
 /// <param name="TokenId">The token id of the prompt</param>
 /// <return>
 /// A bool indicating whether the model should keep processing
 /// </return>
 public record ModelPromptEventArgs(int TokenId)
 {
 }
 /// <summary>
 /// Arguments for the recalculating callback
 /// </summary>
 /// <param name="IsRecalculating"> whether the model is recalculating the context.</param>
 /// <return>
 /// A bool indicating whether the model should keep generating
 /// </return>
 public record ModelRecalculatingEventArgs(bool IsRecalculating);
 /// <summary>
 /// Base class and universal wrapper for GPT4All language models built around llmodel C-API.
 /// </summary>
 public class LLModel : ILLModel
 {
    protected readonly IntPtr _handle;
    private readonly ILogger _logger;
    private bool _disposed;
    internal LLModel(IntPtr handle, ILogger? logger = null)
    {
        _handle = handle;
        _logger = logger ?? NullLogger.Instance;
    }
    /// <summary>
    /// Create a new model from a pointer
    /// </summary>
    /// <param name="handle">Pointer to underlying model</param>
    public static LLModel Create(IntPtr handle, ILogger? logger = null)
    {
        return new LLModel(handle, logger: logger);
    }
    /// <summary>
    /// Generate a response using the model
    /// </summary>
    /// <param name="text">The input promp</param>
    /// <param name="context">The context</param>
    /// <param name="promptCallback">A callback function for handling the processing of prompt</param>
    /// <param name="responseCallback">A callback function for handling the generated response</param>
    /// <param name="recalculateCallback">A callback function for handling recalculation requests</param>
    /// <param name="cancellationToken"></param>
    public void Prompt(
        string text,
        LLModelPromptContext context,
        Func<ModelPromptEventArgs, bool>? promptCallback = null,
        Func<ModelResponseEventArgs, bool>? responseCallback = null,
        Func<ModelRecalculatingEventArgs, bool>? recalculateCallback = null,
        CancellationToken cancellationToken = default)
    {
        GC.KeepAlive(promptCallback);
        GC.KeepAlive(responseCallback);
        GC.KeepAlive(recalculateCallback);
        GC.KeepAlive(cancellationToken);
        _logger.LogInformation("Prompt input='{Prompt}' ctx={Context}", text, context.Dump());
        NativeMethods.llmodel_prompt(
            _handle,
            text,
            (tokenId) =>
            {
                if (cancellationToken.IsCancellationRequested) return false;
                if (promptCallback == null) return true;
                var args = new ModelPromptEventArgs(tokenId);
                return promptCallback(args);
            },
            (tokenId, response) =>
            {
                if (cancellationToken.IsCancellationRequested)
                {
                    _logger.LogDebug("ResponseCallback evt=CancellationRequested");
                    return false;
                }
                if (responseCallback == null) return true;
                var args = new ModelResponseEventArgs(tokenId, response);
                return responseCallback(args);
            },
            (isRecalculating) =>
            {
                if (cancellationToken.IsCancellationRequested) return false;
                if (recalculateCallback == null) return true;
                var args = new ModelRecalculatingEventArgs(isRecalculating);
                return recalculateCallback(args);
            },
            ref context.UnderlyingContext
        );
    }
    /// <summary>
    ///  Set the number of threads to be used by the model.
    /// </summary>
    /// <param name="threadCount">The new thread count</param>
    public void SetThreadCount(int threadCount)
    {
        NativeMethods.llmodel_setThreadCount(_handle, threadCount);
    }
    /// <summary>
    /// Get  the number of threads used by the model.
    /// </summary>
    /// <returns>the number of threads used by the model</returns>
    public int GetThreadCount()
    {
        return NativeMethods.llmodel_threadCount(_handle);
    }
    /// <summary>
    /// Get the size of the internal state of the model.
    /// </summary>
    /// <remarks>
    /// This state data is specific to the type of model you have created.
    /// </remarks>
    /// <returns>the size in bytes of the internal state of the model</returns>
    public ulong GetStateSizeBytes()
    {
        return NativeMethods.llmodel_get_state_size(_handle);
    }
    /// <summary>
    /// Saves the internal state of the model to the specified destination address.
    /// </summary>
    /// <param name="source">A pointer to the src</param>
    /// <returns>The number of bytes copied</returns>
    public unsafe ulong SaveStateData(byte* source)
    {
        return NativeMethods.llmodel_save_state_data(_handle, source);
    }
    /// <summary>
    /// Restores the internal state of the model using data from the specified address.
    /// </summary>
    /// <param name="destination">A pointer to destination</param>
    /// <returns>the number of bytes read</returns>
    public unsafe ulong RestoreStateData(byte* destination)
    {
        return NativeMethods.llmodel_restore_state_data(_handle, destination);
    }
    /// <summary>
    /// Check if the model is loaded.
    /// </summary>
    /// <returns>true if the model was loaded successfully, false otherwise.</returns>
    public bool IsLoaded()
    {
        return NativeMethods.llmodel_isModelLoaded(_handle);
    }
    /// <summary>
    /// Load the model from a file.
    /// </summary>
    /// <param name="modelPath">The path to the model file.</param>
    /// <returns>true if the model was loaded successfully, false otherwise.</returns>
    public bool Load(string modelPath)
    {
        return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048, 100);
    }
    protected void Destroy()
    {
        NativeMethods.llmodel_model_destroy(_handle);
    }
    protected virtual void Dispose(bool disposing)
    {
        if (_disposed) return;
        if (disposing)
        {
            // dispose managed state
        }
        Destroy();
        _disposed = true;
    }
    public void Dispose()
    {
        Dispose(disposing: true);
        GC.SuppressFinalize(this);
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLPromptContext.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLPromptContext.cs
@ -1,147 +0,0 @@
 namespace Gpt4All.Bindings;
 /// <summary>
 /// Wrapper around the llmodel_prompt_context structure for holding the prompt context.
 /// </summary>
 /// <remarks>
 /// The implementation takes care of all the memory handling of the raw logits pointer and the
 /// raw tokens pointer.Attempting to resize them or modify them in any way can lead to undefined behavior
 /// </remarks>
 public unsafe class LLModelPromptContext
 {
    private llmodel_prompt_context _ctx;
    internal ref llmodel_prompt_context UnderlyingContext => ref _ctx;
    public LLModelPromptContext()
    {
        _ctx = new();
    }
    /// <summary>
    /// logits of current context
    /// </summary>
    public Span<float> Logits => new(_ctx.logits, (int)_ctx.logits_size);
    /// <summary>
    /// the size of the raw logits vector
    /// </summary>
    public nuint LogitsSize
    {
        get => _ctx.logits_size;
        set => _ctx.logits_size = value;
    }
    /// <summary>
    /// current tokens in the context window
    /// </summary>
    public Span<int> Tokens => new(_ctx.tokens, (int)_ctx.tokens_size);
    /// <summary>
    /// the size of the raw tokens vector
    /// </summary>
    public nuint TokensSize
    {
        get => _ctx.tokens_size;
        set => _ctx.tokens_size = value;
    }
    /// <summary>
    /// top k logits to sample from
    /// </summary>
    public int TopK
    {
        get => _ctx.top_k;
        set => _ctx.top_k = value;
    }
    /// <summary>
    /// nucleus sampling probability threshold
    /// </summary>
    public float TopP
    {
        get => _ctx.top_p;
        set => _ctx.top_p = value;
    }
    /// <summary>
    /// min p sampling probability threshold
    /// </summary>
    public float MinP
    {
        get => _ctx.min_p;
        set => _ctx.min_p = value;
    }
    /// <summary>
    /// temperature to adjust model's output distribution
    /// </summary>
    public float Temperature
    {
        get => _ctx.temp;
        set => _ctx.temp = value;
    }
    /// <summary>
    /// number of tokens in past conversation
    /// </summary>
    public int PastNum
    {
        get => _ctx.n_past;
        set => _ctx.n_past = value;
    }
    /// <summary>
    /// number of predictions to generate in parallel
    /// </summary>
    public int Batches
    {
        get => _ctx.n_batch;
        set => _ctx.n_batch = value;
    }
    /// <summary>
    /// number of tokens to predict
    /// </summary>
    public int TokensToPredict
    {
        get => _ctx.n_predict;
        set => _ctx.n_predict = value;
    }
    /// <summary>
    /// penalty factor for repeated tokens
    /// </summary>
    public float RepeatPenalty
    {
        get => _ctx.repeat_penalty;
        set => _ctx.repeat_penalty = value;
    }
    /// <summary>
    /// last n tokens to penalize
    /// </summary>
    public int RepeatLastN
    {
        get => _ctx.repeat_last_n;
        set => _ctx.repeat_last_n = value;
    }
    /// <summary>
    /// number of tokens possible in context window
    /// </summary>
    public int ContextSize
    {
        get => _ctx.n_ctx;
        set => _ctx.n_ctx = value;
    }
    /// <summary>
    /// percent of context to erase if we exceed the context window
    /// </summary>
    public float ContextErase
    {
        get => _ctx.context_erase;
        set => _ctx.context_erase = value;
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
@ -1,112 +0,0 @@
 using System.Runtime.InteropServices;
 namespace Gpt4All.Bindings;
 public unsafe partial struct llmodel_prompt_context
 {
    public float* logits;
    [NativeTypeName("size_t")]
    public nuint logits_size;
    [NativeTypeName("int32_t *")]
    public int* tokens;
    [NativeTypeName("size_t")]
    public nuint tokens_size;
    [NativeTypeName("int32_t")]
    public int n_past;
    [NativeTypeName("int32_t")]
    public int n_ctx;
    [NativeTypeName("int32_t")]
    public int n_predict;
    [NativeTypeName("int32_t")]
    public int top_k;
    public float top_p;
    public float min_p;
    public float temp;
    [NativeTypeName("int32_t")]
    public int n_batch;
    public float repeat_penalty;
    [NativeTypeName("int32_t")]
    public int repeat_last_n;
    public float context_erase;
 }
 #pragma warning disable CA2101
 internal static unsafe partial class NativeMethods
 {
    [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
    [return: MarshalAs(UnmanagedType.I1)]
    public delegate bool LlmodelResponseCallback(int token_id, [MarshalAs(UnmanagedType.LPUTF8Str)] string response);
    [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
    [return: MarshalAs(UnmanagedType.I1)]
    public delegate bool LlmodelPromptCallback(int token_id);
    [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
    [return: MarshalAs(UnmanagedType.I1)]
    public delegate bool LlmodelRecalculateCallback(bool isRecalculating);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true, BestFitMapping = false, ThrowOnUnmappableChar = true)]
    [return: NativeTypeName("llmodel_model")]
    public static extern IntPtr llmodel_model_create2(
        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string build_variant,
        out IntPtr error);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    public static extern void llmodel_model_destroy([NativeTypeName("llmodel_model")] IntPtr model);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true, BestFitMapping = false, ThrowOnUnmappableChar = true)]
    [return: MarshalAs(UnmanagedType.I1)]
    public static extern bool llmodel_loadModel(
        [NativeTypeName("llmodel_model")] IntPtr model,
        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
        [NativeTypeName("int32_t")] int n_ctx,
        [NativeTypeName("int32_t")] int ngl);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    [return: MarshalAs(UnmanagedType.I1)]
    public static extern bool llmodel_isModelLoaded([NativeTypeName("llmodel_model")] IntPtr model);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    [return: NativeTypeName("uint64_t")]
    public static extern ulong llmodel_get_state_size([NativeTypeName("llmodel_model")] IntPtr model);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    [return: NativeTypeName("uint64_t")]
    public static extern ulong llmodel_save_state_data([NativeTypeName("llmodel_model")] IntPtr model, [NativeTypeName("uint8_t *")] byte* dest);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    [return: NativeTypeName("uint64_t")]
    public static extern ulong llmodel_restore_state_data([NativeTypeName("llmodel_model")] IntPtr model, [NativeTypeName("const uint8_t *")] byte* src);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true, BestFitMapping = false, ThrowOnUnmappableChar = true)]
    public static extern void llmodel_prompt(
        [NativeTypeName("llmodel_model")] IntPtr model,
        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string prompt,
        LlmodelPromptCallback prompt_callback,
        LlmodelResponseCallback response_callback,
        LlmodelRecalculateCallback recalculate_callback,
        ref llmodel_prompt_context ctx);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    public static extern void llmodel_setThreadCount([NativeTypeName("llmodel_model")] IntPtr model, [NativeTypeName("int32_t")] int n_threads);
    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
    [return: NativeTypeName("int32_t")]
    public static extern int llmodel_threadCount([NativeTypeName("llmodel_model")] IntPtr model);
 }
 #pragma warning restore CA2101
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeTypeNameAttribute.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeTypeNameAttribute.cs
@ -1,21 +0,0 @@
 using System.Diagnostics;
 namespace Gpt4All.Bindings;
 /// <summary>Defines the type of a member as it was used in the native signature.</summary>
 [AttributeUsage(AttributeTargets.Struct | AttributeTargets.Enum | AttributeTargets.Property | AttributeTargets.Field | AttributeTargets.Parameter | AttributeTargets.ReturnValue, AllowMultiple = false, Inherited = true)]
 [Conditional("DEBUG")]
 internal sealed partial class NativeTypeNameAttribute : Attribute
 {
    private readonly string _name;
    /// <summary>Initializes a new instance of the <see cref="NativeTypeNameAttribute" /> class.</summary>
    /// <param name="name">The name of the type that was used in the native signature.</param>
    public NativeTypeNameAttribute(string name)
    {
        _name = name;
    }
    /// <summary>Gets the name of the type that was used in the native signature.</summary>
    public string Name => _name;
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Extensions/LLPromptContextExtensions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Extensions/LLPromptContextExtensions.cs
@ -1,27 +0,0 @@
 using Gpt4All.Bindings;
 namespace Gpt4All;
 internal static class LLPromptContextExtensions
 {
    public static string Dump(this LLModelPromptContext context)
    {
        var ctx = context.UnderlyingContext;
        return @$"
        {{
            logits_size = {ctx.logits_size}
            tokens_size = {ctx.tokens_size}
            n_past = {ctx.n_past}
            n_ctx = {ctx.n_ctx}
            n_predict = {ctx.n_predict}
            top_k = {ctx.top_k}
            top_p = {ctx.top_p}
            min_p = {ctx.min_p}
            temp = {ctx.temp}
            n_batch = {ctx.n_batch}
            repeat_penalty = {ctx.repeat_penalty}
            repeat_last_n = {ctx.repeat_last_n}
            context_erase = {ctx.context_erase}
        }}";
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Extensions/PredictRequestOptionsExtensions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Extensions/PredictRequestOptionsExtensions.cs
@ -1,26 +0,0 @@
 using Gpt4All.Bindings;
 namespace Gpt4All;
 public static class PredictRequestOptionsExtensions
 {
    public static LLModelPromptContext ToPromptContext(this PredictRequestOptions opts)
    {
        return new LLModelPromptContext
        {
            LogitsSize = opts.LogitsSize,
            TokensSize = opts.TokensSize,
            TopK = opts.TopK,
            TopP = opts.TopP,
            MinP = opts.MinP,
            PastNum = opts.PastConversationTokensNum,
            RepeatPenalty = opts.RepeatPenalty,
            Temperature = opts.Temperature,
            RepeatLastN = opts.RepeatLastN,
            Batches = opts.Batches,
            ContextErase = opts.ContextErase,
            ContextSize = opts.ContextSize,
            TokensToPredict = opts.TokensToPredict
        };
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/GenLLModelBindings.rsp
+++ b/gpt4all-bindings/csharp/Gpt4All/GenLLModelBindings.rsp
@ -1,21 +0,0 @@
 --config
 exclude-funcs-with-body
 --with-access-specifier
 *=Public
 --include-directory
 ..\..\..\gpt4all-backend\
 --file
 ..\..\..\gpt4all-backend\llmodel_c.h
 --libraryPath
 libllmodel
 --remap
 sbyte*=IntPtr
 void*=IntPtr
 --namespace
 Gpt4All.Bindings
 --methodClassName
 NativeMethods
 --output
 .\Bindings\NativeMethods.cs
 --output-mode
 CSharp
--- a/gpt4all-bindings/csharp/Gpt4All/Gpt4All.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Gpt4All.cs
@ -1,135 +0,0 @@
 using System.Diagnostics;
 using System.Runtime.CompilerServices;
 using Gpt4All.Bindings;
 using Microsoft.Extensions.Logging;
 using Microsoft.Extensions.Logging.Abstractions;
 [assembly: InternalsVisibleTo("Gpt4All.Tests")]
 namespace Gpt4All;
 public class Gpt4All : IGpt4AllModel
 {
    private readonly ILLModel _model;
    private readonly ILogger _logger;
    private const string ResponseErrorMessage =
        "The model reported an error during token generation error={ResponseError}";
    /// <inheritdoc/>
    public IPromptFormatter? PromptFormatter { get; set; }
    internal Gpt4All(ILLModel model, ILogger? logger = null)
    {
        _model = model;
        _logger = logger ?? NullLogger.Instance;
        PromptFormatter = new DefaultPromptFormatter();
    }
    private string FormatPrompt(string prompt)
    {
        if (PromptFormatter == null) return prompt;
        return PromptFormatter.FormatPrompt(prompt);
    }
    public Task<ITextPredictionResult> GetPredictionAsync(string text, PredictRequestOptions opts, CancellationToken cancellationToken = default)
    {
        ArgumentNullException.ThrowIfNull(text);
        return Task.Run(() =>
        {
            _logger.LogInformation("Start prediction task");
            var sw = Stopwatch.StartNew();
            var result = new TextPredictionResult();
            var context = opts.ToPromptContext();
            var prompt = FormatPrompt(text);
            try
            {
                _model.Prompt(prompt, context, responseCallback: e =>
                {
                    if (e.IsError)
                    {
                        _logger.LogWarning(ResponseErrorMessage, e.Response);
                        result.Success = false;
                        result.ErrorMessage = e.Response;
                        return false;
                    }
                    result.Append(e.Response);
                    return true;
                }, cancellationToken: cancellationToken);
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Prompt error");
                result.Success = false;
            }
            sw.Stop();
            _logger.LogInformation("Prediction task completed elapsed={Elapsed}s", sw.Elapsed.TotalSeconds);
            return (ITextPredictionResult)result;
        }, CancellationToken.None);
    }
    public Task<ITextPredictionStreamingResult> GetStreamingPredictionAsync(string text, PredictRequestOptions opts, CancellationToken cancellationToken = default)
    {
        ArgumentNullException.ThrowIfNull(text);
        var result = new TextPredictionStreamingResult();
        _ = Task.Run(() =>
        {
            _logger.LogInformation("Start streaming prediction task");
            var sw = Stopwatch.StartNew();
            try
            {
                var context = opts.ToPromptContext();
                var prompt = FormatPrompt(text);
                _model.Prompt(prompt, context, responseCallback: e =>
                {
                    if (e.IsError)
                    {
                        _logger.LogWarning(ResponseErrorMessage, e.Response);
                        result.Success = false;
                        result.ErrorMessage = e.Response;
                        return false;
                    }
                    result.Append(e.Response);
                    return true;
                }, cancellationToken: cancellationToken);
            }
            catch (Exception e)
            {
                _logger.LogError(e, "Prompt error");
                result.Success = false;
            }
            finally
            {
                result.Complete();
                sw.Stop();
                _logger.LogInformation("Prediction task completed elapsed={Elapsed}s", sw.Elapsed.TotalSeconds);
            }
        }, CancellationToken.None);
        return Task.FromResult((ITextPredictionStreamingResult)result);
    }
    protected virtual void Dispose(bool disposing)
    {
        if (disposing)
        {
            _model.Dispose();
        }
    }
    public void Dispose()
    {
        Dispose(true);
        GC.SuppressFinalize(this);
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Gpt4All.csproj
+++ b/gpt4all-bindings/csharp/Gpt4All/Gpt4All.csproj
@ -1,23 +0,0 @@
 <Project Sdk="Microsoft.NET.Sdk">
    <PropertyGroup>
        <ImplicitUsings>enable</ImplicitUsings>
        <Nullable>enable</Nullable>
        <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
        <GenerateDocumentationFile>true</GenerateDocumentationFile>
        <TargetFramework>net8.0</TargetFramework>
    </PropertyGroup>
    <ItemGroup>
        <!-- Windows -->
        <None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
        <!-- Linux -->
        <None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
        <!-- MacOS -->
        <None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
        <Content Include="..\runtimes\osx\native\*.metal" Pack="true" PackagePath="contentFiles\any\any;content">
            <PackageCopyToOutput>true</PackageCopyToOutput>
        </Content>
    </ItemGroup>
    <ItemGroup>
        <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="7.0.0" />
    </ItemGroup>
 </Project>
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/ILibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/ILibraryLoader.cs
@ -1,6 +0,0 @@
 namespace Gpt4All.LibraryLoader;
 public interface ILibraryLoader
 {
    LoadResult OpenLibrary(string? fileName);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LinuxLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LinuxLibraryLoader.cs
@ -1,53 +0,0 @@
 using System.Runtime.InteropServices;
 namespace Gpt4All.LibraryLoader;
 internal class LinuxLibraryLoader : ILibraryLoader
 {
 #pragma warning disable CA2101
    [DllImport("libdl.so", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlopen")]
 #pragma warning restore CA2101
    public static extern IntPtr NativeOpenLibraryLibdl(string? filename, int flags);
 #pragma warning disable CA2101
    [DllImport("libdl.so.2", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlopen")]
 #pragma warning restore CA2101
    public static extern IntPtr NativeOpenLibraryLibdl2(string? filename, int flags);
    [DllImport("libdl.so", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlerror")]
    public static extern IntPtr GetLoadError();
    [DllImport("libdl.so.2", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlerror")]
    public static extern IntPtr GetLoadError2();
    public LoadResult OpenLibrary(string? fileName)
    {
        IntPtr loadedLib;
        try
        {
            // open with rtls lazy flag
            loadedLib = NativeOpenLibraryLibdl2(fileName, 0x00001);
        }
        catch (DllNotFoundException)
        {
            loadedLib = NativeOpenLibraryLibdl(fileName, 0x00001);
        }
        if (loadedLib == IntPtr.Zero)
        {
            string errorMessage;
            try
            {
                errorMessage = Marshal.PtrToStringAnsi(GetLoadError2()) ?? "Unknown error";
            }
            catch (DllNotFoundException)
            {
                errorMessage = Marshal.PtrToStringAnsi(GetLoadError()) ?? "Unknown error";
            }
            return LoadResult.Failure(errorMessage);
        }
        return LoadResult.Success;
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LoadResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LoadResult.cs
@ -1,20 +0,0 @@
 namespace Gpt4All.LibraryLoader;
 public class LoadResult
 {
    private LoadResult(bool isSuccess, string? errorMessage)
    {
        IsSuccess = isSuccess;
        ErrorMessage = errorMessage;
    }
    public static LoadResult Success { get; } = new(true, null);
    public static LoadResult Failure(string errorMessage)
    {
        return new(false, errorMessage);
    }
    public bool IsSuccess { get; }
    public string? ErrorMessage { get; }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/MacOsLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/MacOsLibraryLoader.cs
@ -1,28 +0,0 @@
 using System.Runtime.InteropServices;
 namespace Gpt4All.LibraryLoader;
 internal class MacOsLibraryLoader : ILibraryLoader
 {
 #pragma warning disable CA2101
    [DllImport("libdl.dylib", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlopen")]
 #pragma warning restore CA2101
    public static extern IntPtr NativeOpenLibraryLibdl(string? filename, int flags);
    [DllImport("libdl.dylib", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlerror")]
    public static extern IntPtr GetLoadError();
    public LoadResult OpenLibrary(string? fileName)
    {
        var loadedLib = NativeOpenLibraryLibdl(fileName, 0x00001);
        if (loadedLib == IntPtr.Zero)
        {
            var errorMessage = Marshal.PtrToStringAnsi(GetLoadError()) ?? "Unknown error";
            return LoadResult.Failure(errorMessage);
        }
        return LoadResult.Success;
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/NativeLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/NativeLibraryLoader.cs
@ -1,81 +0,0 @@
 #if !IOS && !MACCATALYST && !TVOS && !ANDROID
 using System.Runtime.InteropServices;
 #endif
 namespace Gpt4All.LibraryLoader;
 public static class NativeLibraryLoader
 {
    private static ILibraryLoader? defaultLibraryLoader;
    /// <summary>
    /// Sets the library loader used to load the native libraries. Overwrite this only if you want some custom loading.
    /// </summary>
    /// <param name="libraryLoader">The library loader to be used.</param>
    public static void SetLibraryLoader(ILibraryLoader libraryLoader)
    {
        defaultLibraryLoader = libraryLoader;
    }
    internal static LoadResult LoadNativeLibrary(string? path = default, bool bypassLoading = true)
    {
        // If the user has handled loading the library themselves, we don't need to do anything.
        if (bypassLoading)
        {
            return LoadResult.Success;
        }
        var architecture = RuntimeInformation.OSArchitecture switch
        {
            Architecture.X64 => "x64",
            Architecture.X86 => "x86",
            Architecture.Arm => "arm",
            Architecture.Arm64 => "arm64",
            _ => throw new PlatformNotSupportedException(
                $"Unsupported OS platform, architecture: {RuntimeInformation.OSArchitecture}")
        };
        var (platform, extension) = Environment.OSVersion.Platform switch
        {
            _ when RuntimeInformation.IsOSPlatform(OSPlatform.Windows) => ("win", "dll"),
            _ when RuntimeInformation.IsOSPlatform(OSPlatform.Linux) => ("linux", "so"),
            _ when RuntimeInformation.IsOSPlatform(OSPlatform.OSX) => ("osx", "dylib"),
            _ => throw new PlatformNotSupportedException(
                $"Unsupported OS platform, architecture: {RuntimeInformation.OSArchitecture}")
        };
        // If the user hasn't set the path, we'll try to find it ourselves.
        if (string.IsNullOrEmpty(path))
        {
            var libraryName = "libllmodel";
            var assemblySearchPath = new[]
            {
                AppDomain.CurrentDomain.RelativeSearchPath,
                Path.GetDirectoryName(typeof(NativeLibraryLoader).Assembly.Location),
                Path.GetDirectoryName(Environment.GetCommandLineArgs()[0])
            }.FirstOrDefault(it => !string.IsNullOrEmpty(it));
            // Search for the library dll within the assembly search path. If it doesn't exist, for whatever reason, use the default path.
            path = Directory.EnumerateFiles(assemblySearchPath ?? string.Empty, $"{libraryName}.{extension}", SearchOption.AllDirectories).FirstOrDefault() ?? Path.Combine("runtimes", $"{platform}-{architecture}", $"{libraryName}.{extension}");
        }
        if (defaultLibraryLoader != null)
        {
            return defaultLibraryLoader.OpenLibrary(path);
        }
        if (!File.Exists(path))
        {
            throw new FileNotFoundException($"Native Library not found in path {path}. " +
                                            $"Verify you have have included the native Gpt4All library in your application.");
        }
        ILibraryLoader libraryLoader = platform switch
        {
            "win" => new WindowsLibraryLoader(),
            "osx" => new MacOsLibraryLoader(),
            "linux" => new LinuxLibraryLoader(),
            _ => throw new PlatformNotSupportedException($"Currently {platform} platform is not supported")
        };
        return libraryLoader.OpenLibrary(path);
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/WindowsLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/WindowsLibraryLoader.cs
@ -1,24 +0,0 @@
 using System.ComponentModel;
 using System.Runtime.InteropServices;
 namespace Gpt4All.LibraryLoader;
 internal class WindowsLibraryLoader : ILibraryLoader
 {
    public LoadResult OpenLibrary(string? fileName)
    {
        var loadedLib = LoadLibrary(fileName);
        if (loadedLib == IntPtr.Zero)
        {
            var errorCode = Marshal.GetLastWin32Error();
            var errorMessage = new Win32Exception(errorCode).Message;
            return LoadResult.Failure(errorMessage);
        }
        return LoadResult.Success;
    }
    [DllImport("kernel32", SetLastError = true, CharSet = CharSet.Auto)]
    private static extern IntPtr LoadLibrary([MarshalAs(UnmanagedType.LPWStr)] string? lpFileName);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Model/DefaultPromptFormatter.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/DefaultPromptFormatter.cs
@ -1,16 +0,0 @@
 namespace Gpt4All;
 public class DefaultPromptFormatter : IPromptFormatter
 {
    public string FormatPrompt(string prompt)
    {
        return $"""
        ### Instruction: 
        The prompt below is a question to answer, a task to complete, or a conversation
        to respond to; decide which and write an appropriate response.
        ### Prompt:
        {prompt}
        ### Response:
        """;
    }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
@ -1,62 +0,0 @@
 using System.Diagnostics;
 using Microsoft.Extensions.Logging.Abstractions;
 using Microsoft.Extensions.Logging;
 using Gpt4All.Bindings;
 using Gpt4All.LibraryLoader;
 using System.Runtime.InteropServices;
 namespace Gpt4All;
 public class Gpt4AllModelFactory : IGpt4AllModelFactory
 {
    private readonly ILoggerFactory _loggerFactory;
    private readonly ILogger _logger;
    private static bool bypassLoading;
    private static string? libraryPath;
    private static readonly Lazy<LoadResult> libraryLoaded = new(() =>
    {
        return NativeLibraryLoader.LoadNativeLibrary(Gpt4AllModelFactory.libraryPath, Gpt4AllModelFactory.bypassLoading);
    }, true);
    public Gpt4AllModelFactory(string? libraryPath = default, bool bypassLoading = true, ILoggerFactory? loggerFactory = null)
    {
        _loggerFactory = loggerFactory ?? NullLoggerFactory.Instance;
        _logger = _loggerFactory.CreateLogger<Gpt4AllModelFactory>();
        Gpt4AllModelFactory.libraryPath = libraryPath;
        Gpt4AllModelFactory.bypassLoading = bypassLoading;
        if (!libraryLoaded.Value.IsSuccess)
        {
            throw new Exception($"Failed to load native gpt4all library. Error: {libraryLoaded.Value.ErrorMessage}");
        }
    }
    private Gpt4All CreateModel(string modelPath)
    {
        _logger.LogInformation("Creating model path={ModelPath}", modelPath);
        IntPtr error;
        var handle = NativeMethods.llmodel_model_create2(modelPath, "auto", out error);
        if (error != IntPtr.Zero)
        {
            throw new Exception(Marshal.PtrToStringAnsi(error));
        }
        _logger.LogDebug("Model created handle=0x{ModelHandle:X8}", handle);
        _logger.LogInformation("Model loading started");
        var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048, 100);
        _logger.LogInformation("Model loading completed success={ModelLoadSuccess}", loadedSuccessfully);
        if (!loadedSuccessfully)
        {
            throw new Exception($"Failed to load model: '{modelPath}'");
        }
        var logger = _loggerFactory.CreateLogger<LLModel>();
        var underlyingModel = LLModel.Create(handle, logger: logger);
        Debug.Assert(underlyingModel.IsLoaded());
        return new Gpt4All(underlyingModel, logger: logger);
    }
    public IGpt4AllModel LoadModel(string modelPath) => CreateModel(modelPath);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModel.cs
@ -1,10 +0,0 @@
 namespace Gpt4All;
 public interface IGpt4AllModel : ITextPrediction, IDisposable
 {
    /// <summary>
    /// The prompt formatter used to format the prompt before
    /// feeding it to the model, if null no transformation is applied
    /// </summary>
    IPromptFormatter? PromptFormatter { get; set; }
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModelFactory.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModelFactory.cs
@ -1,6 +0,0 @@
 namespace Gpt4All;
 public interface IGpt4AllModelFactory
 {
    IGpt4AllModel LoadModel(string modelPath);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Model/IPromptFormatter.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/IPromptFormatter.cs
@ -1,14 +0,0 @@
 namespace Gpt4All;
 /// <summary>
 /// Formats a prompt
 /// </summary>
 public interface IPromptFormatter
 {
    /// <summary>
    /// Format the provided prompt
    /// </summary>
    /// <param name="prompt">the input prompt</param>
    /// <returns>The formatted prompt</returns>
    string FormatPrompt(string prompt);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Model/ModelOptions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/ModelOptions.cs
@ -1,6 +0,0 @@
 namespace Gpt4All;
 public record ModelOptions
 {
    public int Threads { get; init; } = 4;
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPrediction.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPrediction.cs
@ -1,31 +0,0 @@
 namespace Gpt4All;
 /// <summary>
 /// Interface for text prediction services
 /// </summary>
 public interface ITextPrediction
 {
    /// <summary>
    /// Get prediction results for the prompt and provided options.
    /// </summary>
    /// <param name="text">The text to complete</param>
    /// <param name="opts">The prediction settings</param>
    /// <param name="cancellation">The <see cref="CancellationToken"/> for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
    /// <returns>The prediction result generated by the model</returns>
    Task<ITextPredictionResult> GetPredictionAsync(
        string text,
        PredictRequestOptions opts,
        CancellationToken cancellation = default);
    /// <summary>
    /// Get streaming prediction results for the prompt and provided options.
    /// </summary>
    /// <param name="text">The text to complete</param>
    /// <param name="opts">The prediction settings</param>
    /// <param name="cancellationToken">The <see cref="CancellationToken"/> for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
    /// <returns>The prediction result generated by the model</returns>
    Task<ITextPredictionStreamingResult> GetStreamingPredictionAsync(
        string text,
        PredictRequestOptions opts,
        CancellationToken cancellationToken = default);
 }
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPredictionResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPredictionResult.cs
@ -1,10 +0,0 @@
 namespace Gpt4All;
 public interface ITextPredictionResult
 {
    bool Success { get; }
    string? ErrorMessage { get; }
    Task<string> GetPredictionAsync(CancellationToken cancellationToken = default);
 }
--- a/Show More
+++ b/Show More
		`@ -1,3 +0,0 @@`
			`desc = 'GPT4All API'`

			`endpoint_paths = {'health': '/health'}`
		`@ -1 +0,0 @@`
			`### Drop GGUF compatible models here, make sure it matches MODEL_BIN on your .env file`
		`@ -0,0 +1 @@`
							`Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6`
		`@ -1 +0,0 @@`
			`Subproject commit e3c4f65d786d26f1daa7aebfb1b67cd6c31ea082`