2025-06-25 00:03:34 -04:00
450 changed files with 23530 additions and 54781 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -1,17 +1,13 @@
 version: 2.1
 setup: true
 orbs:
-  path-filtering: circleci/path-filtering@1.3.0
+  path-filtering: circleci/path-filtering@0.0.1

 workflows:
  version: 2.1
  generate-config:
    jobs:
      - path-filtering/filter:
-          filters:
-            tags:
-              only:
-                - /.*/
          base-revision: main
          config-path: .circleci/continue_config.yml
          mapping: |
@ -19,4 +15,6 @@ workflows:
            gpt4all-backend/.* run-all-workflows true
            gpt4all-bindings/python/.* run-python-workflow true
            gpt4all-bindings/typescript/.* run-ts-workflow true
+            gpt4all-bindings/csharp/.* run-csharp-workflow true
            gpt4all-chat/.* run-chat-workflow true
+            .* run-default-workflow true
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
--- a/.circleci/grab_notary_id.py
+++ b/.circleci/grab_notary_id.py
@ -1,17 +0,0 @@
-import re
-import sys
-
-ID_REG = r"id: (.*)"
-
-def main() -> None:
-    notary_log = sys.argv[1]
-    with open(notary_log, "r") as f:
-        notary_output = f.read()
-        id_m = re.search(ID_REG, notary_output)
-        if id_m:
-            print(id_m.group(1))
-        else:
-            raise RuntimeError("Unable to parse ID from notarization logs")
-
-if __name__ == "__main__":
-    main()
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = blong, afterall, assistent, crasher, requestor
-skip = ./.git,./gpt4all-chat/translations,*.pdf,*.svg,*.lock
+ignore-words-list = blong, afterall, som, assistent, crasher
+skip = .git,*.pdf,*.svg,*.lock
--- a/.github/workflows/codespell.yml
+++ b/.github/workflows/codespell.yml
@ -14,6 +14,6 @@ jobs:

    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Codespell
        uses: codespell-project/actions-codespell@v2
--- a/.gitignore
+++ b/.gitignore
@ -181,8 +181,6 @@ CMakeLists.txt.user
 gpt4all-chat/models/*
 build_*
 build-*
-cmake-build-*
-/gpt4all-chat/tests/python/config.py

 # IntelliJ
 .idea/
--- a/.gitmodules
+++ b/.gitmodules
@ -1,25 +1,4 @@
 [submodule "llama.cpp-mainline"]
-	path = gpt4all-backend/deps/llama.cpp-mainline
+	path = gpt4all-backend/llama.cpp-mainline
 	url = https://github.com/nomic-ai/llama.cpp.git
 	branch = master
-[submodule "gpt4all-chat/usearch"]
-	path = gpt4all-chat/deps/usearch
-	url = https://github.com/nomic-ai/usearch.git
-[submodule "gpt4all-chat/deps/SingleApplication"]
-	path = gpt4all-chat/deps/SingleApplication
-	url = https://github.com/nomic-ai/SingleApplication.git
-[submodule "gpt4all-chat/deps/fmt"]
-	path = gpt4all-chat/deps/fmt
-	url = https://github.com/fmtlib/fmt.git
-[submodule "gpt4all-chat/deps/DuckX"]
-	path = gpt4all-chat/deps/DuckX
-	url = https://github.com/nomic-ai/DuckX.git
-[submodule "gpt4all-chat/deps/QXlsx"]
-	path = gpt4all-chat/deps/QXlsx
-	url = https://github.com/nomic-ai/QXlsx.git
-[submodule "gpt4all-chat/deps/minja"]
-	path = gpt4all-chat/deps/minja
-	url = https://github.com/nomic-ai/minja.git
-[submodule "gpt4all-chat/deps/json"]
-	path = gpt4all-chat/deps/json
-	url = https://github.com/nlohmann/json.git
--- a/MAINTAINERS.md
+++ b/MAINTAINERS.md
@ -1,77 +0,0 @@
-# MAINTAINERS
-
-## Rules
-
-* All content inside GPT4All shall have a documented maintainer
-* If a maintainer decides to retire or resign a call for volunteers will go
-  out
-* If no further maintainer can be found in a reasonable time frame, then the
-  content will be marked deprecated and removed in time
-
-## Job
-
-Maintainers will be...
-
-1. Responsible for overseeing content under their stewardship
-2. Responsible for triaging new issues, reviewing PRs, assigning priority
-   to tasks
-3. Responsible for keeping content in sufficient quality in a timely fashion
-
-## List
-
-Adam Treat ([@manyoso](https://github.com/manyoso))<br/>
-E-mail: adam@nomic.ai<br/>
-Discord: `@gonzochess75`
- Overall project maintainer
- Chat UI
-
-Jared Van Bortel ([@cebtenzzre](https://github.com/cebtenzzre))<br/>
-E-mail: jared@nomic.ai<br/>
-Discord: `@cebtenzzre`
- gpt4all-backend
- Python binding
- Python CLI app
-
-Jacob Nguyen ([@jacoobes](https://github.com/jacoobes))<br/>
-Discord: `@jacoobes`<br/>
-E-mail: `jacoobes@sern.dev`
- TypeScript binding
-
-Dominik ([@cosmic-snow](https://github.com/cosmic-snow))<br/>
-E-mail: cosmic-snow@mailfence.com<br/>
-Discord: `@cosmic__snow`
- Community documentation (GitHub Wiki)
-
-Max Cembalest ([@mcembalest](https://github.com/mcembalest))<br/>
-E-mail: max@nomic.ai<br/>
-Discord: `@maxcembalest.`
- Official documentation (gpt4all-bindings/python/docs -> https://docs.gpt4all.io/)
-
-Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
-E-mail: thiagojramos@outlook.com<br/>
- pt\_BR translation
-
-不知火 Shiranui ([@supersonictw](https://github.com/supersonictw))<br/>
-E-mail: supersonic@livemail.tw<br/>
-Discord: `@supersonictw`
- zh\_TW translation
-
-Jeremy Tayco ([@jstayco](https://github.com/jstayco))<br/>
-E-mail: jstayco@protonmail.ch<br/>
-Discord: `@vertana`
- es\_MX translation
-
-Riccardo Giovanetti ([@Harvester62](https://github.com/Harvester62))<br/>
-E-mail: riccardo.giovanetti@gmail.com<br/>
-Discord: `@harvester62`
- it\_IT translation
-
-Tim ([@Tim453](https://github.com/Tim453))<br/>
-E-mail: tim453@mailbox.org<br/>
-Discord: `@Tim453`
- Flatpak
-
-Jack ([@wuodoo](https://github.com/wuodoo))<br/>
-E-mail: 2296103047@qq.com<br/>
-Discord: `@mikage`
- zh\_CN translation
--- a/README.md
+++ b/README.md
@ -1,110 +1,48 @@
 <h1 align="center">GPT4All</h1>
-
+<p align="center">Privacy-oriented software for chatting with large language models that run on your own computer.</p>
 <p align="center">
-  Now with support for DeepSeek R1 Distillations
-</p>
-
-<p align="center">
-  <a href="https://www.nomic.ai/gpt4all">Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a> &bull; <a href="https://www.youtube.com/watch?v=gQcZDXRVJok">YouTube Tutorial</a>
-</p>
-
-<p align="center">
-  GPT4All runs large language models (LLMs) privately on everyday desktops & laptops.
+  <a href="https://gpt4all.io">Official Website</a> &bull; <a href="https://docs.gpt4all.io">Documentation</a> &bull; <a href="https://discord.gg/mGZE39AS3e">Discord</a>
 </p>
 <p align="center">
-  No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>.
-</p>
-
-<p align="center">
-  Read about what's new in <a href="https://www.nomic.ai/blog/tag/gpt4all">our blog</a>.
+  Official Download Links: <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">Windows</a> &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">macOS</a> &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">Ubuntu</a>
 </p>
 <p align="center">
-  <a href="https://nomic.ai/gpt4all/#newsletter-form">Subscribe to the newsletter</a>
+  <b>NEW:</b> <a href="https://forms.nomic.ai/gpt4all-release-notes-signup">Subscribe to our mailing list</a> for updates and news!
 </p>
-
-https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311
-
 <p align="center">
 GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
 </p>
-
-## Download Links
-
-<p>
-  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
-    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
-  </a> &mdash;
-</p>
-<p>
-  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-win64-arm.exe">
-    <img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows ARM Installer
-  </a> &mdash;
-</p>
-<p>
-  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
-    <img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
-  </a> &mdash;
-</p>
-<p>
-  &mdash; <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
-    <img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
-  </a> &mdash;
-</p>
-<p>
-  The Windows and Linux builds require Intel Core i3 2nd Gen / AMD Bulldozer, or better.
-</p>
-<p>
-  The Windows ARM build supports Qualcomm Snapdragon and Microsoft SQ1/SQ2 processors.
-</p>
-<p>
-  The Linux build is x86-64 only (no ARM).
-</p>
-<p>
-  The macOS build requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
+<p align="center">
+ <a href="https://www.phorm.ai/query?projectId=755eecd3-24ad-49cc-abf4-0ab84caacf63"><img src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg" alt="phorm.ai"></a>
 </p>

-See the full [System Requirements](gpt4all-chat/system_requirements.md) for more details.
-
-<br/>
-<br/>
-<p>
-  <a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
-    <img style="height: 2em; width: auto" alt='Get it on Flathub' src='https://flathub.org/api/badge'><br/>
-    Flathub (community maintained)
-  </a>
+<p align="center">
+  <img width="auto" height="400" src="https://github.com/nomic-ai/gpt4all/assets/14168726/495fce3e-769b-4e5a-a394-99f072ac4d29">
+</p>
+<p align="center">
+Run on an M2 MacBook Pro (not sped up!)
 </p>

-## Install GPT4All Python

-`gpt4all` gives you access to LLMs with our Python client around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations. 
+## About GPT4All

-Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
+GPT4All is an ecosystem to run **powerful** and **customized** large language models that work locally on consumer grade CPUs and NVIDIA and AMD GPUs. Note that your CPU needs to support [AVX instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions).

-```bash
-pip install gpt4all
-```
+Learn more in the [documentation](https://docs.gpt4all.io).

-```python
-from gpt4all import GPT4All
-model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
-with model.chat_session():
-    print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
-```
+A GPT4All model is a 3GB - 8GB file that you can download and plug into the GPT4All software. **Nomic AI** supports and maintains this software ecosystem to enforce quality and security alongside spearheading the effort to allow any person or enterprise to easily deploy their own on-edge large language models.


-## Integrations
+### Installation

-:parrot::link: [Langchain](https://python.langchain.com/v0.2/docs/integrations/providers/gpt4all/)
-:card_file_box: [Weaviate Vector Database](https://github.com/weaviate/weaviate) - [module docs](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all)
-:telescope: [OpenLIT (OTel-native Monitoring)](https://github.com/openlit/openlit) - [Docs](https://docs.openlit.io/latest/integrations/gpt4all)
+The recommended way to install GPT4All is to use one of the online installers linked above in this README, which are also available at the [GPT4All website](https://gpt4all.io/). These require an internet connection at install time, are slightly easier to use on macOS due to code signing, and provide a version of GPT4All that can check for updates.

-## Release History
- **July 2nd, 2024**: V3.0.0 Release
-    - Fresh redesign of the chat application UI
-    - Improved user workflow for LocalDocs
-    - Expanded access to more model architectures
+An alternative way to install GPT4All is to use one of the offline installers available on the [Releases page](https://github.com/nomic-ai/gpt4all/releases). These do not require an internet connection at install time, and can be used to install an older version of GPT4All if so desired. But using these requires acknowledging a security warning on macOS, and they provide a version of GPT4All that is unable to notify you of updates, so you should enable notifications for Releases on this repository (Watch > Custom > Releases) or sign up for announcements in our [Discord server](https://discord.gg/mGZE39AS3e).
+
+
+### What's New
 - **October 19th, 2023**: GGUF Support Launches with Support for:
-    - Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5
+    - Mistral 7b base model, an updated model gallery on [gpt4all.io](https://gpt4all.io), several new local code models including Rift Coder v1.5
    - [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) support for Q4\_0 and Q4\_1 quantizations in GGUF.
    - Offline build support for running old versions of the GPT4All Local LLM Chat Client.
 - **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
@ -113,6 +51,24 @@ with model.chat_session():

 [Docker-based API server]: https://github.com/nomic-ai/gpt4all/tree/cef74c2be20f5b697055d5b8b506861c7b997fab/gpt4all-api

+
+### Building From Source
+
+* Follow the instructions [here](gpt4all-chat/build_and_run.md) to build the GPT4All Chat UI from source.
+
+
+### Bindings
+
+* :snake: <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python">Official Python Bindings</a> [![Downloads](https://static.pepy.tech/badge/gpt4all/week)](https://pepy.tech/project/gpt4all)
+* :computer: <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript">Typescript Bindings</a>
+
+
+### Integrations
+
+* :parrot::link: [Langchain](https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html)
+* :card_file_box: [Weaviate Vector Database](https://github.com/weaviate/weaviate) - [module docs](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all)
+
+
 ## Contributing
 GPT4All welcomes contributions, involvement, and discussion from the open source community!
 Please see CONTRIBUTING.md and follow the issues, bug reports, and PR markdown templates.
@ -121,6 +77,74 @@ Check project discord, with project owners, or through existing issues/PRs to av
 Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
 Example tags: `backend`, `bindings`, `python-bindings`, `documentation`, etc.

+
+## GPT4All 2024 Roadmap
+To contribute to the development of any of the below roadmap items, make or find the corresponding issue and cross-reference the [in-progress task](https://github.com/orgs/nomic-ai/projects/2/views/1).
+
+Each item should have an issue link below.
+
+- Chat UI Language Localization (localize UI into the native languages of users)
+    - [ ] Chinese
+    - [ ] German
+    - [ ] French
+    - [ ] Portuguese
+    - [ ] Your native language here. 
+- UI Redesign: an internal effort at Nomic to improve the UI/UX of gpt4all for all users.
+    - [ ] Design new user interface and gather community feedback
+    - [ ] Implement the new user interface and experience.
+- Installer and Update Improvements
+    - [ ] Seamless native installation and update process on OSX
+    - [ ] Seamless native installation and update process on Windows
+    - [ ] Seamless native installation and update process on Linux
+- Model discoverability improvements:
+    - [x] Support huggingface model discoverability
+    - [ ] Support Nomic hosted model discoverability
+- LocalDocs (towards a local perplexity)
+    - Multilingual LocalDocs Support
+        - [ ] Create a multilingual experience
+        - [ ] Incorporate a multilingual embedding model
+        - [ ] Specify a preferred multilingual LLM for localdocs
+    - Improved RAG techniques
+        - [ ] Query augmentation and re-writing
+        - [ ] Improved chunking and text extraction from arbitrary modalities
+            - [ ] Custom PDF extractor past the QT default (charts, tables, text)
+        - [ ] Faster indexing and local exact search with v1.5 hamming embeddings and reranking (skip ANN index construction!)
+    - Support queries like 'summarize X document'
+    - Multimodal LocalDocs support with Nomic Embed
+    - Nomic Dataset Integration with real-time LocalDocs
+        - [ ] Include an option to allow the export of private LocalDocs collections to Nomic Atlas for debugging data/chat quality
+        - [ ] Allow optional sharing of LocalDocs collections between users.
+        - [ ] Allow the import of a LocalDocs collection from an Atlas Datasets
+            - Chat with live version of Wikipedia, Chat with Pubmed, chat with the latest snapshot of world news.
+- First class Multilingual LLM Support
+    - [ ] Recommend and set a default LLM for German
+    - [ ] Recommend and set a default LLM for English
+    - [ ] Recommend and set a default LLM for Chinese
+    - [ ] Recommend and set a default LLM for Spanish
+
+- Server Mode improvements
+    - Improved UI and new requested features:
+        - [ ] Fix outstanding bugs and feature requests around networking configurations.
+        - [ ] Support Nomic Embed inferencing
+        - [ ] First class documentation
+        - [ ] Improving developer use and quality of server mode (e.g. support larger batches)
+
+
+## Technical Reports
+
+<p align="center">
+<a href="https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf">:green_book: Technical Report 3: GPT4All Snoozy and Groovy </a>
+</p>
+
+<p align="center">
+<a href="https://static.nomic.ai/gpt4all/2023_GPT4All-J_Technical_Report_2.pdf">:green_book: Technical Report 2: GPT4All-J </a>
+</p>
+
+<p align="center">
+<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report 1: GPT4All</a>
+</p>
+
+
 ## Citation

 If you utilize this repository, models or data in a downstream project, please consider citing it with:
--- a/common/common.cmake
+++ b/common/common.cmake
@ -1,41 +0,0 @@
-function(gpt4all_add_warning_options target)
-    if (MSVC)
-        return()
-    endif()
-    target_compile_options("${target}" PRIVATE
-        # base options
-        -Wall
-        -Wextra
-        # extra options
-        -Wcast-align
-        -Wextra-semi
-        -Wformat=2
-        -Wmissing-include-dirs
-        -Wsuggest-override
-        -Wvla
-        # errors
-        -Werror=format-security
-        -Werror=init-self
-        -Werror=pointer-arith
-        -Werror=undef
-        # disabled warnings
-        -Wno-sign-compare
-        -Wno-unused-parameter
-    )
-    if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
-        target_compile_options("${target}" PRIVATE
-            -Wduplicated-branches
-            -Wduplicated-cond
-            -Wlogical-op
-            -Wno-reorder
-            -Wno-null-dereference
-        )
-    elseif (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
-        target_compile_options("${target}" PRIVATE
-            -Wunreachable-code-break
-            -Wunreachable-code-return
-            -Werror=pointer-integer-compare
-            -Wno-reorder-ctor
-        )
-    endif()
-endfunction()
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -1,7 +1,4 @@
-cmake_minimum_required(VERSION 3.23)  # for FILE_SET
-
-include(../common/common.cmake)
-
+cmake_minimum_required(VERSION 3.16)
 set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)

@ -36,7 +33,7 @@ set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)

-set(CMAKE_CXX_STANDARD 23)
+set(CMAKE_CXX_STANDARD 20)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
 set(BUILD_SHARED_LIBS ON)
@ -50,15 +47,17 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

-set(DIRECTORY deps/llama.cpp-mainline)
+set(DIRECTORY llama.cpp-mainline)
 include(llama.cpp.cmake)

 set(BUILD_VARIANTS)
+set(GPTJ_BUILD_VARIANT cpu)
 if (APPLE)
    list(APPEND BUILD_VARIANTS metal)
 endif()
 if (LLMODEL_KOMPUTE)
    list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
+    set(GPTJ_BUILD_VARIANT kompute)
 else()
    list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
 endif()
@ -66,24 +65,6 @@ if (LLMODEL_VULKAN)
    list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
 endif()
 if (LLMODEL_CUDA)
-    cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
-    # Defaults must be set before enable_language(CUDA).
-    # Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
-    if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
-        # 52 == lowest CUDA 12 standard
-        # 60 == f16 CUDA intrinsics
-        # 61 == integer CUDA intrinsics
-        # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
-        if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
-            set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
-        else()
-            set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
-            #set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
-        endif()
-    endif()
-    message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
-
    include(CheckLanguage)
    check_language(CUDA)
    if (NOT CMAKE_CUDA_COMPILER)
@ -97,6 +78,8 @@ if (LLMODEL_ROCM)
    list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
 endif()

+set(CMAKE_VERBOSE_MAKEFILE ON)
+
 # Go through each build variant
 foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Determine flags
@ -105,34 +88,30 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    else()
        set(GPT4ALL_ALLOW_NON_AVX ON)
    endif()
-    set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
-    set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
-    set(GGML_FMA  ${GPT4ALL_ALLOW_NON_AVX})
+    set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
+    set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
+    set(LLAMA_FMA  ${GPT4ALL_ALLOW_NON_AVX})

-    set(GGML_METAL   OFF)
-    set(GGML_KOMPUTE OFF)
-    set(GGML_VULKAN  OFF)
-    set(GGML_CUDA    OFF)
-    set(GGML_ROCM    OFF)
+    set(LLAMA_METAL   OFF)
+    set(LLAMA_KOMPUTE OFF)
+    set(LLAMA_VULKAN  OFF)
+    set(LLAMA_CUDA    OFF)
+    set(LLAMA_ROCM    OFF)
    if (BUILD_VARIANT MATCHES metal)
-        set(GGML_METAL   ON)
+        set(LLAMA_METAL   ON)
    elseif (BUILD_VARIANT MATCHES kompute)
-        set(GGML_KOMPUTE ON)
+        set(LLAMA_KOMPUTE ON)
    elseif (BUILD_VARIANT MATCHES vulkan)
-        set(GGML_VULKAN  ON)
+        set(LLAMA_VULKAN  ON)
    elseif (BUILD_VARIANT MATCHES cuda)
-        set(GGML_CUDA    ON)
+        set(LLAMA_CUDA    ON)
    elseif (BUILD_VARIANT MATCHES rocm)
-        set(GGML_HIPBLAS ON)
+        set(LLAMA_HIPBLAS ON)
    endif()

    # Include GGML
    include_ggml(-mainline-${BUILD_VARIANT})

-    if (BUILD_VARIANT MATCHES metal)
-        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
-    endif()
-
    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
        set(TARGET_NAME ${TARGET_NAME}-${BUILD_VARIANT})
@ -151,35 +130,28 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)

    # Add each individual implementations
    add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
-        src/llamamodel.cpp src/llmodel_shared.cpp)
-    gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
+        llamamodel.cpp llmodel_shared.cpp)
    target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
        LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
-    target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
-        src include/gpt4all-backend
-    )
    prepare_target(llamamodel-mainline llama-mainline)

-    if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
+    if (BUILD_VARIANT MATCHES ${GPTJ_BUILD_VARIANT})
+        add_library(gptj-${BUILD_VARIANT} SHARED
+            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        prepare_target(gptj llama-mainline)
+    endif()
+
+    if (BUILD_VARIANT STREQUAL cuda)
        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
    endif()
 endforeach()

 add_library(llmodel
-    src/dlhandle.cpp
-    src/llmodel.cpp
-    src/llmodel_c.cpp
-    src/llmodel_shared.cpp
-)
-gpt4all_add_warning_options(llmodel)
-target_sources(llmodel PUBLIC
-    FILE_SET public_headers TYPE HEADERS BASE_DIRS include
-    FILES include/gpt4all-backend/llmodel.h
-          include/gpt4all-backend/llmodel_c.h
-          include/gpt4all-backend/sysinfo.h
+    llmodel.h llmodel.cpp llmodel_shared.cpp
+    llmodel_c.h llmodel_c.cpp
+    dlhandle.h
 )
 target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
-target_include_directories(llmodel PRIVATE src include/gpt4all-backend)

 set_target_properties(llmodel PROPERTIES
                              VERSION ${PROJECT_VERSION}
--- a/gpt4all-backend/README.md
+++ b/gpt4all-backend/README.md
@ -27,7 +27,7 @@ Unfortunately, no for three reasons:

 # What is being done to make them more compatible?

-A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.
+A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.

 # What about GPU inference?

--- a/gpt4all-backend/deps/llama.cpp-mainline
+++ b/gpt4all-backend/deps/llama.cpp-mainline
@ -1 +0,0 @@
-Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
--- a/gpt4all-backend/dlhandle.h
+++ b/gpt4all-backend/dlhandle.h
@ -0,0 +1,109 @@
+#ifndef DLHANDLE_H
+#define DLHANDLE_H
+#ifndef _WIN32
+#include <string>
+#include <stdexcept>
+#include <utility>
+#include <dlfcn.h>
+
+
+
+class Dlhandle {
+    void *chandle;
+
+public:
+    class Exception : public std::runtime_error {
+    public:
+        using std::runtime_error::runtime_error;
+    };
+
+    Dlhandle() : chandle(nullptr) {}
+    Dlhandle(const std::string& fpath, int flags = RTLD_LAZY | RTLD_LOCAL) {
+        chandle = dlopen(fpath.c_str(), flags);
+        if (!chandle) {
+            throw Exception("dlopen(\""+fpath+"\"): "+dlerror());
+        }
+    }
+    Dlhandle(const Dlhandle& o) = delete;
+    Dlhandle(Dlhandle&& o) : chandle(o.chandle) {
+        o.chandle = nullptr;
+    }
+    ~Dlhandle() {
+        if (chandle) dlclose(chandle);
+    }
+
+    auto operator =(Dlhandle&& o) {
+        chandle = std::exchange(o.chandle, nullptr);
+    }
+
+    bool is_valid() const {
+        return chandle != nullptr;
+    }
+    operator bool() const {
+        return is_valid();
+    }
+
+    template<typename T>
+    T* get(const std::string& fname) const {
+        auto fres = reinterpret_cast<T*>(dlsym(chandle, fname.c_str()));
+        return (dlerror()==NULL)?fres:nullptr;
+    }
+    auto get_fnc(const std::string& fname) const {
+        return get<void*(...)>(fname);
+    }
+};
+#else
+#include <algorithm>
+#include <filesystem>
+#include <string>
+#include <exception>
+#include <stdexcept>
+
+#define WIN32_LEAN_AND_MEAN
+#ifndef NOMINMAX
+#   define NOMINMAX
+#endif
+#include <windows.h>
+#include <libloaderapi.h>
+
+
+class Dlhandle {
+    HMODULE chandle;
+
+public:
+    class Exception : public std::runtime_error {
+    public:
+        using std::runtime_error::runtime_error;
+    };
+
+    Dlhandle() : chandle(nullptr) {}
+    Dlhandle(const std::string& fpath) {
+        std::string afpath = std::filesystem::absolute(fpath).string();
+        std::replace(afpath.begin(), afpath.end(), '/', '\\');
+        chandle = LoadLibraryExA(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
+        if (!chandle) {
+            throw Exception("dlopen(\""+fpath+"\"): Error");
+        }
+    }
+    Dlhandle(const Dlhandle& o) = delete;
+    Dlhandle(Dlhandle&& o) : chandle(o.chandle) {
+        o.chandle = nullptr;
+    }
+    ~Dlhandle() {
+        if (chandle) FreeLibrary(chandle);
+    }
+
+    bool is_valid() const {
+        return chandle != nullptr;
+    }
+
+    template<typename T>
+    T* get(const std::string& fname) const {
+        return reinterpret_cast<T*>(GetProcAddress(chandle, fname.c_str()));
+    }
+    auto get_fnc(const std::string& fname) const {
+        return get<void*(...)>(fname);
+    }
+};
+#endif
+#endif // DLHANDLE_H
--- a/gpt4all-backend/gptj.cpp
+++ b/gpt4all-backend/gptj.cpp
@ -0,0 +1,847 @@
+#define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#include "gptj_impl.h"
+
+#include "utils.h"
+#include "llmodel_shared.h"
+
+#include <cassert>
+#include <cinttypes>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#if defined(_WIN32) && defined(_MSC_VER)
+    #define WIN32_LEAN_AND_MEAN
+    #ifndef NOMINMAX
+        #define NOMINMAX
+    #endif
+    #include <windows.h>
+    #include <io.h>
+    #include <stdio.h>
+#else
+    #include <unistd.h>
+#endif
+#include <sstream>
+#include <unordered_set>
+#include <ggml.h>
+
+
+namespace {
+const char *modelType_ = "GPT-J";
+}
+
+// default hparams (GPT-J 6B)
+struct gptj_hparams {
+    int32_t n_vocab = 50400;
+    int32_t n_ctx   = 2048;
+    int32_t n_embd  = 4096;
+    int32_t n_head  = 16;
+    int32_t n_layer = 28;
+    int32_t n_rot   = 64;
+    float norm_eps  = 1e-5;
+};
+
+struct gptj_layer {
+    // normalization
+    struct ggml_tensor * ln_1_g;
+    struct ggml_tensor * ln_1_b;
+
+    // attention
+    struct ggml_tensor * c_attn_q_proj_w;
+    struct ggml_tensor * c_attn_k_proj_w;
+    struct ggml_tensor * c_attn_v_proj_w;
+
+    struct ggml_tensor * c_attn_proj_w;
+
+    // ff
+    struct ggml_tensor * c_mlp_fc_w;
+    struct ggml_tensor * c_mlp_fc_b;
+
+    struct ggml_tensor * c_mlp_proj_w;
+    struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gptj_model {
+    gptj_hparams hparams;
+
+    // normalization
+    struct ggml_tensor * ln_f_g;
+    struct ggml_tensor * ln_f_b;
+
+    struct ggml_tensor * wte; // position embedding
+
+    struct ggml_tensor * lmh_g; // language model head
+    struct ggml_tensor * lmh_b; // language model bias
+
+    std::vector<gptj_layer> layers;
+
+    // key + value memory
+    struct llm_kv_cache kv_self;
+
+    //
+    struct ggml_context * ctx;
+    std::map<std::string, struct ggml_tensor *> tensors;
+
+    llm_buffer eval_buf;
+    llm_buffer scr0_buf;
+    llm_buffer scr1_buf;
+
+    ~gptj_model() {
+        if (ctx) {
+            ggml_free(ctx);
+        }
+    }
+};
+
+static bool kv_cache_init(
+        const struct gptj_hparams & hparams,
+              struct llm_kv_cache & cache,
+                         ggml_type   wtype,
+                               int   n_ctx) {
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+
+    const int64_t n_mem      = (int64_t)n_layer*n_ctx;
+    const int64_t n_elements = n_embd*n_mem;
+
+    cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2_MiB);
+
+    struct ggml_init_params params;
+    params.mem_size   = cache.buf.size;
+    params.mem_buffer = cache.buf.addr;
+    params.no_alloc   = false;
+
+    cache.ctx = ggml_init(params);
+
+    if (!cache.ctx) {
+        fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+        return false;
+    }
+
+    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+
+    return true;
+}
+
+// load the model's weights from a file path
+bool gptj_model_load(const std::string &fname, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr) {
+    printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
+    if(mem_req != nullptr) {
+        *mem_req = 0;
+    }
+
+    // create the ggml context
+    struct gguf_init_params params = {
+        /*.no_alloc = */ false,
+        /*.ctx      = */ &model.ctx,
+    };
+
+    gguf_context *ggufctx = gguf_init_from_file(fname.c_str(), params);
+    if (!ggufctx) {
+        fprintf(stderr, "%s: gguf_init_from_file() failed\n", __func__);
+        return false;
+    }
+
+    // load hparams
+    {
+        auto & hparams = model.hparams;
+
+        bool ok = false;
+        int keyidx;
+
+        do {
+            keyidx = gguf_find_key(ggufctx, "gptj.context_length");
+            if (keyidx == -1) { break; }
+            hparams.n_ctx = gguf_get_val_u32(ggufctx, keyidx);
+
+            keyidx = gguf_find_key(ggufctx, "gptj.embedding_length");
+            if (keyidx == -1) { break; }
+            hparams.n_embd = gguf_get_val_u32(ggufctx, keyidx);
+
+            keyidx = gguf_find_key(ggufctx, "gptj.attention.head_count");
+            if (keyidx == -1) { break; }
+            hparams.n_head = gguf_get_val_u32(ggufctx, keyidx);
+
+            keyidx = gguf_find_key(ggufctx, "gptj.block_count");
+            if (keyidx == -1) { break; }
+            hparams.n_layer = gguf_get_val_u32(ggufctx, keyidx);
+
+            keyidx = gguf_find_key(ggufctx, "gptj.rope.dimension_count");
+            if (keyidx == -1) { break; }
+            hparams.n_rot = gguf_get_val_u32(ggufctx, keyidx);
+
+            keyidx = gguf_find_key(ggufctx, "gptj.attention.layer_norm_epsilon");
+            if (keyidx == -1) { break; }
+            hparams.norm_eps = gguf_get_val_f32(ggufctx, keyidx);
+
+            ok = true;
+        } while (false);
+
+        if (!ok) {
+            fprintf(stderr, "%s: required hparam missing!\n", __func__);
+            return false;
+        }
+
+        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+        printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
+    }
+
+    // load vocab
+    {
+        auto & hparams = model.hparams;
+
+        int keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.model");
+        if (keyidx == -1) {
+            fprintf(stderr, "%s: tokenizer model not found!\n", __func__);
+            return false;
+        }
+        if (strcmp(gguf_get_val_str(ggufctx, keyidx), "gpt2") != 0) {
+            fprintf(stderr, "%s: tokenizer model not supported!\n", __func__);
+            return false;
+        }
+
+        int tokens_keyidx = gguf_find_key(ggufctx, "tokenizer.ggml.tokens");
+        if (tokens_keyidx == -1) {
+            fprintf(stderr, "%s: gpt2 tokenizer vocab not found!\n", __func__);
+            return false;
+        }
+
+        hparams.n_vocab = gguf_get_arr_n(ggufctx, tokens_keyidx);
+        printf("%s: gpt2 tokenizer vocab = %d\n", __func__, int(hparams.n_vocab));
+
+        for (int i = 0; i < hparams.n_vocab; i++) {
+            std::string word = gguf_get_arr_str(ggufctx, tokens_keyidx, i);
+            vocab.token_to_id[word] = i;
+            vocab.id_to_token[i] = word;
+        }
+    }
+
+    auto & ctx = model.ctx;
+
+    size_t ctx_size = ggml_get_mem_size(ctx);
+    printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size / (1024.0 * 1024.0));
+
+    if (mem_req != nullptr) {
+        *mem_req = ctx_size;
+        gguf_free(ggufctx);
+        return false;
+    }
+
+    // prepare memory for the weights
+    {
+        const auto & hparams = model.hparams;
+        model.layers.resize(hparams.n_layer);
+
+        model.wte    = ggml_get_tensor(ctx, "token_embd.weight");
+
+        model.ln_f_g = ggml_get_tensor(ctx, "output_norm.weight");
+        model.ln_f_b = ggml_get_tensor(ctx, "output_norm.bias");
+
+        model.lmh_g  = ggml_get_tensor(ctx, "output.weight");
+        model.lmh_b  = ggml_get_tensor(ctx, "output.bias");
+
+        auto name = [](int i, std::string n) {
+            static std::string key;
+            key = "blk." + std::to_string(i) + "." + n;
+            return key.c_str();
+        };
+
+        for (int i = 0; i < hparams.n_layer; ++i) {
+            auto & layer = model.layers[i];
+
+            layer.ln_1_g          = ggml_get_tensor(ctx, name(i, "attn_norm.weight"));
+            layer.ln_1_b          = ggml_get_tensor(ctx, name(i, "attn_norm.bias"));
+
+            layer.c_attn_q_proj_w = ggml_get_tensor(ctx, name(i, "attn_q.weight"));
+            layer.c_attn_k_proj_w = ggml_get_tensor(ctx, name(i, "attn_k.weight"));
+            layer.c_attn_v_proj_w = ggml_get_tensor(ctx, name(i, "attn_v.weight"));
+
+            layer.c_attn_proj_w   = ggml_get_tensor(ctx, name(i, "attn_output.weight"));
+
+            layer.c_mlp_fc_w      = ggml_get_tensor(ctx, name(i, "ffn_up.weight"));
+            layer.c_mlp_fc_b      = ggml_get_tensor(ctx, name(i, "ffn_up.bias"));
+
+            layer.c_mlp_proj_w    = ggml_get_tensor(ctx, name(i, "ffn_down.weight"));
+            layer.c_mlp_proj_b    = ggml_get_tensor(ctx, name(i, "ffn_down.bias"));
+        }
+    }
+
+    // key + value memory
+    {
+        const auto & hparams = model.hparams;
+        if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
+            fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+            ggml_free(ctx);
+            return false;
+        }
+
+        const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
+        printf("%s: kv self size  = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+    }
+
+    model.scr0_buf.resize(256u * 1024 * 1024);
+    model.scr1_buf.resize(256u * 1024 * 1024);
+
+    return true;
+}
+
+// evaluate the transformer
+//
+//   - model:     the model
+//   - n_threads: number of threads to use
+//   - n_past:    the context size so far
+//   - embd_inp:  the embeddings of the tokens in the context
+//   - embd_w:    the predicted logits for the next token
+//
+// The GPT-J model requires about 16MB of memory per input token.
+//
+bool gptj_eval(
+        gptj_model & model,
+        const int n_threads,
+        const int n_past,
+        const std::vector<gpt_vocab::id> & embd_inp,
+              std::vector<float>         & embd_w,
+              size_t                     & mem_per_token) {
+    const int N = embd_inp.size();
+
+    const auto & hparams = model.hparams;
+
+    const int n_embd  = hparams.n_embd;
+    const int n_layer = hparams.n_layer;
+    const int n_ctx   = hparams.n_ctx;
+    const int n_head  = hparams.n_head;
+    const int n_vocab = hparams.n_vocab;
+    const int n_rot   = hparams.n_rot;
+
+    const size_t init_buf_size = 1024_MiB;
+    if (!model.eval_buf.addr || model.eval_buf.size < init_buf_size)
+        model.eval_buf.resize(init_buf_size);
+
+    if (mem_per_token > 0 && mem_per_token*N > model.eval_buf.size) {
+        const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
+        printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.eval_buf.size, buf_size_new);
+
+        // reallocate
+        model.eval_buf.resize(buf_size_new);
+        if (model.eval_buf.addr == nullptr) {
+            fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.eval_buf.size);
+            return false;
+        }
+    }
+
+    struct ggml_init_params params = {
+        .mem_size   = model.eval_buf.size,
+        .mem_buffer = model.eval_buf.addr,
+        .no_alloc = false
+    };
+
+    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+
+    // KQ_pos - contains the positions
+    struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    int * data = (int *) KQ_pos->data;
+    for (int i = 0; i < N; ++i) {
+        data[i] = n_past + i;
+    }
+
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+    // wte
+    struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
+
+    for (int il = 0; il < n_layer; ++il) {
+        struct ggml_tensor * cur;
+        ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
+        // norm
+        {
+            cur = ggml_norm(ctx0, inpL, model.hparams.norm_eps);
+
+            // cur = ln_1_g*cur + ln_1_b
+            cur = ggml_add(ctx0,
+                    ggml_mul(ctx0,
+                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+                        cur),
+                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+        }
+
+        struct ggml_tensor * inpSA = cur;
+
+        // self-attention
+        {
+            struct ggml_tensor * Qcur = ggml_rope(
+                ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur), n_embd/n_head, n_head, N),
+                KQ_pos, n_rot, 0, 0
+            );
+            struct ggml_tensor * Kcur = ggml_rope(
+                ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur), n_embd/n_head, n_head, N),
+                KQ_pos, n_rot, 0, 0
+            );
+
+            // store key and value to memory
+            {
+                struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur));
+
+                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_2d(ctx0, model.kv_self.v, N, n_embd,
+                        (   n_ctx)*ggml_element_size(model.kv_self.v),
+                        (il*n_ctx)*ggml_element_size(model.kv_self.v)*n_embd + n_past*ggml_element_size(model.kv_self.v));
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+            }
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
+
+            // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+            struct ggml_tensor * K =
+                ggml_permute(ctx0,
+                        ggml_reshape_3d(ctx0,
+                            ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),
+                            n_embd/n_head, n_head, n_past + N),
+                        0, 2, 1, 3);
+
+            // K * Q
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, 1.0f/sqrt(float(n_embd)/n_head));
+
+            // KQ_masked = mask_past(KQ_scaled)
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+
+            // KQ = soft_max(KQ_masked)
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+
+            // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+            struct ggml_tensor * V =
+                ggml_view_3d(ctx0, model.kv_self.v,
+                        n_past + N, n_embd/n_head, n_head,
+                        n_ctx*ggml_element_size(model.kv_self.v),
+                        n_ctx*ggml_element_size(model.kv_self.v)*n_embd/n_head,
+                        il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd);
+
+            // KQV = transpose(V) * KQ_soft_max
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            cur = ggml_cpy(ctx0,
+                    KQV_merged,
+                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+
+            // projection (no bias)
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_attn_proj_w,
+                    cur);
+        }
+
+        struct ggml_tensor * inpFF = cur;
+
+        ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
+        // feed-forward network
+        // this is independent of the self-attention result, so it could be done in parallel to the self-attention
+        {
+            // note here we pass inpSA instead of cur
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_fc_w,
+                    inpSA);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+                    cur);
+
+            // GELU activation
+            cur = ggml_gelu(ctx0, cur);
+
+            // projection
+            // cur = proj_w*cur + proj_b
+            cur = ggml_mul_mat(ctx0,
+                    model.layers[il].c_mlp_proj_w,
+                    cur);
+
+            cur = ggml_add(ctx0,
+                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+                    cur);
+        }
+
+        // self-attention + FF
+        cur  = ggml_add(ctx0, cur, inpFF);
+
+        // input for next layer
+        inpL = ggml_add(ctx0, cur, inpL);
+    }
+
+    ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
+
+    // norm
+    {
+        inpL = ggml_norm(ctx0, inpL, model.hparams.norm_eps);
+
+        // inpL = ln_f_g*inpL + ln_f_b
+        inpL = ggml_add(ctx0,
+                ggml_mul(ctx0,
+                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+                    inpL),
+                ggml_repeat(ctx0, model.ln_f_b, inpL));
+    }
+
+    ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
+    // lm_head
+    {
+        inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
+
+        inpL = ggml_add(ctx0,
+                ggml_repeat(ctx0, model.lmh_b, inpL),
+                inpL);
+    }
+
+    // logits -> probs
+    //inpL = ggml_soft_max(ctx0, inpL);
+
+    ggml_build_forward_expand(gf, inpL);
+
+    // run the computation
+    {
+        std::unique_ptr<uint8_t []> data;
+        auto plan = ggml_graph_plan(gf, n_threads);
+        if (plan.work_size > 0) {
+            data.reset(new uint8_t[plan.work_size]);
+            plan.work_data = data.get();
+        }
+        ggml_graph_compute(gf, &plan);
+    }
+
+    //if (n_past%100 == 0) {
+    //    ggml_graph_print   (gf);
+    //    ggml_graph_dump_dot(gf, NULL, "gpt-2.dot");
+    //}
+
+    //embd_w.resize(n_vocab*N);
+    //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+
+    // return result for just the last token
+    embd_w.resize(n_vocab);
+    memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+
+    if (mem_per_token == 0) {
+        mem_per_token = ggml_used_mem(ctx0)/N;
+    }
+    //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+
+    ggml_free(ctx0);
+
+    return true;
+}
+
+#define GPTJ_MAX_RNG_STATE 64*1024
+
+size_t gptj_get_state_size(const gptj_model &model)
+{
+    // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
+    // for reference, std::mt19937(1337) serializes to 6701 bytes.
+    const size_t s_rng_size        = sizeof(size_t);
+    const size_t s_rng             = GPTJ_MAX_RNG_STATE;
+    const size_t s_kv_size         = sizeof(size_t);
+    const size_t s_kv_ntok         = sizeof(int);
+    const size_t s_kv              = model.kv_self.buf.size;
+    const size_t s_total = (
+        + s_rng_size
+        + s_rng
+        + s_kv_size
+        + s_kv_ntok
+        + s_kv
+    );
+    fflush(stdout);
+    return s_total;
+}
+
+size_t gptj_copy_state_data(const gptj_model &model, const std::mt19937 &rng, uint8_t *dest)
+{
+    uint8_t * out = dest;
+    fflush(stdout);
+    // copy rng
+    {
+        std::stringstream rng_ss;
+        rng_ss << rng;
+
+        const size_t rng_size = rng_ss.str().size();
+        char rng_buf[GPTJ_MAX_RNG_STATE];
+
+        memset(&rng_buf[0], 0, GPTJ_MAX_RNG_STATE);
+        memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
+
+        memcpy(out, &rng_size,   sizeof(rng_size));   out += sizeof(rng_size);
+        memcpy(out, &rng_buf[0], GPTJ_MAX_RNG_STATE); out += GPTJ_MAX_RNG_STATE;
+    }
+
+    // copy kv cache
+    {
+        const size_t kv_size = model.kv_self.buf.size;
+        const int    kv_ntok = model.kv_self.n;
+
+        memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
+        memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
+
+        if (kv_size) {
+            memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
+        }
+    }
+
+    const size_t written  = out - dest;
+    assert(written == gptj_get_state_size(model));
+    fflush(stdout);
+    return written;
+}
+
+size_t gptj_set_state_data(gptj_model *model, std::mt19937 *rng, const uint8_t *src)
+{
+    const uint8_t * in = src;
+
+    // set rng
+    {
+        size_t rng_size;
+        char   rng_buf[GPTJ_MAX_RNG_STATE];
+
+        memcpy(&rng_size,   in, sizeof(rng_size));    in += sizeof(rng_size);
+        memcpy(&rng_buf[0], in, GPTJ_MAX_RNG_STATE); in += GPTJ_MAX_RNG_STATE;
+
+        std::stringstream rng_ss;
+        rng_ss.str(std::string(&rng_buf[0], rng_size));
+        rng_ss >> *rng;
+
+        assert(rng_ss.fail() == false);
+    }
+
+    // set kv cache
+    {
+        size_t kv_size;
+        int kv_ntok;
+
+        memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
+        memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
+
+        if (kv_size) {
+            assert(model->kv_self.buf.size == kv_size);
+
+            void * k_data = model->kv_self.k->data; // remember data pointers
+            void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
+
+            memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;
+
+            model->kv_self.k->data = k_data; // restore correct data pointers
+            model->kv_self.v->data = v_data;
+
+        }
+
+        model->kv_self.n = kv_ntok;
+    }
+
+    const size_t nread    = in - src;
+    assert(nread == gptj_get_state_size(*model));
+    fflush(stdout);
+    return nread;
+}
+
+struct GPTJPrivate {
+    const std::string modelPath;
+    bool modelLoaded;
+    gpt_vocab vocab;
+    gptj_model *model = nullptr;
+    int64_t n_threads = 0;
+    size_t mem_per_token = 0;
+    std::mt19937 rng;
+};
+
+GPTJ::GPTJ()
+    : d_ptr(new GPTJPrivate) {
+    d_ptr->model = new gptj_model;
+    d_ptr->model->ctx = nullptr;
+    d_ptr->modelLoaded = false;
+}
+
+size_t GPTJ::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
+    (void)n_ctx;
+    (void)ngl;
+    gptj_model dummy_model;
+    gpt_vocab dummy_vocab;
+    size_t mem_req;
+    gptj_model_load(modelPath, dummy_model, dummy_vocab, &mem_req);
+    return mem_req;
+}
+
+bool GPTJ::loadModel(const std::string &modelPath, int n_ctx, int ngl) {
+    (void)n_ctx;
+    (void)ngl;
+    d_ptr->modelLoaded = false;
+
+    std::mt19937 rng(time(NULL));
+    d_ptr->rng = rng;
+
+    // load the model
+    bool ok = gptj_model_load(modelPath, *d_ptr->model, d_ptr->vocab);
+    fflush(stdout);
+    if (!ok) {
+        std::cerr << "GPT-J ERROR: failed to load model from " <<  modelPath;
+        return false;
+    }
+
+    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    d_ptr->modelLoaded = true;
+    return true;
+}
+
+void GPTJ::setThreadCount(int32_t n_threads) {
+    d_ptr->n_threads = n_threads;
+}
+
+int32_t GPTJ::threadCount() const
+{
+    return d_ptr->n_threads;
+}
+
+GPTJ::~GPTJ()
+{
+    delete d_ptr->model;
+}
+
+bool GPTJ::isModelLoaded() const
+{
+    return d_ptr->modelLoaded;
+}
+
+size_t GPTJ::stateSize() const
+{
+    return gptj_get_state_size(*d_ptr->model);
+}
+
+size_t GPTJ::saveState(uint8_t *dest) const
+{
+    return gptj_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
+}
+
+size_t GPTJ::restoreState(const uint8_t *src)
+{
+    return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);
+}
+
+std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &ctx, const std::string &str, bool special) const
+{
+    (void)ctx;
+    (void)special;
+    return ::gpt_tokenize(d_ptr->vocab, str);
+}
+
+LLModel::Token GPTJ::sampleToken(PromptContext &promptCtx) const
+{
+    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
+    return gpt_sample_top_k_top_p(d_ptr->model->hparams.n_vocab,
+        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
+        n_prev_toks,
+        promptCtx.logits,
+        promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
+        promptCtx.repeat_penalty,
+        d_ptr->rng);
+}
+
+std::string GPTJ::tokenToString(Token id) const
+{
+    return d_ptr->vocab.id_to_token[id];
+}
+
+bool GPTJ::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
+{
+    // determine the required inference memory per token:
+    static bool initialized = false;
+    if (!initialized) {
+        gptj_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, ctx.logits,
+            d_ptr->mem_per_token);
+        initialized = true;
+    }
+
+    return gptj_eval(*d_ptr->model, d_ptr->n_threads, ctx.n_past, tokens, ctx.logits, d_ptr->mem_per_token);
+}
+
+int32_t GPTJ::contextLength() const
+{
+    return d_ptr->model->hparams.n_ctx;
+}
+
+const std::vector<LLModel::Token> &GPTJ::endTokens() const
+{
+    static const std::vector<LLModel::Token> fres = {50256};
+    return fres;
+}
+
+const char *get_arch_name(gguf_context *ctx_gguf) {
+    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
+    if (kid == -1)
+        throw std::runtime_error("key not found in model: general.architecture");
+
+    enum gguf_type ktype = gguf_get_kv_type(ctx_gguf, kid);
+    if (ktype != GGUF_TYPE_STRING)
+        throw std::runtime_error("key general.architecture has wrong type");
+
+    return gguf_get_val_str(ctx_gguf, kid);
+}
+
+#if defined(_WIN32)
+#define DLL_EXPORT __declspec(dllexport)
+#else
+#define DLL_EXPORT __attribute__ ((visibility ("default")))
+#endif
+
+extern "C" {
+DLL_EXPORT bool is_g4a_backend_model_implementation() {
+    return true;
+}
+
+DLL_EXPORT const char *get_model_type() {
+    return modelType_;
+}
+
+DLL_EXPORT const char *get_build_variant() {
+    return GGML_BUILD_VARIANT;
+}
+
+DLL_EXPORT char *get_file_arch(const char *fname) {
+    struct ggml_context * ctx_meta = NULL;
+    struct gguf_init_params params = {
+        /*.no_alloc = */ true,
+        /*.ctx      = */ &ctx_meta,
+    };
+    gguf_context *ctx_gguf = gguf_init_from_file(fname, params);
+
+    char *arch = nullptr;
+    if (ctx_gguf && gguf_get_version(ctx_gguf) <= 3) {
+        try {
+            arch = strdup(get_arch_name(ctx_gguf));
+        } catch (const std::runtime_error &) {
+            // cannot read key -> return null
+        }
+    }
+
+    gguf_free(ctx_gguf);
+    return arch;
+}
+
+DLL_EXPORT bool is_arch_supported(const char *arch) {
+    return !strcmp(arch, "gptj");
+}
+
+DLL_EXPORT LLModel *construct() {
+    return new GPTJ;
+}
+}
--- a/gpt4all-backend/gptj_impl.h
+++ b/gpt4all-backend/gptj_impl.h
@ -0,0 +1,42 @@
+#ifndef GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#error This file is NOT meant to be included outside of gptj.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
+#endif
+#ifndef GPTJ_H
+#define GPTJ_H
+
+#include <string>
+#include <functional>
+#include <vector>
+#include "llmodel.h"
+
+struct GPTJPrivate;
+class GPTJ : public LLModel {
+public:
+    GPTJ();
+    ~GPTJ();
+
+    bool supportsEmbedding() const override { return false; }
+    bool supportsCompletion() const override { return true; }
+    bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
+    bool isModelLoaded() const override;
+    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
+    size_t stateSize() const override;
+    size_t saveState(uint8_t *dest) const override;
+    size_t restoreState(const uint8_t *src) override;
+    void setThreadCount(int32_t n_threads) override;
+    int32_t threadCount() const override;
+
+private:
+    GPTJPrivate *d_ptr;
+
+protected:
+    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
+    Token sampleToken(PromptContext &ctx) const override;
+    std::string tokenToString(Token id) const override;
+    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
+    int32_t contextLength() const override;
+    const std::vector<Token> &endTokens() const override;
+    bool shouldAddBOS() const override { return false; }
+};
+
+#endif // GPTJ_H
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -0,0 +1 @@
+Subproject commit fadf1135a54e80188d644df42ad6a53bf986e8b0
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -7,7 +7,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
 #
 # some of the options here are commented out so they can be set "dynamically" before calling include_ggml()

-set(GGML_LLAMAFILE_DEFAULT ON)
+set(LLAMA_LLAMAFILE_DEFAULT ON)

 # general
 option(LLAMA_STATIC                     "llama: static link libraries"                          OFF)
@ -22,15 +22,15 @@ option(LLAMA_GPROF                      "llama: enable gprof"
 option(LLAMA_FATAL_WARNINGS             "llama: enable -Werror flag"                            OFF)

 # instruction set specific
-#option(GGML_AVX                     "ggml: enable AVX"                                     ON)
-#option(GGML_AVX2                    "ggml: enable AVX2"                                    ON)
-#option(GGML_AVX512                  "ggml: enable AVX512"                                  OFF)
-#option(GGML_AVX512_VBMI             "ggml: enable AVX512-VBMI"                             OFF)
-#option(GGML_AVX512_VNNI             "ggml: enable AVX512-VNNI"                             OFF)
-#option(GGML_FMA                     "ggml: enable FMA"                                     ON)
+#option(LLAMA_AVX                    "llama: enable AVX"                                     ON)
+#option(LLAMA_AVX2                   "llama: enable AVX2"                                    ON)
+#option(LLAMA_AVX512                 "llama: enable AVX512"                                  OFF)
+#option(LLAMA_AVX512_VBMI            "llama: enable AVX512-VBMI"                             OFF)
+#option(LLAMA_AVX512_VNNI            "llama: enable AVX512-VNNI"                             OFF)
+#option(LLAMA_FMA                    "llama: enable FMA"                                     ON)
 # in MSVC F16C is implied with AVX2/AVX512
 #if (NOT MSVC)
-#    option(GGML_F16C                "ggml: enable F16C"                                    ON)
+#    option(LLAMA_F16C               "llama: enable F16C"                                    ON)
 #endif()

 if (WIN32)
@ -38,46 +38,40 @@ if (WIN32)
 endif()

 # 3rd party libs
-option(GGML_ACCELERATE                      "ggml: enable Accelerate framework"               ON)
-option(GGML_BLAS                            "ggml: use BLAS"                                  OFF)
-option(GGML_LLAMAFILE                       "ggml: use llamafile SGEMM"                       ${GGML_LLAMAFILE_DEFAULT})
-set(GGML_BLAS_VENDOR "Generic" CACHE STRING "ggml: BLAS library vendor")
-
-#option(GGML_CUDA                            "ggml: use CUDA"                                  OFF)
-option(GGML_CUDA_FORCE_DMMV                 "ggml: use dmmv instead of mmvq CUDA kernels"     OFF)
-option(GGML_CUDA_FORCE_MMQ                  "ggml: use mmq kernels instead of cuBLAS"         OFF)
-option(GGML_CUDA_FORCE_CUBLAS               "ggml: always use cuBLAS instead of mmq kernels"  OFF)
-set   (GGML_CUDA_DMMV_X   "32" CACHE STRING "ggml: x stride for dmmv CUDA kernels")
-set   (GGML_CUDA_MMV_Y     "1" CACHE STRING "ggml: y block size for mmv CUDA kernels")
-option(GGML_CUDA_F16                        "ggml: use 16 bit floats for some calculations"   OFF)
-set   (GGML_CUDA_KQUANTS_ITER "2" CACHE STRING
-                                            "ggml: iters./thread per block for Q2_K/Q6_K")
-set   (GGML_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
-                                            "ggml: max. batch size for using peer access")
-option(GGML_CUDA_NO_PEER_COPY               "ggml: do not use peer to peer copies"            OFF)
-option(GGML_CUDA_NO_VMM                     "ggml: do not try to use CUDA VMM"                OFF)
-option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashAttention"     OFF)
-option(GGML_CUDA_USE_GRAPHS                 "ggml: use CUDA graphs (llama.cpp only)"          OFF)
-
-#option(GGML_HIPBLAS                         "ggml: use hipBLAS"                               OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
-#option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
-option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
-option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
-option(GGML_VULKAN_VALIDATE                 "ggml: enable Vulkan validation"                  OFF)
-option(GGML_VULKAN_RUN_TESTS                "ggml: run Vulkan tests"                          OFF)
-#option(GGML_METAL                           "ggml: use Metal"                                 ${GGML_METAL_DEFAULT})
-option(GGML_METAL_NDEBUG                    "ggml: disable Metal debugging"                   OFF)
-option(GGML_METAL_SHADER_DEBUG              "ggml: compile Metal with -fno-fast-math"         OFF)
-set(GGML_METAL_MACOSX_VERSION_MIN "" CACHE STRING
-                                            "ggml: metal minimum macOS version")
-set(GGML_METAL_STD "" CACHE STRING          "ggml: metal standard version (-std flag)")
-#option(GGML_KOMPUTE                        "ggml: use Kompute"                               OFF)
-option(GGML_QKK_64                          "ggml: use super-block size of 64 for k-quants"   OFF)
-set(GGML_SCHED_MAX_COPIES  "4" CACHE STRING "ggml: max input copies for pipeline parallelism")
+option(LLAMA_ACCELERATE                      "llama: enable Accelerate framework"               ON)
+option(LLAMA_BLAS                            "llama: use BLAS"                                  OFF)
+option(LLAMA_LLAMAFILE                       "llama: use llamafile SGEMM"                       ${LLAMA_LLAMAFILE_DEFAULT})
+set(LLAMA_BLAS_VENDOR "Generic" CACHE STRING "llama: BLAS library vendor")
+#option(LLAMA_CUDA                            "llama: use CUDA"                                  OFF)
+option(LLAMA_CUDA_FORCE_DMMV                 "llama: use dmmv instead of mmvq CUDA kernels"     OFF)
+option(LLAMA_CUDA_FORCE_MMQ                  "llama: use mmq kernels instead of cuBLAS"         OFF)
+set(LLAMA_CUDA_DMMV_X      "32" CACHE STRING "llama: x stride for dmmv CUDA kernels")
+set(LLAMA_CUDA_MMV_Y        "1" CACHE STRING "llama: y block size for mmv CUDA kernels")
+option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some calculations"   OFF)
+set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
+set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
+                                             "llama: max. batch size for using peer access")
+option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
+#option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
+option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
+#option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
+#option(LLAMA_VULKAN                          "llama: use Vulkan"                                OFF)
+option(LLAMA_VULKAN_CHECK_RESULTS            "llama: run Vulkan op checks"                      OFF)
+option(LLAMA_VULKAN_DEBUG                    "llama: enable Vulkan debug output"                OFF)
+option(LLAMA_VULKAN_VALIDATE                 "llama: enable Vulkan validation"                  OFF)
+option(LLAMA_VULKAN_RUN_TESTS                "llama: run Vulkan tests"                          OFF)
+#option(LLAMA_METAL                           "llama: use Metal"                                 ${LLAMA_METAL_DEFAULT})
+option(LLAMA_METAL_NDEBUG                    "llama: disable Metal debugging"                   OFF)
+option(LLAMA_METAL_SHADER_DEBUG              "llama: compile Metal with -fno-fast-math"         OFF)
+set(LLAMA_METAL_MACOSX_VERSION_MIN "" CACHE STRING
+                                             "llama: metal minimum macOS version")
+set(LLAMA_METAL_STD "" CACHE STRING          "llama: metal standard version (-std flag)")
+#option(LLAMA_KOMPUTE                         "llama: use Kompute"                               OFF)
+option(LLAMA_QKK_64                          "llama: use super-block size of 64 for k-quants"   OFF)
+set(LLAMA_SCHED_MAX_COPIES  "4" CACHE STRING "llama: max input copies for pipeline parallelism")

 # add perf arguments
-option(LLAMA_PERF                           "llama: enable perf"                               OFF)
+option(LLAMA_PERF                            "llama: enable perf"                               OFF)

 #
 # Compile flags
@ -86,14 +80,14 @@ option(LLAMA_PERF                           "llama: enable perf"
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)

-list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
+list(APPEND GGML_COMPILE_DEFS GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})

 # enable libstdc++ assertions for debug builds
 if (CMAKE_SYSTEM_NAME MATCHES "Linux")
    list(APPEND GGML_COMPILE_DEFS $<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
 endif()

-if (APPLE AND GGML_ACCELERATE)
+if (APPLE AND LLAMA_ACCELERATE)
    find_library(ACCELERATE_FRAMEWORK Accelerate)
    if (ACCELERATE_FRAMEWORK)
        message(STATUS "Accelerate framework found")
@ -107,7 +101,7 @@ if (APPLE AND GGML_ACCELERATE)
    endif()
 endif()

-if (GGML_BLAS)
+if (LLAMA_BLAS)
    if (LLAMA_STATIC)
        set(BLA_STATIC ON)
    endif()
@ -115,7 +109,7 @@ if (GGML_BLAS)
        set(BLA_SIZEOF_INTEGER 8)
    endif()

-    set(BLA_VENDOR ${GGML_BLAS_VENDOR})
+    set(BLA_VENDOR ${LLAMA_BLAS_VENDOR})
    find_package(BLAS)

    if (BLAS_FOUND)
@ -125,24 +119,24 @@ if (GGML_BLAS)
            # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake.
            # see https://gitlab.kitware.com/cmake/cmake/-/issues/20268
            find_package(PkgConfig REQUIRED)
-            if (${GGML_BLAS_VENDOR} MATCHES "Generic")
+            if (${LLAMA_BLAS_VENDOR} MATCHES "Generic")
                pkg_check_modules(DepBLAS REQUIRED blas)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "OpenBLAS")
+            elseif (${LLAMA_BLAS_VENDOR} MATCHES "OpenBLAS")
                # As of openblas v0.3.22, the 64-bit is named openblas64.pc
                pkg_check_modules(DepBLAS openblas64)
                if (NOT DepBLAS_FOUND)
                    pkg_check_modules(DepBLAS REQUIRED openblas)
                endif()
-            elseif (${GGML_BLAS_VENDOR} MATCHES "FLAME")
+            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FLAME")
                pkg_check_modules(DepBLAS REQUIRED blis)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "ATLAS")
+            elseif (${LLAMA_BLAS_VENDOR} MATCHES "ATLAS")
                pkg_check_modules(DepBLAS REQUIRED blas-atlas)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "FlexiBLAS")
+            elseif (${LLAMA_BLAS_VENDOR} MATCHES "FlexiBLAS")
                pkg_check_modules(DepBLAS REQUIRED flexiblas_api)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "Intel")
+            elseif (${LLAMA_BLAS_VENDOR} MATCHES "Intel")
                # all Intel* libraries share the same include path
                pkg_check_modules(DepBLAS REQUIRED mkl-sdl)
-            elseif (${GGML_BLAS_VENDOR} MATCHES "NVHPC")
+            elseif (${LLAMA_BLAS_VENDOR} MATCHES "NVHPC")
                # this doesn't provide pkg-config
                # suggest to assign BLAS_INCLUDE_DIRS on your own
                if ("${NVHPC_VERSION}" STREQUAL "")
@ -176,7 +170,7 @@ if (GGML_BLAS)

        list(APPEND GGML_COMPILE_DEFS GGML_USE_OPENBLAS)

-        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${GGML_BLAS_VENDOR} MATCHES "Generic" OR ${GGML_BLAS_VENDOR} MATCHES "Intel"))
+        if (${BLAS_INCLUDE_DIRS} MATCHES "mkl" AND (${LLAMA_BLAS_VENDOR} MATCHES "Generic" OR ${LLAMA_BLAS_VENDOR} MATCHES "Intel"))
            list(APPEND GGML_COMPILE_DEFS GGML_BLAS_USE_MKL)
        endif()

@ -185,18 +179,18 @@ if (GGML_BLAS)
    else()
        message(WARNING "BLAS not found, please refer to "
        "https://cmake.org/cmake/help/latest/module/FindBLAS.html#blas-lapack-vendors"
-        " to set correct GGML_BLAS_VENDOR")
+        " to set correct LLAMA_BLAS_VENDOR")
    endif()
 endif()

-if (GGML_LLAMAFILE)
+if (LLAMA_LLAMAFILE)
    list(APPEND GGML_COMPILE_DEFS GGML_USE_LLAMAFILE)

-    set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.h)
-    set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/ggml/src/llamafile/sgemm.cpp)
+    set(GGML_HEADERS_LLAMAFILE ${DIRECTORY}/sgemm.h)
+    set(GGML_SOURCES_LLAMAFILE ${DIRECTORY}/sgemm.cpp)
 endif()

-if (GGML_QKK_64)
+if (LLAMA_QKK_64)
    list(APPEND GGML_COMPILE_DEFS GGML_QKK_64)
 endif()

@ -367,9 +361,8 @@ function(include_ggml SUFFIX)
    # libraries
    #

-    if (GGML_CUDA)
-        cmake_minimum_required(VERSION 3.18)  # for CMAKE_CUDA_ARCHITECTURES
-
+    if (LLAMA_CUDA)
+        cmake_minimum_required(VERSION 3.17)
        get_property(LANGS GLOBAL PROPERTY ENABLED_LANGUAGES)
        if (NOT CUDA IN_LIST LANGS)
            message(FATAL_ERROR "The CUDA language must be enabled.")
@ -378,64 +371,26 @@ function(include_ggml SUFFIX)
        find_package(CUDAToolkit REQUIRED)
        set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)

-        # architectures are set in gpt4all-backend/CMakeLists.txt
+        set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml-cuda.h)

-        set(GGML_HEADERS_CUDA ${DIRECTORY}/ggml/include/ggml-cuda.h)
-        file(GLOB   GGML_HEADERS_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cuh")
-        list(APPEND GGML_HEADERS_CUDA "${DIRECTORY}/ggml/include/ggml-cuda.h")
-
-        file(GLOB   GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda/*.cu")
-        list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml/src/ggml-cuda.cu")
-        file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-wmma*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/mmq*.cu")
-        list(APPEND GGML_SOURCES_CUDA ${SRCS})
-
-        if (GGML_CUDA_FA_ALL_QUANTS)
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            add_compile_definitions(GGML_CUDA_FA_ALL_QUANTS)
-        else()
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q4_0-q4_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*q8_0-q8_0.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-            file(GLOB   SRCS "${DIRECTORY}/ggml/src/ggml-cuda/template-instances/fattn-vec*f16-f16.cu")
-            list(APPEND GGML_SOURCES_CUDA ${SRCS})
-        endif()
+        file(GLOB GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda/*.cu")
+        list(APPEND GGML_SOURCES_CUDA "${DIRECTORY}/ggml-cuda.cu")

        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CUDA)
-
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-        list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${GGML_CUDA_PEER_MAX_BATCH_SIZE})
-
-        if (GGML_CUDA_USE_GRAPHS)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_USE_GRAPHS)
-        endif()
-
-        if (GGML_CUDA_FORCE_DMMV)
+        if (LLAMA_CUDA_FORCE_DMMV)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
        endif()
-
-        if (GGML_CUDA_FORCE_MMQ)
+        if (LLAMA_CUDA_FORCE_MMQ)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
        endif()
-
-        if (GGML_CUDA_FORCE_CUBLAS)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_CUBLAS)
-        endif()
-
-        if (GGML_CUDA_NO_VMM)
-            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_VMM)
-        endif()
-
-        if (GGML_CUDA_F16)
+        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+        if (LLAMA_CUDA_F16)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_F16)
        endif()
-
-        if (GGML_CUDA_NO_PEER_COPY)
+        list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})
+        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_PEER_MAX_BATCH_SIZE=${LLAMA_CUDA_PEER_MAX_BATCH_SIZE})
+        if (LLAMA_CUDA_NO_PEER_COPY)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
        endif()

@ -451,36 +406,63 @@ function(include_ggml SUFFIX)
        endif()

        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
+
+        if (DEFINED CMAKE_CUDA_ARCHITECTURES)
+            set(GGML_CUDA_ARCHITECTURES "${CMAKE_CUDA_ARCHITECTURES}")
+        else()
+            # 52 == lowest CUDA 12 standard
+            # 60 == f16 CUDA intrinsics
+            # 61 == integer CUDA intrinsics
+            # 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
+            if (LLAMA_CUDA_F16 OR LLAMA_CUDA_DMMV_F16)
+                set(GGML_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics
+            else()
+                set(GGML_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics
+                #set(GGML_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work
+            endif()
+        endif()
+        message(STATUS "Using CUDA architectures: ${GGML_CUDA_ARCHITECTURES}")
    endif()

-    if (GGML_VULKAN)
+    if (LLAMA_CLBLAST)
+        find_package(CLBlast REQUIRED)
+
+        set(GGML_HEADERS_OPENCL ${DIRECTORY}/ggml-opencl.h)
+        set(GGML_SOURCES_OPENCL ${DIRECTORY}/ggml-opencl.cpp)
+
+        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_CLBLAST)
+
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} clblast)
+    endif()
+
+    if (LLAMA_VULKAN)
        find_package(Vulkan REQUIRED)

-        set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml/include/ggml-vulkan.h)
-        set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml/src/ggml-vulkan.cpp)
+        set(GGML_HEADERS_VULKAN ${DIRECTORY}/ggml-vulkan.h)
+        set(GGML_SOURCES_VULKAN ${DIRECTORY}/ggml-vulkan.cpp)

        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_VULKAN)

-        if (GGML_VULKAN_CHECK_RESULTS)
+        if (LLAMA_VULKAN_CHECK_RESULTS)
            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_CHECK_RESULTS)
        endif()

-        if (GGML_VULKAN_DEBUG)
+        if (LLAMA_VULKAN_DEBUG)
            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_DEBUG)
        endif()

-        if (GGML_VULKAN_VALIDATE)
+        if (LLAMA_VULKAN_VALIDATE)
            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_VALIDATE)
        endif()

-        if (GGML_VULKAN_RUN_TESTS)
+        if (LLAMA_VULKAN_RUN_TESTS)
            list(APPEND GGML_COMPILE_DEFS GGML_VULKAN_RUN_TESTS)
        endif()

        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} Vulkan::Vulkan)
    endif()

-    if (GGML_HIPBLAS)
+    if (LLAMA_HIPBLAS)
        if ($ENV{ROCM_PATH})
            set(ROCM_PATH $ENV{ROCM_PATH})
        else()
@ -510,32 +492,32 @@ function(include_ggml SUFFIX)

        message(STATUS "HIP and hipBLAS found")

-        set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml/include/ggml-cuda.h)
+        set(GGML_HEADERS_ROCM ${DIRECTORY}/ggml-cuda.h)

-        file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm/*.cu")
-        list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml/src/ggml-rocm.cu")
+        file(GLOB GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm/*.cu")
+        list(APPEND GGML_SOURCES_ROCM "${DIRECTORY}/ggml-rocm.cu")

        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_HIPBLAS GGML_USE_CUDA)

-        if (GGML_HIP_UMA)
+        if (LLAMA_HIP_UMA)
            list(APPEND GGML_COMPILE_DEFS GGML_HIP_UMA)
        endif()

-        if (GGML_CUDA_FORCE_DMMV)
+        if (LLAMA_CUDA_FORCE_DMMV)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_DMMV)
        endif()

-        if (GGML_CUDA_FORCE_MMQ)
+        if (LLAMA_CUDA_FORCE_MMQ)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_FORCE_MMQ)
        endif()

-        if (GGML_CUDA_NO_PEER_COPY)
+        if (LLAMA_CUDA_NO_PEER_COPY)
            list(APPEND GGML_COMPILE_DEFS GGML_CUDA_NO_PEER_COPY)
        endif()

-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${GGML_CUDA_DMMV_X})
-        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${GGML_CUDA_MMV_Y})
-        list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${GGML_CUDA_KQUANTS_ITER})
+        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
+        list(APPEND GGML_COMPILE_DEFS GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
+        list(APPEND GGML_COMPILE_DEFS K_QUANTS_PER_ITERATION=${LLAMA_CUDA_KQUANTS_ITER})

        if (CXX_IS_HIPCC)
            set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
@ -553,9 +535,9 @@ function(include_ggml SUFFIX)

    set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/${DIRECTORY})

-    if (GGML_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
+    if (LLAMA_KOMPUTE AND NOT GGML_KOMPUTE_ONCE)
        set(GGML_KOMPUTE_ONCE ON PARENT_SCOPE)
-        if (NOT EXISTS "${LLAMA_DIR}/ggml/src/kompute/CMakeLists.txt")
+        if (NOT EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt")
            message(FATAL_ERROR "Kompute not found")
        endif()
        message(STATUS "Kompute found")
@ -579,12 +561,12 @@ function(include_ggml SUFFIX)
                set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
                add_custom_command(
                    OUTPUT ${spv_file}
-                    DEPENDS ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/common.comp
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/op_getrows.comp
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n_pre.comp
-                        ${LLAMA_DIR}/ggml/src/kompute-shaders/op_mul_mv_q_n.comp
-                    COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/ggml/src/kompute-shaders/${source}
+                    DEPENDS ${LLAMA_DIR}/${source}
+                        ${LLAMA_DIR}/kompute-shaders/common.comp
+                        ${LLAMA_DIR}/kompute-shaders/op_getrows.comp
+                        ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n_pre.comp
+                        ${LLAMA_DIR}/kompute-shaders/op_mul_mv_q_n.comp
+                    COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
                    COMMENT "Compiling ${source} to ${source}.spv"
                    )

@ -630,39 +612,39 @@ function(include_ggml SUFFIX)
        set(KOMPUTE_OPT_BUILT_IN_VULKAN_HEADER_TAG "v1.3.239" CACHE STRING "Kompute Vulkan headers tag")
        set(KOMPUTE_OPT_LOG_LEVEL Critical CACHE STRING "Kompute log level")
        set(FMT_INSTALL OFF)
-        add_subdirectory(${LLAMA_DIR}/ggml/src/kompute)
+        add_subdirectory(${LLAMA_DIR}/kompute)

        # Compile our shaders
        compile_shader(SOURCES
-            op_scale.comp
-            op_scale_8.comp
-            op_add.comp
-            op_addrow.comp
-            op_mul.comp
-            op_silu.comp
-            op_relu.comp
-            op_gelu.comp
-            op_softmax.comp
-            op_norm.comp
-            op_rmsnorm.comp
-            op_diagmask.comp
-            op_mul_mat_mat_f32.comp
-            op_mul_mat_f16.comp
-            op_mul_mat_q8_0.comp
-            op_mul_mat_q4_0.comp
-            op_mul_mat_q4_1.comp
-            op_mul_mat_q6_k.comp
-            op_getrows_f32.comp
-            op_getrows_f16.comp
-            op_getrows_q4_0.comp
-            op_getrows_q4_1.comp
-            op_getrows_q6_k.comp
-            op_rope_f16.comp
-            op_rope_f32.comp
-            op_cpy_f16_f16.comp
-            op_cpy_f16_f32.comp
-            op_cpy_f32_f16.comp
-            op_cpy_f32_f32.comp
+            kompute-shaders/op_scale.comp
+            kompute-shaders/op_scale_8.comp
+            kompute-shaders/op_add.comp
+            kompute-shaders/op_addrow.comp
+            kompute-shaders/op_mul.comp
+            kompute-shaders/op_silu.comp
+            kompute-shaders/op_relu.comp
+            kompute-shaders/op_gelu.comp
+            kompute-shaders/op_softmax.comp
+            kompute-shaders/op_norm.comp
+            kompute-shaders/op_rmsnorm.comp
+            kompute-shaders/op_diagmask.comp
+            kompute-shaders/op_mul_mat_mat_f32.comp
+            kompute-shaders/op_mul_mat_f16.comp
+            kompute-shaders/op_mul_mat_q8_0.comp
+            kompute-shaders/op_mul_mat_q4_0.comp
+            kompute-shaders/op_mul_mat_q4_1.comp
+            kompute-shaders/op_mul_mat_q6_k.comp
+            kompute-shaders/op_getrows_f32.comp
+            kompute-shaders/op_getrows_f16.comp
+            kompute-shaders/op_getrows_q4_0.comp
+            kompute-shaders/op_getrows_q4_1.comp
+            kompute-shaders/op_getrows_q6_k.comp
+            kompute-shaders/op_rope_f16.comp
+            kompute-shaders/op_rope_f32.comp
+            kompute-shaders/op_cpy_f16_f16.comp
+            kompute-shaders/op_cpy_f16_f32.comp
+            kompute-shaders/op_cpy_f32_f16.comp
+            kompute-shaders/op_cpy_f32_f32.comp
        )

        # Create a custom target for our generated shaders
@ -707,12 +689,12 @@ function(include_ggml SUFFIX)
        )
    endif()

-    if (GGML_KOMPUTE)
+    if (LLAMA_KOMPUTE)
        list(APPEND GGML_COMPILE_DEFS VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)

        # Add the stamp to the main sources to ensure dependency tracking
-        set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml/src/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
-        set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml/include/ggml-kompute.h)
+        set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-kompute.cpp ${CMAKE_CURRENT_BINARY_DIR}/ggml-kompute.stamp)
+        set(GGML_HEADERS_KOMPUTE ${LLAMA_DIR}/ggml-kompute.h)

        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_KOMPUTE)

@ -721,7 +703,7 @@ function(include_ggml SUFFIX)

    set(CUDA_CXX_FLAGS "")

-    if (GGML_CUDA)
+    if (LLAMA_CUDA)
        set(CUDA_FLAGS -use_fast_math)

        if (LLAMA_FATAL_WARNINGS)
@ -768,25 +750,25 @@ function(include_ggml SUFFIX)
        endif()
    endif()

-    if (GGML_METAL)
+    if (LLAMA_METAL)
        find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
        find_library(METAL_FRAMEWORK    Metal      REQUIRED)
        find_library(METALKIT_FRAMEWORK MetalKit   REQUIRED)

        message(STATUS "Metal framework found")
-        set(GGML_HEADERS_METAL ${DIRECTORY}/ggml/include/ggml-metal.h)
-        set(GGML_SOURCES_METAL ${DIRECTORY}/ggml/src/ggml-metal.m)
+        set(GGML_HEADERS_METAL ${DIRECTORY}/ggml-metal.h)
+        set(GGML_SOURCES_METAL ${DIRECTORY}/ggml-metal.m)

        list(APPEND GGML_COMPILE_DEFS_PUBLIC GGML_USE_METAL)
-        if (GGML_METAL_NDEBUG)
+        if (LLAMA_METAL_NDEBUG)
            list(APPEND GGML_COMPILE_DEFS GGML_METAL_NDEBUG)
        endif()

        # copy ggml-common.h and ggml-metal.metal to bin directory
-        configure_file(${DIRECTORY}/ggml/src/ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
-        configure_file(${DIRECTORY}/ggml/src/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)
+        configure_file(${DIRECTORY}/ggml-common.h    ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h    COPYONLY)
+        configure_file(${DIRECTORY}/ggml-metal.metal ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal COPYONLY)

-        if (GGML_METAL_SHADER_DEBUG)
+        if (LLAMA_METAL_SHADER_DEBUG)
            # custom command to do the following:
            #   xcrun -sdk macosx metal    -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
            #   xcrun -sdk macosx metallib                   ggml-metal.air   -o default.metallib
@ -802,17 +784,16 @@ function(include_ggml SUFFIX)
        endif()

        # Append macOS metal versioning flags
-        if (GGML_METAL_MACOSX_VERSION_MIN)
-            message(STATUS "Adding -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
-            list(APPEND XC_FLAGS -mmacosx-version-min=${GGML_METAL_MACOSX_VERSION_MIN})
+        if (LLAMA_METAL_MACOSX_VERSION_MIN)
+            message(STATUS "Adding -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN} flag to metal compilation")
+            list(APPEND XC_FLAGS -mmacosx-version-min=${LLAMA_METAL_MACOSX_VERSION_MIN})
        endif()
-        if (GGML_METAL_STD)
-            message(STATUS "Adding -std=${GGML_METAL_STD} flag to metal compilation")
-            list(APPEND XC_FLAGS -std=${GGML_METAL_STD})
+        if (LLAMA_METAL_STD)
+            message(STATUS "Adding -std=${LLAMA_METAL_STD} flag to metal compilation")
+            list(APPEND XC_FLAGS -std=${LLAMA_METAL_STD})
        endif()

-        set(GGML_METALLIB "${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib")
-        set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
+        set(GGML_METALLIB ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/default.metallib)
        add_custom_command(
            OUTPUT ${GGML_METALLIB}
            COMMAND xcrun -sdk macosx metal    ${XC_FLAGS} -c ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal -o ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
@ -820,9 +801,10 @@ function(include_ggml SUFFIX)
            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.air
            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-common.h
            COMMAND rm -f ${CMAKE_RUNTIME_OUTPUT_DIRECTORY}/ggml-metal.metal
-            DEPENDS ${DIRECTORY}/ggml/src/ggml-metal.metal ${DIRECTORY}/ggml/src/ggml-common.h
+            DEPENDS ${DIRECTORY}/ggml-metal.metal ${DIRECTORY}/ggml-common.h
            COMMENT "Compiling Metal kernels"
            )
+        set_source_files_properties(${GGML_METALLIB} DIRECTORY ${CMAKE_SOURCE_DIR} PROPERTIES GENERATED ON)

        add_custom_target(
            ggml-metal ALL
@ -873,49 +855,49 @@ function(include_ggml SUFFIX)
             CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|i686|AMD64)$"))
        message(STATUS "x86 detected")
        if (MSVC)
-            if (GGML_AVX512)
+            if (LLAMA_AVX512)
                list(APPEND ARCH_FLAGS /arch:AVX512)
                # MSVC has no compile-time flags enabling specific
                # AVX512 extensions, neither it defines the
                # macros corresponding to the extensions.
                # Do it manually.
-                if (GGML_AVX512_VBMI)
+                if (LLAMA_AVX512_VBMI)
                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VBMI__>)
                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VBMI__>)
                endif()
-                if (GGML_AVX512_VNNI)
+                if (LLAMA_AVX512_VNNI)
                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:C>:__AVX512VNNI__>)
                    list(APPEND GGML_COMPILE_DEFS $<$<COMPILE_LANGUAGE:CXX>:__AVX512VNNI__>)
                endif()
-            elseif (GGML_AVX2)
+            elseif (LLAMA_AVX2)
                list(APPEND ARCH_FLAGS /arch:AVX2)
-            elseif (GGML_AVX)
+            elseif (LLAMA_AVX)
                list(APPEND ARCH_FLAGS /arch:AVX)
            endif()
        else()
-            if (GGML_NATIVE)
+            if (LLAMA_NATIVE)
                list(APPEND ARCH_FLAGS -march=native)
            endif()
-            if (GGML_F16C)
+            if (LLAMA_F16C)
                list(APPEND ARCH_FLAGS -mf16c)
            endif()
-            if (GGML_FMA)
+            if (LLAMA_FMA)
                list(APPEND ARCH_FLAGS -mfma)
            endif()
-            if (GGML_AVX)
+            if (LLAMA_AVX)
                list(APPEND ARCH_FLAGS -mavx)
            endif()
-            if (GGML_AVX2)
+            if (LLAMA_AVX2)
                list(APPEND ARCH_FLAGS -mavx2)
            endif()
-            if (GGML_AVX512)
+            if (LLAMA_AVX512)
                list(APPEND ARCH_FLAGS -mavx512f)
                list(APPEND ARCH_FLAGS -mavx512bw)
            endif()
-            if (GGML_AVX512_VBMI)
+            if (LLAMA_AVX512_VBMI)
                list(APPEND ARCH_FLAGS -mavx512vbmi)
            endif()
-            if (GGML_AVX512_VNNI)
+            if (LLAMA_AVX512_VNNI)
                list(APPEND ARCH_FLAGS -mavx512vnni)
            endif()
        endif()
@ -934,7 +916,7 @@ function(include_ggml SUFFIX)
    list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:CXX>:${ARCH_FLAGS}>")
    list(APPEND GGML_COMPILE_OPTS "$<$<COMPILE_LANGUAGE:C>:${ARCH_FLAGS}>")

-    if (GGML_CUDA)
+    if (LLAMA_CUDA)
        list(APPEND CUDA_CXX_FLAGS ${ARCH_FLAGS})
        list(JOIN CUDA_CXX_FLAGS " " CUDA_CXX_FLAGS_JOINED)  # pass host compiler flags as a single argument
        if (NOT CUDA_CXX_FLAGS_JOINED STREQUAL "")
@ -946,26 +928,24 @@ function(include_ggml SUFFIX)
    # ggml

    add_library(ggml${SUFFIX} OBJECT
-                ${DIRECTORY}/ggml/include/ggml.h
-                ${DIRECTORY}/ggml/include/ggml-alloc.h
-                ${DIRECTORY}/ggml/include/ggml-backend.h
-                ${DIRECTORY}/ggml/src/ggml.c
-                ${DIRECTORY}/ggml/src/ggml-alloc.c
-                ${DIRECTORY}/ggml/src/ggml-backend.c
-                ${DIRECTORY}/ggml/src/ggml-quants.c
-                ${DIRECTORY}/ggml/src/ggml-quants.h
+                ${DIRECTORY}/ggml.c
+                ${DIRECTORY}/ggml.h
+                ${DIRECTORY}/ggml-alloc.c
+                ${DIRECTORY}/ggml-alloc.h
+                ${DIRECTORY}/ggml-backend.c
+                ${DIRECTORY}/ggml-backend.h
+                ${DIRECTORY}/ggml-quants.c
+                ${DIRECTORY}/ggml-quants.h
                ${GGML_SOURCES_CUDA}      ${GGML_HEADERS_CUDA}
+                ${GGML_SOURCES_OPENCL}    ${GGML_HEADERS_OPENCL}
                ${GGML_SOURCES_METAL}     ${GGML_HEADERS_METAL}
                ${GGML_SOURCES_KOMPUTE}   ${GGML_HEADERS_KOMPUTE}
                ${GGML_SOURCES_VULKAN}    ${GGML_HEADERS_VULKAN}
                ${GGML_SOURCES_ROCM}      ${GGML_HEADERS_ROCM}
                ${GGML_SOURCES_LLAMAFILE} ${GGML_HEADERS_LLAMAFILE}
-                ${DIRECTORY}/ggml/src/ggml-aarch64.c
-                ${DIRECTORY}/ggml/src/ggml-aarch64.h
                )

-    target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY}/ggml/include ${LLAMA_EXTRA_INCLUDES})
-    target_include_directories(ggml${SUFFIX} PRIVATE ${DIRECTORY}/ggml/src)
+    target_include_directories(ggml${SUFFIX} PUBLIC ${DIRECTORY} ${LLAMA_EXTRA_INCLUDES})
    target_compile_features(ggml${SUFFIX} PUBLIC c_std_11) # don't bump

    target_link_libraries(ggml${SUFFIX} PUBLIC Threads::Threads ${LLAMA_EXTRA_LIBS})
@ -977,18 +957,14 @@ function(include_ggml SUFFIX)
    # llama

    add_library(llama${SUFFIX} STATIC
-                ${DIRECTORY}/include/llama.h
-                ${DIRECTORY}/src/llama-grammar.cpp
-                ${DIRECTORY}/src/llama-sampling.cpp
-                ${DIRECTORY}/src/llama-vocab.cpp
-                ${DIRECTORY}/src/llama.cpp
-                ${DIRECTORY}/src/unicode-data.cpp
-                ${DIRECTORY}/src/unicode.cpp
-                ${DIRECTORY}/src/unicode.h
+                ${DIRECTORY}/llama.cpp
+                ${DIRECTORY}/llama.h
+                ${DIRECTORY}/unicode.h
+                ${DIRECTORY}/unicode.cpp
+                ${DIRECTORY}/unicode-data.cpp
                )

-    target_include_directories(llama${SUFFIX} PUBLIC  ${DIRECTORY}/include ${DIRECTORY}/ggml/include)
-    target_include_directories(llama${SUFFIX} PRIVATE ${DIRECTORY}/src)
+    target_include_directories(llama${SUFFIX} PUBLIC ${DIRECTORY})
    target_compile_features   (llama${SUFFIX} PUBLIC cxx_std_11) # don't bump

    target_link_libraries(llama${SUFFIX} PRIVATE
@ -1009,6 +985,9 @@ function(include_ggml SUFFIX)
        C_STANDARD 11
        C_STANDARD_REQUIRED true
        )
+    if (GGML_CUDA_ARCHITECTURES)
+        set_property(TARGET ggml${SUFFIX} llama${SUFFIX} PROPERTY CUDA_ARCHITECTURES "${GGML_CUDA_ARCHITECTURES}")
+    endif()

    target_compile_options(ggml${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
    target_compile_options(llama${SUFFIX} PRIVATE "${GGML_COMPILE_OPTS}")
--- a/gpt4all-backend/src/llamamodel.cpp
+++ b/gpt4all-backend/src/llamamodel.cpp
@ -1,45 +1,36 @@
 #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "llamamodel_impl.h"

-#include "llmodel.h"
-#include "utils.h"
-
-#include <ggml.h>
-#include <llama.h>
-
-#include <algorithm>
 #include <cassert>
 #include <cmath>
-#include <cstdint>
 #include <cstdio>
-#include <cstdlib>
 #include <cstring>
 #include <fstream>
-#include <functional>
 #include <initializer_list>
 #include <iomanip>
 #include <iostream>
-#include <iterator>
-#include <memory>
+#include <map>
 #include <numeric>
-#include <optional>
+#include <random>
 #include <sstream>
 #include <stdexcept>
 #include <string>
 #include <thread>
+#include <unordered_set>
 #include <vector>

+#include <llama.h>
+#include <ggml.h>
 #ifdef GGML_USE_KOMPUTE
 #   include <ggml-kompute.h>
-#elif defined(GGML_USE_VULKAN)
+#elif GGML_USE_VULKAN
 #   include <ggml-vulkan.h>
-#elif defined(GGML_USE_CUDA)
+#elif GGML_USE_CUDA
 #   include <ggml-cuda.h>
 #endif

 using namespace std::string_literals;

-
 // Maximum supported GGUF version
 static constexpr int GGUF_VER_MAX = 3;

@ -52,16 +43,14 @@ static const std::vector<const char *> KNOWN_ARCHES {
    // "grok", -- 314B parameters
    "gpt2",
    // "gptj", -- no inference code
-    "gptneox",
-    "granite",
-    "granitemoe",
+    // "gptneox", -- no inference code
    "mpt",
    "baichuan",
    "starcoder",
+    // "persimmon", -- CUDA generates garbage
    "refact",
    "bert",
    "nomic-bert",
-    // "jina-bert-v2", -- Assertion `i01 >= 0 && i01 < ne01' failed.
    "bloom",
    "stablelm",
    "qwen",
@ -75,66 +64,36 @@ static const std::vector<const char *> KNOWN_ARCHES {
    "internlm2",
    // "minicpm", -- CUDA generates garbage
    "gemma",
-    "gemma2",
    "starcoder2",
    // "mamba", -- CUDA missing SSM_CONV
    "xverse",
    "command-r",
    // "dbrx", -- 16x12B parameters
    "olmo",
-    "olmoe",
-    "openelm",
-    // "arctic", -- 10B+128x3.66B parameters
-    "deepseek2",
-    "chatglm",
-    // "bitnet", -- tensor not within file bounds?
-    // "t5", -- seq2seq model
-    "jais",
 };

 static const std::vector<const char *> EMBEDDING_ARCHES {
    "bert", "nomic-bert",
 };

-static bool is_embedding_arch(const std::string &arch)
-{
+static bool is_embedding_arch(const std::string &arch) {
    return std::find(EMBEDDING_ARCHES.begin(), EMBEDDING_ARCHES.end(), arch) < EMBEDDING_ARCHES.end();
 }

-static bool llama_verbose()
-{
+static bool llama_verbose() {
    const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
    return var && *var;
 }

-static void llama_log_callback(ggml_log_level level, const char *text, void *userdata, bool warn)
-{
+static void llama_log_callback(enum ggml_log_level level, const char *text, void *userdata) {
    (void)userdata;
-
-    static ggml_log_level lastlevel = GGML_LOG_LEVEL_NONE;
-    if (!llama_verbose()) {
-        auto efflevel = level == GGML_LOG_LEVEL_CONT ? lastlevel : level;
-        lastlevel = efflevel;
-        switch (efflevel) {
-            case GGML_LOG_LEVEL_CONT:
-                UNREACHABLE();
-                break;
-            case GGML_LOG_LEVEL_WARN:
-                if (warn) break;
-                [[fallthrough]];
-            case GGML_LOG_LEVEL_NONE: // not used?
-            case GGML_LOG_LEVEL_INFO:
-            case GGML_LOG_LEVEL_DEBUG:
-                return; // suppress
-            case GGML_LOG_LEVEL_ERROR:
-                ;
-        }
+    if (llama_verbose() || level <= GGML_LOG_LEVEL_ERROR) {
+        fputs(text, stderr);
    }
-
-    fputs(text, stderr);
 }

 struct gpt_params {
+    int32_t seed          = -1;   // RNG seed
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt

    // sampling parameters
@ -149,8 +108,38 @@ struct gpt_params {
    bool use_mlock         = false; // use mlock to keep model in memory
 };

-const char *get_arch_name(gguf_context *ctx_gguf)
-{
+static int llama_sample_top_p_top_k(
+        llama_context *ctx,
+        const llama_token *last_n_tokens_data,
+        int last_n_tokens_size,
+        int top_k,
+        float top_p,
+        float min_p,
+        float temp,
+        float repeat_penalty,
+        int32_t pos) {
+    auto logits = llama_get_logits_ith(ctx, pos);
+    auto n_vocab = llama_n_vocab(llama_get_model(ctx));
+    // Populate initial list of all candidates
+    std::vector<llama_token_data> candidates;
+    candidates.reserve(n_vocab);
+    for (int token_id = 0; token_id < n_vocab; token_id++) {
+        candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
+    }
+    llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
+    // Sample repeat penalty
+    llama_sample_repetition_penalties(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty, 0.0f, 0.0f);
+    // Temperature sampling
+    llama_sample_top_k(ctx, &candidates_p, top_k, 1);
+    llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
+    llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
+    llama_sample_top_p(ctx, &candidates_p, top_p, 1);
+    llama_sample_min_p(ctx, &candidates_p, min_p, 1);
+    llama_sample_temp(ctx, &candidates_p, temp);
+    return llama_sample_token(ctx, &candidates_p);
+}
+
+const char *get_arch_name(gguf_context *ctx_gguf) {
    const int kid = gguf_find_key(ctx_gguf, "general.architecture");
    if (kid == -1)
        throw std::runtime_error("key not found in model: general.architecture");
@ -162,8 +151,7 @@ const char *get_arch_name(gguf_context *ctx_gguf)
    return gguf_get_val_str(ctx_gguf, kid);
 }

-static gguf_context *load_gguf(const char *fname)
-{
+static gguf_context *load_gguf(const char *fname) {
    struct gguf_init_params params = {
        /*.no_alloc = */ true,
        /*.ctx      = */ nullptr,
@ -184,8 +172,7 @@ static gguf_context *load_gguf(const char *fname)
    return ctx;
 }

-static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey)
-{
+static int32_t get_arch_key_u32(std::string const &modelPath, std::string const &archKey) {
    int32_t value = -1;
    std::string arch;

@ -205,7 +192,7 @@ static int32_t get_arch_key_u32(std::string const &modelPath, std::string const
        if (keyidx != -1) {
            value = gguf_get_val_u32(ctx, keyidx);
        } else {
-            std::cerr << __func__ << ": " << key << " not found in " << modelPath << "\n";
+            std::cerr << __func__ << ": " << key << "not found in " << modelPath << "\n";
        }
    }

@ -215,27 +202,21 @@ cleanup:
 }

 struct LLamaPrivate {
-    bool                         modelLoaded  = false;
-    int                          device       = -1;
-    std::string                  deviceName;
-    int64_t                      n_threads    = 0;
-    std::vector<LLModel::Token>  end_tokens;
-    const char                  *backend_name = nullptr;
-    std::vector<LLModel::Token>  inputTokens;
-
-    llama_model          *model        = nullptr;
-    llama_context        *ctx          = nullptr;
-    llama_model_params    model_params;
-    llama_context_params  ctx_params;
-    llama_sampler        *sampler_chain;
+    const std::string modelPath;
+    bool modelLoaded = false;
+    int device = -1;
+    std::string deviceName;
+    llama_model *model = nullptr;
+    llama_context *ctx = nullptr;
+    llama_model_params model_params;
+    llama_context_params ctx_params;
+    int64_t n_threads = 0;
+    std::vector<LLModel::Token> end_tokens;
+    const char *backend_name = nullptr;
 };

 LLamaModel::LLamaModel()
-    : d_ptr(std::make_unique<LLamaPrivate>())
-{
-    auto sparams = llama_sampler_chain_default_params();
-    d_ptr->sampler_chain = llama_sampler_chain_init(sparams);
-}
+    : d_ptr(new LLamaPrivate) {}

 // default hparams (LLaMA 7B)
 struct llama_file_hparams {
@ -248,8 +229,7 @@ struct llama_file_hparams {
    enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
 };

-size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
-{
+size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl) {
    // TODO(cebtenzzre): update to GGUF
    (void)ngl; // FIXME(cetenzzre): use this value
    auto fin = std::ifstream(modelPath, std::ios::binary);
@ -273,8 +253,7 @@ size_t LLamaModel::requiredMem(const std::string &modelPath, int n_ctx, int ngl)
    return filesize + est_kvcache_size;
 }

-bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
-{
+bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const {
    auto * ctx = load_gguf(modelPath.c_str());
    if (!ctx) {
        std::cerr << __func__ << ": failed to load " << modelPath << "\n";
@ -310,8 +289,7 @@ bool LLamaModel::isModelBlacklisted(const std::string &modelPath) const
    return res;
 }

-bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const
-{
+bool LLamaModel::isEmbeddingModel(const std::string &modelPath) const {
    bool result = false;
    std::string arch;

@ -376,11 +354,6 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        d_ptr->model_params.main_gpu = d_ptr->device;
        d_ptr->model_params.n_gpu_layers = ngl;
        d_ptr->model_params.split_mode = LLAMA_SPLIT_MODE_NONE;
-    } else {
-#ifdef GGML_USE_CUDA
-        std::cerr << "Llama ERROR: CUDA loadModel was called without a device\n";
-        return false;
-#endif // GGML_USE_CUDA
    }
 #elif defined(GGML_USE_METAL)
    (void)ngl;
@ -393,17 +366,15 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    // always fully offload on Metal
    // TODO(cebtenzzre): use this parameter to allow using more than 53% of system RAM to load a model
    d_ptr->model_params.n_gpu_layers = 100;
-#else // !KOMPUTE && !VULKAN && !CUDA && !METAL
+#else
    (void)ngl;
 #endif

-    d_ptr->model = llama_load_model_from_file(modelPath.c_str(), d_ptr->model_params);
+    d_ptr->model = llama_load_model_from_file_gpt4all(modelPath.c_str(), &d_ptr->model_params);
    if (!d_ptr->model) {
        fflush(stdout);
-#ifndef GGML_USE_CUDA
        d_ptr->device = -1;
        d_ptr->deviceName.clear();
-#endif
        std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
        return false;
    }
@ -415,8 +386,7 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    bool isEmbedding = is_embedding_arch(llama_model_arch(d_ptr->model));
    const int n_ctx_train = llama_n_ctx_train(d_ptr->model);
    if (isEmbedding) {
-        d_ptr->ctx_params.n_batch  = n_ctx;
-        d_ptr->ctx_params.n_ubatch = n_ctx;
+        d_ptr->ctx_params.n_batch = n_ctx;
    } else {
        if (n_ctx > n_ctx_train) {
            std::cerr << "warning: model was trained on only " << n_ctx_train << " context tokens ("
@ -424,9 +394,10 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        }
    }

-    d_ptr->ctx_params.n_ctx  = n_ctx;
-    d_ptr->ctx_params.type_k = params.kv_type;
-    d_ptr->ctx_params.type_v = params.kv_type;
+    d_ptr->ctx_params.n_ctx   = n_ctx;
+    d_ptr->ctx_params.seed    = params.seed;
+    d_ptr->ctx_params.type_k  = params.kv_type;
+    d_ptr->ctx_params.type_v  = params.kv_type;

    // The new batch API provides space for n_vocab*n_tokens logits. Tell llama.cpp early
    // that we want this many logits so the state serializes consistently.
@ -445,10 +416,8 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
        std::cerr << "LLAMA ERROR: failed to init context for model " <<  modelPath << std::endl;
        llama_free_model(d_ptr->model);
        d_ptr->model = nullptr;
-#ifndef GGML_USE_CUDA
        d_ptr->device = -1;
        d_ptr->deviceName.clear();
-#endif
        return false;
    }

@ -475,14 +444,12 @@ bool LLamaModel::loadModel(const std::string &modelPath, int n_ctx, int ngl)
    return true;
 }

-void LLamaModel::setThreadCount(int32_t n_threads)
-{
+void LLamaModel::setThreadCount(int32_t n_threads) {
    d_ptr->n_threads = n_threads;
    llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
 }

-int32_t LLamaModel::threadCount() const
-{
+int32_t LLamaModel::threadCount() const {
    return d_ptr->n_threads;
 }

@ -492,7 +459,6 @@ LLamaModel::~LLamaModel()
        llama_free(d_ptr->ctx);
    }
    llama_free_model(d_ptr->model);
-    llama_sampler_free(d_ptr->sampler_chain);
 }

 bool LLamaModel::isModelLoaded() const
@ -502,48 +468,38 @@ bool LLamaModel::isModelLoaded() const

 size_t LLamaModel::stateSize() const
 {
-    return llama_state_get_size(d_ptr->ctx);
+    return llama_get_state_size(d_ptr->ctx);
 }

-size_t LLamaModel::saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const
+size_t LLamaModel::saveState(uint8_t *dest) const
 {
-    size_t bytesWritten = llama_state_get_data(d_ptr->ctx, stateOut.data(), stateOut.size());
-    if (bytesWritten)
-        inputTokensOut.assign(d_ptr->inputTokens.begin(), d_ptr->inputTokens.end());
-    return bytesWritten;
+    return llama_copy_state_data(d_ptr->ctx, dest);
 }

-size_t LLamaModel::restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens)
+size_t LLamaModel::restoreState(const uint8_t *src)
 {
-    size_t bytesRead = llama_state_set_data(d_ptr->ctx, state.data(), state.size());
-    if (bytesRead)
-        d_ptr->inputTokens.assign(inputTokens.begin(), inputTokens.end());
-    return bytesRead;
+    // const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
+    return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
 }

-std::vector<LLModel::Token> LLamaModel::tokenize(std::string_view str) const
+std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str, bool special) const
 {
-    std::vector<LLModel::Token> fres(str.length() + 4);
-    int32_t fres_len = llama_tokenize(
-        d_ptr->model, str.data(), str.length(), fres.data(), fres.size(), /*add_special*/ true, /*parse_special*/ true
-    );
+    const bool wantBOS = ctx.n_past == 0 && ctx.tokens.empty();
+    const bool useBOS = wantBOS && shouldAddBOS();
+    auto strCat = wantBOS && !special ? " " + str : str; // insert leading space ourselves, llama.cpp fork doesn't anymore
+    std::vector<LLModel::Token> fres(strCat.size()+4);
+    auto fres_len = llama_tokenize(d_ptr->model, strCat.c_str(), strCat.length(), fres.data(), fres.size(), useBOS, special);
    fres.resize(fres_len);
    return fres;
 }

-bool LLamaModel::isSpecialToken(Token id) const
-{
-    return llama_token_get_attr(d_ptr->model, id)
-        & (LLAMA_TOKEN_ATTR_CONTROL | LLAMA_TOKEN_ATTR_USER_DEFINED | LLAMA_TOKEN_ATTR_UNKNOWN);
-}
-
 std::string LLamaModel::tokenToString(Token id) const
 {
    std::vector<char> result(8, 0);
-    const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
+    const int n_tokens = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
    if (n_tokens < 0) {
        result.resize(-n_tokens);
-        int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), 0, true);
+        int check = llama_token_to_piece(d_ptr->model, id, result.data(), result.size(), false);
        GGML_ASSERT(check == -n_tokens);
    }
    else {
@ -553,66 +509,27 @@ std::string LLamaModel::tokenToString(Token id) const
    return std::string(result.data(), result.size());
 }

-void LLamaModel::initSampler(const PromptContext &promptCtx)
+LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
 {
-    auto *model = d_ptr->model;
-    auto *chain = d_ptr->sampler_chain;
-
-    // clear sampler chain
-    for (int i = llama_sampler_chain_n(chain) - 1; i >= 0; i--) {
-        auto *smpl = llama_sampler_chain_remove(chain, i);
-        llama_sampler_free(smpl);
-    }
-
-    // build new chain
-    llama_sampler_chain_add(chain,
-        llama_sampler_init_penalties(
-            llama_n_vocab(model),
-            llama_token_eos(model),
-            llama_token_nl(model),
-            promptCtx.repeat_last_n,
-            promptCtx.repeat_penalty,
-            // TODO(jared): consider making the below configurable
-            /*penalty_freq*/    0.0f,
-            /*penalty_present*/ 0.0f,
-            /*penalize_nl*/     true,
-            /*ignore_eos*/      false
-        )
-    );
-    if (promptCtx.temp == 0.0f) {
-        llama_sampler_chain_add(chain, llama_sampler_init_greedy());
-    } else {
-        struct llama_sampler *samplers[] = {
-            llama_sampler_init_top_k(promptCtx.top_k),
-            llama_sampler_init_top_p(promptCtx.top_p, 1),
-            llama_sampler_init_min_p(promptCtx.min_p, 1),
-            llama_sampler_init_temp(promptCtx.temp),
-            llama_sampler_init_softmax(),
-            llama_sampler_init_dist(LLAMA_DEFAULT_SEED),
-        };
-        for (auto *smpl : samplers)
-            llama_sampler_chain_add(chain, smpl);
-    }
+    const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
+    return llama_sample_top_p_top_k(d_ptr->ctx,
+        promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
+        n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.min_p, promptCtx.temp,
+        promptCtx.repeat_penalty, promptCtx.n_last_batch_tokens - 1);
 }

-LLModel::Token LLamaModel::sampleToken() const
+bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
-    return llama_sampler_sample(d_ptr->sampler_chain, d_ptr->ctx, -1);
-}
-
-bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
-{
-    assert(!tokens.empty());
-
-    llama_kv_cache_seq_rm(d_ptr->ctx, 0, nPast, -1);
+    llama_kv_cache_seq_rm(d_ptr->ctx, 0, ctx.n_past, -1);

    llama_batch batch = llama_batch_init(tokens.size(), 0, 1);

    batch.n_tokens = tokens.size();
+    ctx.n_last_batch_tokens = tokens.size();

    for (int32_t i = 0; i < batch.n_tokens; i++) {
        batch.token   [i] = tokens[i];
-        batch.pos     [i] = nPast + i;
+        batch.pos     [i] = ctx.n_past + i;
        batch.n_seq_id[i] = 1;
        batch.seq_id  [i][0] = 0;
        batch.logits  [i] = false;
@ -626,86 +543,11 @@ bool LLamaModel::evalTokens(int32_t nPast, std::span<const Token> tokens) const
    return res == 0;
 }

-void LLamaModel::shiftContext(const PromptContext &promptCtx, int32_t *nPast)
-{
-    // infinite text generation via context shifting
-
-    // erase up to n_ctx*contextErase tokens
-    int n_keep = shouldAddBOS();
-    int n_past = *nPast;
-    int n_discard = std::min(n_past - n_keep, int(contextLength() * promptCtx.contextErase));
-
-    assert(n_discard > 0);
-    if (n_discard <= 0)
-        return;
-
-    std::cerr << "Llama: context full, swapping: n_past = " << n_past << ", n_keep = " << n_keep
-              << ", n_discard = " << n_discard << "\n";
-
-    // erase the first n_discard tokens from the context
-    llama_kv_cache_seq_rm (d_ptr->ctx, 0, n_keep,             n_keep + n_discard);
-    llama_kv_cache_seq_add(d_ptr->ctx, 0, n_keep + n_discard, n_past,             -n_discard);
-
-    auto &inp = d_ptr->inputTokens;
-    inp.erase(inp.begin() + n_keep, inp.begin() + n_keep + n_discard);
-    *nPast = inp.size();
-}
-
 int32_t LLamaModel::contextLength() const
 {
    return llama_n_ctx(d_ptr->ctx);
 }

-auto LLamaModel::specialTokens() -> std::unordered_map<std::string, std::string> const
-{
-    if (!d_ptr->model)
-        throw std::logic_error("model not loaded");
-
-    std::unordered_map<std::string, std::string> tokens;
-    if (auto id = llama_token_bos(d_ptr->model); id != LLAMA_TOKEN_NULL)
-        tokens.emplace("bos_token", tokenToString(id));
-    if (auto id = llama_token_eos(d_ptr->model); id != LLAMA_TOKEN_NULL)
-        tokens.emplace("eos_token", tokenToString(id));
-    return tokens;
-}
-
-int32_t LLamaModel::inputLength() const
-{
-    return d_ptr->inputTokens.size();
-}
-
-int32_t LLamaModel::computeModelInputPosition(std::span<const Token> input) const
-{
-    // find common prefix
-    auto cacheIt = d_ptr->inputTokens.begin();
-    auto inputIt = input.begin();
-    while (cacheIt < d_ptr->inputTokens.end() && inputIt < input.end() && *cacheIt == *inputIt) {
-        ++cacheIt; ++inputIt;
-    }
-    // tell the caller to ignore the tokens between [begin, inputIt)
-    return inputIt - input.begin();
-}
-
-void LLamaModel::setModelInputPosition(int32_t pos)
-{
-    auto &inp = d_ptr->inputTokens;
-    assert(pos >= 0);
-    assert(pos <= inp.size());
-    // truncate token cache to end at the new n_past
-    if (pos < inp.size())
-        inp.resize(pos);
-}
-
-void LLamaModel::appendInputToken(Token tok)
-{
-    d_ptr->inputTokens.push_back(tok);
-}
-
-auto LLamaModel::inputTokens() const -> std::span<const Token>
-{
-    return d_ptr->inputTokens;
-}
-
 const std::vector<LLModel::Token> &LLamaModel::endTokens() const
 {
    return d_ptr->end_tokens;
@ -713,7 +555,10 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const

 bool LLamaModel::shouldAddBOS() const
 {
-    return llama_add_bos_token(d_ptr->model);
+    int add_bos = llama_add_bos_token(d_ptr->model);
+    if (add_bos != -1) { return add_bos; }
+    auto vocab_type = llama_vocab_type(d_ptr->model);
+    return vocab_type == LLAMA_VOCAB_TYPE_SPM || vocab_type == LLAMA_VOCAB_TYPE_WPM;
 }

 int32_t LLamaModel::maxContextLength(std::string const &modelPath) const
@ -726,40 +571,8 @@ int32_t LLamaModel::layerCount(std::string const &modelPath) const
    return get_arch_key_u32(modelPath, "block_count");
 }

-// TODO(jared): reduce redundant code and operations by combining all metadata getters for unloaded
-//              models into a class that keeps the model file open
-auto LLamaModel::chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
-{
-    auto *ctx = load_gguf(modelPath);
-    if (!ctx)
-        return std::unexpected("failed to open model file");
-
-    std::expected<std::string, std::string> result;
-    enum gguf_type ktype;
-    const int kid = gguf_find_key(ctx, "tokenizer.chat_template");
-    if (kid == -1) {
-        result = std::unexpected("key not found");
-        goto cleanup;
-    }
-
-    ktype = gguf_get_kv_type(ctx, kid);
-    if (ktype != GGUF_TYPE_STRING) {
-        result = std::unexpected(
-            "expected key type STRING (" + std::to_string(GGUF_TYPE_STRING) + "), got " + std::to_string(ktype)
-        );
-        goto cleanup;
-    }
-
-    result = gguf_get_val_str(ctx, kid);
-
-cleanup:
-    gguf_free(ctx);
-    return result;
-}
-
 #ifdef GGML_USE_VULKAN
-static const char *getVulkanVendorName(uint32_t vendorID)
-{
+static const char *getVulkanVendorName(uint32_t vendorID) {
    switch (vendorID) {
        case 0x10DE: return "nvidia";
        case 0x1002: return "amd";
@ -889,30 +702,41 @@ bool LLamaModel::initializeGPUDevice(int device, std::string *unavail_reason) co
 #endif
 }

-bool LLamaModel::usingGPUDevice() const
+bool LLamaModel::hasGPUDevice() const
 {
-    if (!d_ptr->model)
-        return false;
-
-    bool usingGPU = llama_model_using_gpu(d_ptr->model);
-#ifdef GGML_USE_KOMPUTE
-    assert(!usingGPU || ggml_vk_has_device());
+#if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    return d_ptr->device != -1;
+#else
+    return false;
 #endif
-    return usingGPU;
 }

-const char *LLamaModel::backendName() const
+bool LLamaModel::usingGPUDevice() const
 {
+    bool hasDevice;
+
+#ifdef GGML_USE_KOMPUTE
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+    assert(!hasDevice || ggml_vk_has_device());
+#elif defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
+    hasDevice = hasGPUDevice() && d_ptr->model_params.n_gpu_layers > 0;
+#elif defined(GGML_USE_METAL)
+    hasDevice = true;
+#else
+    hasDevice = false;
+#endif
+
+    return hasDevice;
+}
+
+const char *LLamaModel::backendName() const {
    return d_ptr->backend_name;
 }

-const char *LLamaModel::gpuDeviceName() const
-{
+const char *LLamaModel::gpuDeviceName() const {
    if (usingGPUDevice()) {
 #if defined(GGML_USE_KOMPUTE) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CUDA)
        return d_ptr->deviceName.c_str();
-#elif defined(GGML_USE_METAL)
-        return "Metal";
 #endif
    }
    return nullptr;
@ -935,15 +759,13 @@ void llama_batch_add(
    batch.n_tokens++;
 }

-static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id)
-{
+static void batch_add_seq(llama_batch &batch, const std::vector<LLModel::Token> &tokens, int seq_id) {
    for (unsigned i = 0; i < tokens.size(); i++) {
        llama_batch_add(batch, tokens[i], i, { seq_id }, i == tokens.size() - 1);
    }
 }

-size_t LLamaModel::embeddingSize() const
-{
+size_t LLamaModel::embeddingSize() const {
    return llama_n_embd(d_ptr->model);
 }

@ -1063,14 +885,12 @@ void LLamaModel::embed(
 // MD5 hash of "nomic empty"
 static const char EMPTY_PLACEHOLDER[] = "24df574ea1c998de59d5be15e769658e";

-auto product(double a) -> std::function<double(double)>
-{
+auto product(double a) -> std::function<double(double)> {
    return [a](double b) { return a * b; };
 }

 template <typename T>
-double getL2NormScale(T *start, T *end)
-{
+double getL2NormScale(T *start, T *end) {
    double magnitude = std::sqrt(std::inner_product(start, end, start, 0.0));
    return 1.0 / std::max(magnitude, 1e-12);
 }
@ -1086,7 +906,7 @@ void LLamaModel::embedInternal(
    const llama_token bos_token = llama_token_bos(d_ptr->model);
    const llama_token eos_token = llama_token_eos(d_ptr->model);

-    bool useBOS = llama_add_bos_token(d_ptr->model);
+    bool useBOS = shouldAddBOS();
    bool useEOS = llama_vocab_type(d_ptr->model) == LLAMA_VOCAB_TYPE_WPM;

    // no EOS, optional BOS
@ -1094,20 +914,17 @@ void LLamaModel::embedInternal(
        if (!text.empty() && text[0] != ' ') {
            text = ' ' + text; // normalize for SPM - our fork of llama.cpp doesn't add a space prefix
        }
+        wantBOS &= useBOS;

        tokens.resize(text.length()+4);
-        int32_t n_tokens = llama_tokenize_gpt4all(
-            d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), /*add_special*/ wantBOS,
-            /*parse_special*/ false, /*insert_space*/ false
-        );
+        int32_t n_tokens = llama_tokenize(d_ptr->model, text.c_str(), text.length(), tokens.data(), tokens.size(), wantBOS, false);
        if (n_tokens) {
            (void)eos_token;
-            (void)useBOS;
-            assert((useEOS && wantBOS && useBOS) == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
-            if (useEOS && wantBOS)
-                n_tokens--; // erase EOS/SEP
+            assert(useEOS == (eos_token != -1 && tokens[n_tokens - 1] == eos_token));
+            tokens.resize(n_tokens - useEOS); // erase EOS/SEP
+        } else {
+            tokens.clear();
        }
-        tokens.resize(n_tokens);
    };

    // tokenize the texts
@ -1159,14 +976,14 @@ void LLamaModel::embedInternal(
    size_t totalTokens = 0;
    for (unsigned i = 0; i < inputs.size(); i++) {
        auto &input = inputs[i];
-        for (unsigned j = 0; j < input.size(); j += max_len) {
-            if (j) { j -= chunkOverlap; }
-            unsigned end = std::min(j + max_len, unsigned(input.size()));
+        for (auto it = input.begin(); it < input.end(); it += max_len) {
+            if (it > input.begin()) { it -= chunkOverlap; }
+            auto end = std::min(it + max_len, input.end());
            batches.push_back({ i, {} });
            auto &batch = batches.back().batch;
            batch = prefixTokens;
-            batch.insert(batch.end(), input.begin() + j, input.begin() + end);
-            totalTokens += end - j;
+            batch.insert(batch.end(), it, end);
+            totalTokens += end - it;
            batch.push_back(eos_token);
            if (!doMean) { break; /* limit text to one chunk */ }
        }
@ -1281,23 +1098,19 @@ void LLamaModel::embedInternal(
 #endif

 extern "C" {
-DLL_EXPORT bool is_g4a_backend_model_implementation()
-{
+DLL_EXPORT bool is_g4a_backend_model_implementation() {
    return true;
 }

-DLL_EXPORT const char *get_model_type()
-{
+DLL_EXPORT const char *get_model_type() {
    return modelType_;
 }

-DLL_EXPORT const char *get_build_variant()
-{
+DLL_EXPORT const char *get_build_variant() {
    return GGML_BUILD_VARIANT;
 }

-DLL_EXPORT char *get_file_arch(const char *fname)
-{
+DLL_EXPORT char *get_file_arch(const char *fname) {
    char *arch = nullptr;
    std::string archStr;

@ -1322,17 +1135,12 @@ cleanup:
    return arch;
 }

-DLL_EXPORT bool is_arch_supported(const char *arch)
-{
+DLL_EXPORT bool is_arch_supported(const char *arch) {
    return std::find(KNOWN_ARCHES.begin(), KNOWN_ARCHES.end(), std::string(arch)) < KNOWN_ARCHES.end();
 }

-DLL_EXPORT LLModel *construct()
-{
-    llama_log_set([](auto l, auto t, auto u) { llama_log_callback(l, t, u, false); }, nullptr);
-#ifdef GGML_USE_CUDA
-    ggml_backend_cuda_log_set_callback([](auto l, auto t, auto u) { llama_log_callback(l, t, u, true); }, nullptr);
-#endif
+DLL_EXPORT LLModel *construct() {
+    llama_log_set(llama_log_callback, nullptr);
    return new LLamaModel;
 }
 }
--- a/gpt4all-backend/src/llamamodel_impl.h
+++ b/gpt4all-backend/src/llamamodel_impl.h
@ -4,14 +4,11 @@
 #ifndef LLAMAMODEL_H
 #define LLAMAMODEL_H

-#include "llmodel.h"
-
+#include <functional>
 #include <memory>
-#include <span>
 #include <string>
-#include <string_view>
 #include <vector>
-#include <unordered_map>
+#include "llmodel.h"

 struct LLamaPrivate;
 struct EmbModelSpec;
@ -29,13 +26,14 @@ public:
    bool isModelLoaded() const override;
    size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
    size_t stateSize() const override;
-    size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
-    size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
+    size_t saveState(uint8_t *dest) const override;
+    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
    bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
    bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
+    bool hasGPUDevice() const override;
    bool usingGPUDevice() const override;
    const char *backendName() const override;
    const char *gpuDeviceName() const override;
@ -49,36 +47,25 @@ public:
    void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
               size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;

-    int32_t contextLength() const override;
-    auto specialTokens() -> std::unordered_map<std::string, std::string> const override;
-
-protected:
-    std::vector<Token> tokenize(std::string_view str) const override;
-    bool isSpecialToken(Token id) const override;
-    std::string tokenToString(Token id) const override;
-    void initSampler(const PromptContext &ctx) override;
-    Token sampleToken() const override;
-    bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
-    void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
-    int32_t inputLength() const override;
-    int32_t computeModelInputPosition(std::span<const Token> input) const override;
-    void setModelInputPosition(int32_t pos) override;
-    void appendInputToken(Token tok) override;
-    std::span<const Token> inputTokens() const override;
-    const std::vector<Token> &endTokens() const override;
-    bool shouldAddBOS() const override;
-    int32_t maxContextLength(std::string const &modelPath) const override;
-    int32_t layerCount(std::string const &modelPath) const override;
-    auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;
-
-    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
-                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
-                       const EmbModelSpec *spec);
-
 private:
    std::unique_ptr<LLamaPrivate> d_ptr;
    bool m_supportsEmbedding = false;
    bool m_supportsCompletion = false;
+
+protected:
+    std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special) const override;
+    std::string tokenToString(Token id) const override;
+    Token sampleToken(PromptContext &ctx) const override;
+    bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
+    int32_t contextLength() const override;
+    const std::vector<Token> &endTokens() const override;
+    bool shouldAddBOS() const override;
+    int32_t maxContextLength(std::string const &modelPath) const override;
+    int32_t layerCount(std::string const &modelPath) const override;
+
+    void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
+                       size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
+                       const EmbModelSpec *spec);
 };

 #endif // LLAMAMODEL_H
--- a/gpt4all-backend/src/llmodel.cpp
+++ b/gpt4all-backend/src/llmodel.cpp
@ -1,13 +1,12 @@
 #include "llmodel.h"
-
 #include "dlhandle.h"
+#include "sysinfo.h"

 #include <cassert>
 #include <cstdlib>
 #include <filesystem>
 #include <fstream>
 #include <iostream>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <regex>
@ -28,12 +27,6 @@
 #   include <intrin.h>
 #endif

-#if defined(__APPLE__) && defined(__aarch64__)
-#   include "sysinfo.h" // for getSystemTotalRAMInBytes
-#endif
-
-namespace fs = std::filesystem;
-
 #ifndef __APPLE__
 static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
 #elif defined(__aarch64__)
@ -92,20 +85,17 @@ LLModel::Implementation::Implementation(Implementation &&o)
    o.m_dlhandle = nullptr;
 }

-LLModel::Implementation::~Implementation()
-{
+LLModel::Implementation::~Implementation() {
    delete m_dlhandle;
 }

-static bool isImplementation(const Dlhandle &dl)
-{
+static bool isImplementation(const Dlhandle &dl) {
    return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
 }

 // Add the CUDA Toolkit to the DLL search path on Windows.
 // This is necessary for chat.exe to find CUDA when started from Qt Creator.
-static void addCudaSearchPath()
-{
+static void addCudaSearchPath() {
 #ifdef _WIN32
    if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
        auto libDir = std::wstring(cudaPath) + L"\\bin";
@ -117,8 +107,7 @@ static void addCudaSearchPath()
 #endif
 }

-const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
-{
+const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList() {
    if (cpu_supports_avx() == 0) {
        throw std::runtime_error("CPU does not support AVX");
    }
@ -130,7 +119,7 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat

        addCudaSearchPath();

-        std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
+        std::string impl_name_re = "(gptj|llamamodel-mainline)-(cpu|metal|kompute|vulkan|cuda)";
        if (cpu_supports_avx2() == 0) {
            impl_name_re += "-avxonly";
        }
@ -140,32 +129,21 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
            std::string path;
            // Split the paths string by the delimiter and process each path.
            while (std::getline(ss, path, ';')) {
-                fs::directory_iterator iter;
-                try {
-                    iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
-                } catch (const fs::filesystem_error &) {
-                    continue; // skip nonexistent path
-                }
+                std::filesystem::path fs_path(path);
                // Iterate over all libraries
-                for (const auto &f : iter) {
-                    const fs::path &p = f.path();
+                for (const auto& f : std::filesystem::directory_iterator(fs_path)) {
+                    const std::filesystem::path& p = f.path();

                    if (p.extension() != LIB_FILE_EXT) continue;
                    if (!std::regex_search(p.stem().string(), re)) continue;

                    // Add to list if model implementation
-                    Dlhandle dl;
                    try {
-                        dl = Dlhandle(p);
-                    } catch (const Dlhandle::Exception &e) {
-                        std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
-                        continue;
-                    }
-                    if (!isImplementation(dl)) {
-                        std::cerr << "Not an implementation: " << p.filename().string() << "\n";
-                        continue;
-                    }
-                    fres.emplace_back(Implementation(std::move(dl)));
+                        Dlhandle dl(p.string());
+                        if (!isImplementation(dl))
+                            continue;
+                        fres.emplace_back(Implementation(std::move(dl)));
+                    } catch (...) {}
                }
            }
        };
@ -178,16 +156,14 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
    return *libs;
 }

-static std::string applyCPUVariant(const std::string &buildVariant)
-{
+static std::string applyCPUVariant(const std::string &buildVariant) {
    if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
        return buildVariant + "-avxonly";
    }
    return buildVariant;
 }

-const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
-{
+const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    bool buildVariantMatched = false;
    std::optional<std::string> archName;
    for (const auto& i : implementationList()) {
@ -211,8 +187,7 @@ const LLModel::Implementation* LLModel::Implementation::implementation(const cha
    throw BadArchError(std::move(*archName));
 }

-LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
-{
+LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx) {
    std::vector<std::string> desiredBackends;
    if (backend != "auto") {
        desiredBackends.push_back(backend);
@ -252,8 +227,7 @@ LLModel *LLModel::Implementation::construct(const std::string &modelPath, const
    throw MissingImplementationError("Could not find any implementations for backend: " + backend);
 }

-LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
-{
+LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend) {
    static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;

    const std::vector<Implementation> *impls;
@ -297,8 +271,7 @@ LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::
    return nullptr;
 }

-std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
-{
+std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired) {
    std::vector<LLModel::GPUDevice> devices;
 #ifndef __APPLE__
    static const std::string backends[] = {"kompute", "cuda"};
@ -313,46 +286,33 @@ std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(siz
    return devices;
 }

-int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
-{
+int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath) {
    auto *llama = constructGlobalLlama();
    return llama ? llama->maxContextLength(modelPath) : -1;
 }

-int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
-{
+int32_t LLModel::Implementation::layerCount(const std::string &modelPath) {
    auto *llama = constructGlobalLlama();
    return llama ? llama->layerCount(modelPath) : -1;
 }

-bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
-{
+bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath) {
    auto *llama = constructGlobalLlama();
    return llama && llama->isEmbeddingModel(modelPath);
 }

-auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
-{
-    auto *llama = constructGlobalLlama();
-    return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
-}
-
-void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
-{
+void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
    s_implementations_search_path = path;
 }

-const std::string& LLModel::Implementation::implementationsSearchPath()
-{
+const std::string& LLModel::Implementation::implementationsSearchPath() {
    return s_implementations_search_path;
 }

-bool LLModel::Implementation::hasSupportedCPU()
-{
+bool LLModel::Implementation::hasSupportedCPU() {
    return cpu_supports_avx() != 0;
 }

-int LLModel::Implementation::cpuSupportsAVX2()
-{
+int LLModel::Implementation::cpuSupportsAVX2() {
    return cpu_supports_avx2();
 }
--- a/gpt4all-backend/include/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel.h
@ -2,33 +2,24 @@
 #define LLMODEL_H

 #include <algorithm>
-#include <cassert>
-#include <cstddef>
 #include <cstdint>
-#include <expected>
+#include <fstream>
 #include <functional>
+#include <limits>
 #include <optional>
-#include <span>
-#include <stdexcept>
 #include <string>
 #include <string_view>
 #include <unordered_map>
-#include <utility>
 #include <vector>

-class Dlhandle;
-
 using namespace std::string_literals;

 #define LLMODEL_MAX_PROMPT_BATCH 128

+class Dlhandle;
 class LLModel {
 public:
    using Token = int32_t;
-    using PromptCallback      = std::function<bool(std::span<const Token> batch, bool cached)>;
-    using ResponseCallback    = std::function<bool(Token token, std::string_view piece)>;
-    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
-    using ProgressCallback    = std::function<bool(float progress)>;

    class BadArchError: public std::runtime_error {
    public:
@ -65,30 +56,23 @@ public:
            backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
            vendor(std::move(vendor)) {}

-        std::string selectionName() const
-        {
-            assert(backend == "cuda"s || backend == "kompute"s);
-            return backendName() + ": " + name;
-        }
-
-        std::string backendName() const { return backendIdToName(backend); }
-
-        static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
+        std::string selectionName() const { return m_backendNames.at(backend) + ": " + name; }
+        std::string reportedName()  const { return name + " (" + m_backendNames.at(backend) + ")"; }

        static std::string updateSelectionName(const std::string &name) {
            if (name == "Auto" || name == "CPU" || name == "Metal")
                return name;
-            auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
+            auto it = std::find_if(m_backendNames.begin(), m_backendNames.end(), [&name](const auto &entry) {
                return name.starts_with(entry.second + ": ");
            });
-            if (it != s_backendNames.end())
+            if (it != m_backendNames.end())
                return name;
            return "Vulkan: " + name; // previously, there were only Vulkan devices
        }

    private:
-        static inline const std::unordered_map<std::string, std::string> s_backendNames {
-            {"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
+        static inline const std::unordered_map<std::string, std::string> m_backendNames {
+            {"cuda", "CUDA"}, {"kompute", "Vulkan"},
        };
    };

@ -106,7 +90,6 @@ public:
        static int32_t maxContextLength(const std::string &modelPath);
        static int32_t layerCount(const std::string &modelPath);
        static bool isEmbeddingModel(const std::string &modelPath);
-        static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
        static void setImplementationsSearchPath(const std::string &path);
        static const std::string &implementationsSearchPath();
        static bool hasSupportedCPU();
@ -130,6 +113,10 @@ public:
    };

    struct PromptContext {
+        std::vector<float> logits;      // logits of current context
+        std::vector<int32_t> tokens;    // current tokens in the context window
+        int32_t n_past = 0;             // number of tokens in past conversation
+        int32_t n_ctx = 0;              // number of tokens possible in context window
        int32_t n_predict = 200;
        int32_t top_k = 40;
        float   top_p = 0.9f;
@ -138,31 +125,38 @@ public:
        int32_t n_batch = 9;
        float   repeat_penalty = 1.10f;
        int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.5f;    // percent of context to erase if we exceed the context window
+        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context window
+        int32_t n_last_batch_tokens = 0;
    };

+    using ProgressCallback = std::function<bool(float progress)>;
+
    explicit LLModel() {}
    virtual ~LLModel() {}

    virtual bool supportsEmbedding() const = 0;
    virtual bool supportsCompletion() const = 0;
    virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
+    virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; };
    virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
    virtual bool isModelLoaded() const = 0;
    virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
-    virtual size_t stateSize() const = 0;
-    virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
-    virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;
+    virtual size_t stateSize() const { return 0; }
+    virtual size_t saveState(uint8_t *dest) const { (void)dest; return 0; }
+    virtual size_t restoreState(const uint8_t *src) { (void)src; return 0; }

    // This method requires the model to return true from supportsCompletion otherwise it will throw
    // an error
-    virtual void prompt(std::string_view        prompt,
-                        const PromptCallback   &promptCallback,
-                        const ResponseCallback &responseCallback,
-                        const PromptContext    &ctx);
+    virtual void prompt(const std::string &prompt,
+                        const std::string &promptTemplate,
+                        std::function<bool(int32_t)> promptCallback,
+                        std::function<bool(int32_t, const std::string&)> responseCallback,
+                        std::function<bool(bool)> recalculateCallback,
+                        PromptContext &ctx,
+                        bool special = false,
+                        std::string *fakeReply = nullptr);

-    virtual int32_t countPromptTokens(std::string_view prompt) const;
+    using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);

    virtual size_t embeddingSize() const {
        throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
@ -201,30 +195,21 @@ public:
        return false;
    }

+    virtual bool hasGPUDevice() const { return false; }
    virtual bool usingGPUDevice() const { return false; }
    virtual const char *backendName() const { return "cpu"; }
    virtual const char *gpuDeviceName() const { return nullptr; }

    void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }

-    virtual int32_t contextLength() const = 0;
-    virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
-
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
-    virtual std::vector<Token> tokenize(std::string_view str) const = 0;
-    virtual bool isSpecialToken(Token id) const = 0;
+    virtual std::vector<Token> tokenize(PromptContext &ctx, const std::string &str, bool special = false) const = 0;
    virtual std::string tokenToString(Token id) const = 0;
-    virtual void initSampler(const PromptContext &ctx) = 0;
-    virtual Token sampleToken() const = 0;
-    virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
-    virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
-    virtual int32_t inputLength() const = 0;
-    virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
-    virtual void setModelInputPosition(int32_t pos) = 0;
-    virtual void appendInputToken(Token tok) = 0;
-    virtual std::span<const Token> inputTokens() const = 0;
+    virtual Token sampleToken(PromptContext &ctx) const = 0;
+    virtual bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const = 0;
+    virtual int32_t contextLength() const = 0;
    virtual const std::vector<Token> &endTokens() const = 0;
    virtual bool shouldAddBOS() const = 0;

@ -240,11 +225,9 @@ protected:
        return -1;
    }

-    virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
-    {
-        (void)modelPath;
-        return std::unexpected("not implemented");
-    }
+    // This is a helper function called from the default implementation of 'prompt' but it can be
+    // shared by all base classes so it isn't virtual
+    void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);

    const Implementation *m_implementation = nullptr;

@ -257,16 +240,16 @@ protected:
        return true;
    }

-    // prefill context with prompt
-    auto decodePrompt(const PromptCallback &promptCallback,
-                      const PromptContext  &promptCtx,
-                      std::vector<Token>    embd_inp)
-        -> std::optional<int32_t>;
-    // generate a response
-    void generateResponse(const ResponseCallback &responseCallback,
-                          const PromptContext    &promptCtx,
-                          int32_t                 nPast);
+    void decodePrompt(std::function<bool(int32_t)> promptCallback,
+                      std::function<bool(int32_t, const std::string&)> responseCallback,
+                      std::function<bool(bool)> recalculateCallback,
+                      PromptContext &promptCtx,
+                      std::vector<Token> embd_inp);
+    void generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
+                          std::function<bool(bool)> recalculateCallback,
+                          PromptContext &promptCtx);

+private:
    friend class LLMImplementation;
 };

--- a/gpt4all-backend/src/llmodel_c.cpp
+++ b/gpt4all-backend/src/llmodel_c.cpp
@ -1,31 +1,20 @@
 #include "llmodel_c.h"
-
 #include "llmodel.h"

-#include <algorithm>
-#include <cstdio>
-#include <cstdlib>
+#include <cerrno>
 #include <cstring>
-#include <exception>
 #include <iostream>
 #include <memory>
 #include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-#include <span>
-
-namespace ranges = std::ranges;
-
-static_assert(sizeof(token_t) == sizeof(LLModel::Token));
+#include <utility>

 struct LLModelWrapper {
    LLModel *llModel = nullptr;
+    LLModel::PromptContext promptContext;
    ~LLModelWrapper() { delete llModel; }
 };

-llmodel_model llmodel_model_create(const char *model_path)
-{
+llmodel_model llmodel_model_create(const char *model_path) {
    const char *error;
    auto fres = llmodel_model_create2(model_path, "auto", &error);
    if (!fres) {
@ -34,8 +23,7 @@ llmodel_model llmodel_model_create(const char *model_path)
    return fres;
 }

-static void llmodel_set_error(const char **errptr, const char *message)
-{
+static void llmodel_set_error(const char **errptr, const char *message) {
    thread_local static std::string last_error_message;
    if (errptr) {
        last_error_message = message;
@ -43,8 +31,7 @@ static void llmodel_set_error(const char **errptr, const char *message)
    }
 }

-llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
-{
+llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error) {
    LLModel *llModel;
    try {
        llModel = LLModel::Implementation::construct(model_path, backend);
@ -58,8 +45,7 @@ llmodel_model llmodel_model_create2(const char *model_path, const char *backend,
    return wrapper;
 }

-void llmodel_model_destroy(llmodel_model model)
-{
+void llmodel_model_destroy(llmodel_model model) {
    delete static_cast<LLModelWrapper *>(model);
 }

@ -88,80 +74,82 @@ bool llmodel_isModelLoaded(llmodel_model model)
    return wrapper->llModel->isModelLoaded();
 }

-uint64_t llmodel_state_get_size(llmodel_model model)
+uint64_t llmodel_get_state_size(llmodel_model model)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->stateSize();
 }

-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
-                                token_t **input_tokens_out, uint64_t *n_input_tokens)
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    std::vector<LLModel::Token> inputTokens;
-    auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
-    if (bytesWritten) {
-        auto *buf = new LLModel::Token[inputTokens.size()];
-        ranges::copy(inputTokens, buf);
-        *input_tokens_out = buf;
-        *n_input_tokens = uint64_t(inputTokens.size());
-    } else {
-        *input_tokens_out = nullptr;
-        *n_input_tokens = 0;
-    }
-    return bytesWritten;
+    return wrapper->llModel->saveState(dest);
 }

-void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
-{
-    delete[] input_tokens;
-}
-
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
-                                const token_t *input_tokens, uint64_t n_input_tokens)
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);
-    return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
+    return wrapper->llModel->restoreState(src);
 }

-bool llmodel_prompt(llmodel_model               model,
-                    const char                 *prompt,
-                    llmodel_prompt_callback     prompt_callback,
-                    llmodel_response_callback   response_callback,
-                    llmodel_prompt_context     *ctx,
-                    const char                **error)
+void llmodel_prompt(llmodel_model model, const char *prompt,
+                    const char *prompt_template,
+                    llmodel_prompt_callback prompt_callback,
+                    llmodel_response_callback response_callback,
+                    llmodel_recalculate_callback recalculate_callback,
+                    llmodel_prompt_context *ctx,
+                    bool special,
+                    const char *fake_reply)
 {
    auto *wrapper = static_cast<LLModelWrapper *>(model);

+    auto response_func = [response_callback](int32_t token_id, const std::string &response) {
+        return response_callback(token_id, response.c_str());
+    };
+
+    if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
+        wrapper->promptContext.tokens.resize(ctx->n_past);
+
    // Copy the C prompt context
-    LLModel::PromptContext promptContext {
-        .n_predict      = ctx->n_predict,
-        .top_k          = ctx->top_k,
-        .top_p          = ctx->top_p,
-        .min_p          = ctx->min_p,
-        .temp           = ctx->temp,
-        .n_batch        = ctx->n_batch,
-        .repeat_penalty = ctx->repeat_penalty,
-        .repeat_last_n  = ctx->repeat_last_n,
-        .contextErase   = ctx->context_erase,
-    };
+    wrapper->promptContext.n_past = ctx->n_past;
+    wrapper->promptContext.n_ctx = ctx->n_ctx;
+    wrapper->promptContext.n_predict = ctx->n_predict;
+    wrapper->promptContext.top_k = ctx->top_k;
+    wrapper->promptContext.top_p = ctx->top_p;
+    wrapper->promptContext.min_p = ctx->min_p;
+    wrapper->promptContext.temp = ctx->temp;
+    wrapper->promptContext.n_batch = ctx->n_batch;
+    wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
+    wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
+    wrapper->promptContext.contextErase = ctx->context_erase;

-    auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
-        return prompt_callback(token_ids.data(), token_ids.size(), cached);
-    };
-    auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
-        return response_callback(token_id, piece.data());
-    };
+    std::string fake_reply_str;
+    if (fake_reply) { fake_reply_str = fake_reply; }
+    auto *fake_reply_p = fake_reply ? &fake_reply_str : nullptr;

    // Call the C++ prompt method
-    try {
-        wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
-    } catch (std::exception const &e) {
-        llmodel_set_error(error, e.what());
-        return false;
-    }
+    wrapper->llModel->prompt(prompt, prompt_template, prompt_callback, response_func, recalculate_callback,
+                             wrapper->promptContext, special, fake_reply_p);

-    return true;
+    // Update the C context by giving access to the wrappers raw pointers to std::vector data
+    // which involves no copies
+    ctx->logits = wrapper->promptContext.logits.data();
+    ctx->logits_size = wrapper->promptContext.logits.size();
+    ctx->tokens = wrapper->promptContext.tokens.data();
+    ctx->tokens_size = wrapper->promptContext.tokens.size();
+
+    // Update the rest of the C prompt context
+    ctx->n_past = wrapper->promptContext.n_past;
+    ctx->n_ctx = wrapper->promptContext.n_ctx;
+    ctx->n_predict = wrapper->promptContext.n_predict;
+    ctx->top_k = wrapper->promptContext.top_k;
+    ctx->top_p = wrapper->promptContext.top_p;
+    ctx->min_p = wrapper->promptContext.min_p;
+    ctx->temp = wrapper->promptContext.temp;
+    ctx->n_batch = wrapper->promptContext.n_batch;
+    ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;
+    ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;
+    ctx->context_erase = wrapper->promptContext.contextErase;
 }

 float *llmodel_embed(
@ -289,6 +277,12 @@ bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
    return wrapper->llModel->initializeGPUDevice(device);
 }

+bool llmodel_has_gpu_device(llmodel_model model)
+{
+    const auto *wrapper = static_cast<LLModelWrapper *>(model);
+    return wrapper->llModel->hasGPUDevice();
+}
+
 const char *llmodel_model_backend_name(llmodel_model model)
 {
    const auto *wrapper = static_cast<LLModelWrapper *>(model);
@ -300,21 +294,3 @@ const char *llmodel_model_gpu_device_name(llmodel_model model)
    const auto *wrapper = static_cast<LLModelWrapper *>(model);
    return wrapper->llModel->gpuDeviceName();
 }
-
-int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
-{
-    auto *wrapper = static_cast<const LLModelWrapper *>(model);
-    try {
-        return wrapper->llModel->countPromptTokens(prompt);
-    } catch (const std::exception& e) {
-        llmodel_set_error(error, e.what());
-        return -1;
-    }
-}
-
-void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
-{
-    auto *wrapper = static_cast<const LLModelWrapper *>(model);
-    for (auto &[name, token] : wrapper->llModel->specialTokens())
-        callback(name.c_str(), token.c_str());
-}
--- a/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/include/gpt4all-backend/llmodel_c.h
@ -1,9 +1,9 @@
 #ifndef LLMODEL_C_H
 #define LLMODEL_C_H

-#include <stdbool.h>
-#include <stddef.h>
 #include <stdint.h>
+#include <stddef.h>
+#include <stdbool.h>

 #ifdef __GNUC__
 #define DEPRECATED __attribute__ ((deprecated))
@ -23,11 +23,6 @@ extern "C" {
 */
 typedef void *llmodel_model;

-/**
- * A token.
- */
-typedef int32_t token_t;
-
 /**
 * llmodel_prompt_context structure for holding the prompt context.
 * NOTE: The implementation takes care of all the memory handling of the raw logits pointer and the
@ -35,15 +30,21 @@ typedef int32_t token_t;
 * behavior.
 */
 struct llmodel_prompt_context {
+    float *logits;          // logits of current context
+    size_t logits_size;     // the size of the raw logits vector
+    int32_t *tokens;        // current tokens in the context window
+    size_t tokens_size;     // the size of the raw tokens vector
+    int32_t n_past;         // number of tokens in past conversation
+    int32_t n_ctx;          // number of tokens possible in context window
    int32_t n_predict;      // number of tokens to predict
    int32_t top_k;          // top k logits to sample from
-    float   top_p;          // nucleus sampling probability threshold
-    float   min_p;          // Min P sampling
-    float   temp;           // temperature to adjust model's output distribution
+    float top_p;            // nucleus sampling probability threshold
+    float min_p;            // Min P sampling
+    float temp;             // temperature to adjust model's output distribution
    int32_t n_batch;        // number of predictions to generate in parallel
-    float   repeat_penalty; // penalty factor for repeated tokens
+    float repeat_penalty;   // penalty factor for repeated tokens
    int32_t repeat_last_n;  // last n tokens to penalize
-    float   context_erase;  // percent of context to erase if we exceed the context window
+    float context_erase;    // percent of context to erase if we exceed the context window
 };

 struct llmodel_gpu_device {
@ -62,12 +63,10 @@ typedef struct llmodel_gpu_device llmodel_gpu_device;

 /**
 * Callback type for prompt processing.
- * @param token_ids An array of token ids of the prompt.
- * @param n_token_ids The number of tokens in the array.
- * @param cached Whether the tokens were already in cache.
+ * @param token_id The token id of the prompt.
 * @return a bool indicating whether the model should keep processing.
 */
-typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
+typedef bool (*llmodel_prompt_callback)(int32_t token_id);

 /**
 * Callback type for response.
@ -75,7 +74,14 @@ typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token
 * @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
 * @return a bool indicating whether the model should keep generating.
 */
-typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
+typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
+
+/**
+ * Callback type for recalculation of context.
+ * @param whether the model is recalculating the context.
+ * @return a bool indicating whether the model should keep generating.
+ */
+typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);

 /**
 * Embedding cancellation callback for use with llmodel_embed.
@ -86,8 +92,6 @@ typedef bool (*llmodel_response_callback)(token_t token_id, const char *response
 */
 typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);

-typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
-
 /**
 * Create a llmodel instance.
 * Recognises correct model type from file at model_path
@ -146,57 +150,46 @@ bool llmodel_isModelLoaded(llmodel_model model);
 * @param model A pointer to the llmodel_model instance.
 * @return the size in bytes of the internal state of the model
 */
-uint64_t llmodel_state_get_size(llmodel_model model);
+uint64_t llmodel_get_state_size(llmodel_model model);

 /**
- * Saves the internal state of the model.
+ * Saves the internal state of the model to the specified destination address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
- * @param state_size The size of the destination for the state.
- * @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
- * be freed with llmodel_state_free_input_tokens.
- * @param n_input_tokens Where to store the size of the token cache state.
- * @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
- * size is set to zero.
+ * @param dest A pointer to the destination.
+ * @return the number of bytes copied
 */
-uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
-                                token_t **input_tokens_out, uint64_t *n_input_tokens);
-
-/**
- * Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
- * @param input_tokens The token cache buffer.
- */
-void llmodel_state_free_input_tokens(token_t *input_tokens);
+uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);

 /**
 * Restores the internal state of the model using data from the specified address.
 * NOTE: This state data is specific to the type of model you have created.
 * @param model A pointer to the llmodel_model instance.
- * @param state A pointer to the state data.
- * @param state_size The size of the state data.
- * @param input_tokens The token cache associated with the saved state.
- * @param n_input_tokens The number of tokens in input_tokens.
- * @return The number of bytes read, or zero on error.
+ * @param src A pointer to the src.
+ * @return the number of bytes read
 */
-uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
-                                const token_t *input_tokens, uint64_t n_input_tokens);
+uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);

 /**
 * Generate a response using the model.
 * @param model A pointer to the llmodel_model instance.
 * @param prompt A string representing the input prompt.
+ * @param prompt_template A string representing the input prompt template.
 * @param prompt_callback A callback function for handling the processing of prompt.
 * @param response_callback A callback function for handling the generated response.
+ * @param recalculate_callback A callback function for handling recalculation requests.
+ * @param special True if special tokens in the prompt should be processed, false otherwise.
+ * @param fake_reply A string to insert into context as the model's reply, or NULL to generate one.
 * @param ctx A pointer to the llmodel_prompt_context structure.
- * @param error A pointer to a string; will only be set on error.
 */
-bool llmodel_prompt(llmodel_model               model,
-                    const char                 *prompt,
-                    llmodel_prompt_callback     prompt_callback,
-                    llmodel_response_callback   response_callback,
-                    llmodel_prompt_context     *ctx,
-                    const char                **error);
+void llmodel_prompt(llmodel_model model, const char *prompt,
+                    const char *prompt_template,
+                    llmodel_prompt_callback prompt_callback,
+                    llmodel_response_callback response_callback,
+                    llmodel_recalculate_callback recalculate_callback,
+                    llmodel_prompt_context *ctx,
+                    bool special,
+                    const char *fake_reply);

 /**
 * Generate an embedding using the model.
@ -298,6 +291,11 @@ bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gp
 */
 bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);

+/**
+ * @return True if a GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_has_gpu_device(llmodel_model model);
+
 /**
 * @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
 */
@ -308,10 +306,6 @@ const char *llmodel_model_backend_name(llmodel_model model);
 */
 const char *llmodel_model_gpu_device_name(llmodel_model model);

-int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
-
-void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
-
 #ifdef __cplusplus
 }
 #endif
--- a/gpt4all-backend/llmodel_shared.cpp
+++ b/gpt4all-backend/llmodel_shared.cpp
@ -0,0 +1,298 @@
+#include "llmodel.h"
+
+#include <cassert>
+#include <iostream>
+#include <regex>
+#include <string>
+#include <unordered_set>
+
+// TODO(cebtenzzre): replace this with llama_kv_cache_seq_shift for llamamodel (GPT-J needs this as-is)
+void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
+    int n_keep = shouldAddBOS();
+    const int32_t n_discard = (promptCtx.n_ctx - n_keep) * promptCtx.contextErase;
+
+    // Erase the first percentage of context from the tokens
+    std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
+    promptCtx.tokens.erase(promptCtx.tokens.begin() + n_keep, promptCtx.tokens.begin() + n_keep + n_discard);
+
+    size_t i = n_keep;
+    promptCtx.n_past = n_keep;
+    while (i < promptCtx.tokens.size()) {
+        size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
+        std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
+        assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
+        if (!evalTokens(promptCtx, batch)) {
+            std::cerr << "LLModel ERROR: Failed to process prompt\n";
+            goto stop_generating;
+        }
+        promptCtx.n_past += batch.size();
+        if (!recalculate(true))
+            goto stop_generating;
+        i = batch_end;
+    }
+    assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));
+
+stop_generating:
+    recalculate(false);
+}
+
+static bool parsePromptTemplate(const std::string &tmpl, std::vector<std::smatch> &placeholders, std::string &err) {
+    static const std::regex placeholderRegex(R"(%[1-2](?![0-9]))");
+
+    auto it = std::sregex_iterator(tmpl.begin(), tmpl.end(), placeholderRegex);
+    placeholders.clear();
+    placeholders.insert(placeholders.end(), it, std::sregex_iterator());
+
+    if (placeholders.size() > 2) {
+        err = "ERROR: expected at most two placeholders, got " + std::to_string(placeholders.size());
+        return false;
+    }
+    if (placeholders.size() >= 1 && placeholders[0].str() != "%1") {
+        err = "ERROR: first placeholder must be %1, got " + placeholders[0].str();
+        return false;
+    }
+    if (placeholders.size() >= 2 && placeholders[1].str() != "%2") {
+        err = "ERROR: second placeholder must be %2, got " + placeholders[1].str();
+        return false;
+    }
+    return true;
+}
+
+void LLModel::prompt(const std::string &prompt,
+                     const std::string &promptTemplate,
+                     std::function<bool(int32_t)> promptCallback,
+                     std::function<bool(int32_t, const std::string&)> responseCallback,
+                     std::function<bool(bool)> recalculateCallback,
+                     PromptContext &promptCtx,
+                     bool special,
+                     std::string *fakeReply)
+{
+    if (!isModelLoaded()) {
+        std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
+        return;
+    }
+
+    if (!supportsCompletion()) {
+        std::string errorMessage = "ERROR: this model does not support text completion or chat!";
+        responseCallback(-1, errorMessage);
+        std::cerr << implementation().modelType() << " " << errorMessage << "\n";
+        return;
+    }
+
+    // parse the prompt template
+    std::vector<std::smatch> placeholders;
+    {
+        std::string err;
+        if (!parsePromptTemplate(promptTemplate, placeholders, err)) {
+            responseCallback(-1, err);
+            std::cerr << err << "\n";
+            return;
+        }
+    }
+
+    auto old_n_past = promptCtx.n_past; // prepare to fake n_past for tokenize
+
+    // tokenize the user prompt
+    std::vector<Token> embd_inp;
+    if (placeholders.empty()) {
+        // this is unusual, but well-defined
+        std::cerr << __func__ << ": prompt template has no placeholder\n";
+        embd_inp = tokenize(promptCtx, promptTemplate, true);
+    } else {
+        // template: beginning of user prompt
+        const auto &phUser = placeholders[0];
+        std::string userPrefix(phUser.prefix());
+        if (!userPrefix.empty()) {
+            embd_inp = tokenize(promptCtx, userPrefix, true);
+            promptCtx.n_past += embd_inp.size();
+        }
+
+        // user input (shouldn't have special token processing)
+        auto tokens = tokenize(promptCtx, prompt, special);
+        embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
+        promptCtx.n_past += tokens.size();
+
+        // template: end of user prompt + start of assistant prompt
+        size_t start = phUser.position() + phUser.length();
+        size_t end = placeholders.size() >= 2 ? placeholders[1].position() : promptTemplate.length();
+        auto userToAsst = promptTemplate.substr(start, end - start);
+        if (!userToAsst.empty()) {
+            tokens = tokenize(promptCtx, userToAsst, true);
+            embd_inp.insert(embd_inp.end(), tokens.begin(), tokens.end());
+            promptCtx.n_past += tokens.size();
+        }
+    }
+
+    promptCtx.n_past = old_n_past; // restore n_past so decodePrompt can increment it
+
+    // decode the user prompt
+    decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
+
+    // decode the assistant's reply, either generated or spoofed
+    if (fakeReply == nullptr) {
+        generateResponse(responseCallback, recalculateCallback, promptCtx);
+    } else {
+        embd_inp = tokenize(promptCtx, *fakeReply, false);
+        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
+    }
+
+    // decode the rest of the prompt template
+    // template: end of assistant prompt
+    std::string asstSuffix;
+    if (placeholders.size() >= 2) {
+        size_t start = placeholders[1].position() + placeholders[1].length();
+        asstSuffix = promptTemplate.substr(start);
+    } else {
+        asstSuffix = "\n\n"; // default to a blank link, good for e.g. Alpaca
+    }
+    if (!asstSuffix.empty()) {
+        embd_inp = tokenize(promptCtx, asstSuffix, true);
+        decodePrompt(promptCallback, responseCallback, recalculateCallback, promptCtx, embd_inp);
+    }
+}
+
+void LLModel::decodePrompt(std::function<bool(int32_t)> promptCallback,
+                           std::function<bool(int32_t, const std::string&)> responseCallback,
+                           std::function<bool(bool)> recalculateCallback,
+                           PromptContext &promptCtx,
+                           std::vector<Token> embd_inp) {
+    // save the context size
+    promptCtx.n_ctx = contextLength();
+
+    if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
+        responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
+        std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
+            " tokens and the context window is " << promptCtx.n_ctx << "!\n";
+        return;
+    }
+
+    promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
+    promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
+    promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
+
+    // process the prompt in batches
+    size_t i = 0;
+    while (i < embd_inp.size()) {
+        size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
+        std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
+
+        // Check if the context has run out...
+        if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
+            recalculateContext(promptCtx, recalculateCallback);
+            assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
+        }
+
+        if (!evalTokens(promptCtx, batch)) {
+            std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
+            return;
+        }
+
+        size_t tokens = batch_end - i;
+        for (size_t t = 0; t < tokens; ++t) {
+            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
+                promptCtx.tokens.erase(promptCtx.tokens.begin());
+            promptCtx.tokens.push_back(batch.at(t));
+            promptCtx.n_past += 1;
+            if (!promptCallback(batch.at(t)))
+                return;
+        }
+        i = batch_end;
+    }
+}
+
+void LLModel::generateResponse(std::function<bool(int32_t, const std::string&)> responseCallback,
+                               std::function<bool(bool)> recalculateCallback,
+                               PromptContext &promptCtx) {
+    std::string cachedResponse;
+    std::vector<Token> cachedTokens;
+    std::unordered_set<std::string> reversePrompts
+        = { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
+
+    // predict next tokens
+    for (int i = 0; i < promptCtx.n_predict; i++) {
+
+        // sample next token
+        auto id = sampleToken(promptCtx);
+
+        // Check if the context has run out...
+        if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
+            recalculateContext(promptCtx, recalculateCallback);
+            assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
+        }
+
+        if (!evalTokens(promptCtx, { id })) {
+            std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
+            return;
+        }
+
+        // display text
+        for (const auto token : endTokens()) {
+            if (id == token) return;
+        }
+
+        const std::string str = tokenToString(id);
+
+        // Check if the provided str is part of our reverse prompts
+        bool foundPartialReversePrompt = false;
+        const std::string completed = cachedResponse + std::string(str);
+        if (reversePrompts.find(completed) != reversePrompts.end())
+            return;
+
+        // Check if it partially matches our reverse prompts and if so, cache
+        for (const auto& s : reversePrompts) {
+            if (s.compare(0, completed.size(), completed) == 0) {
+                foundPartialReversePrompt = true;
+                cachedResponse = completed;
+                break;
+            }
+        }
+
+        // Regardless the token gets added to our cache
+        cachedTokens.push_back(id);
+
+        // Continue if we have found a partial match
+        if (foundPartialReversePrompt)
+            continue;
+
+        // Empty the cache
+        for (auto t : cachedTokens) {
+            if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
+                promptCtx.tokens.erase(promptCtx.tokens.begin());
+            promptCtx.tokens.push_back(t);
+            promptCtx.n_past += 1;
+            //TODO: Conversion to std::string can be avoided here...
+            if (!responseCallback(t, std::string(tokenToString(t))))
+                return;
+        }
+        cachedTokens.clear();
+    }
+}
+
+void LLModel::embed(
+    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
+    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
+) {
+    (void)texts;
+    (void)embeddings;
+    (void)prefix;
+    (void)dimensionality;
+    (void)tokenCount;
+    (void)doMean;
+    (void)atlas;
+    (void)cancelCb;
+    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
+}
+
+void LLModel::embed(
+    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
+    bool doMean, bool atlas
+) {
+    (void)texts;
+    (void)embeddings;
+    (void)isRetrieval;
+    (void)dimensionality;
+    (void)tokenCount;
+    (void)doMean;
+    (void)atlas;
+    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
+}
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@ -0,0 +1,46 @@
+#pragma once
+#include <cstdint>
+#include <cstddef>
+#include <vector>
+#include <ggml.h>
+
+struct llm_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+
+    void resize(size_t size) {
+        delete[] addr;
+        addr = new uint8_t[size];
+        this->size = size;
+    }
+
+    ~llm_buffer() {
+        delete[] addr;
+    }
+};
+
+struct llm_kv_cache {
+    struct ggml_tensor * k;
+    struct ggml_tensor * v;
+
+    struct ggml_context * ctx = NULL;
+
+    llm_buffer buf;
+
+    int n; // number of tokens currently in the cache
+
+    ~llm_kv_cache() {
+        if (ctx) {
+            ggml_free(ctx);
+        }
+    }
+};
+
+inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.addr;
+    }
+    ggml_graph_compute(graph, &plan);
+}
--- a/gpt4all-backend/scripts/convert_bert_hf_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_bert_hf_to_gguf.py
@ -0,0 +1,140 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import json
+import struct
+import sys
+from pathlib import Path
+
+import gguf
+import numpy as np
+from transformers import AutoConfig, AutoModel, AutoTokenizer
+
+
+if not 2 <= len(sys.argv) < 4:
+    print("Usage: {} dir-model [ftype]\n".format(Path(__file__).name))
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = Path(sys.argv[1])
+
+with open(dir_model / "vocab.txt", encoding="utf-8") as f:
+    vocab = f.readlines()
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+
+fname_out = dir_model / ("ggml-model-" + ftype_str[ftype] + ".gguf")
+
+
+ARCH = gguf.MODEL_ARCH.BERT
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+config = AutoConfig.from_pretrained(dir_model)
+
+block_count = config.num_hidden_layers
+gguf_writer.add_name("BERT")
+gguf_writer.add_context_length(config.max_position_embeddings)
+gguf_writer.add_embedding_length(config.hidden_size)
+gguf_writer.add_feed_forward_length(config.intermediate_size)
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_head_count(config.num_attention_heads)
+gguf_writer.add_file_type(ftype)
+
+print("gguf: get tokenizer metadata")
+
+try:
+    with open(dir_model / "tokenizer.json", encoding="utf-8") as f:
+        tokenizer_json = json.load(f)
+except FileNotFoundError as e:
+    print(f'Error: Missing {e.filename!r}', file=sys.stderr)
+    sys.exit(1)
+
+print("gguf: get wordpiece tokenizer vocab")
+
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+print(tokenizer.encode('I believe the meaning of life is'))
+
+tokens: list[bytearray] = []
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+for i in range(config.vocab_size):
+    try:
+        text = reverse_vocab[i]
+    except KeyError:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+
+    tokens.append(text)
+
+gguf_writer.add_tokenizer_model("bert")  # wordpiece
+gguf_writer.add_token_list(tokens)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab.add_to_gguf(gguf_writer)
+
+print("gguf: get tensor metadata")
+
+model = AutoModel.from_pretrained(dir_model, config=config, low_cpu_mem_usage=True)
+print(model)
+
+tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
+
+list_vars = model.state_dict()
+for name in list_vars.keys():
+    print(name, list_vars[name].shape, list_vars[name].dtype)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    if name in ['embeddings.position_ids', 'pooler.dense.weight', 'pooler.dense.bias']:
+        continue
+    print("Processing variable:", name, "with shape:", data.shape)
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
+        print("  Converting to float16")
+        data = data.astype(np.float16)
+        l_type = 1
+    else:
+        l_type = 0
+
+    # map tensor names
+    new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+    if new_name is None:
+        print("Can not map tensor '" + name + "'")
+        sys.exit()
+
+    gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print()
--- a/gpt4all-backend/scripts/convert_gptj_to_gguf.py
+++ b/gpt4all-backend/scripts/convert_gptj_to_gguf.py
@ -0,0 +1,165 @@
+#!/usr/bin/env python3
+# Convert GPT-J-6B h5 transformer model to ggml format
+#
+# Load the model using GPTJForCausalLM.
+# Iterate over all variables and write them to a binary file.
+#
+# For each variable, write the following:
+#   - Number of dimensions (int)
+#   - Name length (int)
+#   - Dimensions (int[n_dims])
+#   - Name (char[name_length])
+#   - Data (float[n_dims])
+#
+# By default, the bigger matrices are converted to 16-bit floats.
+# This can be disabled by adding the "ftype" CLI argument.
+#
+# At the start of the ggml file we write the model parameters
+# and vocabulary.
+#
+
+from __future__ import annotations
+
+import sys
+import struct
+import json
+from pathlib import Path
+
+import gguf
+import numpy as np
+from transformers import AutoConfig, AutoTokenizer, GPTJForCausalLM
+from transformers.models.gpt2 import tokenization_gpt2
+
+
+if not 2 <= len(sys.argv) < 4:
+    print("Usage: python {} dir-model [ftype]\n".format(Path(__file__).name))
+    print("  ftype == 0 -> float32")
+    print("  ftype == 1 -> float16")
+    sys.exit(1)
+
+# output in the same directory as the model
+dir_model = Path(sys.argv[1])
+fname_out = dir_model / "ggml-model.gguf"
+
+# possible data types
+#   ftype == 0 -> float32
+#   ftype == 1 -> float16
+#
+# map from ftype to string
+ftype_str = ["f32", "f16"]
+
+ftype = 1
+if len(sys.argv) > 2:
+    ftype = int(sys.argv[2])
+    if ftype < 0 or ftype > 1:
+        print("Invalid ftype: " + str(ftype))
+        sys.exit(1)
+
+fname_out = dir_model / ("ggml-model-" + ftype_str[ftype] + ".gguf")
+
+
+ARCH = gguf.MODEL_ARCH.GPTJ
+gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH])
+
+print("gguf: get model metadata")
+
+config = AutoConfig.from_pretrained(dir_model)
+
+block_count = config.n_layer
+gguf_writer.add_name("GPT-J")
+gguf_writer.add_context_length(config.n_positions)
+gguf_writer.add_embedding_length(config.n_embd)
+gguf_writer.add_block_count(block_count)
+gguf_writer.add_feed_forward_length(4 * config.n_embd)
+gguf_writer.add_head_count(config.n_head)
+gguf_writer.add_rope_dimension_count(config.rotary_dim)
+gguf_writer.add_layer_norm_eps(config.layer_norm_epsilon)
+gguf_writer.add_file_type(ftype)
+
+print("gguf: get gpt2 tokenizer vocab")
+
+tokenizer = AutoTokenizer.from_pretrained(dir_model)
+
+reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
+byte_encoder = tokenization_gpt2.bytes_to_unicode()
+byte_decoder = {v: k for k, v in byte_encoder.items()}
+
+tokens: list[bytearray] = []
+
+for i in range(config.vocab_size):
+    if i in reverse_vocab:
+        try:
+            text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
+        except KeyError:
+            text = bytearray()
+            for c in reverse_vocab[i]:
+                if ord(c) < 256:  # single byte character
+                    text.append(byte_decoder[c])
+                else:  # multibyte special token character
+                    text.extend(c.encode('utf-8'))
+    else:
+        print(f"Key {i} not in tokenizer vocabulary. Padding with an arbitrary token.")
+        pad_token = f"[PAD{i}]".encode("utf8")
+        text = bytearray(pad_token)
+
+    tokens.append(text)
+
+
+gguf_writer.add_tokenizer_model("gpt2")
+gguf_writer.add_token_list(tokens)
+
+special_vocab = gguf.SpecialVocab(dir_model, load_merges=True)
+special_vocab.add_to_gguf(gguf_writer)
+
+print("gguf: get tensor metadata")
+
+model = GPTJForCausalLM.from_pretrained(dir_model, config=config, low_cpu_mem_usage=True)
+#print (model)
+
+tensor_map = gguf.get_tensor_name_map(ARCH, block_count)
+
+list_vars = model.state_dict()
+#print (list_vars)
+
+for name in list_vars.keys():
+    data = list_vars[name].squeeze().numpy()
+    print("Processing variable:", name, "with shape:", data.shape)
+
+    # we don't need these
+    if name.endswith("attn.masked_bias") or name.endswith(".attn.bias"):
+        print("  Skipping variable:", name)
+        continue
+
+    n_dims = len(data.shape)
+
+    # ftype == 0 -> float32, ftype == 1 -> float16
+    ftype_cur = 0
+    if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
+        print("  Converting to float16")
+        data = data.astype(np.float16)
+        ftype_cur = 1
+    elif ftype == 1 or data.dtype != np.float32:
+        print("  Converting to float32")
+        data = data.astype(np.float32)
+        ftype_cur = 0
+
+    # map tensor names
+    new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+    if new_name is None:
+        print("Can not map tensor '" + name + "'")
+        sys.exit()
+
+    gguf_writer.add_tensor(new_name, data)
+
+
+print("gguf: write header")
+gguf_writer.write_header_to_file()
+print("gguf: write metadata")
+gguf_writer.write_kv_data_to_file()
+print("gguf: write tensors")
+gguf_writer.write_tensors_to_file()
+
+gguf_writer.close()
+
+print(f"gguf: model successfully exported to '{fname_out}'")
+print()
--- a/gpt4all-backend/src/dlhandle.cpp
+++ b/gpt4all-backend/src/dlhandle.cpp
@ -1,73 +0,0 @@
-#include "dlhandle.h"
-
-#include <string>
-
-#ifndef _WIN32
-#   include <dlfcn.h>
-#else
-#   include <cassert>
-#   include <sstream>
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
-#endif
-
-using namespace std::string_literals;
-namespace fs = std::filesystem;
-
-
-#ifndef _WIN32
-
-Dlhandle::Dlhandle(const fs::path &fpath)
-{
-    chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
-    if (!chandle) {
-        throw Exception("dlopen: "s + dlerror());
-    }
-}
-
-Dlhandle::~Dlhandle()
-{
-    if (chandle) dlclose(chandle);
-}
-
-void *Dlhandle::get_internal(const char *symbol) const
-{
-    return dlsym(chandle, symbol);
-}
-
-#else // defined(_WIN32)
-
-Dlhandle::Dlhandle(const fs::path &fpath)
-{
-    fs::path afpath = fs::absolute(fpath);
-
-    // Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
-    UINT lastErrorMode = GetErrorMode();
-    SetErrorMode(lastErrorMode | SEM_FAILCRITICALERRORS);
-
-    chandle = LoadLibraryExW(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
-
-    SetErrorMode(lastErrorMode);
-
-    if (!chandle) {
-        DWORD err = GetLastError();
-        std::ostringstream ss;
-        ss << "LoadLibraryExW failed with error 0x" << std::hex << err;
-        throw Exception(ss.str());
-    }
-}
-
-Dlhandle::~Dlhandle()
-{
-    if (chandle) FreeLibrary(HMODULE(chandle));
-}
-
-void *Dlhandle::get_internal(const char *symbol) const
-{
-    return GetProcAddress(HMODULE(chandle), symbol);
-}
-
-#endif // defined(_WIN32)
--- a/gpt4all-backend/src/dlhandle.h
+++ b/gpt4all-backend/src/dlhandle.h
@ -1,47 +0,0 @@
-#pragma once
-
-#include <filesystem>
-#include <stdexcept>
-#include <string>
-#include <utility>
-
-namespace fs = std::filesystem;
-
-
-class Dlhandle {
-    void *chandle = nullptr;
-
-public:
-    class Exception : public std::runtime_error {
-    public:
-        using std::runtime_error::runtime_error;
-    };
-
-    Dlhandle() = default;
-    Dlhandle(const fs::path &fpath);
-    Dlhandle(const Dlhandle &o) = delete;
-    Dlhandle(Dlhandle &&o)
-        : chandle(o.chandle)
-    {
-        o.chandle = nullptr;
-    }
-
-    ~Dlhandle();
-
-    Dlhandle &operator=(Dlhandle &&o) {
-        chandle = std::exchange(o.chandle, nullptr);
-        return *this;
-    }
-
-    template <typename T>
-    T *get(const std::string &symbol) const {
-        return reinterpret_cast<T *>(get_internal(symbol.c_str()));
-    }
-
-    auto get_fnc(const std::string &symbol) const {
-        return get<void*(...)>(symbol);
-    }
-
-private:
-    void *get_internal(const char *symbol) const;
-};
--- a/gpt4all-backend/src/llmodel_shared.cpp
+++ b/gpt4all-backend/src/llmodel_shared.cpp
@ -1,298 +0,0 @@
-#include "llmodel.h"
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <cstdint>
-#include <iostream>
-#include <iterator>
-#include <optional>
-#include <ranges>
-#include <stdexcept>
-#include <string>
-#include <string_view>
-#include <vector>
-
-namespace ranges = std::ranges;
-namespace views  = std::ranges::views;
-
-void LLModel::prompt(
-    std::string_view        prompt,
-    const PromptCallback   &promptCallback,
-    const ResponseCallback &responseCallback,
-    const PromptContext    &promptCtx
-) {
-    if (!isModelLoaded())
-        throw std::invalid_argument("Attempted to prompt an unloaded model.");
-    if (!supportsCompletion())
-        throw std::invalid_argument("Not a text completion model.");
-    if (!promptCtx.n_batch)
-        throw std::invalid_argument("Batch size cannot be zero.");
-    if (!promptCtx.n_predict)
-        return; // nothing requested
-
-    auto embd_inp = tokenize(prompt);
-    if (embd_inp.empty())
-        throw std::invalid_argument("Prompt tokenized to zero tokens.");
-
-    if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
-        generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
-}
-
-int32_t LLModel::countPromptTokens(std::string_view prompt) const
-{
-    if (!isModelLoaded())
-        throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
-    return int32_t(tokenize(prompt).size());
-}
-
-auto LLModel::decodePrompt(
-    const PromptCallback &promptCallback,
-    const PromptContext  &promptCtx,
-    std::vector<Token>    embd_inp
-) -> std::optional<int32_t>
-{
-    assert(!embd_inp.empty());
-
-    int32_t nCtx = contextLength();
-    int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
-
-    // Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
-    // requested n_past.
-    // This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
-    int32_t nPast = computeModelInputPosition(embd_inp);
-
-    // always decode up to a full batch before generating, even if cached
-    nPast -= std::min(n_batch, nPast);
-
-    // TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
-    if (!nPast && int32_t(embd_inp.size()) > nCtx) {
-        // no cache hit -> shift the input before even processing
-
-        int32_t nKeep     = shouldAddBOS();
-        auto    newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
-        int32_t nDiscard  = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
-
-        // execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
-        auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
-        if (!promptCallback(discardedTokens, true))
-            return std::nullopt;
-
-        // erase nDiscard tokens
-        embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
-        assert(int32_t(embd_inp.size()) <= nCtx);
-
-        // check the cache again, just in case
-        nPast = computeModelInputPosition(embd_inp);
-        nPast -= std::min(n_batch, nPast);
-    }
-
-    setModelInputPosition(nPast);
-
-    // execute the callback even for skipped tokens
-    if (!promptCallback(embd_inp | views::take(nPast), true))
-        return std::nullopt;
-
-    // process the prompt in batches
-    for (int32_t i = nPast; i < embd_inp.size();) {
-        auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
-        std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
-
-        // Check if the context has run out...
-        if (nPast + int32_t(batch.size()) > nCtx) {
-            shiftContext(promptCtx, &nPast);
-            assert(nPast + int32_t(batch.size()) <= nCtx);
-        }
-
-        // FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
-        if (!evalTokens(nPast, batch))
-            throw std::runtime_error("An internal error was encountered during prompt processing.");
-
-        for (auto &tok : batch) {
-            appendInputToken(tok);
-            nPast++;
-            if (!promptCallback({ &tok, 1 }, false))
-                return std::nullopt;
-        }
-        i = batch_end;
-    }
-
-    return nPast;
-}
-
-/*
- * If string s overlaps with the string key such that some prefix of the key is at the end
- * of the string, return the position in s where the first match starts. Otherwise, return
- * std::string::npos. Examples:
- * s = "bfo",  key = "foo" -> 1
- * s = "fooa", key = "foo" -> npos
- */
-static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
-{
-    if (s.empty() || key.empty())
-        throw std::invalid_argument("arguments to stringsOverlap must not be empty");
-
-    for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
-        if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
-            return start;
-    }
-    return std::string::npos;
-}
-
-void LLModel::generateResponse(
-    const ResponseCallback &responseCallback,
-    const PromptContext    &promptCtx,
-    int32_t                 nPast
-) {
-    static const char *stopSequences[] {
-        "### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
-        "<|im_start|>", "<|im_end|>", "<|endoftext|>",
-    };
-
-    initSampler(promptCtx);
-
-    std::string cachedResponse;
-    std::vector<Token> cachedTokens;
-    int n_predicted = 0;
-
-    // Predict next tokens
-    for (bool stop = false; !stop;) {
-        // Sample next token
-        std::optional<Token> new_tok = sampleToken();
-        std::string new_piece = tokenToString(new_tok.value());
-        cachedTokens.push_back(new_tok.value());
-        cachedResponse += new_piece;
-
-        auto accept = [this, &promptCtx, &new_tok, &nPast] {
-            // Shift context if out of space
-            if (nPast >= contextLength()) {
-                shiftContext(promptCtx, &nPast);
-                assert(nPast < contextLength());
-            }
-
-            // Accept the token
-            Token tok = std::exchange(new_tok, std::nullopt).value();
-            if (!evalTokens(nPast, { &tok, 1 }))
-                throw std::runtime_error("An internal error was encountered during response generation.");
-
-            appendInputToken(tok);
-            nPast++;
-        };
-
-        // Check for EOS
-        auto lengthLimit = std::string::npos;
-        for (const auto token : endTokens()) {
-            if (new_tok == token) {
-                stop = true;
-                lengthLimit = cachedResponse.size() - new_piece.size();
-            }
-        }
-
-        if (lengthLimit != std::string::npos) {
-            // EOS matched
-        } else if (!isSpecialToken(new_tok.value())) {
-            // Check if the response contains a stop sequence
-            for (const auto &p : stopSequences) {
-                auto match = cachedResponse.find(p);
-                if (match != std::string::npos) stop = true;
-                lengthLimit = std::min(lengthLimit, match);
-                if (match == 0) break;
-            }
-
-            // Check if the response matches the start of a stop sequence
-            if (lengthLimit == std::string::npos) {
-                for (const auto &p : stopSequences) {
-                    auto match = stringsOverlap(cachedResponse, p);
-                    lengthLimit = std::min(lengthLimit, match);
-                    if (match == 0) break;
-                }
-            }
-        } else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
-            // Special tokens must exactly match a stop sequence
-            stop = true;
-            lengthLimit = cachedResponse.size() - new_piece.size();
-        }
-
-        // Empty the cache, up to the length limit
-        std::string::size_type responseLength = 0;
-        while (!cachedTokens.empty()) {
-            Token tok = cachedTokens.front();
-            std::string piece = tokenToString(tok);
-
-            // Stop if the piece (or part of it) does not fit within the length limit
-            if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
-                break;
-
-            // Remove token from cache
-            assert(cachedResponse.starts_with(piece));
-            cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
-            cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
-
-            // Accept the token, if needed (not cached)
-            if (cachedTokens.empty() && new_tok)
-                accept();
-
-            // Send the token
-            if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
-                stop = true;
-                break;
-            }
-
-            // FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
-            // output token IDs and could cache a partial token for the next prompt call
-            responseLength += piece.size();
-        }
-        assert(cachedTokens.empty() == cachedResponse.empty());
-
-        // Accept the token, if needed (in cache)
-        if (new_tok) {
-            assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
-            if (stop) {
-                cachedTokens.pop_back();
-            } else {
-                accept();
-            }
-        }
-    }
-
-    if (inputLength() < cachedTokens.size()) {
-        /* This is theoretically possible if the longest stop sequence is greater than
-         * n_ctx * contextErase tokens. */
-        throw std::runtime_error("shifted too much context, can't go back");
-    }
-
-#ifndef NDEBUG
-    auto inp = inputTokens();
-    auto discard_start = inp.end() - cachedTokens.size();
-    assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
-#endif
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
-    size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)prefix;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    (void)cancelCb;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
-
-void LLModel::embed(
-    const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
-    bool doMean, bool atlas
-) {
-    (void)texts;
-    (void)embeddings;
-    (void)isRetrieval;
-    (void)dimensionality;
-    (void)tokenCount;
-    (void)doMean;
-    (void)atlas;
-    throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
-}
--- a/gpt4all-backend/src/utils.h
+++ b/gpt4all-backend/src/utils.h
@ -1,17 +0,0 @@
-#pragma once
-
-#include <cassert>
-
-#ifdef NDEBUG
-#   ifdef __has_builtin
-#       if __has_builtin(__builtin_unreachable)
-#           define UNREACHABLE() __builtin_unreachable()
-#       else
-#           define UNREACHABLE() do {} while (0)
-#       endif
-#   else
-#       define UNREACHABLE() do {} while (0)
-#   endif
-#else
-#   define UNREACHABLE() assert(!"Unreachable statement was reached")
-#endif
--- a/gpt4all-backend/include/gpt4all-backend/sysinfo.h
+++ b/gpt4all-backend/include/gpt4all-backend/sysinfo.h
@ -2,21 +2,17 @@
 #define SYSINFO_H

 #include <fstream>
-#include <iomanip>
-#include <sstream>
 #include <string>
+#include <sstream>
+#include <iomanip>

 #if defined(__linux__)
-#   include <unistd.h>
+#include <unistd.h>
 #elif defined(__APPLE__)
-#   include <sys/types.h>
-#   include <sys/sysctl.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
 #elif defined(_WIN32)
-#   define WIN32_LEAN_AND_MEAN
-#   ifndef NOMINMAX
-#       define NOMINMAX
-#   endif
-#   include <windows.h>
+#include <windows.h>
 #endif

 static long long getSystemTotalRAMInBytes()
--- a/gpt4all-backend/utils.cpp
+++ b/gpt4all-backend/utils.cpp
@ -0,0 +1,328 @@
+#include "utils.h"
+
+#include <fstream>
+#include <regex>
+
+void replace(std::string & str, const std::string & needle, const std::string & replacement) {
+    size_t pos = 0;
+    while ((pos = str.find(needle, pos)) != std::string::npos) {
+        str.replace(pos, needle.length(), replacement);
+        pos += replacement.length();
+    }
+}
+
+std::map<std::string, int32_t> json_parse(const std::string & fname) {
+    std::map<std::string, int32_t> result;
+
+    // read file into string
+    std::string json;
+    {
+        std::ifstream ifs(fname);
+        if (!ifs) {
+            fprintf(stderr, "Failed to open %s\n", fname.c_str());
+            exit(1);
+        }
+
+        json = std::string((std::istreambuf_iterator<char>(ifs)),
+                (std::istreambuf_iterator<char>()));
+    }
+
+    if (json[0] != '{') {
+        return result;
+    }
+
+    // parse json
+    {
+        bool has_key  = false;
+        bool in_token = false;
+
+        std::string str_key = "";
+        std::string str_val = "";
+
+        int n = json.size();
+        for (int i = 1; i < n; ++i) {
+            if (!in_token) {
+                if (json[i] == ' ') continue;
+                if (json[i] == '"') {
+                    in_token = true;
+                    continue;
+                }
+            } else {
+                if (json[i] == '\\' && i+1 < n) {
+                    if (has_key == false) {
+                        str_key += json[i];
+                    } else {
+                        str_val += json[i];
+                    }
+                    ++i;
+                } else if (json[i] == '"') {
+                    if (has_key == false) {
+                        has_key = true;
+                        ++i;
+                        while (json[i] == ' ') ++i;
+                        ++i; // :
+                        while (json[i] == ' ') ++i;
+                        if (json[i] != '\"') {
+                            while (json[i] != ',' && json[i] != '}') {
+                                str_val += json[i++];
+                            }
+                            has_key = false;
+                        } else {
+                            in_token = true;
+                            continue;
+                        }
+                    } else {
+                        has_key = false;
+                    }
+
+                    ::replace(str_key, "\\u0120", " " ); // \u0120 -> space
+                    ::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
+                    ::replace(str_key, "\\\"",    "\""); // \\\"   -> "
+
+                    try {
+                        result[str_key] = std::stoi(str_val);
+                    } catch (...) {
+                        //fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
+
+                    }
+                    str_key = "";
+                    str_val = "";
+                    in_token = false;
+                    continue;
+                }
+                if (has_key == false) {
+                    str_key += json[i];
+                } else {
+                    str_val += json[i];
+                }
+            }
+        }
+    }
+
+    return result;
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {
+    std::vector<std::string> words;
+
+    // first split the text into words
+    {
+        std::string str = text;
+        std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
+
+        std::regex re(pat);
+        std::smatch m;
+
+        while (std::regex_search(str, m, re)) {
+            for (auto x : m) {
+                words.push_back(x);
+            }
+            str = m.suffix();
+        }
+    }
+
+    // find the longest tokens that form the words:
+    std::vector<gpt_vocab::id> tokens;
+    for (const auto & word : words) {
+        if (word.size() == 0) continue;
+
+        int i = 0;
+        int n = word.size();
+        while (i < n) {
+            int j = n;
+            while (j > i) {
+                auto it = vocab.token_to_id.find(word.substr(i, j-i));
+                if (it != vocab.token_to_id.end()) {
+                    tokens.push_back(it->second);
+                    i = j;
+                    break;
+                }
+                --j;
+            }
+            if (i == n) {
+                break;
+            }
+            if (j == i) {
+                auto sub = word.substr(i, 1);
+                if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
+                    tokens.push_back(vocab.token_to_id.at(sub));
+                } else {
+                    fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
+                }
+                ++i;
+            }
+        }
+    }
+
+    return tokens;
+}
+
+std::string regex_escape(const std::string &s) {
+  static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
+  return std::regex_replace(s, metacharacters, "\\$&");
+}
+
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
+    // Generate the subpattern from the special_tokens vector if it's not empty
+    if (!vocab.special_tokens.empty()) {
+        std::vector<gpt_vocab::id> out;
+        std::vector<std::string> chunks;
+        std::string str = text;
+        std::string special_tokens_subpattern;
+        for (const auto &token : vocab.special_tokens) {
+            if (!special_tokens_subpattern.empty()) {
+                special_tokens_subpattern += "|";
+            }
+            special_tokens_subpattern += regex_escape(token);
+        }
+        std::regex re(special_tokens_subpattern);
+        std::smatch m;
+        while (std::regex_search(str, m, re)) {
+            auto tok = vocab.token_to_id.find(m.str());
+            if (tok != vocab.token_to_id.end()) {
+                auto tokid = tok->second;
+                auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
+                out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
+                out.push_back(tokid);
+                str = m.suffix();
+            }
+        }
+        if (!str.empty()) {
+            auto tokrest = gpt_tokenize_inner(vocab, str);
+            out.insert(out.end(), tokrest.begin(), tokrest.end());
+        }
+        return out;
+    } else {
+        return gpt_tokenize_inner(vocab, text);
+    }
+}
+
+
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
+    printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
+
+    vocab.token_to_id = ::json_parse(fname);
+
+    for (const auto & kv : vocab.token_to_id) {
+        vocab.id_to_token[kv.second] = kv.first;
+    }
+
+    printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
+
+    // print the vocabulary
+    //for (auto kv : vocab.token_to_id) {
+    //    printf("'%s' -> %d\n", kv.first.data(), kv.second);
+    //}
+
+    return true;
+}
+
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const size_t actualVocabSize,
+        const int32_t * last_n_tokens_data,
+        int   last_n_tokens_size,
+        const std::vector<float> logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        float repeat_penalty,
+        std::mt19937 & rng) {
+    int n_logits = actualVocabSize;
+
+    const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
+    const auto * plogits = logits.data();
+
+    if (temp <= 0) {
+        // select the token with the highest logit directly
+        float max_logit = plogits[0];
+        gpt_vocab::id max_id = 0;
+
+        for (int i = 1; i < n_logits; ++i) {
+            if (plogits[i] > max_logit) {
+                max_logit = plogits[i];
+                max_id = i;
+            }
+        }
+        return max_id;
+    }
+    std::vector<std::pair<double, gpt_vocab::id>> logits_id;
+    logits_id.reserve(n_logits);
+
+    {
+        const float scale = 1.0f/temp;
+        for (int i = 0; i < n_logits; ++i) {
+            // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
+            // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
+            if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
+                // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
+                if (plogits[i] < 0.0f) {
+                    logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
+                } else {
+                    logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
+                }
+            } else {
+                logits_id.push_back(std::make_pair(plogits[i]*scale, i));
+            }
+        }
+    }
+
+    // find the top K tokens
+    std::partial_sort(
+            logits_id.begin(),
+            logits_id.begin() + top_k, logits_id.end(),
+            [](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
+        return a.first > b.first;
+    });
+
+    logits_id.resize(top_k);
+
+    double maxl = -INFINITY;
+    for (const auto & kv : logits_id) {
+        maxl = std::max(maxl, kv.first);
+    }
+
+    // compute probs for the top K tokens
+    std::vector<double> probs;
+    probs.reserve(logits_id.size());
+
+    double sum = 0.0;
+    for (const auto & kv : logits_id) {
+        double p = exp(kv.first - maxl);
+        probs.push_back(p);
+        sum += p;
+    }
+
+    // normalize the probs
+    for (auto & p : probs) {
+        p /= sum;
+    }
+
+    if (top_p < 1.0f) {
+        double cumsum = 0.0f;
+        for (int i = 0; i < top_k; i++) {
+            cumsum += probs[i];
+            if (cumsum >= top_p) {
+                top_k = i + 1;
+                probs.resize(top_k);
+                logits_id.resize(top_k);
+                break;
+            }
+        }
+
+        cumsum = 1.0/cumsum;
+        for (int i = 0; i < (int) probs.size(); i++) {
+            probs[i] *= cumsum;
+        }
+    }
+
+    //printf("\n");
+    //for (int i = 0; i < (int) probs.size(); i++) {
+    //    printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
+    //}
+    //exit(0);
+
+    std::discrete_distribution<> dist(probs.begin(), probs.end());
+    int idx = dist(rng);
+
+    return logits_id[idx].second;
+}
--- a/gpt4all-backend/utils.h
+++ b/gpt4all-backend/utils.h
@ -0,0 +1,97 @@
+// Various helper functions and utilities
+
+#pragma once
+
+#include <string>
+#include <map>
+#include <vector>
+#include <random>
+#include <thread>
+
+//
+// General purpose inline functions
+//
+constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes) {
+    return bytes*1024*1024;
+}
+
+//
+// CLI argument parsing
+//
+
+struct gpt_params {
+    int32_t seed      = -1; // RNG seed
+    int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    int32_t n_predict = 200; // new tokens to predict
+
+    // sampling parameters
+    int32_t top_k = 40;
+    float   top_p = 0.9f;
+    float   temp  = 0.9f;
+
+    int32_t n_batch = 8; // batch size for prompt processing
+
+    std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
+    std::string prompt;
+};
+
+bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
+
+void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
+
+std::string gpt_random_prompt(std::mt19937 & rng);
+
+//
+// Vocab utils
+//
+
+struct gpt_vocab {
+    using id    = int32_t;
+    using token = std::string;
+
+    std::map<token, id> token_to_id;
+    std::map<id, token> id_to_token;
+    std::vector<std::string> special_tokens;
+
+    void add_special_token(const std::string &token) {
+        special_tokens.push_back(token);
+    }
+};
+
+void replace(std::string & str, const std::string & needle, const std::string & replacement);
+
+// poor-man's JSON parsing
+std::map<std::string, int32_t> json_parse(const std::string & fname);
+
+// split text into tokens
+//
+// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
+//
+// Regex (Python):
+// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
+//
+// Regex (C++):
+// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
+//
+std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
+
+// load the tokens from encoder.json
+bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
+
+// sample next token given probabilities for each embedding
+//
+//   - consider only the top K tokens
+//   - from them, consider only the top tokens with cumulative probability > P
+//
+// TODO: not sure if this implementation is correct
+//
+gpt_vocab::id gpt_sample_top_k_top_p(
+        const size_t actualVocabSize,
+        const int32_t * last_n_tokens_data,
+        int   last_n_tokens_size,
+        const std::vector<float> logits,
+        int    top_k,
+        double top_p,
+        double temp,
+        float repeat_penalty,
+        std::mt19937 & rng);
--- a/gpt4all-bindings/README.md
+++ b/gpt4all-bindings/README.md
@ -1,21 +1,3 @@
-# GPT4All Language Bindings
-These are the language bindings for the GPT4All backend. They provide functionality to load GPT4All models (and other llama.cpp models), generate text, and (in the case of the Python bindings) embed text as a vector representation.
-
-See their respective folders for language-specific documentation.
-
-### Languages
- [Python](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python) (Nomic official, maintained by [@cebtenzzre](https://github.com/cebtenzzre))
- [Node.js/Typescript](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript) (community, maintained by [@jacoobes](https://github.com/jacoobes) and [@iimez](https://github.com/iimez))
-
-<br/>
-<br/>
-
-<details><summary><b>Archived Bindings</b></summary>
-<br/>
-
-The following bindings have been removed from this repository due to lack of maintenance. If adopted, they can be brought back&mdash;feel free to message a developer on Dicsord if you are interested in maintaining one of them. Below are links to their last available version (not necessarily the last working version).
- C#: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/csharp)
- Java: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/java)
- Go: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/golang)
-
-</details>
+# GPT4All Bindings
+This directory will contain language specific bindings on top of the C/C++ model backends.
+We will have one directory per language binding (e.g. Python, Typescript, Golang, etc.).
--- a/gpt4all-bindings/cli/README.md
+++ b/gpt4all-bindings/cli/README.md
@ -2,7 +2,8 @@

 GPT4All on the command-line.

-More details on the [wiki](https://github.com/nomic-ai/gpt4all/wiki/Python-CLI).
+## Documentation
+<https://docs.gpt4all.io/gpt4all_cli.html>

 ## Quickstart

@ -33,11 +34,11 @@ python -m pip install --user --upgrade gpt4all typer
 # run the CLI
 python app.py repl
 ```
-By default, it will automatically download the `Mistral Instruct` model to `.cache/gpt4all/` in your
-user directory, if necessary.
+By default, it will automatically download the `groovy` model to `.cache/gpt4all/` in your user
+directory, if necessary.

 If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
 for example:
 ```shell
-python app.py repl --model /home/user/my-gpt4all-models/mistral-7b-instruct-v0.1.Q4_0.gguf
+python app.py repl --model /home/user/my-gpt4all-models/gpt4all-13b-snoozy-q4_0.gguf
 ```
--- a/gpt4all-bindings/cli/app.py
+++ b/gpt4all-bindings/cli/app.py
@ -113,7 +113,10 @@ def _old_loop(gpt4all_instance):
        full_response = gpt4all_instance.chat_completion(
            MESSAGES,
            # preferential kwargs for chat ux
+            logits_size=0,
+            tokens_size=0,
            n_past=0,
+            n_ctx=0,
            n_predict=200,
            top_k=40,
            top_p=0.9,
--- a/gpt4all-bindings/csharp/.editorconfig
+++ b/gpt4all-bindings/csharp/.editorconfig
@ -0,0 +1,348 @@
+# EditorConfig is awesome: https://EditorConfig.org
+
+# top-most EditorConfig file
+root = true
+
+# Don't use tabs for indentation.
+[*]
+indent_style = space
+# (Please don't specify an indent_size here; that has too many unintended consequences.)
+
+# Code files
+[*.{cs,csx,vb,vbx}]
+indent_size = 4
+insert_final_newline = true
+charset = utf-8-bom
+
+# XML project files
+[*.{csproj,vbproj,vcxproj,vcxproj.filters,proj,projitems,shproj}]
+indent_size = 4
+
+# XML config files
+[*.{props,targets,ruleset,config,nuspec,resx,vsixmanifest,vsct}]
+indent_size = 2
+
+# JSON files
+[*.json]
+indent_size = 2
+
+# Powershell files
+[*.ps1]
+indent_size = 2
+
+# Shell script files
+[*.sh]
+end_of_line = lf
+indent_size = 2
+insert_final_newline = true
+
+# Dotnet code style settings:
+[*.{cs,vb}]
+
+# IDE0055: Fix formatting
+dotnet_diagnostic.IDE0055.severity = error
+dotnet_diagnostic.CS1573.severity = suggestion
+dotnet_diagnostic.CS1591.severity = suggestion
+
+# Sort using and Import directives with System.* appearing first
+dotnet_sort_system_directives_first = true
+dotnet_separate_import_directive_groups = false
+
+# Avoid "this." and "Me." if not necessary
+dotnet_style_qualification_for_field = false:suggestion
+dotnet_style_qualification_for_property = false:suggestion
+dotnet_style_qualification_for_method = false:suggestion
+dotnet_style_qualification_for_event = false:suggestion
+
+# Use language keywords instead of framework type names for type references
+dotnet_style_predefined_type_for_locals_parameters_members = true:warning
+dotnet_style_predefined_type_for_member_access = true:warning
+
+# Suggest more modern language features when available
+dotnet_style_object_initializer = true:suggestion
+dotnet_style_collection_initializer = true:suggestion
+dotnet_style_coalesce_expression = true:suggestion
+dotnet_style_null_propagation = true:suggestion
+dotnet_style_explicit_tuple_names = true:suggestion
+
+# Whitespace options
+dotnet_style_allow_multiple_blank_lines_experimental = false
+
+# Private fields are camelCase with '_' prefix
+dotnet_naming_rule.private_members_with_underscore.symbols  = private_fields
+dotnet_naming_rule.private_members_with_underscore.style    = prefix_underscore
+dotnet_naming_rule.private_members_with_underscore.severity = error
+dotnet_naming_symbols.private_fields.applicable_kinds           = field
+dotnet_naming_symbols.private_fields.applicable_accessibilities = private
+dotnet_naming_style.prefix_underscore.capitalization = camel_case
+dotnet_naming_style.prefix_underscore.required_prefix = _
+
+# Non-private static fields are PascalCase
+dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.severity = suggestion
+dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.symbols = non_private_static_fields
+dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.style = non_private_static_field_style
+
+dotnet_naming_symbols.non_private_static_fields.applicable_kinds = field
+dotnet_naming_symbols.non_private_static_fields.applicable_accessibilities = public, protected, internal, protected_internal, private_protected
+dotnet_naming_symbols.non_private_static_fields.required_modifiers = static
+
+dotnet_naming_style.non_private_static_field_style.capitalization = pascal_case
+
+# Non-private readonly fields are PascalCase
+dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.severity = suggestion
+dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.symbols = non_private_readonly_fields
+dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.style = non_private_static_field_style
+
+dotnet_naming_symbols.non_private_readonly_fields.applicable_kinds = field
+dotnet_naming_symbols.non_private_readonly_fields.applicable_accessibilities = public, protected, internal, protected_internal, private_protected
+dotnet_naming_symbols.non_private_readonly_fields.required_modifiers = readonly
+
+dotnet_naming_style.non_private_readonly_field_style.capitalization = pascal_case
+
+# Constants are PascalCase
+dotnet_naming_rule.constants_should_be_pascal_case.severity = suggestion
+dotnet_naming_rule.constants_should_be_pascal_case.symbols = constants
+dotnet_naming_rule.constants_should_be_pascal_case.style = non_private_static_field_style
+
+dotnet_naming_symbols.constants.applicable_kinds = field, local
+dotnet_naming_symbols.constants.required_modifiers = const
+
+dotnet_naming_style.constant_style.capitalization = pascal_case
+
+# Static fields are camelCase and start with s_
+dotnet_naming_rule.static_fields_should_be_camel_case.severity = none
+dotnet_naming_rule.static_fields_should_be_camel_case.symbols = static_fields
+dotnet_naming_rule.static_fields_should_be_camel_case.style = static_field_style
+
+dotnet_naming_symbols.static_fields.applicable_kinds = field
+dotnet_naming_symbols.static_fields.required_modifiers = static
+
+dotnet_naming_style.static_field_style.capitalization = camel_case
+dotnet_naming_style.static_field_style.required_prefix = s_
+
+# Instance fields are camelCase and start with _
+dotnet_naming_rule.instance_fields_should_be_camel_case.severity = none
+dotnet_naming_rule.instance_fields_should_be_camel_case.symbols = instance_fields
+dotnet_naming_rule.instance_fields_should_be_camel_case.style = instance_field_style
+
+dotnet_naming_symbols.instance_fields.applicable_kinds = field
+
+dotnet_naming_style.instance_field_style.capitalization = camel_case
+dotnet_naming_style.instance_field_style.required_prefix = _
+
+# Locals and parameters are camelCase
+dotnet_naming_rule.locals_should_be_camel_case.severity = suggestion
+dotnet_naming_rule.locals_should_be_camel_case.symbols = locals_and_parameters
+dotnet_naming_rule.locals_should_be_camel_case.style = camel_case_style
+
+dotnet_naming_symbols.locals_and_parameters.applicable_kinds = parameter, local
+
+dotnet_naming_style.camel_case_style.capitalization = camel_case
+
+# Local functions are PascalCase
+dotnet_naming_rule.local_functions_should_be_pascal_case.severity = suggestion
+dotnet_naming_rule.local_functions_should_be_pascal_case.symbols = local_functions
+dotnet_naming_rule.local_functions_should_be_pascal_case.style = non_private_static_field_style
+
+dotnet_naming_symbols.local_functions.applicable_kinds = local_function
+
+dotnet_naming_style.local_function_style.capitalization = pascal_case
+
+# By default, name items with PascalCase
+dotnet_naming_rule.members_should_be_pascal_case.severity = suggestion
+dotnet_naming_rule.members_should_be_pascal_case.symbols = all_members
+dotnet_naming_rule.members_should_be_pascal_case.style = non_private_static_field_style
+
+dotnet_naming_symbols.all_members.applicable_kinds = *
+
+dotnet_naming_style.pascal_case_style.capitalization = pascal_case
+
+# error RS2008: Enable analyzer release tracking for the analyzer project containing rule '{0}'
+dotnet_diagnostic.RS2008.severity = none
+
+# IDE0073: File header
+dotnet_diagnostic.IDE0073.severity = none
+#file_header_template = Licensed to the .NET Foundation under one or more agreements.\nThe .NET Foundation licenses this file to you under the MIT license.\nSee the LICENSE file in the project root for more information.
+
+# IDE0035: Remove unreachable code
+dotnet_diagnostic.IDE0035.severity = warning
+
+# IDE0036: Order modifiers
+dotnet_diagnostic.IDE0036.severity = warning
+
+# IDE0043: Format string contains invalid placeholder
+dotnet_diagnostic.IDE0043.severity = warning
+
+# IDE0044: Make field readonly
+dotnet_diagnostic.IDE0044.severity = warning
+
+# IDE1006: Naming rule violation
+#dotnet_diagnostic.IDE1006.severity = none
+
+# RS0016: Only enable if API files are present
+dotnet_public_api_analyzer.require_api_files = true
+dotnet_style_operator_placement_when_wrapping = beginning_of_line
+tab_width = 4
+end_of_line = crlf
+dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion
+dotnet_style_prefer_auto_properties = true:silent
+dotnet_style_prefer_simplified_boolean_expressions = true:suggestion
+dotnet_style_prefer_conditional_expression_over_assignment = true:silent
+dotnet_style_prefer_conditional_expression_over_return = true:silent
+dotnet_style_prefer_inferred_tuple_names = true:suggestion
+dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion
+dotnet_style_prefer_compound_assignment = true:suggestion
+dotnet_style_prefer_simplified_interpolation = true:suggestion
+dotnet_style_namespace_match_folder = true:suggestion
+
+# CSharp code style settings:
+[*.cs]
+# Newline settings
+csharp_new_line_before_open_brace = all
+csharp_new_line_before_else = true
+csharp_new_line_before_catch = true
+csharp_new_line_before_finally = true
+csharp_new_line_before_members_in_object_initializers = true
+csharp_new_line_before_members_in_anonymous_types = true
+csharp_new_line_between_query_expression_clauses = true
+
+# Indentation preferences
+csharp_indent_block_contents = true
+csharp_indent_braces = false
+csharp_indent_case_contents = true
+csharp_indent_case_contents_when_block = true
+csharp_indent_switch_labels = true
+csharp_indent_labels = flush_left
+
+# Whitespace options
+csharp_style_allow_embedded_statements_on_same_line_experimental = false
+csharp_style_allow_blank_lines_between_consecutive_braces_experimental = false
+csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental = false
+
+# Prefer "var" everywhere
+csharp_style_var_for_built_in_types = true:suggestion
+csharp_style_var_when_type_is_apparent = true:suggestion
+csharp_style_var_elsewhere = true:suggestion
+
+# Prefer method-like constructs to have a block body
+csharp_style_expression_bodied_methods = false:none
+csharp_style_expression_bodied_constructors = false:none
+csharp_style_expression_bodied_operators = false:none
+
+# Prefer property-like constructs to have an expression-body
+csharp_style_expression_bodied_properties = true:none
+csharp_style_expression_bodied_indexers = true:none
+csharp_style_expression_bodied_accessors = true:none
+
+# Suggest more modern language features when available
+csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
+csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
+csharp_style_inlined_variable_declaration = true:suggestion
+csharp_style_throw_expression = true:suggestion
+csharp_style_conditional_delegate_call = true:suggestion
+
+# Space preferences
+csharp_space_after_cast = false
+csharp_space_after_colon_in_inheritance_clause = true
+csharp_space_after_comma = true
+csharp_space_after_dot = false
+csharp_space_after_keywords_in_control_flow_statements = true
+csharp_space_after_semicolon_in_for_statement = true
+csharp_space_around_binary_operators = before_and_after
+csharp_space_around_declaration_statements = do_not_ignore
+csharp_space_before_colon_in_inheritance_clause = true
+csharp_space_before_comma = false
+csharp_space_before_dot = false
+csharp_space_before_open_square_brackets = false
+csharp_space_before_semicolon_in_for_statement = false
+csharp_space_between_empty_square_brackets = false
+csharp_space_between_method_call_empty_parameter_list_parentheses = false
+csharp_space_between_method_call_name_and_opening_parenthesis = false
+csharp_space_between_method_call_parameter_list_parentheses = false
+csharp_space_between_method_declaration_empty_parameter_list_parentheses = false
+csharp_space_between_method_declaration_name_and_open_parenthesis = false
+csharp_space_between_method_declaration_parameter_list_parentheses = false
+csharp_space_between_parentheses = false
+csharp_space_between_square_brackets = false
+
+# Blocks are allowed
+csharp_prefer_braces = true:silent
+csharp_preserve_single_line_blocks = true
+csharp_preserve_single_line_statements = true
+
+# Target-type new expressio
+csharp_style_implicit_object_creation_when_type_is_apparent = true:suggestion
+
+# Currently only enabled for C# due to crash in VB analyzer.  VB can be enabled once
+# https://github.com/dotnet/roslyn/pull/54259 has been published.
+dotnet_style_allow_statement_immediately_after_block_experimental = false
+dotnet_diagnostic.RCS0003.severity=warning
+dotnet_diagnostic.RCS1036.severity=error
+dotnet_diagnostic.IDE0005.severity=warning
+dotnet_diagnostic.IDE0007.severity=error
+csharp_using_directive_placement = outside_namespace:silent
+csharp_prefer_simple_using_statement = true:suggestion
+csharp_style_namespace_declarations = block_scoped:silent
+csharp_style_expression_bodied_lambdas = true:silent
+csharp_style_expression_bodied_local_functions = false:silent
+csharp_style_prefer_null_check_over_type_check = true:suggestion
+dotnet_diagnostic.RCS1075.severity = suggestion
+
+[src/CodeStyle/**.{cs,vb}]
+# warning RS0005: Do not use generic CodeAction.Create to create CodeAction
+dotnet_diagnostic.RS0005.severity = none
+
+[src/{Analyzers,CodeStyle,Features,Workspaces,EditorFeatures,VisualStudio}/**/*.{cs,vb}]
+
+# IDE0011: Add braces
+csharp_prefer_braces = when_multiline:warning
+# NOTE: We need the below severity entry for Add Braces due to https://github.com/dotnet/roslyn/issues/44201
+dotnet_diagnostic.IDE0011.severity = warning
+
+# IDE0040: Add accessibility modifiers
+dotnet_diagnostic.IDE0040.severity = warning
+
+# CONSIDER: Are IDE0051 and IDE0052 too noisy to be warnings for IDE editing scenarios? Should they be made build-only warnings?
+# IDE0051: Remove unused private member
+dotnet_diagnostic.IDE0051.severity = warning
+
+# IDE0052: Remove unread private member
+dotnet_diagnostic.IDE0052.severity = warning
+
+# IDE0059: Unnecessary assignment to a value
+dotnet_diagnostic.IDE0059.severity = warning
+
+# IDE0060: Remove unused parameter
+dotnet_diagnostic.IDE0060.severity = warning
+
+# CA1012: Abstract types should not have public constructors
+dotnet_diagnostic.CA1012.severity = warning
+
+# CA1822: Make member static
+dotnet_diagnostic.CA1822.severity = warning
+
+# Prefer "var" everywhere
+dotnet_diagnostic.IDE0007.severity = warning
+csharp_style_var_for_built_in_types = true:warning
+csharp_style_var_when_type_is_apparent = true:warning
+csharp_style_var_elsewhere = true:warning
+
+# dotnet_style_allow_multiple_blank_lines_experimental
+dotnet_diagnostic.IDE2000.severity = warning
+
+# csharp_style_allow_embedded_statements_on_same_line_experimental
+dotnet_diagnostic.IDE2001.severity = warning
+
+# csharp_style_allow_blank_lines_between_consecutive_braces_experimental
+dotnet_diagnostic.IDE2002.severity = warning
+
+# dotnet_style_allow_statement_immediately_after_block_experimental
+dotnet_diagnostic.IDE2003.severity = warning
+
+# csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental
+dotnet_diagnostic.IDE2004.severity = warning
+
+[src/{VisualStudio}/**/*.{cs,vb}]
+# CA1822: Make member static
+# There is a risk of accidentally breaking an internal API that partners rely on though IVT.
+dotnet_code_quality.CA1822.api_surface = private
--- a/gpt4all-bindings/csharp/.gitignore
+++ b/gpt4all-bindings/csharp/.gitignore
@ -0,0 +1,379 @@
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+##
+## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
+
+runtimes
+**/*nuget
+
+*.zip
+include/
+*.exp
+*.lib
+*.dll
+
+# User-specific files
+*.rsuser
+*.suo
+*.user
+*.userosscache
+*.sln.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Mono auto generated files
+mono_crash.*
+Tests/**/launchSettings.json
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+[Ww][Ii][Nn]32/
+[Aa][Rr][Mm]/
+[Aa][Rr][Mm]64/
+bld/
+[Bb]in/
+[Oo]bj/
+[Oo]ut/
+[Ll]og/
+[Ll]ogs/
+
+# Visual Studio 2015/2017 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# Visual Studio 2017 auto generated files
+Generated\ Files/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUnit
+*.VisualState.xml
+TestResult.xml
+nunit-*.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# Benchmark Results
+BenchmarkDotNet.Artifacts/
+
+# .NET Core
+project.lock.json
+project.fragment.lock.json
+artifacts/
+
+# ASP.NET Scaffolding
+ScaffoldingReadMe.txt
+
+# StyleCop
+StyleCopReport.xml
+
+# Files built by Visual Studio
+*_i.c
+*_p.c
+*_h.h
+*.ilk
+*.meta
+*.obj
+*.iobj
+*.pch
+*.pdb
+*.ipdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*_wpftmp.csproj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opendb
+*.opensdf
+*.sdf
+*.cachefile
+*.VC.db
+*.VC.VC.opendb
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+*.sap
+
+# Visual Studio Trace Files
+*.e2e
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# AxoCover is a Code Coverage Tool
+.axoCover/*
+!.axoCover/settings.json
+
+# Coverlet is a free, cross platform Code Coverage Tool
+coverage*.json
+coverage*.xml
+coverage*.info
+
+# Visual Studio code coverage results
+*.coverage
+*.coveragexml
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# Note: Comment the next line if you want to checkin your web deploy settings,
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# Microsoft Azure Web App publish settings. Comment the next line if you want to
+# checkin your Azure Web App publish settings, but sensitive information contained
+# in these scripts will be unencrypted
+PublishScripts/
+
+# NuGet Packages
+*.nupkg
+# NuGet Symbol Packages
+*.snupkg
+# The packages folder can be ignored because of Package Restore
+**/[Pp]ackages/*
+# except build/, which is used as an MSBuild target.
+!**/[Pp]ackages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/[Pp]ackages/repositories.config
+# NuGet v3's project.json files produces more ignorable files
+*.nuget.props
+*.nuget.targets
+
+# Microsoft Azure Build Output
+csx/
+*.build.csdef
+
+# Microsoft Azure Emulator
+ecf/
+rcf/
+
+# Windows Store app package directories and files
+AppPackages/
+BundleArtifacts/
+Package.StoreAssociation.xml
+_pkginfo.txt
+*.appx
+*.appxbundle
+*.appxupload
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!?*.[Cc]ache/
+
+# Others
+ClientBin/
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.jfm
+*.pfx
+*.publishsettings
+orleans.codegen.cs
+
+# Including strong name files can present a security risk
+# (https://github.com/github/gitignore/pull/2483#issue-259490424)
+#*.snk
+
+# Since there are multiple workflows, uncomment next line to ignore bower_components
+# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
+#bower_components/
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+ServiceFabricBackup/
+*.rptproj.bak
+
+# SQL Server files
+*.mdf
+*.ldf
+*.ndf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+*.rptproj.rsuser
+*- [Bb]ackup.rdl
+*- [Bb]ackup ([0-9]).rdl
+*- [Bb]ackup ([0-9][0-9]).rdl
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# GhostDoc plugin setting file
+*.GhostDoc.xml
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+node_modules/
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
+*.vbw
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+# Paket dependency manager
+.paket/paket.exe
+paket-files/
+
+# FAKE - F# Make
+.fake/
+
+# CodeRush personal settings
+.cr/personal
+
+# Python Tools for Visual Studio (PTVS)
+__pycache__/
+*.pyc
+
+# Cake - Uncomment if you are using it
+# tools/**
+# !tools/packages.config
+
+# Tabs Studio
+*.tss
+
+# Telerik's JustMock configuration file
+*.jmconfig
+
+# BizTalk build output
+*.btp.cs
+*.btm.cs
+*.odx.cs
+*.xsd.cs
+
+# OpenCover UI analysis results
+OpenCover/
+
+# Azure Stream Analytics local run output
+ASALocalRun/
+
+# MSBuild Binary and Structured Log
+*.binlog
+
+# NVidia Nsight GPU debugger configuration file
+*.nvuser
+
+# MFractors (Xamarin productivity tool) working folder
+.mfractor/
+
+# Local History for Visual Studio
+.localhistory/
+
+# BeatPulse healthcheck temp database
+healthchecksdb
+
+# Backup folder for Package Reference Convert tool in Visual Studio 2017
+MigrationBackup/
+
+# Ionide (cross platform F# VS Code tools) working folder
+.ionide/
+
+# Fody - auto-generated XML schema
+FodyWeavers.xsd
+
+# JetBrains Rider
+.idea
+
+# Visual Studio Code
+.vscode
--- a/gpt4all-bindings/csharp/Directory.Build.props
+++ b/gpt4all-bindings/csharp/Directory.Build.props
@ -0,0 +1,44 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project>
+
+    <PropertyGroup>
+        <Company></Company>
+        <Copyright></Copyright>
+        <NeutralLanguage>en-US</NeutralLanguage>
+        <Version>0.6.4-alpha</Version>
+        <VersionSuffix>$(VersionSuffix)</VersionSuffix>
+        <Version Condition=" '$(VersionSuffix)' != '' ">$(Version)$(VersionSuffix)</Version>
+        <TreatWarningsAsErrors>true</TreatWarningsAsErrors>
+        <RepositoryUrl></RepositoryUrl>
+        <RepositoryType>git</RepositoryType>
+        <IncludeSymbols>true</IncludeSymbols>
+        <IncludeSource>true</IncludeSource>
+        <AnalysisLevel>latest-minimum</AnalysisLevel>
+		<EnforceCodeStyleInBuild>true</EnforceCodeStyleInBuild>
+    </PropertyGroup>
+
+    <ItemGroup>
+        <Using Include="System"/>
+    </ItemGroup>
+
+    <PropertyGroup>
+        <LangVersion>preview</LangVersion>
+        <Features>strict</Features>
+    </PropertyGroup>
+	
+	<ItemGroup>
+        <PackageReference Include="Roslynator.Analyzers" Version="4.2.0">
+            <PrivateAssets>all</PrivateAssets>
+            <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
+        </PackageReference>
+        <PackageReference Include="Roslynator.CodeAnalysis.Analyzers" Version="4.2.0">
+            <PrivateAssets>all</PrivateAssets>
+            <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
+        </PackageReference>
+        <PackageReference Include="Roslynator.Formatting.Analyzers" Version="4.2.0">
+            <PrivateAssets>all</PrivateAssets>
+            <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
+        </PackageReference>
+    </ItemGroup>
+
+</Project>
--- a/gpt4all-bindings/csharp/Gpt4All.Samples/Gpt4All.Samples.csproj
+++ b/gpt4all-bindings/csharp/Gpt4All.Samples/Gpt4All.Samples.csproj
@ -0,0 +1,33 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+    <PropertyGroup>
+        <OutputType>Exe</OutputType>
+        <TargetFramework>net8.0</TargetFramework>
+        <ImplicitUsings>enable</ImplicitUsings>
+        <Nullable>enable</Nullable>
+        <GenerateDocumentationFile>true</GenerateDocumentationFile>
+    </PropertyGroup>
+
+    <ItemGroup>
+        <ProjectReference Include="..\Gpt4All\Gpt4All.csproj" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <!-- Windows -->
+        <None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
+        <!-- Linux -->
+        <None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
+        <!-- MacOS -->
+        <None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <!-- Windows -->
+        <None Condition="$([MSBuild]::IsOSPlatform('Windows'))" Include="..\runtimes\win-x64\native\*.dll" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+        <!-- Linux -->
+        <None Condition="$([MSBuild]::IsOSPlatform('Linux'))" Include="..\runtimes\linux-x64\native\*.so" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+         <!-- MacOS -->
+        <None Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.dylib" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+        <Content Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.metal" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+    </ItemGroup>
+</Project>
--- a/gpt4all-bindings/csharp/Gpt4All.Samples/Program.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Samples/Program.cs
@ -0,0 +1,22 @@
+using Gpt4All;
+
+var modelFactory = new Gpt4AllModelFactory();
+if (args.Length < 2)
+{
+    Console.WriteLine($"Usage: Gpt4All.Samples <model-path> <prompt>");
+    return;
+}
+
+var modelPath = args[0];
+var prompt = args[1];
+
+using var model = modelFactory.LoadModel(modelPath);
+
+var result = await model.GetStreamingPredictionAsync(
+    prompt,
+    PredictRequestOptions.Defaults);
+
+await foreach (var token in result.GetPredictionStreamingAsync())
+{
+    Console.Write(token);
+}
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/Constants.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/Constants.cs
@ -0,0 +1,9 @@
+namespace Gpt4All.Tests;
+
+public static class Constants
+{
+    public const string MODELS_BASE_DIR = "../../../models";
+    public const string LLAMA_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-gpt4all-l13b-snoozy.bin";
+    public const string GPTJ_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-gpt4all-j-v1.3-groovy.bin";
+    public const string MPT_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-mpt-7b-chat.bin";
+}
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/Gpt4All.Tests.csproj
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/Gpt4All.Tests.csproj
@ -0,0 +1,60 @@
+<Project Sdk="Microsoft.NET.Sdk">
+
+    <PropertyGroup>
+        <TargetFramework>net8.0</TargetFramework>
+        <Nullable>enable</Nullable>
+
+        <IsPackable>false</IsPackable>
+        <GenerateDocumentationFile>true</GenerateDocumentationFile>
+    </PropertyGroup>
+
+    <ItemGroup>
+        <PackageReference Include="Microsoft.NET.Test.Sdk" Version="17.6.2" />
+        <PackageReference Include="xunit" Version="2.4.2" />
+        <PackageReference Include="xunit.runner.visualstudio" Version="2.4.5">
+            <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+            <PrivateAssets>all</PrivateAssets>
+        </PackageReference>
+        <PackageReference Include="coverlet.collector" Version="6.0.0">
+            <IncludeAssets>runtime; build; native; contentfiles; analyzers; buildtransitive</IncludeAssets>
+            <PrivateAssets>all</PrivateAssets>
+        </PackageReference>
+    </ItemGroup>
+
+    <ItemGroup>
+        <ProjectReference Include="..\Gpt4All\Gpt4All.csproj" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <!-- Windows -->
+        <None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
+        <!-- Linux -->
+        <None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
+        <!-- MacOS -->
+        <None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
+    </ItemGroup>
+
+    <ItemGroup>
+        <!-- Windows -->
+        <None Condition="$([MSBuild]::IsOSPlatform('Windows'))" Include="..\runtimes\win-x64\native\*.dll" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+        <!-- Linux -->
+        <None Condition="$([MSBuild]::IsOSPlatform('Linux'))" Include="..\runtimes\linux-x64\native\*.so" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+         <!-- MacOS -->
+        <None Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.dylib" Visible="False" CopyToOutputDirectory="PreserveNewest" />
+    </ItemGroup>
+
+    <ItemGroup>
+      <PackageReference Update="Roslynator.Analyzers" Version="4.3.0">
+        <PrivateAssets>all</PrivateAssets>
+        <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
+      </PackageReference>
+      <PackageReference Update="Roslynator.CodeAnalysis.Analyzers" Version="4.3.0">
+        <PrivateAssets>all</PrivateAssets>
+        <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
+      </PackageReference>
+      <PackageReference Update="Roslynator.Formatting.Analyzers" Version="4.3.0">
+        <PrivateAssets>all</PrivateAssets>
+        <IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
+      </PackageReference>
+    </ItemGroup>
+</Project>
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/ModelFactoryTests.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/ModelFactoryTests.cs
@ -0,0 +1,34 @@
+using Xunit;
+
+namespace Gpt4All.Tests;
+
+public class ModelFactoryTests
+{
+    private readonly Gpt4AllModelFactory _modelFactory;
+
+    public ModelFactoryTests()
+    {
+        _modelFactory = new Gpt4AllModelFactory();
+    }
+
+    [Fact]
+    [Trait(Traits.SkipOnCI, "True")]
+    public void CanLoadLlamaModel()
+    {
+        using var model = _modelFactory.LoadModel(Constants.LLAMA_MODEL_PATH);
+    }
+
+    [Fact]
+    [Trait(Traits.SkipOnCI, "True")]
+    public void CanLoadGptjModel()
+    {
+        using var model = _modelFactory.LoadModel(Constants.GPTJ_MODEL_PATH);
+    }
+
+    [Fact]
+    [Trait(Traits.SkipOnCI, "True")]
+    public void CanLoadMptModel()
+    {
+        using var model = _modelFactory.LoadModel(Constants.MPT_MODEL_PATH);
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/NativeLibraryLoaderTests.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/NativeLibraryLoaderTests.cs
@ -0,0 +1,56 @@
+using System.IO;
+using Gpt4All.LibraryLoader;
+using Xunit;
+
+namespace Gpt4All.Tests;
+
+public class NativeLibraryLoaderTests
+{
+    [Fact]
+    public void NativeLibraryShouldLoad()
+    {
+        var result = NativeLibraryLoader.LoadNativeLibrary(bypassLoading: false);
+        Assert.True(result.IsSuccess);
+    }
+
+    private const string LLModelLib = "libllmodel.{0}";
+
+    [PlatformSpecificFact(Platforms.Windows)]
+    public void NativeLibraryShouldLoad_Windows()
+    {
+        var libraryLoader = new WindowsLibraryLoader();
+
+        var libraryPath = Path.Combine(
+            Environment.CurrentDirectory,
+            string.Format(LLModelLib, "dll"));
+
+        var result = libraryLoader.OpenLibrary(libraryPath);
+        Assert.True(result.IsSuccess);
+    }
+
+    [PlatformSpecificFact(Platforms.Linux)]
+    public void NativeLibraryShouldLoad_Linux()
+    {
+        var libraryLoader = new LinuxLibraryLoader();
+
+        var libraryPath = Path.Combine(
+            Environment.CurrentDirectory,
+            string.Format(LLModelLib, "so"));
+
+        var result = libraryLoader.OpenLibrary(libraryPath);
+        Assert.True(result.IsSuccess);
+    }
+
+    [PlatformSpecificFact(Platforms.MacOS)]
+    public void NativeLibraryShouldLoad_MacOS()
+    {
+        var libraryLoader = new MacOsLibraryLoader();
+
+        var libraryPath = Path.Combine(
+            Environment.CurrentDirectory,
+            string.Format(LLModelLib, "dylib"));
+
+        var result = libraryLoader.OpenLibrary(libraryPath);
+        Assert.True(result.IsSuccess);
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/PlatformSpecificFactAttribute.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/PlatformSpecificFactAttribute.cs
@ -0,0 +1,27 @@
+using Xunit;
+
+namespace Gpt4All.Tests;
+
+public static class Platforms
+{
+    public const string Windows = "windows";
+    public const string Linux = "linux";
+    public const string MacOS = "macOS";
+}
+
+/// <summary>
+/// This attribute ensures the Fact is only run on the specified platform.
+/// </summary>
+/// <remarks>
+/// <see cref="OperatingSystem.IsOSPlatform(string)"/> for info about the platform string.
+/// </remarks>
+public class PlatformSpecificFactAttribute : FactAttribute
+{
+    public PlatformSpecificFactAttribute(string platform)
+    {
+        if (!OperatingSystem.IsOSPlatform(platform))
+        {
+            Skip = $"Test only runs on {platform}.";
+        }
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All.Tests/Traits.cs
+++ b/gpt4all-bindings/csharp/Gpt4All.Tests/Traits.cs
@ -0,0 +1,6 @@
+namespace Gpt4All.Tests;
+
+public static class Traits
+{
+    public const string SkipOnCI = "SKIP_ON_CI";
+}
--- a/gpt4all-bindings/csharp/Gpt4All.sln
+++ b/gpt4all-bindings/csharp/Gpt4All.sln
@ -0,0 +1,47 @@
+
+Microsoft Visual Studio Solution File, Format Version 12.00
+# Visual Studio Version 17
+VisualStudioVersion = 17.5.33516.290
+MinimumVisualStudioVersion = 10.0.40219.1
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Gpt4All.Samples", "Gpt4All.Samples\Gpt4All.Samples.csproj", "{59864AE8-E45D-42F7-A7C0-1308EF185F39}"
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{DA396C11-CEAD-4368-8234-FB12255A30D2}"
+	ProjectSection(SolutionItems) = preProject
+		.gitignore = .gitignore
+		build_linux.sh = build_linux.sh
+		build_win-mingw.ps1 = build_win-mingw.ps1
+		build_win-msvc.ps1 = build_win-msvc.ps1
+		docs\gpt4all_csharp.md = docs\gpt4all_csharp.md
+		README.md = README.md
+	EndProjectSection
+EndProject
+Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Gpt4All", "Gpt4All\Gpt4All.csproj", "{6015C62B-2008-426B-A334-740D6F1FE38B}"
+EndProject
+Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Gpt4All.Tests", "Gpt4All.Tests\Gpt4All.Tests.csproj", "{33A72341-52C1-4EAE-878B-A98BC77F686A}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Debug|Any CPU = Debug|Any CPU
+		Release|Any CPU = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{59864AE8-E45D-42F7-A7C0-1308EF185F39}.Release|Any CPU.Build.0 = Release|Any CPU
+		{6015C62B-2008-426B-A334-740D6F1FE38B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{6015C62B-2008-426B-A334-740D6F1FE38B}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{6015C62B-2008-426B-A334-740D6F1FE38B}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{6015C62B-2008-426B-A334-740D6F1FE38B}.Release|Any CPU.Build.0 = Release|Any CPU
+		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
+		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Debug|Any CPU.Build.0 = Debug|Any CPU
+		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Release|Any CPU.ActiveCfg = Release|Any CPU
+		{33A72341-52C1-4EAE-878B-A98BC77F686A}.Release|Any CPU.Build.0 = Release|Any CPU
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+	GlobalSection(ExtensibilityGlobals) = postSolution
+		SolutionGuid = {17632027-F4C2-4903-B88F-310CE3DE386B}
+	EndGlobalSection
+EndGlobal
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/ILLModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/ILLModel.cs
@ -0,0 +1,29 @@
+namespace Gpt4All.Bindings;
+
+/// <summary>
+/// Represents the interface exposed by the universal wrapper for GPT4All language models built around llmodel C-API.
+/// </summary>
+public interface ILLModel : IDisposable
+{
+    ulong GetStateSizeBytes();
+
+    int GetThreadCount();
+
+    void SetThreadCount(int threadCount);
+
+    bool IsLoaded();
+
+    bool Load(string modelPath);
+
+    void Prompt(
+        string text,
+        LLModelPromptContext context,
+        Func<ModelPromptEventArgs, bool>? promptCallback = null,
+        Func<ModelResponseEventArgs, bool>? responseCallback = null,
+        Func<ModelRecalculatingEventArgs, bool>? recalculateCallback = null,
+        CancellationToken cancellationToken = default);
+
+    unsafe ulong RestoreStateData(byte* destination);
+
+    unsafe ulong SaveStateData(byte* source);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLModel.cs
@ -0,0 +1,212 @@
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
+
+namespace Gpt4All.Bindings;
+
+/// <summary>
+/// Arguments for the response processing callback
+/// </summary>
+/// <param name="TokenId">The token id of the response</param>
+/// <param name="Response"> The response string. NOTE: a token_id of -1 indicates the string is an error string</param>
+/// <return>
+/// A bool indicating whether the model should keep generating
+/// </return>
+public record ModelResponseEventArgs(int TokenId, string Response)
+{
+    public bool IsError => TokenId == -1;
+}
+
+/// <summary>
+/// Arguments for the prompt processing callback
+/// </summary>
+/// <param name="TokenId">The token id of the prompt</param>
+/// <return>
+/// A bool indicating whether the model should keep processing
+/// </return>
+public record ModelPromptEventArgs(int TokenId)
+{
+}
+
+/// <summary>
+/// Arguments for the recalculating callback
+/// </summary>
+/// <param name="IsRecalculating"> whether the model is recalculating the context.</param>
+/// <return>
+/// A bool indicating whether the model should keep generating
+/// </return>
+public record ModelRecalculatingEventArgs(bool IsRecalculating);
+
+/// <summary>
+/// Base class and universal wrapper for GPT4All language models built around llmodel C-API.
+/// </summary>
+public class LLModel : ILLModel
+{
+    protected readonly IntPtr _handle;
+    private readonly ILogger _logger;
+    private bool _disposed;
+
+    internal LLModel(IntPtr handle, ILogger? logger = null)
+    {
+        _handle = handle;
+        _logger = logger ?? NullLogger.Instance;
+    }
+
+    /// <summary>
+    /// Create a new model from a pointer
+    /// </summary>
+    /// <param name="handle">Pointer to underlying model</param>
+    public static LLModel Create(IntPtr handle, ILogger? logger = null)
+    {
+        return new LLModel(handle, logger: logger);
+    }
+
+    /// <summary>
+    /// Generate a response using the model
+    /// </summary>
+    /// <param name="text">The input promp</param>
+    /// <param name="context">The context</param>
+    /// <param name="promptCallback">A callback function for handling the processing of prompt</param>
+    /// <param name="responseCallback">A callback function for handling the generated response</param>
+    /// <param name="recalculateCallback">A callback function for handling recalculation requests</param>
+    /// <param name="cancellationToken"></param>
+    public void Prompt(
+        string text,
+        LLModelPromptContext context,
+        Func<ModelPromptEventArgs, bool>? promptCallback = null,
+        Func<ModelResponseEventArgs, bool>? responseCallback = null,
+        Func<ModelRecalculatingEventArgs, bool>? recalculateCallback = null,
+        CancellationToken cancellationToken = default)
+    {
+        GC.KeepAlive(promptCallback);
+        GC.KeepAlive(responseCallback);
+        GC.KeepAlive(recalculateCallback);
+        GC.KeepAlive(cancellationToken);
+
+        _logger.LogInformation("Prompt input='{Prompt}' ctx={Context}", text, context.Dump());
+
+        NativeMethods.llmodel_prompt(
+            _handle,
+            text,
+            (tokenId) =>
+            {
+                if (cancellationToken.IsCancellationRequested) return false;
+                if (promptCallback == null) return true;
+                var args = new ModelPromptEventArgs(tokenId);
+                return promptCallback(args);
+            },
+            (tokenId, response) =>
+            {
+                if (cancellationToken.IsCancellationRequested)
+                {
+                    _logger.LogDebug("ResponseCallback evt=CancellationRequested");
+                    return false;
+                }
+
+                if (responseCallback == null) return true;
+                var args = new ModelResponseEventArgs(tokenId, response);
+                return responseCallback(args);
+            },
+            (isRecalculating) =>
+            {
+                if (cancellationToken.IsCancellationRequested) return false;
+                if (recalculateCallback == null) return true;
+                var args = new ModelRecalculatingEventArgs(isRecalculating);
+                return recalculateCallback(args);
+            },
+            ref context.UnderlyingContext
+        );
+    }
+
+    /// <summary>
+    ///  Set the number of threads to be used by the model.
+    /// </summary>
+    /// <param name="threadCount">The new thread count</param>
+    public void SetThreadCount(int threadCount)
+    {
+        NativeMethods.llmodel_setThreadCount(_handle, threadCount);
+    }
+
+    /// <summary>
+    /// Get  the number of threads used by the model.
+    /// </summary>
+    /// <returns>the number of threads used by the model</returns>
+    public int GetThreadCount()
+    {
+        return NativeMethods.llmodel_threadCount(_handle);
+    }
+
+    /// <summary>
+    /// Get the size of the internal state of the model.
+    /// </summary>
+    /// <remarks>
+    /// This state data is specific to the type of model you have created.
+    /// </remarks>
+    /// <returns>the size in bytes of the internal state of the model</returns>
+    public ulong GetStateSizeBytes()
+    {
+        return NativeMethods.llmodel_get_state_size(_handle);
+    }
+
+    /// <summary>
+    /// Saves the internal state of the model to the specified destination address.
+    /// </summary>
+    /// <param name="source">A pointer to the src</param>
+    /// <returns>The number of bytes copied</returns>
+    public unsafe ulong SaveStateData(byte* source)
+    {
+        return NativeMethods.llmodel_save_state_data(_handle, source);
+    }
+
+    /// <summary>
+    /// Restores the internal state of the model using data from the specified address.
+    /// </summary>
+    /// <param name="destination">A pointer to destination</param>
+    /// <returns>the number of bytes read</returns>
+    public unsafe ulong RestoreStateData(byte* destination)
+    {
+        return NativeMethods.llmodel_restore_state_data(_handle, destination);
+    }
+
+    /// <summary>
+    /// Check if the model is loaded.
+    /// </summary>
+    /// <returns>true if the model was loaded successfully, false otherwise.</returns>
+    public bool IsLoaded()
+    {
+        return NativeMethods.llmodel_isModelLoaded(_handle);
+    }
+
+    /// <summary>
+    /// Load the model from a file.
+    /// </summary>
+    /// <param name="modelPath">The path to the model file.</param>
+    /// <returns>true if the model was loaded successfully, false otherwise.</returns>
+    public bool Load(string modelPath)
+    {
+        return NativeMethods.llmodel_loadModel(_handle, modelPath, 2048, 100);
+    }
+
+    protected void Destroy()
+    {
+        NativeMethods.llmodel_model_destroy(_handle);
+    }
+    protected virtual void Dispose(bool disposing)
+    {
+        if (_disposed) return;
+
+        if (disposing)
+        {
+            // dispose managed state
+        }
+
+        Destroy();
+
+        _disposed = true;
+    }
+
+    public void Dispose()
+    {
+        Dispose(disposing: true);
+        GC.SuppressFinalize(this);
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/LLPromptContext.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/LLPromptContext.cs
@ -0,0 +1,147 @@
+namespace Gpt4All.Bindings;
+
+/// <summary>
+/// Wrapper around the llmodel_prompt_context structure for holding the prompt context.
+/// </summary>
+/// <remarks>
+/// The implementation takes care of all the memory handling of the raw logits pointer and the
+/// raw tokens pointer.Attempting to resize them or modify them in any way can lead to undefined behavior
+/// </remarks>
+public unsafe class LLModelPromptContext
+{
+    private llmodel_prompt_context _ctx;
+
+    internal ref llmodel_prompt_context UnderlyingContext => ref _ctx;
+
+    public LLModelPromptContext()
+    {
+        _ctx = new();
+    }
+
+    /// <summary>
+    /// logits of current context
+    /// </summary>
+    public Span<float> Logits => new(_ctx.logits, (int)_ctx.logits_size);
+
+    /// <summary>
+    /// the size of the raw logits vector
+    /// </summary>
+    public nuint LogitsSize
+    {
+        get => _ctx.logits_size;
+        set => _ctx.logits_size = value;
+    }
+
+    /// <summary>
+    /// current tokens in the context window
+    /// </summary>
+    public Span<int> Tokens => new(_ctx.tokens, (int)_ctx.tokens_size);
+
+    /// <summary>
+    /// the size of the raw tokens vector
+    /// </summary>
+    public nuint TokensSize
+    {
+        get => _ctx.tokens_size;
+        set => _ctx.tokens_size = value;
+    }
+
+    /// <summary>
+    /// top k logits to sample from
+    /// </summary>
+    public int TopK
+    {
+        get => _ctx.top_k;
+        set => _ctx.top_k = value;
+    }
+
+    /// <summary>
+    /// nucleus sampling probability threshold
+    /// </summary>
+    public float TopP
+    {
+        get => _ctx.top_p;
+        set => _ctx.top_p = value;
+    }
+
+    /// <summary>
+    /// min p sampling probability threshold
+    /// </summary>
+    public float MinP
+    {
+        get => _ctx.min_p;
+        set => _ctx.min_p = value;
+    }
+
+    /// <summary>
+    /// temperature to adjust model's output distribution
+    /// </summary>
+    public float Temperature
+    {
+        get => _ctx.temp;
+        set => _ctx.temp = value;
+    }
+
+    /// <summary>
+    /// number of tokens in past conversation
+    /// </summary>
+    public int PastNum
+    {
+        get => _ctx.n_past;
+        set => _ctx.n_past = value;
+    }
+
+    /// <summary>
+    /// number of predictions to generate in parallel
+    /// </summary>
+    public int Batches
+    {
+        get => _ctx.n_batch;
+        set => _ctx.n_batch = value;
+    }
+
+    /// <summary>
+    /// number of tokens to predict
+    /// </summary>
+    public int TokensToPredict
+    {
+        get => _ctx.n_predict;
+        set => _ctx.n_predict = value;
+    }
+
+    /// <summary>
+    /// penalty factor for repeated tokens
+    /// </summary>
+    public float RepeatPenalty
+    {
+        get => _ctx.repeat_penalty;
+        set => _ctx.repeat_penalty = value;
+    }
+
+    /// <summary>
+    /// last n tokens to penalize
+    /// </summary>
+    public int RepeatLastN
+    {
+        get => _ctx.repeat_last_n;
+        set => _ctx.repeat_last_n = value;
+    }
+
+    /// <summary>
+    /// number of tokens possible in context window
+    /// </summary>
+    public int ContextSize
+    {
+        get => _ctx.n_ctx;
+        set => _ctx.n_ctx = value;
+    }
+
+    /// <summary>
+    /// percent of context to erase if we exceed the context window
+    /// </summary>
+    public float ContextErase
+    {
+        get => _ctx.context_erase;
+        set => _ctx.context_erase = value;
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeMethods.cs
@ -0,0 +1,112 @@
+using System.Runtime.InteropServices;
+
+namespace Gpt4All.Bindings;
+
+public unsafe partial struct llmodel_prompt_context
+{
+    public float* logits;
+
+    [NativeTypeName("size_t")]
+    public nuint logits_size;
+
+    [NativeTypeName("int32_t *")]
+    public int* tokens;
+
+    [NativeTypeName("size_t")]
+    public nuint tokens_size;
+
+    [NativeTypeName("int32_t")]
+    public int n_past;
+
+    [NativeTypeName("int32_t")]
+    public int n_ctx;
+
+    [NativeTypeName("int32_t")]
+    public int n_predict;
+
+    [NativeTypeName("int32_t")]
+    public int top_k;
+
+    public float top_p;
+
+    public float min_p;
+
+    public float temp;
+
+    [NativeTypeName("int32_t")]
+    public int n_batch;
+
+    public float repeat_penalty;
+
+    [NativeTypeName("int32_t")]
+    public int repeat_last_n;
+
+    public float context_erase;
+}
+#pragma warning disable CA2101
+internal static unsafe partial class NativeMethods
+{
+    [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
+    [return: MarshalAs(UnmanagedType.I1)]
+    public delegate bool LlmodelResponseCallback(int token_id, [MarshalAs(UnmanagedType.LPUTF8Str)] string response);
+
+    [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
+    [return: MarshalAs(UnmanagedType.I1)]
+    public delegate bool LlmodelPromptCallback(int token_id);
+
+    [UnmanagedFunctionPointer(CallingConvention.Cdecl)]
+    [return: MarshalAs(UnmanagedType.I1)]
+    public delegate bool LlmodelRecalculateCallback(bool isRecalculating);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true, BestFitMapping = false, ThrowOnUnmappableChar = true)]
+    [return: NativeTypeName("llmodel_model")]
+    public static extern IntPtr llmodel_model_create2(
+        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
+        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string build_variant,
+        out IntPtr error);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+    public static extern void llmodel_model_destroy([NativeTypeName("llmodel_model")] IntPtr model);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true, BestFitMapping = false, ThrowOnUnmappableChar = true)]
+    [return: MarshalAs(UnmanagedType.I1)]
+    public static extern bool llmodel_loadModel(
+        [NativeTypeName("llmodel_model")] IntPtr model,
+        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string model_path,
+        [NativeTypeName("int32_t")] int n_ctx,
+        [NativeTypeName("int32_t")] int ngl);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+
+    [return: MarshalAs(UnmanagedType.I1)]
+    public static extern bool llmodel_isModelLoaded([NativeTypeName("llmodel_model")] IntPtr model);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+    [return: NativeTypeName("uint64_t")]
+    public static extern ulong llmodel_get_state_size([NativeTypeName("llmodel_model")] IntPtr model);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+    [return: NativeTypeName("uint64_t")]
+    public static extern ulong llmodel_save_state_data([NativeTypeName("llmodel_model")] IntPtr model, [NativeTypeName("uint8_t *")] byte* dest);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+    [return: NativeTypeName("uint64_t")]
+    public static extern ulong llmodel_restore_state_data([NativeTypeName("llmodel_model")] IntPtr model, [NativeTypeName("const uint8_t *")] byte* src);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true, BestFitMapping = false, ThrowOnUnmappableChar = true)]
+    public static extern void llmodel_prompt(
+        [NativeTypeName("llmodel_model")] IntPtr model,
+        [NativeTypeName("const char *")][MarshalAs(UnmanagedType.LPUTF8Str)] string prompt,
+        LlmodelPromptCallback prompt_callback,
+        LlmodelResponseCallback response_callback,
+        LlmodelRecalculateCallback recalculate_callback,
+        ref llmodel_prompt_context ctx);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+    public static extern void llmodel_setThreadCount([NativeTypeName("llmodel_model")] IntPtr model, [NativeTypeName("int32_t")] int n_threads);
+
+    [DllImport("libllmodel", CallingConvention = CallingConvention.Cdecl, ExactSpelling = true)]
+    [return: NativeTypeName("int32_t")]
+    public static extern int llmodel_threadCount([NativeTypeName("llmodel_model")] IntPtr model);
+}
+#pragma warning restore CA2101
--- a/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeTypeNameAttribute.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Bindings/NativeTypeNameAttribute.cs
@ -0,0 +1,21 @@
+using System.Diagnostics;
+
+namespace Gpt4All.Bindings;
+
+/// <summary>Defines the type of a member as it was used in the native signature.</summary>
+[AttributeUsage(AttributeTargets.Struct | AttributeTargets.Enum | AttributeTargets.Property | AttributeTargets.Field | AttributeTargets.Parameter | AttributeTargets.ReturnValue, AllowMultiple = false, Inherited = true)]
+[Conditional("DEBUG")]
+internal sealed partial class NativeTypeNameAttribute : Attribute
+{
+    private readonly string _name;
+
+    /// <summary>Initializes a new instance of the <see cref="NativeTypeNameAttribute" /> class.</summary>
+    /// <param name="name">The name of the type that was used in the native signature.</param>
+    public NativeTypeNameAttribute(string name)
+    {
+        _name = name;
+    }
+
+    /// <summary>Gets the name of the type that was used in the native signature.</summary>
+    public string Name => _name;
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Extensions/LLPromptContextExtensions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Extensions/LLPromptContextExtensions.cs
@ -0,0 +1,27 @@
+using Gpt4All.Bindings;
+
+namespace Gpt4All;
+
+internal static class LLPromptContextExtensions
+{
+    public static string Dump(this LLModelPromptContext context)
+    {
+        var ctx = context.UnderlyingContext;
+        return @$"
+        {{
+            logits_size = {ctx.logits_size}
+            tokens_size = {ctx.tokens_size}
+            n_past = {ctx.n_past}
+            n_ctx = {ctx.n_ctx}
+            n_predict = {ctx.n_predict}
+            top_k = {ctx.top_k}
+            top_p = {ctx.top_p}
+            min_p = {ctx.min_p}
+            temp = {ctx.temp}
+            n_batch = {ctx.n_batch}
+            repeat_penalty = {ctx.repeat_penalty}
+            repeat_last_n = {ctx.repeat_last_n}
+            context_erase = {ctx.context_erase}
+        }}";
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Extensions/PredictRequestOptionsExtensions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Extensions/PredictRequestOptionsExtensions.cs
@ -0,0 +1,26 @@
+using Gpt4All.Bindings;
+
+namespace Gpt4All;
+
+public static class PredictRequestOptionsExtensions
+{
+    public static LLModelPromptContext ToPromptContext(this PredictRequestOptions opts)
+    {
+        return new LLModelPromptContext
+        {
+            LogitsSize = opts.LogitsSize,
+            TokensSize = opts.TokensSize,
+            TopK = opts.TopK,
+            TopP = opts.TopP,
+            MinP = opts.MinP,
+            PastNum = opts.PastConversationTokensNum,
+            RepeatPenalty = opts.RepeatPenalty,
+            Temperature = opts.Temperature,
+            RepeatLastN = opts.RepeatLastN,
+            Batches = opts.Batches,
+            ContextErase = opts.ContextErase,
+            ContextSize = opts.ContextSize,
+            TokensToPredict = opts.TokensToPredict
+        };
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/GenLLModelBindings.rsp
+++ b/gpt4all-bindings/csharp/Gpt4All/GenLLModelBindings.rsp
@ -0,0 +1,21 @@
+--config
+exclude-funcs-with-body
+--with-access-specifier
+*=Public
+--include-directory
+..\..\..\gpt4all-backend\
+--file
+..\..\..\gpt4all-backend\llmodel_c.h
+--libraryPath
+libllmodel
+--remap
+sbyte*=IntPtr
+void*=IntPtr
+--namespace
+Gpt4All.Bindings
+--methodClassName
+NativeMethods
+--output
+.\Bindings\NativeMethods.cs
+--output-mode
+CSharp
--- a/gpt4all-bindings/csharp/Gpt4All/Gpt4All.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Gpt4All.cs
@ -0,0 +1,135 @@
+using System.Diagnostics;
+using System.Runtime.CompilerServices;
+using Gpt4All.Bindings;
+using Microsoft.Extensions.Logging;
+using Microsoft.Extensions.Logging.Abstractions;
+
+[assembly: InternalsVisibleTo("Gpt4All.Tests")]
+
+namespace Gpt4All;
+
+public class Gpt4All : IGpt4AllModel
+{
+    private readonly ILLModel _model;
+    private readonly ILogger _logger;
+
+    private const string ResponseErrorMessage =
+        "The model reported an error during token generation error={ResponseError}";
+
+    /// <inheritdoc/>
+    public IPromptFormatter? PromptFormatter { get; set; }
+
+    internal Gpt4All(ILLModel model, ILogger? logger = null)
+    {
+        _model = model;
+        _logger = logger ?? NullLogger.Instance;
+        PromptFormatter = new DefaultPromptFormatter();
+    }
+
+    private string FormatPrompt(string prompt)
+    {
+        if (PromptFormatter == null) return prompt;
+
+        return PromptFormatter.FormatPrompt(prompt);
+    }
+
+    public Task<ITextPredictionResult> GetPredictionAsync(string text, PredictRequestOptions opts, CancellationToken cancellationToken = default)
+    {
+        ArgumentNullException.ThrowIfNull(text);
+
+        return Task.Run(() =>
+        {
+            _logger.LogInformation("Start prediction task");
+
+            var sw = Stopwatch.StartNew();
+            var result = new TextPredictionResult();
+            var context = opts.ToPromptContext();
+            var prompt = FormatPrompt(text);
+
+            try
+            {
+                _model.Prompt(prompt, context, responseCallback: e =>
+                {
+                    if (e.IsError)
+                    {
+                        _logger.LogWarning(ResponseErrorMessage, e.Response);
+                        result.Success = false;
+                        result.ErrorMessage = e.Response;
+                        return false;
+                    }
+                    result.Append(e.Response);
+                    return true;
+                }, cancellationToken: cancellationToken);
+            }
+            catch (Exception e)
+            {
+                _logger.LogError(e, "Prompt error");
+                result.Success = false;
+            }
+
+            sw.Stop();
+            _logger.LogInformation("Prediction task completed elapsed={Elapsed}s", sw.Elapsed.TotalSeconds);
+
+            return (ITextPredictionResult)result;
+        }, CancellationToken.None);
+    }
+
+    public Task<ITextPredictionStreamingResult> GetStreamingPredictionAsync(string text, PredictRequestOptions opts, CancellationToken cancellationToken = default)
+    {
+        ArgumentNullException.ThrowIfNull(text);
+
+        var result = new TextPredictionStreamingResult();
+
+        _ = Task.Run(() =>
+        {
+            _logger.LogInformation("Start streaming prediction task");
+            var sw = Stopwatch.StartNew();
+
+            try
+            {
+                var context = opts.ToPromptContext();
+                var prompt = FormatPrompt(text);
+
+                _model.Prompt(prompt, context, responseCallback: e =>
+                {
+                    if (e.IsError)
+                    {
+                        _logger.LogWarning(ResponseErrorMessage, e.Response);
+                        result.Success = false;
+                        result.ErrorMessage = e.Response;
+                        return false;
+                    }
+                    result.Append(e.Response);
+                    return true;
+                }, cancellationToken: cancellationToken);
+            }
+            catch (Exception e)
+            {
+                _logger.LogError(e, "Prompt error");
+                result.Success = false;
+            }
+            finally
+            {
+                result.Complete();
+                sw.Stop();
+                _logger.LogInformation("Prediction task completed elapsed={Elapsed}s", sw.Elapsed.TotalSeconds);
+            }
+        }, CancellationToken.None);
+
+        return Task.FromResult((ITextPredictionStreamingResult)result);
+    }
+
+    protected virtual void Dispose(bool disposing)
+    {
+        if (disposing)
+        {
+            _model.Dispose();
+        }
+    }
+
+    public void Dispose()
+    {
+        Dispose(true);
+        GC.SuppressFinalize(this);
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Gpt4All.csproj
+++ b/gpt4all-bindings/csharp/Gpt4All/Gpt4All.csproj
@ -0,0 +1,23 @@
+<Project Sdk="Microsoft.NET.Sdk">
+    <PropertyGroup>
+        <ImplicitUsings>enable</ImplicitUsings>
+        <Nullable>enable</Nullable>
+        <AllowUnsafeBlocks>true</AllowUnsafeBlocks>
+        <GenerateDocumentationFile>true</GenerateDocumentationFile>
+        <TargetFramework>net8.0</TargetFramework>
+    </PropertyGroup>
+    <ItemGroup>
+        <!-- Windows -->
+        <None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
+        <!-- Linux -->
+        <None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
+        <!-- MacOS -->
+        <None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
+        <Content Include="..\runtimes\osx\native\*.metal" Pack="true" PackagePath="contentFiles\any\any;content">
+            <PackageCopyToOutput>true</PackageCopyToOutput>
+        </Content>
+    </ItemGroup>
+    <ItemGroup>
+        <PackageReference Include="Microsoft.Extensions.Logging.Abstractions" Version="7.0.0" />
+    </ItemGroup>
+</Project>
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/ILibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/ILibraryLoader.cs
@ -0,0 +1,6 @@
+namespace Gpt4All.LibraryLoader;
+
+public interface ILibraryLoader
+{
+    LoadResult OpenLibrary(string? fileName);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LinuxLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LinuxLibraryLoader.cs
@ -0,0 +1,53 @@
+using System.Runtime.InteropServices;
+
+namespace Gpt4All.LibraryLoader;
+
+internal class LinuxLibraryLoader : ILibraryLoader
+{
+#pragma warning disable CA2101
+    [DllImport("libdl.so", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlopen")]
+#pragma warning restore CA2101
+    public static extern IntPtr NativeOpenLibraryLibdl(string? filename, int flags);
+
+#pragma warning disable CA2101
+    [DllImport("libdl.so.2", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlopen")]
+#pragma warning restore CA2101
+    public static extern IntPtr NativeOpenLibraryLibdl2(string? filename, int flags);
+
+    [DllImport("libdl.so", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlerror")]
+    public static extern IntPtr GetLoadError();
+
+    [DllImport("libdl.so.2", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlerror")]
+    public static extern IntPtr GetLoadError2();
+
+    public LoadResult OpenLibrary(string? fileName)
+    {
+        IntPtr loadedLib;
+        try
+        {
+            // open with rtls lazy flag
+            loadedLib = NativeOpenLibraryLibdl2(fileName, 0x00001);
+        }
+        catch (DllNotFoundException)
+        {
+            loadedLib = NativeOpenLibraryLibdl(fileName, 0x00001);
+        }
+
+        if (loadedLib == IntPtr.Zero)
+        {
+            string errorMessage;
+            try
+            {
+                errorMessage = Marshal.PtrToStringAnsi(GetLoadError2()) ?? "Unknown error";
+            }
+            catch (DllNotFoundException)
+            {
+                errorMessage = Marshal.PtrToStringAnsi(GetLoadError()) ?? "Unknown error";
+            }
+
+            return LoadResult.Failure(errorMessage);
+        }
+
+        return LoadResult.Success;
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LoadResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/LoadResult.cs
@ -0,0 +1,20 @@
+namespace Gpt4All.LibraryLoader;
+
+public class LoadResult
+{
+    private LoadResult(bool isSuccess, string? errorMessage)
+    {
+        IsSuccess = isSuccess;
+        ErrorMessage = errorMessage;
+    }
+
+    public static LoadResult Success { get; } = new(true, null);
+
+    public static LoadResult Failure(string errorMessage)
+    {
+        return new(false, errorMessage);
+    }
+
+    public bool IsSuccess { get; }
+    public string? ErrorMessage { get; }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/MacOsLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/MacOsLibraryLoader.cs
@ -0,0 +1,28 @@
+using System.Runtime.InteropServices;
+
+namespace Gpt4All.LibraryLoader;
+
+internal class MacOsLibraryLoader : ILibraryLoader
+{
+#pragma warning disable CA2101
+    [DllImport("libdl.dylib", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlopen")]
+#pragma warning restore CA2101
+    public static extern IntPtr NativeOpenLibraryLibdl(string? filename, int flags);
+
+    [DllImport("libdl.dylib", ExactSpelling = true, CharSet = CharSet.Auto, EntryPoint = "dlerror")]
+    public static extern IntPtr GetLoadError();
+
+    public LoadResult OpenLibrary(string? fileName)
+    {
+        var loadedLib = NativeOpenLibraryLibdl(fileName, 0x00001);
+
+        if (loadedLib == IntPtr.Zero)
+        {
+            var errorMessage = Marshal.PtrToStringAnsi(GetLoadError()) ?? "Unknown error";
+
+            return LoadResult.Failure(errorMessage);
+        }
+
+        return LoadResult.Success;
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/NativeLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/NativeLibraryLoader.cs
@ -0,0 +1,81 @@
+#if !IOS && !MACCATALYST && !TVOS && !ANDROID
+using System.Runtime.InteropServices;
+#endif
+
+namespace Gpt4All.LibraryLoader;
+
+public static class NativeLibraryLoader
+{
+    private static ILibraryLoader? defaultLibraryLoader;
+
+    /// <summary>
+    /// Sets the library loader used to load the native libraries. Overwrite this only if you want some custom loading.
+    /// </summary>
+    /// <param name="libraryLoader">The library loader to be used.</param>
+    public static void SetLibraryLoader(ILibraryLoader libraryLoader)
+    {
+        defaultLibraryLoader = libraryLoader;
+    }
+
+    internal static LoadResult LoadNativeLibrary(string? path = default, bool bypassLoading = true)
+    {
+        // If the user has handled loading the library themselves, we don't need to do anything.
+        if (bypassLoading)
+        {
+            return LoadResult.Success;
+        }
+
+        var architecture = RuntimeInformation.OSArchitecture switch
+        {
+            Architecture.X64 => "x64",
+            Architecture.X86 => "x86",
+            Architecture.Arm => "arm",
+            Architecture.Arm64 => "arm64",
+            _ => throw new PlatformNotSupportedException(
+                $"Unsupported OS platform, architecture: {RuntimeInformation.OSArchitecture}")
+        };
+
+        var (platform, extension) = Environment.OSVersion.Platform switch
+        {
+            _ when RuntimeInformation.IsOSPlatform(OSPlatform.Windows) => ("win", "dll"),
+            _ when RuntimeInformation.IsOSPlatform(OSPlatform.Linux) => ("linux", "so"),
+            _ when RuntimeInformation.IsOSPlatform(OSPlatform.OSX) => ("osx", "dylib"),
+            _ => throw new PlatformNotSupportedException(
+                $"Unsupported OS platform, architecture: {RuntimeInformation.OSArchitecture}")
+        };
+
+        // If the user hasn't set the path, we'll try to find it ourselves.
+        if (string.IsNullOrEmpty(path))
+        {
+            var libraryName = "libllmodel";
+            var assemblySearchPath = new[]
+            {
+                AppDomain.CurrentDomain.RelativeSearchPath,
+                Path.GetDirectoryName(typeof(NativeLibraryLoader).Assembly.Location),
+                Path.GetDirectoryName(Environment.GetCommandLineArgs()[0])
+            }.FirstOrDefault(it => !string.IsNullOrEmpty(it));
+            // Search for the library dll within the assembly search path. If it doesn't exist, for whatever reason, use the default path.
+            path = Directory.EnumerateFiles(assemblySearchPath ?? string.Empty, $"{libraryName}.{extension}", SearchOption.AllDirectories).FirstOrDefault() ?? Path.Combine("runtimes", $"{platform}-{architecture}", $"{libraryName}.{extension}");
+        }
+
+        if (defaultLibraryLoader != null)
+        {
+            return defaultLibraryLoader.OpenLibrary(path);
+        }
+
+        if (!File.Exists(path))
+        {
+            throw new FileNotFoundException($"Native Library not found in path {path}. " +
+                                            $"Verify you have have included the native Gpt4All library in your application.");
+        }
+
+        ILibraryLoader libraryLoader = platform switch
+        {
+            "win" => new WindowsLibraryLoader(),
+            "osx" => new MacOsLibraryLoader(),
+            "linux" => new LinuxLibraryLoader(),
+            _ => throw new PlatformNotSupportedException($"Currently {platform} platform is not supported")
+        };
+        return libraryLoader.OpenLibrary(path);
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/WindowsLibraryLoader.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/LibraryLoader/WindowsLibraryLoader.cs
@ -0,0 +1,24 @@
+using System.ComponentModel;
+using System.Runtime.InteropServices;
+
+namespace Gpt4All.LibraryLoader;
+
+internal class WindowsLibraryLoader : ILibraryLoader
+{
+    public LoadResult OpenLibrary(string? fileName)
+    {
+        var loadedLib = LoadLibrary(fileName);
+
+        if (loadedLib == IntPtr.Zero)
+        {
+            var errorCode = Marshal.GetLastWin32Error();
+            var errorMessage = new Win32Exception(errorCode).Message;
+            return LoadResult.Failure(errorMessage);
+        }
+
+        return LoadResult.Success;
+    }
+
+    [DllImport("kernel32", SetLastError = true, CharSet = CharSet.Auto)]
+    private static extern IntPtr LoadLibrary([MarshalAs(UnmanagedType.LPWStr)] string? lpFileName);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Model/DefaultPromptFormatter.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/DefaultPromptFormatter.cs
@ -0,0 +1,16 @@
+namespace Gpt4All;
+
+public class DefaultPromptFormatter : IPromptFormatter
+{
+    public string FormatPrompt(string prompt)
+    {
+        return $"""
+        ### Instruction: 
+        The prompt below is a question to answer, a task to complete, or a conversation
+        to respond to; decide which and write an appropriate response.
+        ### Prompt:
+        {prompt}
+        ### Response:
+        """;
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/Gpt4AllModelFactory.cs
@ -0,0 +1,62 @@
+using System.Diagnostics;
+using Microsoft.Extensions.Logging.Abstractions;
+using Microsoft.Extensions.Logging;
+using Gpt4All.Bindings;
+using Gpt4All.LibraryLoader;
+using System.Runtime.InteropServices;
+
+namespace Gpt4All;
+
+public class Gpt4AllModelFactory : IGpt4AllModelFactory
+{
+    private readonly ILoggerFactory _loggerFactory;
+    private readonly ILogger _logger;
+    private static bool bypassLoading;
+    private static string? libraryPath;
+
+    private static readonly Lazy<LoadResult> libraryLoaded = new(() =>
+    {
+        return NativeLibraryLoader.LoadNativeLibrary(Gpt4AllModelFactory.libraryPath, Gpt4AllModelFactory.bypassLoading);
+    }, true);
+
+    public Gpt4AllModelFactory(string? libraryPath = default, bool bypassLoading = true, ILoggerFactory? loggerFactory = null)
+    {
+        _loggerFactory = loggerFactory ?? NullLoggerFactory.Instance;
+        _logger = _loggerFactory.CreateLogger<Gpt4AllModelFactory>();
+        Gpt4AllModelFactory.libraryPath = libraryPath;
+        Gpt4AllModelFactory.bypassLoading = bypassLoading;
+
+        if (!libraryLoaded.Value.IsSuccess)
+        {
+            throw new Exception($"Failed to load native gpt4all library. Error: {libraryLoaded.Value.ErrorMessage}");
+        }
+    }
+
+    private Gpt4All CreateModel(string modelPath)
+    {
+        _logger.LogInformation("Creating model path={ModelPath}", modelPath);
+        IntPtr error;
+        var handle = NativeMethods.llmodel_model_create2(modelPath, "auto", out error);
+        if (error != IntPtr.Zero)
+        {
+            throw new Exception(Marshal.PtrToStringAnsi(error));
+        }
+        _logger.LogDebug("Model created handle=0x{ModelHandle:X8}", handle);
+        _logger.LogInformation("Model loading started");
+        var loadedSuccessfully = NativeMethods.llmodel_loadModel(handle, modelPath, 2048, 100);
+        _logger.LogInformation("Model loading completed success={ModelLoadSuccess}", loadedSuccessfully);
+        if (!loadedSuccessfully)
+        {
+            throw new Exception($"Failed to load model: '{modelPath}'");
+        }
+
+        var logger = _loggerFactory.CreateLogger<LLModel>();
+        var underlyingModel = LLModel.Create(handle, logger: logger);
+
+        Debug.Assert(underlyingModel.IsLoaded());
+
+        return new Gpt4All(underlyingModel, logger: logger);
+    }
+
+    public IGpt4AllModel LoadModel(string modelPath) => CreateModel(modelPath);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModel.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModel.cs
@ -0,0 +1,10 @@
+namespace Gpt4All;
+
+public interface IGpt4AllModel : ITextPrediction, IDisposable
+{
+    /// <summary>
+    /// The prompt formatter used to format the prompt before
+    /// feeding it to the model, if null no transformation is applied
+    /// </summary>
+    IPromptFormatter? PromptFormatter { get; set; }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModelFactory.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/IGpt4AllModelFactory.cs
@ -0,0 +1,6 @@
+namespace Gpt4All;
+
+public interface IGpt4AllModelFactory
+{
+    IGpt4AllModel LoadModel(string modelPath);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Model/IPromptFormatter.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/IPromptFormatter.cs
@ -0,0 +1,14 @@
+namespace Gpt4All;
+
+/// <summary>
+/// Formats a prompt
+/// </summary>
+public interface IPromptFormatter
+{
+    /// <summary>
+    /// Format the provided prompt
+    /// </summary>
+    /// <param name="prompt">the input prompt</param>
+    /// <returns>The formatted prompt</returns>
+    string FormatPrompt(string prompt);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Model/ModelOptions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Model/ModelOptions.cs
@ -0,0 +1,6 @@
+namespace Gpt4All;
+
+public record ModelOptions
+{
+    public int Threads { get; init; } = 4;
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPrediction.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPrediction.cs
@ -0,0 +1,31 @@
+namespace Gpt4All;
+
+/// <summary>
+/// Interface for text prediction services
+/// </summary>
+public interface ITextPrediction
+{
+    /// <summary>
+    /// Get prediction results for the prompt and provided options.
+    /// </summary>
+    /// <param name="text">The text to complete</param>
+    /// <param name="opts">The prediction settings</param>
+    /// <param name="cancellation">The <see cref="CancellationToken"/> for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
+    /// <returns>The prediction result generated by the model</returns>
+    Task<ITextPredictionResult> GetPredictionAsync(
+        string text,
+        PredictRequestOptions opts,
+        CancellationToken cancellation = default);
+
+    /// <summary>
+    /// Get streaming prediction results for the prompt and provided options.
+    /// </summary>
+    /// <param name="text">The text to complete</param>
+    /// <param name="opts">The prediction settings</param>
+    /// <param name="cancellationToken">The <see cref="CancellationToken"/> for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
+    /// <returns>The prediction result generated by the model</returns>
+    Task<ITextPredictionStreamingResult> GetStreamingPredictionAsync(
+        string text,
+        PredictRequestOptions opts,
+        CancellationToken cancellationToken = default);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPredictionResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPredictionResult.cs
@ -0,0 +1,10 @@
+namespace Gpt4All;
+
+public interface ITextPredictionResult
+{
+    bool Success { get; }
+
+    string? ErrorMessage { get; }
+
+    Task<string> GetPredictionAsync(CancellationToken cancellationToken = default);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPredictionStreamingResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/ITextPredictionStreamingResult.cs
@ -0,0 +1,6 @@
+namespace Gpt4All;
+
+public interface ITextPredictionStreamingResult : ITextPredictionResult
+{
+    IAsyncEnumerable<string> GetPredictionStreamingAsync(CancellationToken cancellationToken = default);
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/PredictRequestOptions.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/PredictRequestOptions.cs
@ -0,0 +1,32 @@
+namespace Gpt4All;
+
+public record PredictRequestOptions
+{
+    public nuint LogitsSize { get; init; } = 0;
+
+    public nuint TokensSize { get; init; } = 0;
+
+    public int PastConversationTokensNum { get; init; } = 0;
+
+    public int ContextSize { get; init; } = 1024;
+
+    public int TokensToPredict { get; init; } = 128;
+
+    public int TopK { get; init; } = 40;
+
+    public float TopP { get; init; } = 0.9f;
+
+    public float MinP { get; init; } = 0.0f;
+
+    public float Temperature { get; init; } = 0.1f;
+
+    public int Batches { get; init; } = 8;
+
+    public float RepeatPenalty { get; init; } = 1.2f;
+
+    public int RepeatLastN { get; init; } = 10;
+
+    public float ContextErase { get; init; } = 0.5f;
+
+    public static readonly PredictRequestOptions Defaults = new();
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/TextPredictionResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/TextPredictionResult.cs
@ -0,0 +1,27 @@
+using System.Text;
+
+namespace Gpt4All;
+
+public record TextPredictionResult : ITextPredictionResult
+{
+    private readonly StringBuilder _result;
+
+    public bool Success { get; internal set; } = true;
+
+    public string? ErrorMessage { get; internal set; }
+
+    internal TextPredictionResult()
+    {
+        _result = new StringBuilder();
+    }
+
+    internal void Append(string token)
+    {
+        _result.Append(token);
+    }
+
+    public Task<string> GetPredictionAsync(CancellationToken cancellationToken = default)
+    {
+        return Task.FromResult(_result.ToString());
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/Prediction/TextPredictionStreamingResult.cs
+++ b/gpt4all-bindings/csharp/Gpt4All/Prediction/TextPredictionStreamingResult.cs
@ -0,0 +1,49 @@
+using System.Text;
+using System.Threading.Channels;
+
+namespace Gpt4All;
+
+public record TextPredictionStreamingResult : ITextPredictionStreamingResult
+{
+    private readonly Channel<string> _channel;
+
+    public bool Success { get; internal set; } = true;
+
+    public string? ErrorMessage { get; internal set; }
+
+    public Task Completion => _channel.Reader.Completion;
+
+    internal TextPredictionStreamingResult()
+    {
+        _channel = Channel.CreateUnbounded<string>();
+    }
+
+    internal bool Append(string token)
+    {
+        return _channel.Writer.TryWrite(token);
+    }
+
+    internal void Complete()
+    {
+        _channel.Writer.Complete();
+    }
+
+    public async Task<string> GetPredictionAsync(CancellationToken cancellationToken = default)
+    {
+        var sb = new StringBuilder();
+
+        var tokens = GetPredictionStreamingAsync(cancellationToken).ConfigureAwait(false);
+
+        await foreach (var token in tokens)
+        {
+            sb.Append(token);
+        }
+
+        return sb.ToString();
+    }
+
+    public IAsyncEnumerable<string> GetPredictionStreamingAsync(CancellationToken cancellationToken = default)
+    {
+        return _channel.Reader.ReadAllAsync(cancellationToken);
+    }
+}
--- a/gpt4all-bindings/csharp/Gpt4All/gen_bindings.ps1
+++ b/gpt4all-bindings/csharp/Gpt4All/gen_bindings.ps1
@ -0,0 +1 @@
+ClangSharpPInvokeGenerator @(Get-Content .\GenLLModelBindings.rsp)
--- a/gpt4all-bindings/csharp/README.md
+++ b/gpt4all-bindings/csharp/README.md
@ -0,0 +1,124 @@
+# C# GPT4All
+
+This package contains a set of C# bindings around the `llmodel` C-API.
+
+## Documentation
+TBD
+
+## Installation
+
+Windows and Linux builds are available on NuGet: https://www.nuget.org/packages/Gpt4All
+
+macOS is WIP due to code signing issues, contributions are welcome.
+
+## Project Structure
+```
+gpt4all-bindings/
+└── csharp                
+    ├── Gpt4All               // .NET Bindigs
+    ├── Gpt4All.Samples       // Sample project
+    ├── build_win-msvc.ps1    // Native build scripts
+    ├── build_win-mingw.ps1   
+    ├── build_linux.sh        
+    └── runtimes              // [POST-BUILD] Platform-specific native libraries
+          ├── win-x64
+          ├── ...
+          └── linux-x64
+```
+
+## Prerequisites
+
+On Windows and Linux, building GPT4All requires the complete Vulkan SDK. You may download it from here: https://vulkan.lunarg.com/sdk/home
+
+macOS users do not need Vulkan, as GPT4All will use Metal instead.
+
+## Local Build Instructions
+> **Note** 
+> Tested On:
+>  - Windows 11 22H + VS2022 (CE) x64
+>  - Linux Ubuntu x64
+>  - Linux Ubuntu (WSL2) x64
+
+1. Setup the repository
+2. Build the native libraries for the platform of choice (see below)
+3. Build the C# Bindings (NET6+ SDK is required)
+```
+git clone --recurse-submodules https://github.com/nomic-ai/gpt4all
+cd gpt4all/gpt4all-bindings/csharp
+```
+### Linux
+1. Setup build environment and install NET6+ SDK with the appropriate procedure for your distribution
+```
+sudo apt-get update
+sudo apt-get install -y cmake build-essential
+chmod +x ./build_linux.sh
+```
+2. `./build_linux.sh`
+3. The native libraries should be present at `.\native\linux-x64`
+
+### Windows - MinGW64
+#### Additional requirements
+  - [MinGW64](https://www.mingw-w64.org/) 
+  - CMAKE
+1. Setup
+```
+choco install mingw
+$env:Path += ";C:\ProgramData\mingw64\mingw64\bin"
+choco install -y cmake --installargs 'ADD_CMAKE_TO_PATH=System'
+```
+2. Run the `./build_win-mingw.ps1` build script
+3. The native libraries should be present at `.\native\win-x64`
+
+### Windows - MSVC
+#### Additional requirements
+  - Visual Studio 2022
+1. Open a terminal using the  `x64 Native Tools Command Prompt for VS 2022` (`vcvars64.bat`)
+2. Run the `./build_win-msvc.ps1` build script
+3. `libllmodel.dll` and `libllama.dll` should be present at `.\native\win-x64`
+
+> **Warning** 
+> If the build fails with: '**error C7555: use of designated initializers requires at least '/std:c++20'**'
+>
+> Modify `cd gpt4all/gpt4all-backends/CMakeLists.txt` adding `CXX_STANDARD_20` to `llmodel` properties.
+> ```cmake
+> set_target_properties(llmodel PROPERTIES
+>                              VERSION ${PROJECT_VERSION}
+>                              CXX_STANDARD 20 # <---- ADD THIS -----------------------
+>                              SOVERSION ${PROJECT_VERSION_MAJOR})
+> ```
+## C# Bindings Build Instructions
+Build the `Gpt4All` (or `Gpt4All.Samples`) projects from within VisualStudio.
+### Try the bindings
+```csharp
+using Gpt4All;
+
+// load the model
+var modelFactory = new ModelFactory();
+
+using var model = modelFactory.LoadModel("./path/to/ggml-gpt4all-j-v1.3-groovy.bin");
+
+var input = "Name 3 Colors";
+
+// request a prediction
+var result = await model.GetStreamingPredictionAsync(
+    input, 
+    PredictRequestOptions.Defaults);
+
+// asynchronously print the tokens as soon as they are produces by the model
+await foreach(var token in result.GetPredictionStreamingAsync())
+{
+    Console.Write(token);
+}
+```
+Output:
+```
+gptj_model_load: loading model from 'ggml-gpt4all-j-v1.3-groovy.bin' - please wait ...
+gptj_model_load: n_vocab = 50400
+[...TRUNCATED...]
+gptj_model_load: ggml ctx size = 5401.45 MB
+gptj_model_load: kv self size  =  896.00 MB
+gptj_model_load: ................................... done
+gptj_model_load: model size =  3609.38 MB / num tensors = 285
+
+Black, Blue and White
+```
--- a/gpt4all-bindings/csharp/build_linux.sh
+++ b/gpt4all-bindings/csharp/build_linux.sh
@ -0,0 +1,10 @@
+#!/bin/sh
+mkdir -p runtimes
+rm -rf runtimes/linux-x64
+mkdir -p runtimes/linux-x64/native
+mkdir runtimes/linux-x64/build
+cmake -S ../../gpt4all-backend -B runtimes/linux-x64/build
+cmake --build runtimes/linux-x64/build --parallel --config Release
+cp runtimes/linux-x64/build/libllmodel.so  runtimes/linux-x64/native/libllmodel.so
+cp runtimes/linux-x64/build/libgptj*.so  runtimes/linux-x64/native/
+cp runtimes/linux-x64/build/libllama*.so  runtimes/linux-x64/native/
--- a/gpt4all-bindings/csharp/build_win-mingw.ps1
+++ b/gpt4all-bindings/csharp/build_win-mingw.ps1
@ -0,0 +1,16 @@
+$ROOT_DIR = '.\runtimes\win-x64'
+$BUILD_DIR = '.\runtimes\win-x64\build\mingw'
+$LIBS_DIR = '.\runtimes\win-x64\native'
+
+# cleanup env
+Remove-Item -Force -Recurse $ROOT_DIR -ErrorAction SilentlyContinue | Out-Null
+mkdir $BUILD_DIR | Out-Null
+mkdir $LIBS_DIR  | Out-Null
+
+# build
+cmake -G "MinGW Makefiles" -S ..\..\gpt4all-backend -B $BUILD_DIR
+cmake --build $BUILD_DIR --parallel --config Release
+
+# copy native dlls
+cp "C:\ProgramData\mingw64\mingw64\bin\*dll" $LIBS_DIR
+cp "$BUILD_DIR\bin\*.dll" $LIBS_DIR
--- a/gpt4all-bindings/csharp/build_win-msvc.ps1
+++ b/gpt4all-bindings/csharp/build_win-msvc.ps1
@ -0,0 +1,6 @@
+Remove-Item -Force -Recurse .\runtimes\win-x64\msvc -ErrorAction SilentlyContinue
+mkdir .\runtimes\win-x64\msvc\build | Out-Null
+cmake -G "Visual Studio 17 2022" -A X64 -S ..\..\gpt4all-backend -B .\runtimes\win-x64\msvc\build
+cmake --build .\runtimes\win-x64\msvc\build --parallel --config Release
+cp .\runtimes\win-x64\msvc\build\bin\Release\*.dll .\runtimes\win-x64
+mv .\runtimes\win-x64\llmodel.dll .\runtimes\win-x64\libllmodel.dll
--- a/gpt4all-bindings/csharp/docs/gpt4all_csharp.md
+++ b/gpt4all-bindings/csharp/docs/gpt4all_csharp.md
@ -0,0 +1 @@
+# GPT4All C# API
--- a/gpt4all-bindings/golang/Makefile
+++ b/gpt4all-bindings/golang/Makefile
@ -0,0 +1,163 @@
+INCLUDE_PATH := $(abspath ./)
+LIBRARY_PATH := $(abspath ./)
+CMAKEFLAGS=
+
+ifndef UNAME_S
+UNAME_S := $(shell uname -s)
+endif
+
+ifndef UNAME_P
+UNAME_P := $(shell uname -p)
+endif
+
+ifndef UNAME_M
+UNAME_M := $(shell uname -m)
+endif
+
+CCV := $(shell $(CC) --version | head -n 1)
+CXXV := $(shell $(CXX) --version | head -n 1)
+
+# Mac OS + Arm can report x86_64
+# ref: https://github.com/ggerganov/whisper.cpp/issues/66#issuecomment-1282546789
+ifeq ($(UNAME_S),Darwin)
+	ifneq ($(UNAME_P),arm)
+		SYSCTL_M := $(shell sysctl -n hw.optional.arm64 2>/dev/null)
+		ifeq ($(SYSCTL_M),1)
+			# UNAME_P := arm
+			# UNAME_M := arm64
+			warn := $(warning Your arch is announced as x86_64, but it seems to actually be ARM64. Not fixing that can lead to bad performance. For more info see: https://github.com/ggerganov/whisper.cpp/issues/66\#issuecomment-1282546789)
+		endif
+	endif
+endif
+
+#
+# Compile flags
+#
+
+# keep standard at C11 and C++11
+CFLAGS   = -I. -I../../gpt4all-backend/llama.cpp -I../../gpt4all-backend -I -O3 -DNDEBUG -std=c11 -fPIC
+CXXFLAGS = -I. -I../../gpt4all-backend/llama.cpp -I../../gpt4all-backend -O3 -DNDEBUG -std=c++17 -fPIC
+LDFLAGS  =
+
+# warnings
+CFLAGS   += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function
+CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar
+
+# OS specific
+# TODO: support Windows
+ifeq ($(UNAME_S),Linux)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),Darwin)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),FreeBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),NetBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),OpenBSD)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+ifeq ($(UNAME_S),Haiku)
+	CFLAGS   += -pthread
+	CXXFLAGS += -pthread
+endif
+
+# Architecture specific
+# TODO: probably these flags need to be tweaked on some architectures
+#       feel free to update the Makefile for your architecture and send a pull request or issue
+ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
+	CXXFLAGS += -march=native -mtune=native
+endif
+ifneq ($(filter ppc64%,$(UNAME_M)),)
+	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
+	ifneq (,$(findstring POWER9,$(POWER9_M)))
+		CFLAGS += -mcpu=power9
+		CXXFLAGS += -mcpu=power9
+	endif
+	# Require c++23's std::byteswap for big-endian support.
+	ifeq ($(UNAME_M),ppc64)
+		CXXFLAGS += -std=c++23 -DGGML_BIG_ENDIAN
+	endif
+endif
+ifndef LLAMA_NO_ACCELERATE
+	# Mac M1 - include Accelerate framework.
+	# `-framework Accelerate` works on Mac Intel as well, with negliable performance boost (as of the predict time).
+	ifeq ($(UNAME_S),Darwin)
+		CFLAGS  += -DGGML_USE_ACCELERATE
+		LDFLAGS += -framework Accelerate
+	endif
+endif
+ifdef LLAMA_OPENBLAS
+	CFLAGS  += -DGGML_USE_OPENBLAS -I/usr/local/include/openblas
+	LDFLAGS += -lopenblas
+endif
+ifdef LLAMA_GPROF
+	CFLAGS   += -pg
+	CXXFLAGS += -pg
+endif
+ifneq ($(filter aarch64%,$(UNAME_M)),)
+	CFLAGS += -mcpu=native
+	CXXFLAGS += -mcpu=native
+endif
+ifneq ($(filter armv6%,$(UNAME_M)),)
+	# Raspberry Pi 1, 2, 3
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access
+endif
+ifneq ($(filter armv7%,$(UNAME_M)),)
+	# Raspberry Pi 4
+	CFLAGS += -mfpu=neon-fp-armv8 -mfp16-format=ieee -mno-unaligned-access -funsafe-math-optimizations
+endif
+ifneq ($(filter armv8%,$(UNAME_M)),)
+	# Raspberry Pi 4
+	CFLAGS += -mfp16-format=ieee -mno-unaligned-access
+endif
+
+#
+# Print build information
+#
+
+$(info I go-gpt4all build info: )
+$(info I UNAME_S:  $(UNAME_S))
+$(info I UNAME_P:  $(UNAME_P))
+$(info I UNAME_M:  $(UNAME_M))
+$(info I CFLAGS:   $(CFLAGS))
+$(info I CXXFLAGS: $(CXXFLAGS))
+$(info I LDFLAGS:  $(LDFLAGS))
+$(info I CMAKEFLAGS:  $(CMAKEFLAGS))
+$(info I CC:       $(CCV))
+$(info I CXX:      $(CXXV))
+$(info )
+
+llmodel.o:
+	[ -e buildllm ] || mkdir buildllm
+	cd buildllm && cmake ../../../gpt4all-backend/ $(CMAKEFLAGS) && make
+	cd buildllm && cp -rf CMakeFiles/llmodel.dir/llmodel_c.cpp.o ../llmodel_c.o
+	cd buildllm && cp -rf CMakeFiles/llmodel.dir/llmodel.cpp.o ../llmodel.o
+
+clean:
+	rm -f *.o
+	rm -f *.a
+	rm -rf buildllm
+	rm -rf example/main
+
+binding.o: binding.cpp binding.h
+	$(CXX) $(CXXFLAGS) binding.cpp -o binding.o -c $(LDFLAGS)
+
+libgpt4all.a: binding.o llmodel.o
+	ar src libgpt4all.a llmodel.o binding.o
+
+test: libgpt4all.a
+	@C_INCLUDE_PATH=${INCLUDE_PATH} LIBRARY_PATH=${LIBRARY_PATH} go test -v ./...
+
+example/main: libgpt4all.a
+	C_INCLUDE_PATH=$(INCLUDE_PATH) LIBRARY_PATH=$(INCLUDE_PATH) go build -o example/main ./example/
--- a/gpt4all-bindings/golang/README.md
+++ b/gpt4all-bindings/golang/README.md
@ -0,0 +1,59 @@
+# GPT4All Golang bindings
+
+The golang bindings have been tested on:
+- MacOS
+- Linux
+
+### Usage
+
+```
+import (
+	"github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+)
+
+func main() {
+	// Load the model
+	model, err := gpt4all.New("model.bin", gpt4all.SetModelType(gpt4all.GPTJType))
+	if err != nil {
+		panic(err)
+	}
+	defer model.Free()
+
+	model.SetTokenCallback(func(s string) bool {
+		fmt.Print(s)
+		return true
+	})
+
+	_, err = model.Predict("Here are 4 steps to create a website:", "", "", gpt4all.SetTemperature(0.1))
+	if err != nil {
+		panic(err)
+	}
+}
+```
+
+## Building
+
+In order to use the bindings you will need to build `libgpt4all.a`:
+
+```
+git clone --recurse-submodules https://github.com/nomic-ai/gpt4all
+cd gpt4all/gpt4all-bindings/golang
+make libgpt4all.a
+```
+
+To use the bindings in your own software:
+
+- Import `github.com/nomic-ai/gpt4all/gpt4all-bindings/golang`;
+- Compile `libgpt4all.a` (you can use `make libgpt4all.a` in the bindings/go directory);
+- Link your go binary by setting the environment variables `C_INCLUDE_PATH` and `LIBRARY_PATH` to point to the `binding.h` file directory and `libgpt4all.a` file directory respectively.
+- Note: you need to have *.so/*.dynlib/*.dll files of the implementation nearby the binary produced by the binding in order to make this to work
+
+## Testing
+
+To run tests, run `make test`:
+
+```
+git clone https://github.com/nomic-ai/gpt4all
+cd gpt4all/gpt4all-bindings/golang
+make test
+```
--- a/gpt4all-bindings/golang/binding.cpp
+++ b/gpt4all-bindings/golang/binding.cpp
@ -0,0 +1,107 @@
+#include "../../gpt4all-backend/llmodel_c.h"
+#include "../../gpt4all-backend/llmodel.h"
+#include "../../gpt4all-backend/llmodel_c.cpp"
+
+#include "binding.h"
+#include <cassert>
+#include <cmath>
+#include <cstddef>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+#include <iostream>
+#include <unistd.h>
+
+void* load_model(const char *fname, int n_threads) {
+    // load the model
+    const char *new_error;
+    auto model = llmodel_model_create2(fname, "auto", &new_error);
+    if (model == nullptr) {
+        fprintf(stderr, "%s: error '%s'\n", __func__, new_error);
+        return nullptr;
+    }
+    if (!llmodel_loadModel(model, fname, 2048, 100)) {
+        llmodel_model_destroy(model);
+        return nullptr;
+    }
+
+    llmodel_setThreadCount(model,  n_threads);
+    return model;
+}
+
+std::string res = "";
+void * mm;
+
+void model_prompt(const char *prompt, const char *prompt_template, int special, const char *fake_reply, 
+                    void *m, char* result, int repeat_last_n, float repeat_penalty, int n_ctx, int tokens, 
+                    int top_k, float top_p, float min_p, float temp, int n_batch,float ctx_erase)
+{
+    llmodel_model* model = (llmodel_model*) m;
+
+   // std::string res = "";
+ 
+    auto lambda_prompt = [](int token_id)  {
+	        return true;
+    };
+
+    mm=model;
+    res="";
+
+    auto lambda_response = [](int token_id, const char *responsechars) {
+        res.append((char*)responsechars);
+        return !!getTokenCallback(mm, (char*)responsechars);
+	};
+	
+	auto lambda_recalculate = [](bool is_recalculating) {
+	        // You can handle recalculation requests here if needed
+	    return is_recalculating;
+	};
+
+    llmodel_prompt_context* prompt_context = new llmodel_prompt_context{
+        .logits = NULL,
+        .logits_size = 0,
+        .tokens = NULL,
+        .tokens_size = 0,
+        .n_past = 0,
+        .n_ctx = 1024,
+        .n_predict = 50,
+        .top_k = 10,
+        .top_p = 0.9,
+        .min_p = 0.0,
+        .temp = 1.0,
+        .n_batch = 1,
+        .repeat_penalty = 1.2,
+        .repeat_last_n = 10,
+        .context_erase = 0.5
+    };
+
+    prompt_context->n_predict = tokens;
+    prompt_context->repeat_last_n = repeat_last_n;
+    prompt_context->repeat_penalty = repeat_penalty;
+    prompt_context->n_ctx = n_ctx;
+    prompt_context->top_k = top_k;
+    prompt_context->context_erase = ctx_erase;
+    prompt_context->top_p = top_p;
+    prompt_context->min_p = min_p;
+    prompt_context->temp = temp;
+    prompt_context->n_batch = n_batch;    
+
+    llmodel_prompt(model, prompt, prompt_template,
+                        lambda_prompt,
+                        lambda_response,
+                    lambda_recalculate,
+                    prompt_context, special, fake_reply);
+
+    strcpy(result, res.c_str()); 
+
+    free(prompt_context);
+}
+
+void free_model(void *state_ptr) {
+    llmodel_model* ctx = (llmodel_model*) state_ptr;
+    llmodel_model_destroy(*ctx);
+}
+
--- a/gpt4all-bindings/golang/binding.h
+++ b/gpt4all-bindings/golang/binding.h
@ -0,0 +1,19 @@
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <stdbool.h>
+
+void* load_model(const char *fname, int n_threads);
+
+void model_prompt(const char *prompt, const char *prompt_template, int special, const char *fake_reply, 
+                    void *m, char* result, int repeat_last_n, float repeat_penalty, int n_ctx, int tokens, 
+                    int top_k, float top_p, float min_p, float temp, int n_batch,float ctx_erase);
+
+void free_model(void *state_ptr);
+
+extern unsigned char getTokenCallback(void *, char *);
+
+#ifdef __cplusplus
+}
+#endif
--- a/gpt4all-bindings/golang/example/main.go
+++ b/gpt4all-bindings/golang/example/main.go
@ -0,0 +1,82 @@
+package main
+
+import (
+	"bufio"
+	"flag"
+	"fmt"
+	"io"
+	"os"
+	"runtime"
+	"strings"
+
+	gpt4all "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+)
+
+var (
+	threads = 4
+	tokens  = 128
+)
+
+func main() {
+	var model string
+
+	flags := flag.NewFlagSet(os.Args[0], flag.ExitOnError)
+	flags.StringVar(&model, "m", "./models/7B/ggml-model-q4_0.bin", "path to q4_0.bin model file to load")
+	flags.IntVar(&threads, "t", runtime.NumCPU(), "number of threads to use during computation")
+	flags.IntVar(&tokens, "n", 512, "number of tokens to predict")
+
+	err := flags.Parse(os.Args[1:])
+	if err != nil {
+		fmt.Printf("Parsing program arguments failed: %s", err)
+		os.Exit(1)
+	}
+	l, err := gpt4all.New(model, gpt4all.SetThreads(threads))
+	if err != nil {
+		fmt.Println("Loading the model failed:", err.Error())
+		os.Exit(1)
+	}
+	fmt.Printf("Model loaded successfully.\n")
+
+	l.SetTokenCallback(func(token string) bool {
+		fmt.Print(token)
+		return true
+	})
+
+	reader := bufio.NewReader(os.Stdin)
+
+	for {
+		text := readMultiLineInput(reader)
+
+		_, err := l.Predict(text, "", "", gpt4all.SetTokens(tokens), gpt4all.SetTopK(90), gpt4all.SetTopP(0.86))
+		if err != nil {
+			panic(err)
+		}
+		fmt.Printf("\n\n")
+	}
+}
+
+// readMultiLineInput reads input until an empty line is entered.
+func readMultiLineInput(reader *bufio.Reader) string {
+	var lines []string
+	fmt.Print(">>> ")
+
+	for {
+		line, err := reader.ReadString('\n')
+		if err != nil {
+			if err == io.EOF {
+				os.Exit(0)
+			}
+			fmt.Printf("Reading the prompt failed: %s", err)
+			os.Exit(1)
+		}
+
+		if len(strings.TrimSpace(line)) == 0 {
+			break
+		}
+
+		lines = append(lines, line)
+	}
+
+	text := strings.Join(lines, "")
+	return text
+}
--- a/gpt4all-bindings/golang/go.mod
+++ b/gpt4all-bindings/golang/go.mod
@ -0,0 +1,20 @@
+module github.com/nomic-ai/gpt4all/gpt4all-bindings/golang
+
+go 1.19
+
+require (
+	github.com/onsi/ginkgo/v2 v2.9.4
+	github.com/onsi/gomega v1.27.6
+)
+
+require (
+	github.com/go-logr/logr v1.2.4 // indirect
+	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
+	github.com/google/go-cmp v0.5.9 // indirect
+	github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
+	golang.org/x/net v0.9.0 // indirect
+	golang.org/x/sys v0.7.0 // indirect
+	golang.org/x/text v0.9.0 // indirect
+	golang.org/x/tools v0.8.0 // indirect
+	gopkg.in/yaml.v3 v3.0.1 // indirect
+)
--- a/gpt4all-bindings/golang/go.sum
+++ b/gpt4all-bindings/golang/go.sum
@ -0,0 +1,40 @@
+github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
+github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
+github.com/chzyer/test v0.0.0-20180213035817-a1ea475d72b1/go.mod h1:Q3SI9o4m/ZMnBNeIyt5eFwwo7qiLfzFZmjNmxjkiQlU=
+github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
+github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
+github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ=
+github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A=
+github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 h1:tfuBGBXKqDEevZMzYi5KSi8KkcZtzBcTgAUUtapy0OI=
+github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572/go.mod h1:9Pwr4B2jHnOSGXyyzV8ROjYa2ojvAY6HCGYYfMoC3Ls=
+github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg=
+github.com/google/go-cmp v0.5.9 h1:O2Tfq5qg4qc4AmwVlvv0oLiVAGB7enBSJ2x2DqQFi38=
+github.com/google/go-cmp v0.5.9/go.mod h1:17dUlkBOakJ0+DkrSSNjCkIjxS6bF9zb3elmeNGIjoY=
+github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 h1:yAJXTCF9TqKcTiHJAE8dj7HMvPfh66eeA2JYW7eFpSE=
+github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE=
+github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc=
+github.com/onsi/ginkgo/v2 v2.9.4 h1:xR7vG4IXt5RWx6FfIjyAtsoMAtnc3C/rFXBBd2AjZwE=
+github.com/onsi/ginkgo/v2 v2.9.4/go.mod h1:gCQYp2Q+kSoIj7ykSVb9nskRSsR6PUj4AiLywzIhbKM=
+github.com/onsi/gomega v1.27.6 h1:ENqfyGeS5AX/rlXDd/ETokDz93u0YufY1Pgxuy/PvWE=
+github.com/onsi/gomega v1.27.6/go.mod h1:PIQNjfQwkP3aQAH7lf7j87O/5FiNr+ZR8+ipb+qQlhg=
+github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
+github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
+github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
+github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
+github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
+golang.org/x/net v0.9.0 h1:aWJ/m6xSmxWBx+V0XRHTlrYrPG56jKsLdTFmsSsCzOM=
+golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns=
+golang.org/x/sys v0.0.0-20191204072324-ce4227a45e2e/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.7.0 h1:3jlCCIQZPdOYu1h8BkNvLz8Kgwtae2cagcG/VamtZRU=
+golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE=
+golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8=
+golang.org/x/tools v0.8.0 h1:vSDcovVPld282ceKgDimkRSC8kpaH1dgyc9UMzlt84Y=
+golang.org/x/tools v0.8.0/go.mod h1:JxBZ99ISMI5ViVkT1tr6tdNmXeTrcpVSD3vZ1RsRdN4=
+google.golang.org/protobuf v1.28.0 h1:w43yiav+6bVFTBQFZX0r7ipe9JQ1QsbMgHwbBziscLw=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
+gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
+gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
+gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA=
+gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/gpt4all-bindings/golang/gpt4all.go
+++ b/gpt4all-bindings/golang/gpt4all.go
@ -0,0 +1,112 @@
+package gpt4all
+
+// #cgo CFLAGS: -I${SRCDIR}../../gpt4all-backend/ -I${SRCDIR}../../gpt4all-backend/llama.cpp -I./
+// #cgo CXXFLAGS: -std=c++17 -I${SRCDIR}../../gpt4all-backend/ -I${SRCDIR}../../gpt4all-backend/llama.cpp -I./
+// #cgo darwin LDFLAGS: -framework Accelerate
+// #cgo darwin CXXFLAGS: -std=c++17
+// #cgo LDFLAGS: -lgpt4all -lm -lstdc++ -ldl
+// void* load_model(const char *fname, int n_threads);
+// void model_prompt( const char *prompt, const char *prompt_template, int special, const char *fake_reply, void *m, char* result, int repeat_last_n, float repeat_penalty, int n_ctx, int tokens, int top_k,
+//                            float top_p, float min_p, float temp, int n_batch,float ctx_erase);
+// void free_model(void *state_ptr);
+// extern unsigned char getTokenCallback(void *, char *);
+// void llmodel_set_implementation_search_path(const char *path);
+import "C"
+import (
+	"fmt"
+	"runtime"
+	"strings"
+	"sync"
+	"unsafe"
+)
+
+// The following code is https://github.com/go-skynet/go-llama.cpp with small adaptations
+type Model struct {
+	state unsafe.Pointer
+}
+
+func New(model string, opts ...ModelOption) (*Model, error) {
+	ops := NewModelOptions(opts...)
+
+	if ops.LibrarySearchPath != "" {
+		C.llmodel_set_implementation_search_path(C.CString(ops.LibrarySearchPath))
+	}
+
+	state := C.load_model(C.CString(model), C.int(ops.Threads))
+
+	if state == nil {
+		return nil, fmt.Errorf("failed loading model")
+	}
+
+	gpt := &Model{state: state}
+	// set a finalizer to remove any callbacks when the struct is reclaimed by the garbage collector.
+	runtime.SetFinalizer(gpt, func(g *Model) {
+		setTokenCallback(g.state, nil)
+	})
+
+	return gpt, nil
+}
+
+func (l *Model) Predict(text, template, fakeReplyText string, opts ...PredictOption) (string, error) {
+
+	po := NewPredictOptions(opts...)
+
+	input := C.CString(text)
+	if po.Tokens == 0 {
+		po.Tokens = 99999999
+	}
+	templateInput := C.CString(template)
+	fakeReplyInput := C.CString(fakeReplyText)
+	out := make([]byte, po.Tokens)
+
+	C.model_prompt(input, templateInput, C.int(po.Special), fakeReplyInput, l.state, (*C.char)(unsafe.Pointer(&out[0])),
+		C.int(po.RepeatLastN), C.float(po.RepeatPenalty), C.int(po.ContextSize), C.int(po.Tokens),
+		C.int(po.TopK), C.float(po.TopP), C.float(po.MinP), C.float(po.Temperature), C.int(po.Batch),
+		C.float(po.ContextErase))
+
+	res := C.GoString((*C.char)(unsafe.Pointer(&out[0])))
+	res = strings.TrimPrefix(res, " ")
+	res = strings.TrimPrefix(res, text)
+	res = strings.TrimPrefix(res, "\n")
+	res = strings.TrimSuffix(res, "<|endoftext|>")
+
+	return res, nil
+}
+
+func (l *Model) Free() {
+	C.free_model(l.state)
+}
+
+func (l *Model) SetTokenCallback(callback func(token string) bool) {
+	setTokenCallback(l.state, callback)
+}
+
+var (
+	m         sync.Mutex
+	callbacks = map[uintptr]func(string) bool{}
+)
+
+//export getTokenCallback
+func getTokenCallback(statePtr unsafe.Pointer, token *C.char) bool {
+	m.Lock()
+	defer m.Unlock()
+
+	if callback, ok := callbacks[uintptr(statePtr)]; ok {
+		return callback(C.GoString(token))
+	}
+
+	return true
+}
+
+// setCallback can be used to register a token callback for LLama. Pass in a nil callback to
+// remove the callback.
+func setTokenCallback(statePtr unsafe.Pointer, callback func(string) bool) {
+	m.Lock()
+	defer m.Unlock()
+
+	if callback == nil {
+		delete(callbacks, uintptr(statePtr))
+	} else {
+		callbacks[uintptr(statePtr)] = callback
+	}
+}
--- a/gpt4all-bindings/golang/gpt4all_suite_test.go
+++ b/gpt4all-bindings/golang/gpt4all_suite_test.go
@ -0,0 +1,13 @@
+package gpt4all_test
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestGPT(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "go-gpt4all-j test suite")
+}
--- a/gpt4all-bindings/golang/gpt4all_test.go
+++ b/gpt4all-bindings/golang/gpt4all_test.go
@ -0,0 +1,17 @@
+package gpt4all_test
+
+import (
+	. "github.com/nomic-ai/gpt4all/gpt4all-bindings/golang"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LLama binding", func() {
+	Context("Declaration", func() {
+		It("fails with no model", func() {
+			model, err := New("not-existing")
+			Expect(err).To(HaveOccurred())
+			Expect(model).To(BeNil())
+		})
+	})
+})
--- a/gpt4all-bindings/golang/options.go
+++ b/gpt4all-bindings/golang/options.go
@ -0,0 +1,138 @@
+package gpt4all
+
+type PredictOptions struct {
+	ContextSize, RepeatLastN, Tokens, TopK, Batch, Special int
+	TopP, MinP, Temperature, ContextErase, RepeatPenalty   float64
+}
+
+type PredictOption func(p *PredictOptions)
+
+var DefaultOptions PredictOptions = PredictOptions{
+	Tokens:        200,
+	TopK:          10,
+	TopP:          0.90,
+	MinP:          0.0,
+	Temperature:   0.96,
+	Batch:         1,
+	Special:       0,
+	ContextErase:  0.55,
+	ContextSize:   1024,
+	RepeatLastN:   10,
+	RepeatPenalty: 1.2,
+}
+
+var DefaultModelOptions ModelOptions = ModelOptions{
+	Threads: 4,
+}
+
+type ModelOptions struct {
+	Threads           int
+	LibrarySearchPath string
+}
+type ModelOption func(p *ModelOptions)
+
+// SetTokens sets the number of tokens to generate.
+func SetTokens(tokens int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Tokens = tokens
+	}
+}
+
+// SetTopK sets the value for top-K sampling.
+func SetTopK(topk int) PredictOption {
+	return func(p *PredictOptions) {
+		p.TopK = topk
+	}
+}
+
+// SetTopP sets the value for nucleus sampling.
+func SetTopP(topp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.TopP = topp
+	}
+}
+
+// SetMinP sets the value for min p sampling
+func SetMinP(minp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.MinP = minp
+	}
+}
+
+// SetRepeatPenalty sets the repeat penalty.
+func SetRepeatPenalty(ce float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.RepeatPenalty = ce
+	}
+}
+
+// SetRepeatLastN sets the RepeatLastN.
+func SetRepeatLastN(ce int) PredictOption {
+	return func(p *PredictOptions) {
+		p.RepeatLastN = ce
+	}
+}
+
+// SetContextErase sets the context erase %.
+func SetContextErase(ce float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.ContextErase = ce
+	}
+}
+
+// SetTemperature sets the temperature value for text generation.
+func SetTemperature(temp float64) PredictOption {
+	return func(p *PredictOptions) {
+		p.Temperature = temp
+	}
+}
+
+// SetBatch sets the batch size.
+func SetBatch(size int) PredictOption {
+	return func(p *PredictOptions) {
+		p.Batch = size
+	}
+}
+
+// SetSpecial is true if special tokens in the prompt should be processed, false otherwise.
+func SetSpecial(special bool) PredictOption {
+	return func(p *PredictOptions) {
+		if special {
+			p.Special = 1
+		} else {
+			p.Special = 0
+		}
+	}
+}
+
+// Create a new PredictOptions object with the given options.
+func NewPredictOptions(opts ...PredictOption) PredictOptions {
+	p := DefaultOptions
+	for _, opt := range opts {
+		opt(&p)
+	}
+	return p
+}
+
+// SetThreads sets the number of threads to use for text generation.
+func SetThreads(c int) ModelOption {
+	return func(p *ModelOptions) {
+		p.Threads = c
+	}
+}
+
+// SetLibrarySearchPath sets the dynamic libraries used by gpt4all for the various ggml implementations.
+func SetLibrarySearchPath(t string) ModelOption {
+	return func(p *ModelOptions) {
+		p.LibrarySearchPath = t
+	}
+}
+
+// Create a new PredictOptions object with the given options.
+func NewModelOptions(opts ...ModelOption) ModelOptions {
+	p := DefaultModelOptions
+	for _, opt := range opts {
+		opt(&p)
+	}
+	return p
+}
--- a/gpt4all-bindings/java/.gitignore
+++ b/gpt4all-bindings/java/.gitignore
@ -0,0 +1,5 @@
+# Make sure native directory never gets commited to git for the project.
+/src/main/resources/native
+
+# IntelliJ project file
+*.iml
--- a/gpt4all-bindings/java/Developer_docs.md
+++ b/gpt4all-bindings/java/Developer_docs.md
@ -0,0 +1,80 @@
+# Java Bindings Developer documents.
+
+This document is meant to anyone looking to build the Java bindings from source, test a build locally and perform a release.
+
+## Building locally
+
+Maven is the build tool used by the project. Maven version of 3.8 or higher is recommended. Make sure the **mvn** 
+is available on the command path.
+
+The project builds to Java version 11 target so make sure that a JDK at version 11 or newer is installed.
+
+### Setting up location of native shared libraries
+The property **native.libs.location** in pom.xml may need to be set:
+```
+    <properties>
+        ...
+        <native.libs.location>C:\Users\felix\dev\gpt4all_java_bins\release_1_1_3_Jun22_2023</native.libs.location>
+    </properties>
+```
+All the native shared libraries bundled with the Java binding jar will be copied from this location.
+The directory structure is **native/linux**, **native/macos**, **native/windows**. These directories are copied
+into the **src/main/resources** folder during the build process.
+
+For the purposes of local testing, none of these directories have to be present or just one OS type may be present.
+
+If none of the native libraries are present in **native.libs.location** the shared libraries will be searched for
+in location path set by **LLModel.LIBRARY_SEARCH_PATH** static variable in Java source code that is using the bindings.
+
+Alternately you can copy the shared libraries into the **src/resources/native/linux** before 
+you build, but note **src/main/resources/native** is on the .gitignore, so it will not be committed to sources.
+
+### Building
+
+To package the bindings jar run:
+```
+mvn package
+```
+This will build two jars. One has only the Java bindings and the other is a fat jar that will have required dependencies included as well.
+
+To package and install the Java bindings to your local maven repository run:
+```
+mvn install
+```
+
+### Using in a sample application
+
+You can check out a sample project that uses the java bindings here:
+https://github.com/felix-zaslavskiy/gpt4all-java-bindings-sample.git
+
+1. First, update the dependency of java bindings to whatever you have installed in local repository such as **1.1.4-SNAPSHOT**
+2. Second, update **Main.java** and set **baseModelPath** to the correct location of model weight files.
+
+3. To make a runnable jar run:
+```
+mvn package
+```
+
+A fat jar is also created which is easy to run from command line:
+```
+java -jar target/gpt4all-java-bindings-sample-1.0-SNAPSHOT-jar-with-dependencies.jar
+```
+
+### Publish a public release.
+
+For publishing a new version to maven central repository requires password and signing keys which F.Z. currently maintains, so
+he is responsible for making a public release.
+
+The procedure is as follows:
+
+For a snapshot release
+Run:
+```
+mvn deploy -P signing-profile
+```
+
+For a non-snapshot release
+Run:
+```
+mvn clean deploy -P signing-profile,release
+```
--- a/gpt4all-bindings/java/README.md
+++ b/gpt4all-bindings/java/README.md
@ -0,0 +1,126 @@
+# Java bindings
+
+Java bindings let you load a gpt4all library into your Java application and execute text 
+generation using an intuitive and easy to use API. No GPU is required because gpt4all executes on the CPU.
+The gpt4all models are quantized to easily fit into system RAM and use about 4 to 7GB of system RAM.
+
+## Getting Started
+You can add Java bindings into your Java project by adding the following dependency to your project:
+
+**Maven**
+```
+<dependency>
+    <groupId>com.hexadevlabs</groupId>
+    <artifactId>gpt4all-java-binding</artifactId>
+    <version>1.1.5</version>
+</dependency>
+```
+**Gradle**
+```
+implementation 'com.hexadevlabs:gpt4all-java-binding:1.1.5'
+```
+
+To add the library dependency for another build system see [Maven Central Java bindings](https://central.sonatype.com/artifact/com.hexadevlabs/gpt4all-java-binding/).
+
+To download model binary weights file use a URL such as [`https://gpt4all.io/models/gguf/gpt4all-13b-snoozy-q4_0.gguf`](https://gpt4all.io/models/gguf/gpt4all-13b-snoozy-q4_0.gguf).
+
+For information about other models available see the [model file list](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-chat#manual-download-of-models).
+
+### Sample code
+```java
+public class Example {
+    public static void main(String[] args) {
+
+        String prompt = "### Human:\nWhat is the meaning of life\n### Assistant:";
+
+        // Replace the hardcoded path with the actual path where your model file resides
+        String modelFilePath = "C:\\Users\\felix\\AppData\\Local\\nomic.ai\\GPT4All\\ggml-gpt4all-j-v1.3-groovy.bin";
+        
+        try (LLModel model = new LLModel(Path.of(modelFilePath))) {
+
+            // May generate up to 4096 tokens but generally stops early
+            LLModel.GenerationConfig config = LLModel.config()
+                    .withNPredict(4096).build();
+
+            // Will also stream to standard output
+            String fullGeneration = model.generate(prompt, config, true);
+
+        } catch (Exception e) {
+            // Exceptions generally may happen if the model file fails to load 
+            // for a number of reasons such as a file not found. 
+            // It is possible that Java may not be able to dynamically load the native shared library or 
+            // the llmodel shared library may not be able to dynamically load the backend 
+            // implementation for the model file you provided.
+            // 
+            // Once the LLModel class is successfully loaded into memory the text generation calls 
+            // generally should not throw exceptions.
+            e.printStackTrace(); // Printing here but in a production system you may want to take some action.
+        }
+    }
+
+}
+```
+
+For a Maven-based sample project that uses this library see this [sample project](https://github.com/felix-zaslavskiy/gpt4all-java-bindings-sample)
+
+### Additional considerations
+#### Logger warnings
+The Java bindings library may produce a warning if you don't have a SLF4J binding included in your project:
+```
+SLF4J: Failed to load class "org.slf4j.impl.StaticLoggerBinder".
+SLF4J: Defaulting to no-operation (NOP) logger implementation
+SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.
+```
+The Java bindings only use logging for informational 
+purposes, so a logger is not essential to correctly use the library. You can ignore this warning if you don't have SLF4J bindings
+in your project.
+
+To add a simple logger using a Maven dependency you may use:
+```
+<dependency>
+    <groupId>org.slf4j</groupId>
+    <artifactId>slf4j-simple</artifactId>
+    <version>1.7.36</version>
+</dependency>
+```
+
+#### Loading your native libraries
+1. the Java bindings package JAR comes bundled with a native library files for Windows, macOS and Linux. These library files are 
+copied to a temporary directory and loaded at runtime. For advanced users who may want to package shared libraries into Docker containers 
+or want to use a custom build of the shared libraries and ignore the once bundled with the Java package they have option 
+to load libraries from your local directory by setting a static property to the location of library files.
+There are no guarantees of compatibility if used in such a way so be careful if you really want to do it.
+
+For example:
+```java
+class Example {
+    public static void main(String[] args) {
+        // gpt4all native shared libraries location
+        LLModel.LIBRARY_SEARCH_PATH = "C:\\Users\\felix\\gpt4all\\lib\\"; 
+        // ... use the library normally
+    }
+}
+```
+2. Not every AVX-only shared library is bundled with the JAR right now to reduce size. Only libgptj-avx is included.
+If you are running into issues please let us know using the [gpt4all project issue tracker](https://github.com/nomic-ai/gpt4all/issues).
+
+3. For Windows the native library included in jar depends on specific Microsoft C and C++ (MSVC) runtime libraries which may not be installed on your system.
+If this is the case you can easily download and install the latest x64 Microsoft Visual C++ Redistributable package from https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170
+
+4. When running Java in a Docker container it is advised to use eclipse-temurin:17-jre parent image. Alpine based parent images don't work due to the native library dependencies.
+
+## Version history
+1. Version **1.1.2**:
+   - Java bindings is compatible with gpt4ll version 2.4.6
+   - Initial stable release with the initial feature set
+2. Version **1.1.3**:
+   - Java bindings is compatible with gpt4all version 2.4.8
+   - Add static GPT4ALL_VERSION to signify gpt4all version of the bindings
+   - Add PromptIsTooLongException for prompts that are longer than context size.
+   - Replit model support to include Metal Mac hardware support.
+3. Version **1.1.4**:
+   - Java bindings is compatible with gpt4all version 2.4.11
+   - Falcon model support included.
+4. Version **1.1.5**:
+   - Add a check for model file readability before loading model.
+   
--- a/gpt4all-bindings/java/TODO.md
+++ b/gpt4all-bindings/java/TODO.md
@ -0,0 +1,6 @@
+## Needed
+1. Integrate with circleci build pipeline like the C# binding.
+
+## These are just ideas
+1. Better Chat completions function.
+2. Chat completion that returns result in OpenAI compatible format.
--- a/gpt4all-bindings/java/pom.xml
+++ b/gpt4all-bindings/java/pom.xml
@ -0,0 +1,216 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <groupId>com.hexadevlabs</groupId>
+    <artifactId>gpt4all-java-binding</artifactId>
+    <version>1.1.5</version>
+    <packaging>jar</packaging>
+
+    <properties>
+        <maven.compiler.source>11</maven.compiler.source>
+        <maven.compiler.target>11</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+        <native.libs.location>C:\Users\felix\dev\gpt4all_java_bins\release_1_1_4_July8_2023</native.libs.location>
+    </properties>
+
+    <name>${project.groupId}:${project.artifactId}</name>
+    <description>Java bindings for GPT4ALL LLM</description>
+    <url>https://github.com/nomic-ai/gpt4all</url>
+    <licenses>
+        <license>
+            <name>The Apache License, Version 2.0</name>
+            <url>https://github.com/nomic-ai/gpt4all/blob/main/LICENSE.txt</url>
+        </license>
+    </licenses>
+    <developers>
+        <developer>
+            <name>Felix Zaslavskiy</name>
+            <email>felixz@hexadevlabs.com</email>
+            <organizationUrl>https://github.com/felix-zaslavskiy/</organizationUrl>
+        </developer>
+    </developers>
+    <scm>
+        <connection>scm:git:git://github.com/nomic-ai/gpt4all.git</connection>
+        <developerConnection>scm:git:ssh://github.com/nomic-ai/gpt4all.git</developerConnection>
+        <url>https://github.com/nomic-ai/gpt4all/tree/main</url>
+    </scm>
+
+    <dependencies>
+        <dependency>
+            <groupId>com.github.jnr</groupId>
+            <artifactId>jnr-ffi</artifactId>
+            <version>2.2.13</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.slf4j</groupId>
+            <artifactId>slf4j-api</artifactId>
+            <version>1.7.36</version>
+        </dependency>
+
+        <dependency>
+            <groupId>org.junit.jupiter</groupId>
+            <artifactId>junit-jupiter-api</artifactId>
+            <version>5.9.2</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-junit-jupiter</artifactId>
+            <version>5.4.0</version>
+            <scope>test</scope>
+        </dependency>
+
+        <dependency>
+            <groupId>org.mockito</groupId>
+            <artifactId>mockito-core</artifactId>
+            <version>5.4.0</version>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <distributionManagement>
+        <snapshotRepository>
+            <id>ossrh</id>
+            <url>https://s01.oss.sonatype.org/content/repositories/snapshots</url>
+        </snapshotRepository>
+        <repository>
+            <id>ossrh</id>
+            <url>https://s01.oss.sonatype.org/service/local/staging/deploy/maven2/</url>
+        </repository>
+    </distributionManagement>
+
+    <build>
+        <resources>
+            <resource>
+                <directory>src/main/resources</directory>
+            </resource>
+            <resource>
+                <directory>${project.build.directory}/generated-resources</directory>
+            </resource>
+        </resources>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <version>3.0.0</version>
+                <configuration>
+                    <forkCount>0</forkCount>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-resources-plugin</artifactId>
+                <version>3.3.1</version>
+                <executions>
+                    <execution>
+                        <id>copy-resources</id>
+                        <!-- Here the phase you need -->
+                        <phase>validate</phase>
+                        <goals>
+                            <goal>copy-resources</goal>
+                        </goals>
+                        <configuration>
+                            <outputDirectory>${project.build.directory}/generated-resources</outputDirectory>
+                            <resources>
+                                <resource>
+                                    <directory>${native.libs.location}</directory>
+                                </resource>
+                            </resources>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
+
+
+            <plugin>
+                <groupId>org.sonatype.plugins</groupId>
+                <artifactId>nexus-staging-maven-plugin</artifactId>
+                <version>1.6.13</version>
+                <extensions>true</extensions>
+                <configuration>
+                    <serverId>ossrh</serverId>
+                    <nexusUrl>https://s01.oss.sonatype.org/</nexusUrl>
+                    <autoReleaseAfterClose>true</autoReleaseAfterClose>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-source-plugin</artifactId>
+                <version>2.2.1</version>
+                <executions>
+                    <execution>
+                        <id>attach-sources</id>
+                        <goals>
+                            <goal>jar-no-fork</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-javadoc-plugin</artifactId>
+                <version>3.5.0</version>
+                <executions>
+                    <execution>
+                        <id>attach-javadocs</id>
+                        <goals>
+                            <goal>jar</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-assembly-plugin</artifactId>
+                <version>3.6.0</version>
+                <configuration>
+                    <descriptorRefs>
+                        <descriptorRef>jar-with-dependencies</descriptorRef>
+                    </descriptorRefs>
+                </configuration>
+                <executions>
+                    <execution>
+                        <id>make-assembly</id>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>single</goal>
+                        </goals>
+                    </execution>
+                </executions>
+            </plugin>
+         </plugins>
+
+    </build>
+
+    <profiles>
+        <profile>
+        <id>signing-profile</id>
+        <!-- activation conditions here, if any -->
+            <build>
+                <plugins>
+                    <plugin>
+                        <groupId>org.apache.maven.plugins</groupId>
+                        <artifactId>maven-gpg-plugin</artifactId>
+                        <version>3.1.0</version>
+                        <executions>
+                            <execution>
+                                <id>sign-artifacts</id>
+                                <phase>verify</phase>
+                                <goals>
+                                    <goal>sign</goal>
+                                </goals>
+
+                            </execution>
+                        </executions>
+                    </plugin>
+                </plugins>
+            </build>
+        </profile>
+    </profiles>
+</project>
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6`
				`@ -0,0 +1 @@`
				`Subproject commit fadf1135a54e80188d644df42ad6a53bf986e8b0`
				`@ -0,0 +1 @@`
				`ClangSharpPInvokeGenerator @(Get-Content .\GenLLModelBindings.rsp)`