Fix for windows circleci

Try and fix the rest of circleci for vulkan.
more circleci
2025-07-18 00:03:40 -04:00 · 2023-08-31 15:29:54 -04:00 · 2023-08-31 15:29:54 -04:00 · 2023-08-31 15:29:54 -04:00 · 2023-08-31 15:29:54 -04:00 · 2023-08-31 15:29:54 -04:00
21 changed files with 616 additions and 87 deletions
--- a/.circleci/continue_config.yml
+++ b/.circleci/continue_config.yml
@ -41,10 +41,12 @@ jobs:
      - restore_cache:  # this is the new step to restore cache
          keys:
            - linux-qt-cache
-      - run: 
+      - run:
          name: Setup Linux and Dependencies
          command: |
-            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev
+            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
+            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+            sudo apt update && sudo apt install -y libfontconfig1 libfreetype6 libx11-6 libx11-xcb1 libxext6 libxfixes3 libxi6 libxrender1 libxcb1 libxcb-cursor0 libxcb-glx0 libxcb-keysyms1 libxcb-image0 libxcb-shm0 libxcb-icccm4 libxcb-sync1 libxcb-xfixes0 libxcb-shape0 libxcb-randr0 libxcb-render-util0 libxcb-util1 libxcb-xinerama0 libxcb-xkb1 libxkbcommon0 libxkbcommon-x11-0 bison build-essential flex gperf python3 gcc g++ libgl1-mesa-dev libwayland-dev vulkan-sdk
      - run:
          name: Installing Qt
          command: |
@ -92,12 +94,18 @@ jobs:
          key: windows-qt-cache
          paths:
            - C:\Qt
+      - run:
+          name: Install VulkanSDK
+          command: |
+            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
+            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
      - run:
          name: Build
          command: |
            $Env:PATH = "${Env:PATH};C:\Program Files (x86)\Windows Kits\10\bin\x64"
            $Env:PATH = "${Env:PATH};C:\Program Files (x86)\Windows Kits\10\bin\10.0.22000.0\x64"
            $Env:PATH = "${Env:PATH};C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\bin\HostX64\x64"
+            $Env:PATH = "${Env:PATH};C:\VulkanSDK\1.3.261.1\bin"
            $Env:LIB = "${Env:LIB};C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22000.0\ucrt\x64"
            $Env:LIB = "${Env:LIB};C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22000.0\um\x64"
            $Env:LIB = "${Env:LIB};C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\VC\Tools\MSVC\14.29.30133\lib\x64"
@ -117,6 +125,7 @@ jobs:
              "-DCMAKE_BUILD_TYPE=Release" `
              "-DCMAKE_PREFIX_PATH:PATH=C:\Qt\6.5.1\msvc2019_64" `
              "-DCMAKE_MAKE_PROGRAM:FILEPATH=C:\Qt\Tools\Ninja\ninja.exe" `
+              "-DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON" `
              "-S ..\gpt4all-chat" `
              "-B ."
            & "C:\Qt\Tools\Ninja\ninja.exe"
@ -205,15 +214,20 @@ jobs:
          command: aws cloudfront create-invalidation --distribution-id E1STQOW63QL2OH --paths "/*"

  build-py-linux:
-    docker:
-      - image: circleci/python:3.8
+    machine:
+      image: ubuntu-2204:2023.04.2
    steps:
      - checkout
+      - run:
+          name: Set Python Version
+          command: pyenv global 3.11.2
      - run:
          name: Install dependencies
          command: |
+            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
+            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential
+            sudo apt-get install -y cmake build-essential vulkan-sdk
            pip install setuptools wheel cmake
      - run:
          name: Build C library
@ -277,9 +291,15 @@ jobs:
      - run:
          name: Add MinGW64 to PATH
          command: $env:Path += ";C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin"
+      - run:
+          name: Install VulkanSDK
+          command: |
+            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
+            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
      - run:
          name: Install dependencies
-          command: choco install -y cmake --installargs 'ADD_CMAKE_TO_PATH=System'
+          command:
+            choco install -y cmake --installargs 'ADD_CMAKE_TO_PATH=System'
      - run:
          name: Install Python dependencies
          command: pip install setuptools wheel cmake
@ -291,7 +311,8 @@ jobs:
            cd gpt4all-backend
            mkdir build
            cd build
-            cmake -G "MinGW Makefiles" .. 
+            $env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
+            cmake -G "MinGW Makefiles" .. -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
            cmake --build . --parallel
      - run:
          name: Build wheel
@ -343,8 +364,10 @@ jobs:
      - run:
          name: Install dependencies
          command: |
+            wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
+            sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list            
            sudo apt-get update
-            sudo apt-get install -y cmake build-essential
+            sudo apt-get install -y cmake build-essential vulkan-sdk
      - run:
          name: Build Libraries
          command: |
@ -407,6 +430,11 @@ jobs:
      - run:
          name: Install MinGW64
          command: choco install -y mingw --force --no-progress
+      - run:
+          name: Install VulkanSDK
+          command: |
+            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
+            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
      - run:
          name: Install dependencies
          command: |
@ -417,10 +445,11 @@ jobs:
            $MinGWBin = "C:\ProgramData\chocolatey\lib\mingw\tools\install\mingw64\bin"
            $Env:Path += ";$MinGwBin"
            $Env:Path += ";C:\Program Files\CMake\bin"
+            $Env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
            cd gpt4all-backend
            mkdir runtimes/win-x64
            cd runtimes/win-x64
-            cmake -G "MinGW Makefiles" ../..
+            cmake -G "MinGW Makefiles" -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON ../..
            cmake --build . --parallel --config Release
            cp "$MinGWBin\libgcc*.dll" .
            cp "$MinGWBin\libstdc++*.dll" .
@ -443,6 +472,11 @@ jobs:
          command: |
            git submodule sync
            git submodule update --init --recursive
+      - run:
+          name: Install VulkanSDK
+          command: |
+            Invoke-WebRequest -Uri https://sdk.lunarg.com/sdk/download/1.3.261.1/windows/VulkanSDK-1.3.261.1-Installer.exe -OutFile VulkanSDK-1.3.261.1-Installer.exe
+            .\VulkanSDK-1.3.261.1-Installer.exe --accept-licenses --default-answer --confirm-command install
      - run:
          name: Install dependencies
          command: |
@ -451,10 +485,11 @@ jobs:
          name: Build Libraries
          command: |
            $Env:Path += ";C:\Program Files\CMake\bin"
+            $Env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
            cd gpt4all-backend
            mkdir runtimes/win-x64_msvc
            cd runtimes/win-x64_msvc
-            cmake -G "Visual Studio 17 2022" -A X64 ../..
+            cmake -G "Visual Studio 17 2022" -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -A X64 ../..
            cmake --build . --parallel --config Release
            cp bin/Release/*.dll .
      - persist_to_workspace:
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = blong, belong, afterall
+ignore-words-list = blong, belong, afterall, som
 skip = .git,*.pdf,*.svg,*.lock
--- a/LICENSE_SOM.txt
+++ b/LICENSE_SOM.txt
@ -0,0 +1,30 @@
+Software for Open Models License (SOM)
+Version 1.0 dated August 30th, 2023
+
+This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
+
+This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
+
+1. Definitions
+The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
+A “Model” is the output of a machine learning algorithm, and excludes the Software.
+“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
+“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
+
+2. Grant of Rights. Subject to the conditions and limitations in section 3:
+(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
+
+(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software.  No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
+
+3. Conditions and Limitations
+(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms. 
+
+(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
+
+(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
+
+(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
+
+(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
+
+(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
--- a/gpt4all-backend/CMakeLists.txt
+++ b/gpt4all-backend/CMakeLists.txt
@ -20,7 +20,7 @@ endif()
 include_directories("${CMAKE_CURRENT_BINARY_DIR}")

 set(LLMODEL_VERSION_MAJOR 0)
-set(LLMODEL_VERSION_MINOR 3)
+set(LLMODEL_VERSION_MINOR 4)
 set(LLMODEL_VERSION_PATCH 0)
 set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
 project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
@ -39,6 +39,10 @@ else()
    message(STATUS "Interprocedural optimization support detected")
 endif()

+if(NOT APPLE)
+  set(LLAMA_KOMPUTE YES)
+endif()
+
 include(llama.cpp.cmake)

 set(BUILD_VARIANTS default avxonly)
@ -69,11 +73,6 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
    # Include GGML
    set(LLAMA_K_QUANTS YES)
    include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
-    if (NOT LLAMA_METAL)
-        set(LLAMA_K_QUANTS NO)
-        include_ggml(llama.cpp-230511 -230511-${BUILD_VARIANT} ON)
-        include_ggml(llama.cpp-230519 -230519-${BUILD_VARIANT} ON)
-    endif()

    # Function for preparing individual implementations
    function(prepare_target TARGET_NAME BASE_LIB)
@ -100,38 +99,32 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)

    add_library(replit-mainline-${BUILD_VARIANT} SHARED
    replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+    target_compile_definitions(replit-mainline-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
    prepare_target(replit-mainline llama-mainline)

    if (NOT LLAMA_METAL)
-        add_library(llamamodel-230519-${BUILD_VARIANT} SHARED
-            llamamodel.cpp llmodel_shared.cpp)
-        target_compile_definitions(llamamodel-230519-${BUILD_VARIANT} PRIVATE
-            LLAMA_VERSIONS===2 LLAMA_DATE=230519)
-        prepare_target(llamamodel-230519 llama-230519)
-        add_library(llamamodel-230511-${BUILD_VARIANT} SHARED
-            llamamodel.cpp llmodel_shared.cpp)
-        target_compile_definitions(llamamodel-230511-${BUILD_VARIANT} PRIVATE
-            LLAMA_VERSIONS=<=1 LLAMA_DATE=230511)
-        prepare_target(llamamodel-230511 llama-230511)
-
-        add_library(gptj-${BUILD_VARIANT} SHARED
-            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
-        prepare_target(gptj ggml-230511)
+# FIXME: These need to be forward ported to latest ggml
+#        add_library(gptj-${BUILD_VARIANT} SHARED
+#            gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+#        prepare_target(gptj ggml-230511)

        add_library(falcon-${BUILD_VARIANT} SHARED
            falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        target_compile_definitions(falcon-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
        prepare_target(falcon llama-mainline)
-
-        add_library(mpt-${BUILD_VARIANT} SHARED
-            mpt.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
-        prepare_target(mpt ggml-230511)
+# FIXME: These need to be forward ported to latest ggml
+#        add_library(mpt-${BUILD_VARIANT} SHARED
+#            mpt.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+#        prepare_target(mpt ggml-230511)

        add_library(bert-${BUILD_VARIANT} SHARED
            bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        target_compile_definitions(bert-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
        prepare_target(bert llama-mainline)

        add_library(starcoder-${BUILD_VARIANT} SHARED
            starcoder.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
+        target_compile_definitions(starcoder-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
        prepare_target(starcoder llama-mainline)
    endif()
 endforeach()
--- a/gpt4all-backend/bert.cpp
+++ b/gpt4all-backend/bert.cpp
@ -1,5 +1,6 @@
 #define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "bert_impl.h"
+#include "llmodel_shared.h"
 #include "ggml.h"

 #include <cassert>
@ -91,22 +92,6 @@ struct bert_model
 };

 // Replacement for std::vector<uint8_t> that doesn't require zero-initialization.
-struct bert_buffer {
-    uint8_t * data = NULL;
-    size_t size = 0;
-
-    void resize(size_t size) {
-        delete[] data;
-        data = new uint8_t[size];
-        this->size = size;
-    }
-
-    ~bert_buffer() {
-        delete[] data;
-    }
-};
-
-
 struct bert_ctx
 {
    bert_model model;
@ -115,7 +100,8 @@ struct bert_ctx
    size_t mem_per_token;
    int64_t mem_per_input;
    int32_t max_batch_n;
-    bert_buffer buf_compute;
+    llm_buffer buf_compute;
+    llm_buffer work_buf;
 };

 int32_t bert_n_embd(bert_ctx * ctx)
@ -328,13 +314,12 @@ void bert_eval(

    struct ggml_init_params params = {
        .mem_size = buf_compute.size,
-        .mem_buffer = buf_compute.data,
+        .mem_buffer = buf_compute.addr,
        .no_alloc = false,
    };

    struct ggml_context *ctx0 = ggml_init(params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    // Embeddings. word_embeddings + token_type_embeddings + position_embeddings
    struct ggml_tensor *token_layer = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
@ -466,7 +451,9 @@ void bert_eval(
    ggml_tensor *output = inpL;
    // run the computation
    ggml_build_forward_expand(&gf, output);
-    ggml_graph_compute(ctx0, &gf);
+    //ggml_graph_compute_g4a()
+    ggml_graph_compute_g4a(ctx->work_buf, &gf, n_threads);
+    //ggml_graph_compute(ctx0, &gf);


    // float *dat = ggml_get_data_f32(output);
@ -633,7 +620,7 @@ struct bert_ctx * bert_load_from_file(const char *fname)
        model_mem_req += n_layer * (n_intermediate * ggml_type_sizef(GGML_TYPE_F32)); // ff_i_b
        model_mem_req += n_layer * (n_embd * ggml_type_sizef(GGML_TYPE_F32)); // ff_o_b

-        model_mem_req += (5 + 16 * n_layer) * 256; // object overhead
+        model_mem_req += (5 + 16 * n_layer) * ggml_tensor_overhead(); // object overhead

 #if defined(DEBUG_BERT)
        printf("%s: ggml ctx size = %6.2f MB\n", __func__, model_mem_req / (1024.0 * 1024.0));
@ -1063,4 +1050,4 @@ DLL_EXPORT bool magic_match(std::istream& f) {
 DLL_EXPORT LLModel *construct() {
    return new Bert;
 }
-}
+}
--- a/gpt4all-backend/dlhandle.h
+++ b/gpt4all-backend/dlhandle.h
@ -75,7 +75,7 @@ public:

    Dlhandle() : chandle(nullptr) {}
    Dlhandle(const std::string& fpath) {
-        chandle = LoadLibraryA(fpath.c_str());
+        chandle = LoadLibraryExA(fpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
        if (!chandle) {
            throw Exception("dlopen(\""+fpath+"\"): Error");
        }
--- a/gpt4all-backend/falcon.cpp
+++ b/gpt4all-backend/falcon.cpp
@ -1,3 +1,4 @@
+#include "ggml.h"
 #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
 #include "falcon_impl.h"
 #include "llama.h"
@ -64,6 +65,7 @@ struct falcon_model {
    std::map<std::string, struct ggml_tensor*> tensors;

    llm_buffer eval_buf;
+    llm_buffer work_buf;
    llm_buffer scr0_buf;
    llm_buffer scr1_buf;
 };
@ -446,7 +448,7 @@ bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_voca
 //   - embd_w:    the predicted logits for the next token
 //
 bool falcon_eval(
-        const falcon_model & model,
+        falcon_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
@ -473,7 +475,6 @@ bool falcon_eval(

    struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -546,8 +547,8 @@ bool falcon_eval(
                head_dim * (n_head + n_head_kv) * sizeof_wtype);

            // using mode = 2 for neox mode
-            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2);
-            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2);
+            Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, n_ctx);
+            Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, n_ctx);

            // store key and value to memory
            {
@ -678,7 +679,8 @@ bool falcon_eval(

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
+  

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +1 @@
-Subproject commit da760ac3829a89ab9d60ec797df8a570b9b8419a
+Subproject commit ced231980e0f88b9c7b454c456256c71c4f3cb75
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -1,3 +1,11 @@
+#
+# Copyright (c) 2023 Nomic, Inc. All rights reserved.
+#
+# This software is licensed under the terms of the Software for Open Models License (SOM),
+# version 1.0, as detailed in the LICENSE_SOM.txt file. A copy of this license should accompany
+# this software. Except as expressly granted in the SOM license, all rights are reserved by Nomic, Inc.
+#
+
 cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason

 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@ -145,6 +153,129 @@ if (LLAMA_OPENBLAS)
    endif()
 endif()

+if (LLAMA_KOMPUTE)
+    find_package(Vulkan COMPONENTS glslc REQUIRED)
+    find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
+    if (NOT glslc_executable)
+        message(FATAL_ERROR "glslc not found")
+    endif()
+
+    set(LLAMA_DIR ${CMAKE_CURRENT_SOURCE_DIR}/llama.cpp-mainline)
+
+    function(compile_shader)
+      set(options)
+      set(oneValueArgs)
+      set(multiValueArgs SOURCES)
+      cmake_parse_arguments(compile_shader "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+      foreach(source ${compile_shader_SOURCES})
+        get_filename_component(OP_FILE ${source} NAME)
+        set(spv_file ${CMAKE_CURRENT_BINARY_DIR}/${OP_FILE}.spv)
+        add_custom_command(
+            OUTPUT ${spv_file}
+            DEPENDS ${LLAMA_DIR}/${source}
+            COMMAND ${glslc_executable} --target-env=vulkan1.2 -o ${spv_file} ${LLAMA_DIR}/${source}
+            COMMENT "Compiling ${source} to ${source}.spv"
+        )
+
+        get_filename_component(RAW_FILE_NAME ${spv_file} NAME)
+        set(FILE_NAME "shader${RAW_FILE_NAME}")
+        string(REPLACE ".comp.spv" ".h" HEADER_FILE ${FILE_NAME})
+        string(TOUPPER ${HEADER_FILE} HEADER_FILE_DEFINE)
+        string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
+        set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
+        message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
+        add_custom_command(
+          OUTPUT ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
+          COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
+          DEPENDS ${spv_file} xxd
+          COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
+        )
+      endforeach()
+    endfunction()
+
+    if (EXISTS "${LLAMA_DIR}/kompute/CMakeLists.txt")
+        message(STATUS "Kompute found")
+        add_subdirectory(${LLAMA_DIR}/kompute)
+
+        # Compile our shaders
+        compile_shader(SOURCES
+          kompute/op_scale.comp
+          kompute/op_add.comp
+          kompute/op_addrow.comp
+          kompute/op_mul.comp
+          kompute/op_mulrow.comp
+          kompute/op_silu.comp
+          kompute/op_relu.comp
+          kompute/op_gelu.comp
+          kompute/op_softmax.comp
+          kompute/op_norm.comp
+          kompute/op_rmsnorm.comp
+          kompute/op_diagmask.comp
+          kompute/op_mul_mat_f16.comp
+          kompute/op_mul_mat_q4_0.comp
+          kompute/op_mul_mat_q4_1.comp
+          kompute/op_getrows_f16.comp
+          kompute/op_getrows_q4_0.comp
+          kompute/op_getrows_q4_1.comp
+          kompute/op_rope.comp
+          kompute/op_cpy_f16_f16.comp
+          kompute/op_cpy_f16_f32.comp
+          kompute/op_cpy_f32_f16.comp
+          kompute/op_cpy_f32_f32.comp
+        )
+
+        # Create a custom target for our generated shaders
+        add_custom_target(generated_shaders DEPENDS
+          shaderop_scale.h
+          shaderop_add.h
+          shaderop_addrow.h
+          shaderop_mul.h
+          shaderop_mulrow.h
+          shaderop_silu.h
+          shaderop_relu.h
+          shaderop_gelu.h
+          shaderop_softmax.h
+          shaderop_norm.h
+          shaderop_rmsnorm.h
+          shaderop_diagmask.h
+          shaderop_mul_mat_f16.h
+          shaderop_mul_mat_q4_0.h
+          shaderop_mul_mat_q4_1.h
+          shaderop_getrows_f16.h
+          shaderop_getrows_q4_0.h
+          shaderop_getrows_q4_1.h
+          shaderop_rope.h
+          shaderop_cpy_f16_f16.h
+          shaderop_cpy_f16_f32.h
+          shaderop_cpy_f32_f16.h
+          shaderop_cpy_f32_f32.h
+        )
+
+        # Create a custom command that depends on the generated_shaders
+        add_custom_command(
+            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            COMMAND ${CMAKE_COMMAND} -E touch ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp
+            DEPENDS generated_shaders
+            COMMENT "Ensuring shaders are generated before compiling ggml-vulkan.cpp"
+        )
+
+        # Add the stamp to the main sources to ensure dependency tracking
+        set(GGML_SOURCES_KOMPUTE ${LLAMA_DIR}/ggml-vulkan.cpp ${LLAMA_DIR}/ggml-vulkan.h ${CMAKE_CURRENT_BINARY_DIR}/ggml-vulkan.stamp)
+        add_compile_definitions(GGML_USE_KOMPUTE)
+        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} kompute)
+        set(LLAMA_EXTRA_INCLUDES ${LLAMA_EXTRA_INCLUDES} ${CMAKE_BINARY_DIR})
+    else()
+        message(WARNING "Kompute not found")
+    endif()
+endif()
+
 if (LLAMA_ALL_WARNINGS)
    if (NOT MSVC)
        set(c_flags
@ -296,10 +427,13 @@ function(include_ggml DIRECTORY SUFFIX WITH_LLAMA)
    add_library(ggml${SUFFIX} OBJECT
                ${DIRECTORY}/ggml.c
                ${DIRECTORY}/ggml.h
+                ${DIRECTORY}/ggml-alloc.c
+                ${DIRECTORY}/ggml-alloc.h
                ${GGML_SOURCES_QUANT_K}
                ${GGML_SOURCES_CUDA}
                ${GGML_METAL_SOURCES}
-                ${GGML_OPENCL_SOURCES})
+                ${GGML_OPENCL_SOURCES}
+                ${GGML_SOURCES_KOMPUTE})

    if (LLAMA_K_QUANTS)
        target_compile_definitions(ggml${SUFFIX} PUBLIC GGML_USE_K_QUANTS)
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -28,6 +28,9 @@
 #include <llama.h>
 #include <ggml.h>

+#ifdef GGML_USE_KOMPUTE
+#include "ggml-vulkan.h"
+#endif

 namespace {
 const char *modelType_ = "LLaMA";
@ -155,6 +158,13 @@ bool LLamaModel::loadModel(const std::string &modelPath)
    // currently
    d_ptr->params.n_gpu_layers = 1;
 #endif
+#ifdef GGML_USE_KOMPUTE
+    if (ggml_vk_has_device()) {
+        // vulkan always runs the whole model if n_gpu_layers is not 0, at least
+        // currently
+        d_ptr->params.n_gpu_layers = 1;
+    }
+#endif

    d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
    if (!d_ptr->ctx) {
@ -162,6 +172,12 @@ bool LLamaModel::loadModel(const std::string &modelPath)
        return false;
    }

+#ifdef GGML_USE_KOMPUTE
+    if (ggml_vk_has_device()) {
+        std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
+    }
+#endif
+
    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
    d_ptr->modelLoaded = true;
    fflush(stderr);
@ -252,6 +268,75 @@ const std::vector<LLModel::Token> &LLamaModel::endTokens() const
    return fres;
 }

+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+#endif
+
+std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
+{
+#if defined(GGML_USE_KOMPUTE)
+    std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
+
+    std::vector<LLModel::GPUDevice> devices;
+    for(const auto& vkDevice : vkDevices) {
+        LLModel::GPUDevice device;
+        device.index = vkDevice.index;
+        device.type = vkDevice.type;
+        device.heapSize = vkDevice.heapSize;
+        device.name = vkDevice.name;
+        device.vendor = vkDevice.vendor;
+
+        devices.push_back(device);
+    }
+
+    return devices;
+#else
+    return std::vector<LLModel::GPUDevice>();
+#endif
+}
+
+bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_init_device(memoryRequired, device);
+#else
+    return false;
+#endif
+}
+
+bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device)
+{
+#if defined(GGML_USE_KOMPUTE)
+    ggml_vk_device vkDevice;
+    vkDevice.index = device.index;
+    vkDevice.type = device.type;
+    vkDevice.heapSize = device.heapSize;
+    vkDevice.name = device.name;
+    vkDevice.vendor = device.vendor;
+    return ggml_vk_init_device(vkDevice);
+#else
+    return false;
+#endif
+}
+
+bool LLamaModel::initializeGPUDevice(int device)
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_init_device(device);
+#else
+    return false;
+#endif
+}
+
+bool LLamaModel::hasGPUDevice()
+{
+#if defined(GGML_USE_KOMPUTE)
+    return ggml_vk_has_device();
+#else
+    return false;
+#endif
+}
+
 #if defined(_WIN32)
 #define DLL_EXPORT __declspec(dllexport)
 #else
--- a/gpt4all-backend/llamamodel_impl.h
+++ b/gpt4all-backend/llamamodel_impl.h
@ -25,6 +25,11 @@ public:
    size_t restoreState(const uint8_t *src) override;
    void setThreadCount(int32_t n_threads) override;
    int32_t threadCount() const override;
+    std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
+    bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
+    bool initializeGPUDevice(const GPUDevice &device) override;
+    bool initializeGPUDevice(int device) override;
+    bool hasGPUDevice() override;

 private:
    LLamaPrivate *d_ptr;
--- a/gpt4all-backend/llmodel.h
+++ b/gpt4all-backend/llmodel.h
@ -58,6 +58,14 @@ public:
            // window
    };

+    struct GPUDevice {
+        int index = 0;
+        int type = 0;
+        size_t heapSize = 0;
+        std::string name;
+        std::string vendor;
+    };
+
    explicit LLModel() {}
    virtual ~LLModel() {}

@ -87,6 +95,12 @@ public:
        return *m_implementation;
    }

+    virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
+    virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
+    virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
+    virtual bool initializeGPUDevice(int /*device*/) { return false; }
+    virtual bool hasGPUDevice() { return false; }
+
 protected:
    // These are pure virtual because subclasses need to implement as the default implementation of
    // 'prompt' above calls these functions
--- a/gpt4all-backend/llmodel_c.cpp
+++ b/gpt4all-backend/llmodel_c.cpp
@ -5,7 +5,6 @@
 #include <cerrno>
 #include <utility>

-
 struct LLModelWrapper {
    LLModel *llModel = nullptr;
    LLModel::PromptContext promptContext;
@ -210,3 +209,57 @@ const char *llmodel_get_implementation_search_path()
 {
    return LLModel::Implementation::implementationsSearchPath().c_str();
 }
+
+struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
+
+    // Set the num_devices
+    *num_devices = devices.size();
+
+    if (*num_devices == 0) return nullptr;  // Return nullptr if no devices are found
+
+    // Allocate memory for the output array
+    struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device));
+
+    for (int i = 0; i < *num_devices; i++) {
+        output[i].index = devices[i].index;
+        output[i].type = devices[i].type;
+        output[i].heapSize = devices[i].heapSize;
+        output[i].name = strdup(devices[i].name.c_str());  // Convert std::string to char* and allocate memory
+        output[i].vendor = strdup(devices[i].vendor.c_str());  // Convert std::string to char* and allocate memory
+    }
+
+    return output;
+}
+
+bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
+}
+
+bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
+{
+    LLModel::GPUDevice d;
+    d.index = device->index;
+    d.type = device->type;
+    d.heapSize = device->heapSize;
+    d.name = device->name;
+    d.vendor = device->vendor;
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->initializeGPUDevice(d);
+}
+
+bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->initializeGPUDevice(device);
+}
+
+bool llmodel_has_gpu_device(llmodel_model model)
+{
+    LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
+    return wrapper->llModel->hasGPUDevice();
+}
--- a/gpt4all-backend/llmodel_c.h
+++ b/gpt4all-backend/llmodel_c.h
@ -56,8 +56,18 @@ struct llmodel_prompt_context {
    int32_t repeat_last_n;  // last n tokens to penalize
    float context_erase;    // percent of context to erase if we exceed the context window
 };
+
+struct llmodel_gpu_device {
+    int index = 0;
+    int type = 0;           // same as VkPhysicalDeviceType
+    size_t heapSize = 0;
+    const char * name;
+    const char * vendor;
+};
+
 #ifndef __cplusplus
 typedef struct llmodel_prompt_context llmodel_prompt_context;
+typedef struct llmodel_gpu_device llmodel_gpu_device;
 #endif

 /**
@ -218,6 +228,50 @@ void llmodel_set_implementation_search_path(const char *path);
 */
 const char *llmodel_get_implementation_search_path();

+/**
+ * Get a list of available GPU devices given the memory required.
+ * @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
+ */
+struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices);
+
+/**
+ * Initializes a GPU device based on a specified string criterion.
+ *
+ * This function initializes a GPU device based on a string identifier provided. The function
+ * allows initialization based on general device type ("gpu"), vendor name ("amd", "nvidia", "intel"),
+ * or any specific device name.
+ *
+ * @param memoryRequired The amount of memory (in bytes) required by the application or task
+ *                       that will utilize the GPU device.
+ * @param device A string specifying the desired criterion for GPU device selection. It can be:
+ *               - "gpu": To initialize the best available GPU.
+ *               - "amd", "nvidia", or "intel": To initialize the best available GPU from that vendor.
+ *               - A specific GPU device name: To initialize a GPU with that exact name.
+ *
+ * @return True if the GPU device is successfully initialized based on the provided string
+ *         criterion. Returns false if the desired GPU device could not be initialized.
+ */
+bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device);
+
+/**
+ * Initializes a GPU device by specifying a valid gpu device pointer.
+ * @param device A gpu device pointer.
+ * @return True if the GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device);
+
+/**
+ * Initializes a GPU device by its index.
+ * @param device An integer representing the index of the GPU device to be initialized.
+ * @return True if the GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
+
+/**
+ * @return True if a GPU device is successfully initialized, false otherwise.
+ */
+bool llmodel_has_gpu_device(llmodel_model model);
+
 #ifdef __cplusplus
 }
 #endif
--- a/gpt4all-backend/llmodel_shared.h
+++ b/gpt4all-backend/llmodel_shared.h
@ -1,8 +1,52 @@
 #pragma once
 #include <cstdint>
 #include <cstddef>
+#include <vector>
 #include <ggml.h>

+#if defined(GGML_USE_KOMPUTE)
+#include "ggml-vulkan.h"
+struct llm_buffer {
+    uint8_t * addr = NULL;
+    size_t size = 0;
+    ggml_vk_memory memory;
+
+    llm_buffer() = default;
+
+    void resize(size_t size) {
+        free();
+
+        if (!ggml_vk_has_device()) {
+            this->addr = new uint8_t[size];
+            this->size = size;
+        } else {
+            this->memory = ggml_vk_allocate(size);
+            this->addr = (uint8_t*)memory.data;
+            this->size = size;
+        }
+    }
+
+    void free() {
+        if (!memory.primaryMemory) {
+            delete[] addr;
+        } else if (memory.data) {
+            ggml_vk_free_memory(memory);
+        }
+        this->addr = NULL;
+        this->size = 0;
+    }
+
+    ~llm_buffer() {
+        free();
+    }
+
+    // disable copy and move
+    llm_buffer(const llm_buffer&) = delete;
+    llm_buffer(llm_buffer&&) = delete;
+    llm_buffer& operator=(const llm_buffer&) = delete;
+    llm_buffer& operator=(llm_buffer&&) = delete;
+};
+#else
 struct llm_buffer {
    uint8_t * addr = NULL;
    size_t size = 0;
@ -17,6 +61,7 @@ struct llm_buffer {
        delete[] addr;
    }
 };
+#endif

 struct llm_kv_cache {
    struct ggml_tensor * k;
@ -34,3 +79,14 @@ struct llm_kv_cache {
        }
    }
 };
+
+#if LLAMA_DATE >= 230519
+inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
+    struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
+    if (plan.work_size > 0) {
+        buf.resize(plan.work_size);
+        plan.work_data = buf.addr;
+    }
+    ggml_graph_compute(graph, &plan);
+}
+#endif
--- a/gpt4all-backend/replit.cpp
+++ b/gpt4all-backend/replit.cpp
@ -196,6 +196,7 @@ struct replit_model {

    struct ggml_context * ctx;
    llm_buffer eval_buf;
+    llm_buffer work_buf;
    llm_buffer scr0_buf;
    llm_buffer scr1_buf;
    #ifdef GGML_USE_METAL
@ -490,7 +491,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
   model.scr1_buf.resize(256u * 1024 * 1024);

 #ifdef GGML_USE_METAL
-    model.ctx_metal = ggml_metal_init();
+    model.ctx_metal = ggml_metal_init(1);
    void* data_ptr = ggml_get_mem_buffer(model.ctx);
    size_t data_size = ggml_get_mem_size(model.ctx);
    const size_t max_size = ggml_get_max_tensor_size(model.ctx);
@ -534,7 +535,7 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t
 //   - embd_inp:  the embeddings of the tokens in the context
 //   - embd_w:    the predicted logits for the next token
 //
-bool replit_eval(const replit_model & model, const int n_threads, const int n_past,
+bool replit_eval(replit_model & model, const int n_threads, const int n_past,
                 const std::vector<gpt_vocab::id> & embd_inp, std::vector<float> & embd_w, size_t & mem_per_token) {
    const int N = embd_inp.size();

@ -552,7 +553,7 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
        .no_alloc = false,
    };
    struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
-    struct ggml_cgraph gf = {.n_threads = n_threads};
+    struct ggml_cgraph gf = {};

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N * ggml_element_size(embd));
@ -706,10 +707,10 @@ bool replit_eval(const replit_model & model, const int n_threads, const int n_pa
        ggml_metal_get_tensor(model.ctx_metal, model.kv_self.k);
        ggml_metal_get_tensor(model.ctx_metal, model.kv_self.v);

-        ggml_graph_compute(ctx0, &gf);
+        ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
    }
 #else
-    ggml_graph_compute(ctx0, &gf);
+    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
 #endif

    // std::cout << "Qcur" << std::endl;
--- a/gpt4all-backend/starcoder.cpp
+++ b/gpt4all-backend/starcoder.cpp
@ -73,6 +73,7 @@ struct starcoder_model {
    llm_buffer eval_buf;
    llm_buffer scr0_buf;
    llm_buffer scr1_buf;
+    llm_buffer work_buf;
 };

 static bool kv_cache_init(
@ -452,7 +453,7 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
 //   - embd_w:    the predicted logits for the next token
 //
 bool starcoder_eval(
-        const starcoder_model & model,
+        starcoder_model & model,
        const int n_threads,
        const int n_past,
        const std::vector<gpt_vocab::id> & embd_inp,
@ -477,7 +478,6 @@ bool starcoder_eval(

    struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
    struct ggml_cgraph gf = {};
-    gf.n_threads = n_threads;

    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
    memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
@ -730,7 +730,7 @@ bool starcoder_eval(

    // run the computation
    ggml_build_forward_expand(&gf, inpL);
-    ggml_graph_compute       (ctx0, &gf);
+    ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);

    //if (n_past%100 == 0) {
    //    ggml_graph_print   (&gf);
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -66,6 +66,7 @@ class GPT4All:
        model_type: Optional[str] = None,
        allow_download: bool = True,
        n_threads: Optional[int] = None,
+        device: Optional[str] = "cpu",
    ):
        """
        Constructor
@ -78,11 +79,22 @@ class GPT4All:
                descriptive identifier for user. Default is None.
            allow_download: Allow API to download models from gpt4all.io. Default is True.
            n_threads: number of CPU threads used by GPT4All. Default is None, then the number of threads are determined automatically.
+            device: The processing unit on which the GPT4All model will run. It can be set to:
+                - "cpu": Model will run on the central processing unit.
+                - "gpu": Model will run on the best available graphics processing unit, irrespective of its vendor.
+                - "amd", "nvidia", "intel": Model will run on the best available GPU from the specified vendor.
+                Alternatively, a specific GPU name can also be provided, and the model will run on the GPU that matches the name if it's available.
+                Default is "cpu".
+
+                Note: If a selected GPU device does not have sufficient RAM to accommodate the model, an error will be thrown, and the GPT4All instance will be rendered invalid. It's advised to ensure the device has enough memory before initiating the model.
        """
        self.model_type = model_type
        self.model = pyllmodel.LLModel()
        # Retrieve model and download if allowed
        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download)
+        if device is not None:
+            if device != "cpu":
+                self.model.init_gpu(model_path=self.config["path"], device=device)
        self.model.load_model(self.config["path"])
        # Set n_threads
        if n_threads is not None:
--- a/gpt4all-bindings/python/gpt4all/pyllmodel.py
+++ b/gpt4all-bindings/python/gpt4all/pyllmodel.py
@ -70,6 +70,14 @@ class LLModelPromptContext(ctypes.Structure):
        ("context_erase", ctypes.c_float),
    ]

+class LLModelGPUDevice(ctypes.Structure):
+    _fields_ = [
+        ("index", ctypes.c_int32),
+        ("type", ctypes.c_int32),
+        ("heapSize", ctypes.c_size_t),
+        ("name", ctypes.c_char_p),
+        ("vendor", ctypes.c_char_p),
+    ]

 # Define C function signatures using ctypes
 llmodel.llmodel_model_create.argtypes = [ctypes.c_char_p]
@ -125,6 +133,20 @@ llmodel.llmodel_threadCount.restype = ctypes.c_int32

 llmodel.llmodel_set_implementation_search_path(MODEL_LIB_PATH.encode("utf-8"))

+llmodel.llmodel_available_gpu_devices.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.POINTER(ctypes.c_int32)]
+llmodel.llmodel_available_gpu_devices.restype = ctypes.POINTER(LLModelGPUDevice)
+
+llmodel.llmodel_gpu_init_gpu_device_by_string.argtypes = [ctypes.c_void_p, ctypes.c_size_t, ctypes.c_char_p]
+llmodel.llmodel_gpu_init_gpu_device_by_string.restype = ctypes.c_bool
+
+llmodel.llmodel_gpu_init_gpu_device_by_struct.argtypes = [ctypes.c_void_p, ctypes.POINTER(LLModelGPUDevice)]
+llmodel.llmodel_gpu_init_gpu_device_by_struct.restype = ctypes.c_bool
+
+llmodel.llmodel_gpu_init_gpu_device_by_int.argtypes = [ctypes.c_void_p, ctypes.c_int32]
+llmodel.llmodel_gpu_init_gpu_device_by_int.restype = ctypes.c_bool
+
+llmodel.llmodel_has_gpu_device.argtypes = [ctypes.c_void_p]
+llmodel.llmodel_has_gpu_device.restype = ctypes.c_bool

 ResponseCallbackType = Callable[[int, str], bool]
 RawResponseCallbackType = Callable[[int, bytes], bool]
@ -169,6 +191,60 @@ class LLModel:
        else:
            raise ValueError("Unable to instantiate model")

+    def list_gpu(self, model_path: str) -> list:
+        """
+        Lists available GPU devices that satisfy the model's memory requirements.
+
+        Parameters
+        ----------
+        model_path : str
+            Path to the model.
+
+        Returns
+        -------
+        list
+            A list of LLModelGPUDevice structures representing available GPU devices.
+        """
+        if self.model is not None:
+            model_path_enc = model_path.encode("utf-8")
+            mem_required = llmodel.llmodel_required_mem(self.model, model_path_enc)
+        else:
+            mem_required = self.memory_needed(model_path)
+        num_devices = ctypes.c_int32(0)
+        devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, mem_required, ctypes.byref(num_devices))
+        if not devices_ptr:
+            raise ValueError("Unable to retrieve available GPU devices")
+        devices = [devices_ptr[i] for i in range(num_devices.value)]
+        return devices
+
+    def init_gpu(self, model_path: str, device: str):
+        if self.model is not None:
+            model_path_enc = model_path.encode("utf-8")
+            mem_required = llmodel.llmodel_required_mem(self.model, model_path_enc)
+        else:
+            mem_required = self.memory_needed(model_path)
+        device_enc = device.encode("utf-8")
+        success = self.llmodel_lib.llmodel_gpu_init_gpu_device_by_string(self.model, mem_required, device_enc)
+        if not success:
+            # Retrieve all GPUs without considering memory requirements.
+            num_devices = ctypes.c_int32(0)
+            all_devices_ptr = self.llmodel_lib.llmodel_available_gpu_devices(self.model, 0, ctypes.byref(num_devices))
+            if not all_devices_ptr:
+                raise ValueError("Unable to retrieve list of all GPU devices")
+            all_gpus = [all_devices_ptr[i].name.decode('utf-8') for i in range(num_devices.value)]
+
+            # Retrieve GPUs that meet the memory requirements using list_gpu
+            available_gpus = [device.name.decode('utf-8') for device in self.list_gpu(model_path)]
+
+            # Identify GPUs that are unavailable due to insufficient memory or features
+            unavailable_gpus = set(all_gpus) - set(available_gpus)
+
+            # Formulate the error message
+            error_msg = "Unable to initialize model on GPU: '{}'.".format(device)
+            error_msg += "\nAvailable GPUs: {}.".format(available_gpus)
+            error_msg += "\nUnavailable GPUs due to insufficient memory or features: {}.".format(unavailable_gpus)
+            raise ValueError(error_msg)
+
    def load_model(self, model_path: str) -> bool:
        """
        Load model from a file.
--- a/gpt4all-chat/CMakeLists.txt
+++ b/gpt4all-chat/CMakeLists.txt
@ -180,18 +180,10 @@ install(TARGETS llmodel DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})

 # We should probably iterate through the list of the cmake for backend, but these need to be installed
 # to the this component's dir for the finicky qt installer to work
-install(TARGETS gptj-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS gptj-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-230511-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-230511-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-230519-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llama-230519-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+#install(TARGETS gptj-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+#install(TARGETS gptj-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS llama-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS llama-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-230511-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-230511-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-230519-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS llamamodel-230519-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS llamamodel-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS llamamodel-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 if(APPLE)
@ -199,8 +191,8 @@ install(TARGETS llamamodel-mainline-metal DESTINATION lib COMPONENT ${COMPONENT_
 endif()
 install(TARGETS falcon-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS falcon-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS mpt-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
-install(TARGETS mpt-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+#install(TARGETS mpt-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
+#install(TARGETS mpt-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS replit-mainline-avxonly DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 install(TARGETS replit-mainline-default DESTINATION lib COMPONENT ${COMPONENT_NAME_MAIN})
 if(APPLE)
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -401,7 +401,7 @@ bool ChatLLM::handlePrompt(int32_t token)
 #endif
    ++m_promptTokens;
    ++m_promptResponseTokens;
-    m_timer->inc();
+    m_timer->start();
    return !m_stopGenerating;
 }
Author	SHA1	Message	Date
Adam Treat	a69d23ecc4	Fix for windows circleci	2023-08-31 15:29:54 -04:00
Adam Treat	b9fd0c25b2	Try and fix the rest of circleci for vulkan.	2023-08-31 15:29:54 -04:00
Adam Treat	85e34598f9	more circleci	2023-08-31 15:29:54 -04:00
Adam Treat	9f1cbad4f1	more Circleci	2023-08-31 15:29:54 -04:00
Adam Treat	202805637b	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	2832fad965	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	6a309e2ac8	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	94969a4199	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	1a2a9791bd	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	8d80f7963e	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	1723f82aaa	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	3bdc87ff4a	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	5e5a235639	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	4521c71b4e	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	2f1c995739	More circleci	2023-08-31 15:29:54 -04:00
Adam Treat	84e08858a8	Fix missing run in circleci	2023-08-31 15:29:54 -04:00
Adam Treat	6fd6369ab3	Fix yaml parsing	2023-08-31 15:29:54 -04:00
Adam Treat	54bc61e280	Make it work on gpt4all-backend linux circleci too.	2023-08-31 15:29:54 -04:00
Adam Treat	320eda9685	Get VulkanSDK installed on linux circleci.	2023-08-31 15:29:54 -04:00
Adam Treat	f578fa6cdf	Fix for windows.	2023-08-31 15:29:54 -04:00
Adam Treat	17d3e4976c	Add a comment indicating future work.	2023-08-31 15:29:54 -04:00
Adam Treat	7ec522dfb0	Lower case the som.	2023-08-31 15:29:54 -04:00
Adam Treat	7ae6bfc928	Add SOM to codespell ignore list.	2023-08-31 15:29:54 -04:00
Adam Treat	987546c63b	Nomic vulkan backend licensed under the Software for Open Models License (SOM), version 1.0.	2023-08-31 15:29:54 -04:00
Adam Treat	d55cbbee32	Update to newer llama.cpp and disable older forks.	2023-08-31 15:29:54 -04:00
Aaron Miller	0bc2274869	bump llama.cpp version + needed fixes for that	2023-08-31 15:29:54 -04:00
aaron miller	33c22be2aa	starcoder: use ggml_graph_plan	2023-08-31 15:29:54 -04:00