Compare commits

..

55 Commits

Author SHA1 Message Date
Jacob Nguyen
3dde1d977c fix typings and vulkan build works on win 2023-09-16 13:59:16 -05:00
Jacob Nguyen
af28bd0579 Merge branch 'main' into feat(ts)/gpu 2023-09-16 13:45:25 -05:00
Adam Treat
1e5d52fd3e Release notes for v2.4.19 and bump the version. 2023-09-16 13:44:08 -05:00
Adam Treat
6ab97c4487 Fix for crashes on systems where vulkan is not installed properly. 2023-09-16 13:44:08 -05:00
Adam Treat
7f46228bf5 Release notes for v2.4.18 and bump the version. 2023-09-16 13:44:08 -05:00
Adam Treat
ce51f82cb3 Actually bump the version. 2023-09-16 13:44:08 -05:00
Adam Treat
d713c4c655 Send actual and requested device info for those who have opt-in. 2023-09-16 13:44:08 -05:00
Adam Treat
3c9acadcf3 Link against ggml in bin so we can get the available devices without loading a model. 2023-09-16 13:44:08 -05:00
Adam Treat
45706602e2 Bump the Python version to python-v1.0.12 to restrict the quants that vulkan recognizes. 2023-09-16 13:44:08 -05:00
Adam Treat
81bdcc7c91 Release notes for v2.4.17 and bump the version. 2023-09-16 13:44:07 -05:00
Adam Treat
635b40d832 Fallback to CPU more robustly. 2023-09-16 13:44:07 -05:00
Adam Treat
b63c162c25 Release notes for v2.4.16 and bump the version. 2023-09-16 13:44:07 -05:00
Adam Treat
780da62cc0 Bump to new llama with new bugfix. 2023-09-16 13:44:07 -05:00
Adam Treat
6eb6f23929 Only show GPU when we're actually using it. 2023-09-16 13:44:07 -05:00
Adam Treat
3c5b5f05ef Report the actual device we're using. 2023-09-16 13:44:07 -05:00
Adam Treat
11e459ecef Sync to a newer version of llama.cpp with bugfix for vulkan. 2023-09-16 13:44:07 -05:00
Adam Treat
0a19cef006 Fix a bug where we're not properly falling back to CPU. 2023-09-16 13:44:07 -05:00
Adam Treat
ce9f64e8bc Add version 2.4.15 and bump the version number. 2023-09-16 13:44:07 -05:00
Adam Treat
2a913ca301 Update the submodule. 2023-09-16 13:44:07 -05:00
Aaron Miller
299dabe7cd init at most one vulkan device, submodule update
fixes issues w/ multiple of the same gpu
2023-09-16 13:44:07 -05:00
Adam Treat
ea41e60745 Fix up the name and formatting. 2023-09-16 13:44:07 -05:00
Adam Treat
74b48005a5 Show the device we're currently using. 2023-09-16 13:44:07 -05:00
Adam Treat
246ba226f2 When device is Auto (the default) then we will only consider discrete GPU's otherwise fallback to CPU. 2023-09-16 13:44:07 -05:00
Adam Treat
be830350b0 Bring the vulkan backend to the GUI. 2023-09-16 13:44:07 -05:00
Aaron Miller
e5b0d2de51 vulkan python bindings on windows fixes 2023-09-16 13:44:06 -05:00
Adam Treat
71e2000552 Don't link against libvulkan. 2023-09-16 13:44:06 -05:00
Aaron Miller
73ff1c417b bump python version (library linking fix) 2023-09-16 13:44:06 -05:00
Aaron Miller
463d9cb258 remove extra dynamic linker deps when building with vulkan 2023-09-16 13:44:06 -05:00
Jacob Nguyen
e86c63750d Update llama.cpp.cmake
Signed-off-by: Jacob Nguyen <76754747+jacoobes@users.noreply.github.com>
2023-09-16 11:42:56 -07:00
Adam Treat
f47e698193 Release notes for v2.4.19 and bump the version. 2023-09-16 12:35:08 -04:00
Adam Treat
84905aa281 Fix for crashes on systems where vulkan is not installed properly. 2023-09-16 12:19:46 -04:00
Adam Treat
ecf014f03b Release notes for v2.4.18 and bump the version. 2023-09-16 10:21:50 -04:00
Adam Treat
e6e724d2dc Actually bump the version. 2023-09-16 10:07:20 -04:00
Adam Treat
06a833e652 Send actual and requested device info for those who have opt-in. 2023-09-16 09:42:22 -04:00
Adam Treat
045f6e6cdc Link against ggml in bin so we can get the available devices without loading a model. 2023-09-15 14:45:25 -04:00
Adam Treat
0f046cf905 Bump the Python version to python-v1.0.12 to restrict the quants that vulkan recognizes. 2023-09-15 09:12:20 -04:00
Adam Treat
655372dbfa Release notes for v2.4.17 and bump the version. 2023-09-14 17:11:04 -04:00
Adam Treat
aa33419c6e Fallback to CPU more robustly. 2023-09-14 16:53:11 -04:00
Adam Treat
79843c269e Release notes for v2.4.16 and bump the version. 2023-09-14 11:24:25 -04:00
Adam Treat
9013a089bd Bump to new llama with new bugfix. 2023-09-14 10:02:11 -04:00
Adam Treat
3076e0bf26 Only show GPU when we're actually using it. 2023-09-14 09:59:19 -04:00
Adam Treat
1fa67a585c Report the actual device we're using. 2023-09-14 08:25:37 -04:00
Adam Treat
cf4eb530ce Sync to a newer version of llama.cpp with bugfix for vulkan. 2023-09-13 21:01:44 -04:00
Adam Treat
21a3244645 Fix a bug where we're not properly falling back to CPU. 2023-09-13 19:30:27 -04:00
Adam Treat
0458c9b4e6 Add version 2.4.15 and bump the version number. 2023-09-13 17:55:50 -04:00
Adam Treat
4b9a345aee Update the submodule. 2023-09-13 17:05:46 -04:00
Aaron Miller
6f038c136b init at most one vulkan device, submodule update
fixes issues w/ multiple of the same gpu
2023-09-13 12:49:53 -07:00
Adam Treat
86e862df7e Fix up the name and formatting. 2023-09-13 15:48:55 -04:00
Adam Treat
358ff2a477 Show the device we're currently using. 2023-09-13 15:24:33 -04:00
Adam Treat
891ddafc33 When device is Auto (the default) then we will only consider discrete GPU's otherwise fallback to CPU. 2023-09-13 11:59:36 -04:00
Adam Treat
8f99dca70f Bring the vulkan backend to the GUI. 2023-09-13 11:26:10 -04:00
Aaron Miller
f0735efa7d vulkan python bindings on windows fixes 2023-09-12 14:16:02 -07:00
Adam Treat
c953b321b7 Don't link against libvulkan. 2023-09-12 14:26:56 -04:00
Aaron Miller
0ad1472b62 bump python version (library linking fix) 2023-09-11 09:42:06 -07:00
Aaron Miller
c4d23512e4 remove extra dynamic linker deps when building with vulkan 2023-09-11 08:44:39 -07:00
24 changed files with 338 additions and 49 deletions

View File

@ -312,7 +312,7 @@ jobs:
mkdir build
cd build
$env:Path += ";C:\VulkanSDK\1.3.261.1\bin"
cmake -G "MinGW Makefiles" .. -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON
cmake -G "MinGW Makefiles" .. -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DKOMPUTE_OPT_USE_BUILT_IN_VULKAN_HEADER=OFF
cmake --build . --parallel
- run:
name: Build wheel

View File

@ -134,6 +134,8 @@ add_library(llmodel
llmodel_c.h llmodel_c.cpp
dlhandle.h
)
target_link_libraries(llmodel PRIVATE ggml-mainline-default)
target_compile_definitions(llmodel PRIVATE GGML_BUILD_VARIANT="default")
target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
set_target_properties(llmodel PROPERTIES

@ -1 +1 @@
Subproject commit ced231980e0f88b9c7b454c456256c71c4f3cb75
Subproject commit 0631ea363c14335969095976bbe17bf20503bc6d

View File

@ -154,6 +154,7 @@ if (LLAMA_OPENBLAS)
endif()
if (LLAMA_KOMPUTE)
add_compile_definitions(VULKAN_HPP_DISPATCH_LOADER_DYNAMIC=1)
find_package(Vulkan COMPONENTS glslc REQUIRED)
find_program(glslc_executable NAMES glslc HINTS Vulkan::glslc)
if (NOT glslc_executable)
@ -184,19 +185,35 @@ if (LLAMA_KOMPUTE)
string(REPLACE "." "_" HEADER_FILE_DEFINE "${HEADER_FILE_DEFINE}")
set(OUTPUT_HEADER_FILE "${HEADER_FILE}")
message(STATUS "${HEADER_FILE} generating ${HEADER_FILE_DEFINE}")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
)
if(CMAKE_GENERATOR MATCHES "Visual Studio")
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/${CMAKE_BUILD_TYPE}/xxd"
)
else()
add_custom_command(
OUTPUT ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "/*THIS FILE HAS BEEN AUTOMATICALLY GENERATED - DO NOT EDIT*/" > ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#ifndef ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace kp {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "namespace shader_data {" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_BINARY_DIR}/bin/xxd -i ${spv_file} >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo "}}" >> ${OUTPUT_HEADER_FILE}
COMMAND ${CMAKE_COMMAND} -E echo \"\#endif // define ${HEADER_FILE_DEFINE}\" >> ${OUTPUT_HEADER_FILE}
DEPENDS ${spv_file} xxd
COMMENT "Converting to hpp: ${FILE_NAME} ${CMAKE_BINARY_DIR}/bin/xxd"
)
endif()
endforeach()
endfunction()

View File

@ -168,6 +168,10 @@ bool LLamaModel::loadModel(const std::string &modelPath)
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
if (!d_ptr->ctx) {
#ifdef GGML_USE_KOMPUTE
// Explicitly free the device so next load it doesn't use it
ggml_vk_free_device();
#endif
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
return false;
}
@ -194,7 +198,7 @@ int32_t LLamaModel::threadCount() const {
LLamaModel::~LLamaModel()
{
if(d_ptr->ctx) {
if (d_ptr->ctx) {
llama_free(d_ptr->ctx);
}
}
@ -337,6 +341,16 @@ bool LLamaModel::hasGPUDevice()
#endif
}
bool LLamaModel::usingGPUDevice()
{
#if defined(GGML_USE_KOMPUTE)
return ggml_vk_using_vulkan();
#elif defined(GGML_USE_METAL)
return true;
#endif
return false;
}
#if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport)
#else

View File

@ -30,6 +30,7 @@ public:
bool initializeGPUDevice(const GPUDevice &device) override;
bool initializeGPUDevice(int device) override;
bool hasGPUDevice() override;
bool usingGPUDevice() override;
private:
LLamaPrivate *d_ptr;

View File

@ -100,6 +100,8 @@ public:
virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
virtual bool initializeGPUDevice(int /*device*/) { return false; }
virtual bool hasGPUDevice() { return false; }
virtual bool usingGPUDevice() { return false; }
static std::vector<GPUDevice> availableGPUDevices();
protected:
// These are pure virtual because subclasses need to implement as the default implementation of

View File

@ -4,6 +4,10 @@
#include <iostream>
#include <unordered_set>
#ifdef GGML_USE_KOMPUTE
#include "ggml-vulkan.h"
#endif
void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
size_t i = 0;
promptCtx.n_past = 0;
@ -174,3 +178,26 @@ std::vector<float> LLModel::embedding(const std::string &/*text*/)
}
return std::vector<float>();
}
std::vector<LLModel::GPUDevice> LLModel::availableGPUDevices()
{
#if defined(GGML_USE_KOMPUTE)
std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(0);
std::vector<LLModel::GPUDevice> devices;
for(const auto& vkDevice : vkDevices) {
LLModel::GPUDevice device;
device.index = vkDevice.index;
device.type = vkDevice.type;
device.heapSize = vkDevice.heapSize;
device.name = vkDevice.name;
device.vendor = vkDevice.vendor;
devices.push_back(device);
}
return devices;
#else
return std::vector<LLModel::GPUDevice>();
#endif
}

View File

@ -163,7 +163,7 @@ struct mpt_hparams {
int32_t n_embd = 0; //max_seq_len
int32_t n_head = 0; // n_heads
int32_t n_layer = 0; //n_layers
int32_t ftype = 0;
int32_t ftype = 0;
};
struct replit_layer {
@ -220,7 +220,7 @@ static bool kv_cache_init(
params.mem_size = cache.buf.size;
params.mem_buffer = cache.buf.addr;
params.no_alloc = false;
cache.ctx = ggml_init(params);
if (!cache.ctx) {
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
@ -503,7 +503,7 @@ bool replit_model_load(const std::string & fname, std::istream &fin, replit_mode
}
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "data", data_ptr, data_size, max_size));
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx),
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "kv", ggml_get_mem_buffer(model.kv_self.ctx),
ggml_get_mem_size(model.kv_self.ctx), 0));
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "eval", model.eval_buf.addr, model.eval_buf.size, 0));
GGML_CHECK_BUF(ggml_metal_add_buffer(model.ctx_metal, "scr0", model.scr0_buf.addr, model.scr0_buf.size, 0));
@ -975,6 +975,14 @@ const std::vector<LLModel::Token> &Replit::endTokens() const
return fres;
}
bool Replit::usingGPUDevice()
{
#if defined(GGML_USE_METAL)
return true;
#endif
return false;
}
#if defined(_WIN32)
#define DLL_EXPORT __declspec(dllexport)
#else

View File

@ -27,6 +27,7 @@ public:
size_t restoreState(const uint8_t *src) override;
void setThreadCount(int32_t n_threads) override;
int32_t threadCount() const override;
bool usingGPUDevice() override;
private:
ReplitPrivate *d_ptr;

View File

@ -61,7 +61,7 @@ copy_prebuilt_C_lib(SRC_CLIB_DIRECtORY,
setup(
name=package_name,
version="1.0.9",
version="1.0.12",
description="Python bindings for GPT4All",
author="Nomic and the Open Source Community",
author_email="support@nomic.ai",

View File

@ -58,6 +58,7 @@ const fltArray = createEmbedding(model, "Pain is inevitable, suffering optional"
* (win) msvc version 143
* Can be obtained with visual studio 2022 build tools
* python 3
* Vulkan SDK. Should be installable via `pkg manager of choice` or [here](https://vulkan.lunarg.com/#new_tab)
### Build (from source)
@ -73,15 +74,12 @@ cd gpt4all-bindings/typescript
```sh
yarn
```
* llama.cpp git submodule for gpt4all can be possibly absent. If this is the case, make sure to run in llama.cpp parent directory
```sh
git submodule update --init --depth 1 --recursive
```
**AS OF NEW BACKEND** to build the backend,
```sh
yarn build:backend
```

View File

@ -162,7 +162,7 @@ declare class LLModel {
* GPUs that are usable for this LLModel
* @returns
*/
availableGpus() : GpuDevice[]
listGpu() : GpuDevice[]
}
/**
* an object that contains gpu data on this machine.
@ -223,7 +223,7 @@ declare function loadModel(
declare function loadModel(
modelName: string,
options?: EmbeddingOptions | InferenceOptions
options?: EmbeddingModelOptions | InferenceModelOptions
): Promise<InferenceModel | EmbeddingModel>;
/**
@ -440,7 +440,7 @@ declare const DEFAULT_MODEL_CONFIG: ModelConfig;
/**
* Default prompt context.
*/
declare const DEFAULT_PROMT_CONTEXT: LLModelPromptContext;
declare const DEFAULT_PROMPT_CONTEXT: LLModelPromptContext;
/**
* Default model list url.
@ -541,7 +541,7 @@ export {
DEFAULT_DIRECTORY,
DEFAULT_LIBRARIES_DIRECTORY,
DEFAULT_MODEL_CONFIG,
DEFAULT_PROMT_CONTEXT,
DEFAULT_PROMPT_CONTEXT,
DEFAULT_MODEL_LIST_URL,
downloadModel,
retrieveModel,

View File

@ -18,7 +18,7 @@ endif()
set(APP_VERSION_MAJOR 2)
set(APP_VERSION_MINOR 4)
set(APP_VERSION_PATCH 15)
set(APP_VERSION_PATCH 20)
set(APP_VERSION "${APP_VERSION_MAJOR}.${APP_VERSION_MINOR}.${APP_VERSION_PATCH}")
# Include the binary directory for the generated header file

View File

@ -56,6 +56,7 @@ void Chat::connectLLM()
connect(m_llmodel, &ChatLLM::recalcChanged, this, &Chat::handleRecalculating, Qt::QueuedConnection);
connect(m_llmodel, &ChatLLM::generatedNameChanged, this, &Chat::generatedNameChanged, Qt::QueuedConnection);
connect(m_llmodel, &ChatLLM::reportSpeed, this, &Chat::handleTokenSpeedChanged, Qt::QueuedConnection);
connect(m_llmodel, &ChatLLM::reportDevice, this, &Chat::handleDeviceChanged, Qt::QueuedConnection);
connect(m_llmodel, &ChatLLM::databaseResultsChanged, this, &Chat::handleDatabaseResultsChanged, Qt::QueuedConnection);
connect(m_llmodel, &ChatLLM::modelInfoChanged, this, &Chat::handleModelInfoChanged, Qt::QueuedConnection);
@ -345,6 +346,12 @@ void Chat::handleTokenSpeedChanged(const QString &tokenSpeed)
emit tokenSpeedChanged();
}
void Chat::handleDeviceChanged(const QString &device)
{
m_device = device;
emit deviceChanged();
}
void Chat::handleDatabaseResultsChanged(const QList<ResultInfo> &results)
{
m_databaseResults = results;

View File

@ -25,6 +25,7 @@ class Chat : public QObject
Q_PROPERTY(QList<QString> collectionList READ collectionList NOTIFY collectionListChanged)
Q_PROPERTY(QString modelLoadingError READ modelLoadingError NOTIFY modelLoadingErrorChanged)
Q_PROPERTY(QString tokenSpeed READ tokenSpeed NOTIFY tokenSpeedChanged);
Q_PROPERTY(QString device READ device NOTIFY deviceChanged);
QML_ELEMENT
QML_UNCREATABLE("Only creatable from c++!")
@ -88,6 +89,7 @@ public:
QString modelLoadingError() const { return m_modelLoadingError; }
QString tokenSpeed() const { return m_tokenSpeed; }
QString device() const { return m_device; }
public Q_SLOTS:
void serverNewPromptResponsePair(const QString &prompt);
@ -115,6 +117,7 @@ Q_SIGNALS:
void isServerChanged();
void collectionListChanged(const QList<QString> &collectionList);
void tokenSpeedChanged();
void deviceChanged();
private Q_SLOTS:
void handleResponseChanged(const QString &response);
@ -125,6 +128,7 @@ private Q_SLOTS:
void handleRecalculating();
void handleModelLoadingError(const QString &error);
void handleTokenSpeedChanged(const QString &tokenSpeed);
void handleDeviceChanged(const QString &device);
void handleDatabaseResultsChanged(const QList<ResultInfo> &results);
void handleModelInfoChanged(const ModelInfo &modelInfo);
void handleModelInstalled();
@ -137,6 +141,7 @@ private:
ModelInfo m_modelInfo;
QString m_modelLoadingError;
QString m_tokenSpeed;
QString m_device;
QString m_response;
QList<QString> m_collections;
ChatModel *m_chatModel;

View File

@ -81,6 +81,7 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
connect(parent, &Chat::idChanged, this, &ChatLLM::handleChatIdChanged);
connect(&m_llmThread, &QThread::started, this, &ChatLLM::handleThreadStarted);
connect(MySettings::globalInstance(), &MySettings::forceMetalChanged, this, &ChatLLM::handleForceMetalChanged);
connect(MySettings::globalInstance(), &MySettings::deviceChanged, this, &ChatLLM::handleDeviceChanged);
// The following are blocking operations and will block the llm thread
connect(this, &ChatLLM::requestRetrieveFromDB, LocalDocs::globalInstance()->database(), &Database::retrieveFromDB,
@ -124,6 +125,16 @@ void ChatLLM::handleForceMetalChanged(bool forceMetal)
#endif
}
void ChatLLM::handleDeviceChanged()
{
if (isModelLoaded() && m_shouldBeLoaded) {
m_reloadingToChangeVariant = true;
unloadModel();
reloadModel();
m_reloadingToChangeVariant = false;
}
}
bool ChatLLM::loadDefaultModel()
{
ModelInfo defaultModel = ModelList::globalInstance()->defaultModelInfo();
@ -250,16 +261,52 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
#endif
if (m_llModelInfo.model) {
// Update the settings that a model is being loaded and update the device list
MySettings::globalInstance()->setAttemptModelLoad(filePath);
// Pick the best match for the device
QString actualDevice = m_llModelInfo.model->implementation().buildVariant() == "metal" ? "Metal" : "CPU";
const QString requestedDevice = MySettings::globalInstance()->device();
if (requestedDevice != "CPU") {
const size_t requiredMemory = m_llModelInfo.model->requiredMem(filePath.toStdString());
std::vector<LLModel::GPUDevice> availableDevices = m_llModelInfo.model->availableGPUDevices(requiredMemory);
if (!availableDevices.empty() && requestedDevice == "Auto" && availableDevices.front().type == 2 /*a discrete gpu*/) {
m_llModelInfo.model->initializeGPUDevice(availableDevices.front());
actualDevice = QString::fromStdString(availableDevices.front().name);
} else {
for (LLModel::GPUDevice &d : availableDevices) {
if (QString::fromStdString(d.name) == requestedDevice) {
m_llModelInfo.model->initializeGPUDevice(d);
actualDevice = QString::fromStdString(d.name);
break;
}
}
}
}
// Report which device we're actually using
emit reportDevice(actualDevice);
bool success = m_llModelInfo.model->loadModel(filePath.toStdString());
if (!success && actualDevice != "CPU") {
emit reportDevice("CPU");
success = m_llModelInfo.model->loadModel(filePath.toStdString());
}
MySettings::globalInstance()->setAttemptModelLoad(QString());
if (!success) {
delete std::exchange(m_llModelInfo.model, nullptr);
delete m_llModelInfo.model;
m_llModelInfo.model = nullptr;
if (!m_isServer)
LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
m_llModelInfo = LLModelInfo();
emit modelLoadingError(QString("Could not load model due to invalid model file for %1").arg(modelInfo.filename()));
} else {
// We might have had to fallback to CPU after load if the model is not possible to accelerate
// for instance if the quantization method is not supported on Vulkan yet
if (actualDevice != "CPU" && !m_llModelInfo.model->usingGPUDevice())
emit reportDevice("CPU");
switch (m_llModelInfo.model->implementation().modelType()[0]) {
case 'L': m_llModelType = LLModelType::LLAMA_; break;
case 'G': m_llModelType = LLModelType::GPTJ_; break;
@ -270,7 +317,8 @@ bool ChatLLM::loadModel(const ModelInfo &modelInfo)
case 'S': m_llModelType = LLModelType::STARCODER_; break;
default:
{
delete std::exchange(m_llModelInfo.model, nullptr);
delete m_llModelInfo.model;
m_llModelInfo.model = nullptr;
if (!m_isServer)
LLModelStore::globalInstance()->releaseModel(m_llModelInfo); // release back into the store
m_llModelInfo = LLModelInfo();

View File

@ -111,6 +111,7 @@ public Q_SLOTS:
void handleShouldBeLoadedChanged();
void handleThreadStarted();
void handleForceMetalChanged(bool forceMetal);
void handleDeviceChanged();
void processSystemPrompt();
Q_SIGNALS:
@ -128,6 +129,7 @@ Q_SIGNALS:
void shouldBeLoadedChanged();
void requestRetrieveFromDB(const QList<QString> &collections, const QString &text, int retrievalSize, QList<ResultInfo> *results);
void reportSpeed(const QString &speed);
void reportDevice(const QString &device);
void databaseResultsChanged(const QList<ResultInfo>&);
void modelInfoChanged(const ModelInfo &modelInfo);

View File

@ -1006,13 +1006,14 @@ Window {
}
Text {
id: speed
id: device
anchors.bottom: textInputView.top
anchors.bottomMargin: 20
anchors.right: parent.right
anchors.rightMargin: 30
color: theme.mutedTextColor
text: currentChat.tokenSpeed
visible: currentChat.tokenSpeed !== ""
text: qsTr("Speed: ") + currentChat.tokenSpeed + "<br>" + qsTr("Device: ") + currentChat.device
font.pixelSize: theme.fontSizeLarge
}

View File

@ -464,6 +464,71 @@
"
* Lakshay Kansal (Nomic AI)
* Adam Treat (Nomic AI)
"
},
{
"version": "2.4.15",
"notes":
"
* Add Vulkan GPU backend which allows inference on AMD, Intel and NVIDIA GPUs
* Add ability to switch font sizes
* Various bug fixes
",
"contributors":
"
* Adam Treat (Nomic AI)
* Aaron Miller (Nomic AI)
* Nils Sauer (Nomic AI)
* Lakshay Kansal (Nomic AI)
"
},
{
"version": "2.4.16",
"notes":
"
* Bugfix for properly falling back to CPU when GPU can't be used
* Report the actual device we're using
* Fix context bugs for GPU accelerated models
",
"contributors":
"
* Adam Treat (Nomic AI)
* Aaron Miller (Nomic AI)
"
},
{
"version": "2.4.17",
"notes":
"
* Bugfix for properly falling back to CPU when GPU is out of memory
",
"contributors":
"
* Adam Treat (Nomic AI)
* Aaron Miller (Nomic AI)
"
},
{
"version": "2.4.18",
"notes":
"
* Bugfix for devices to show up in the settings combobox on application start and not just on model load
* Send information on requested device and actual device on model load to help assess which model/gpu/os combos are working
",
"contributors":
"
* Adam Treat (Nomic AI)
"
},
{
"version": "2.4.19",
"notes":
"
* Fix a crasher on systems with corrupted vulkan drivers or corrupted vulkan dlls
",
"contributors":
"
* Adam Treat (Nomic AI)
"
}
]

View File

@ -1,5 +1,6 @@
#include "mysettings.h"
#include "modellist.h"
#include "../gpt4all-backend/llmodel.h"
#include <QDir>
#include <QFile>
@ -23,6 +24,7 @@ static bool default_localDocsShowReferences = true;
static QString default_networkAttribution = "";
static bool default_networkIsActive = false;
static bool default_networkUsageStatsActive = false;
static QString default_device = "Auto";
static QString defaultLocalModelsPath()
{
@ -62,6 +64,24 @@ MySettings::MySettings()
: QObject{nullptr}
{
QSettings::setDefaultFormat(QSettings::IniFormat);
std::vector<LLModel::GPUDevice> devices = LLModel::availableGPUDevices();
QVector<QString> deviceList{ "Auto" };
for (LLModel::GPUDevice &d : devices)
deviceList << QString::fromStdString(d.name);
deviceList << "CPU";
setDeviceList(deviceList);
}
Q_INVOKABLE QVector<QString> MySettings::deviceList() const
{
return m_deviceList;
}
void MySettings::setDeviceList(const QVector<QString> &deviceList)
{
m_deviceList = deviceList;
emit deviceListChanged();
}
void MySettings::restoreModelDefaults(const ModelInfo &model)
@ -79,6 +99,9 @@ void MySettings::restoreModelDefaults(const ModelInfo &model)
void MySettings::restoreApplicationDefaults()
{
setChatTheme(default_chatTheme);
setFontSize(default_fontSize);
setDevice(default_device);
setThreadCount(default_threadCount);
setSaveChats(default_saveChats);
setSaveChatGPTChats(default_saveChatGPTChats);
@ -485,7 +508,7 @@ QString MySettings::chatTheme() const
void MySettings::setChatTheme(const QString &u)
{
if(chatTheme() == u)
if (chatTheme() == u)
return;
QSettings setting;
@ -503,7 +526,7 @@ QString MySettings::fontSize() const
void MySettings::setFontSize(const QString &u)
{
if(fontSize() == u)
if (fontSize() == u)
return;
QSettings setting;
@ -512,6 +535,24 @@ void MySettings::setFontSize(const QString &u)
emit fontSizeChanged();
}
QString MySettings::device() const
{
QSettings setting;
setting.sync();
return setting.value("device", default_device).toString();
}
void MySettings::setDevice(const QString &u)
{
if (device() == u)
return;
QSettings setting;
setting.setValue("device", u);
setting.sync();
emit deviceChanged();
}
bool MySettings::forceMetal() const
{
return m_forceMetal;

View File

@ -25,6 +25,8 @@ class MySettings : public QObject
Q_PROPERTY(QString networkAttribution READ networkAttribution WRITE setNetworkAttribution NOTIFY networkAttributionChanged)
Q_PROPERTY(bool networkIsActive READ networkIsActive WRITE setNetworkIsActive NOTIFY networkIsActiveChanged)
Q_PROPERTY(bool networkUsageStatsActive READ networkUsageStatsActive WRITE setNetworkUsageStatsActive NOTIFY networkUsageStatsActiveChanged)
Q_PROPERTY(QString device READ device WRITE setDevice NOTIFY deviceChanged)
Q_PROPERTY(QVector<QString> deviceList READ deviceList NOTIFY deviceListChanged)
public:
static MySettings *globalInstance();
@ -78,6 +80,8 @@ public:
void setFontSize(const QString &u);
bool forceMetal() const;
void setForceMetal(bool b);
QString device() const;
void setDevice(const QString &u);
// Release/Download settings
QString lastVersionStarted() const;
@ -102,6 +106,9 @@ public:
QString attemptModelLoad() const;
void setAttemptModelLoad(const QString &modelFile);
QVector<QString> deviceList() const;
void setDeviceList(const QVector<QString> &deviceList);
Q_SIGNALS:
void nameChanged(const ModelInfo &model);
void filenameChanged(const ModelInfo &model);
@ -131,9 +138,12 @@ Q_SIGNALS:
void networkIsActiveChanged();
void networkUsageStatsActiveChanged();
void attemptModelLoadChanged();
void deviceChanged();
void deviceListChanged();
private:
bool m_forceMetal;
QVector<QString> m_deviceList;
private:
explicit MySettings();

View File

@ -393,6 +393,8 @@ void Network::sendMixpanelEvent(const QString &ev, const QVector<KeyValue> &valu
properties.insert("name", QCoreApplication::applicationName() + " v"
+ QCoreApplication::applicationVersion());
properties.insert("model", ChatListModel::globalInstance()->currentChat()->modelInfo().filename());
properties.insert("requestedDevice", MySettings::globalInstance()->device());
properties.insert("actualDevice", ChatListModel::globalInstance()->currentChat()->device());
// Some additional startup information
if (ev == "startup") {

View File

@ -89,17 +89,55 @@ MySettingsTab {
}
}
Label {
id: defaultModelLabel
text: qsTr("Default model:")
id: deviceLabel
text: qsTr("Device:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 3
Layout.column: 0
}
MyComboBox {
id: comboBox
id: deviceBox
Layout.row: 3
Layout.column: 1
Layout.columnSpan: 1
Layout.minimumWidth: 350
Layout.fillWidth: false
model: MySettings.deviceList
Accessible.role: Accessible.ComboBox
Accessible.name: qsTr("ComboBox for displaying/picking the device")
Accessible.description: qsTr("Use this for picking the device of the chat client")
function updateModel() {
deviceBox.currentIndex = deviceBox.indexOfValue(MySettings.device);
}
Component.onCompleted: {
deviceBox.updateModel()
}
Connections {
target: MySettings
function onDeviceChanged() {
deviceBox.updateModel()
}
function onDeviceListChanged() {
deviceBox.updateModel()
}
}
onActivated: {
MySettings.device = deviceBox.currentText
}
}
Label {
id: defaultModelLabel
text: qsTr("Default model:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 4
Layout.column: 0
}
MyComboBox {
id: comboBox
Layout.row: 4
Layout.column: 1
Layout.columnSpan: 2
Layout.minimumWidth: 350
Layout.fillWidth: true
@ -128,7 +166,7 @@ MySettingsTab {
text: qsTr("Download path:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 4
Layout.row: 5
Layout.column: 0
}
MyDirectoryField {
@ -136,7 +174,7 @@ MySettingsTab {
text: MySettings.modelPath
font.pixelSize: theme.fontSizeLarge
implicitWidth: 300
Layout.row: 4
Layout.row: 5
Layout.column: 1
Layout.fillWidth: true
ToolTip.text: qsTr("Path where model files will be downloaded to")
@ -153,7 +191,7 @@ MySettingsTab {
}
}
MyButton {
Layout.row: 4
Layout.row: 5
Layout.column: 2
text: qsTr("Browse")
Accessible.description: qsTr("Opens a folder picker dialog to choose where to save model files")
@ -168,7 +206,7 @@ MySettingsTab {
text: qsTr("CPU Threads:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 5
Layout.row: 6
Layout.column: 0
}
MyTextField {
@ -177,7 +215,7 @@ MySettingsTab {
font.pixelSize: theme.fontSizeLarge
ToolTip.text: qsTr("Amount of processing threads to use bounded by 1 and number of logical processors")
ToolTip.visible: hovered
Layout.row: 5
Layout.row: 6
Layout.column: 1
validator: IntValidator {
bottom: 1
@ -200,12 +238,12 @@ MySettingsTab {
text: qsTr("Save chats to disk:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 6
Layout.row: 7
Layout.column: 0
}
MyCheckBox {
id: saveChatsBox
Layout.row: 6
Layout.row: 7
Layout.column: 1
checked: MySettings.saveChats
onClicked: {
@ -220,12 +258,12 @@ MySettingsTab {
text: qsTr("Save ChatGPT chats to disk:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 7
Layout.row: 8
Layout.column: 0
}
MyCheckBox {
id: saveChatGPTChatsBox
Layout.row: 7
Layout.row: 8
Layout.column: 1
checked: MySettings.saveChatGPTChats
onClicked: {
@ -237,12 +275,12 @@ MySettingsTab {
text: qsTr("Enable API server:")
color: theme.textColor
font.pixelSize: theme.fontSizeLarge
Layout.row: 8
Layout.row: 9
Layout.column: 0
}
MyCheckBox {
id: serverChatBox
Layout.row: 8
Layout.row: 9
Layout.column: 1
checked: MySettings.serverChat
onClicked: {
@ -252,7 +290,7 @@ MySettingsTab {
ToolTip.visible: hovered
}
Rectangle {
Layout.row: 9
Layout.row: 10
Layout.column: 0
Layout.columnSpan: 3
Layout.fillWidth: true