2025-07-21 00:02:15 -04:00
11 changed files with 45 additions and 240 deletions
--- a/.codespellrc
+++ b/.codespellrc
@ -1,3 +1,3 @@
 [codespell]
-ignore-words-list = blong, belong, afterall, som, assistent
+ignore-words-list = blong, belong, afterall, som
 skip = .git,*.pdf,*.svg,*.lock
--- a/gpt4all-backend/llama.cpp-mainline
+++ b/gpt4all-backend/llama.cpp-mainline
@ -1 +1 @@
-Subproject commit 3742085b0429cbe0ede49bcb9f891e4a5e25a724
+Subproject commit 7b8f00f5ccf4fc3cc67fe1ced792b3aec1ae6c1c
--- a/gpt4all-backend/llama.cpp.cmake
+++ b/gpt4all-backend/llama.cpp.cmake
@ -238,10 +238,6 @@ if (LLAMA_KOMPUTE)
          kompute/op_norm.comp
          kompute/op_rmsnorm.comp
          kompute/op_diagmask.comp
          kompute/op_mul_mat_mat_f32.comp
          kompute/op_mul_mat_mat_f16.comp
          kompute/op_mul_mat_mat_q8_0.comp
          kompute/op_mul_mat_mat_q4_0.comp
          kompute/op_mul_mat_f16.comp
          kompute/op_mul_mat_q8_0.comp
          kompute/op_mul_mat_q4_0.comp
@ -272,10 +268,6 @@ if (LLAMA_KOMPUTE)
          shaderop_norm.h
          shaderop_rmsnorm.h
          shaderop_diagmask.h
          shaderop_mul_mat_mat_f32.h
          shaderop_mul_mat_mat_f16.h
          shaderop_mul_mat_mat_q8_0.h
          shaderop_mul_mat_mat_q4_0.h
          shaderop_mul_mat_f16.h
          shaderop_mul_mat_q8_0.h
          shaderop_mul_mat_q4_0.h
--- a/gpt4all-backend/llamamodel.cpp
+++ b/gpt4all-backend/llamamodel.cpp
@ -36,17 +36,6 @@ namespace {
 const char *modelType_ = "LLaMA";
 }
 static void null_log_callback(enum ggml_log_level level, const char* text, void* userdata) {
    (void)level;
    (void)text;
    (void)userdata;
 }
 static bool llama_verbose() {
    const char* var = getenv("GPT4ALL_VERBOSE_LLAMACPP");
    return var && *var;
 }
 struct gpt_params {
    int32_t seed          = -1;   // RNG seed
    int32_t n_keep        = 0;    // number of tokens to keep from initial prompt
@ -155,9 +144,7 @@ bool LLamaModel::loadModel(const std::string &modelPath)
    d_ptr->params.use_mlock  = params.use_mlock;
 #endif
 #ifdef GGML_USE_METAL
-    if (llama_verbose()) {
+    std::cerr << "llama.cpp: using Metal" << std::endl;
        std::cerr << "llama.cpp: using Metal" << std::endl;
    }
    // metal always runs the whole model if n_gpu_layers is not 0, at least
    // currently
    d_ptr->params.n_gpu_layers = 1;
@ -403,9 +390,6 @@ DLL_EXPORT bool magic_match(const char * fname) {
 }
 DLL_EXPORT LLModel *construct() {
    if (!llama_verbose()) {
        llama_log_set(null_log_callback, nullptr);
    }
    return new LLamaModel;
 }
 }
--- a/gpt4all-backend/llmodel.cpp
+++ b/gpt4all-backend/llmodel.cpp
@ -113,18 +113,17 @@ const std::vector<LLModel::Implementation> &LLModel::Implementation::implementat
 const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant) {
    for (const auto& i : implementationList()) {
        if (buildVariant != i.m_buildVariant) continue;
        if (!i.m_magicMatch(fname)) continue;
        if (buildVariant != i.m_buildVariant) continue;
        return &i;
    }
    return nullptr;
 }
 LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant) {
-    if (!has_at_least_minimal_hardware()) {
+
-        std::cerr << "LLModel ERROR: CPU does not support AVX\n";
+    if (!has_at_least_minimal_hardware())
        return nullptr;
    }
    // Get correct implementation
    const Implementation* impl = nullptr;
--- a/gpt4all-bindings/python/gpt4all/gpt4all.py
+++ b/gpt4all-bindings/python/gpt4all/gpt4all.py
@ -1,8 +1,6 @@
 """
 Python only API for running all GPT4All models.
 """
 from __future__ import annotations
 import os
 import sys
 import time
@ -62,12 +60,11 @@ class GPT4All:
    def __init__(
        self,
        model_name: str,
-        model_path: Optional[Union[str, os.PathLike[str]]] = None,
+        model_path: Optional[str] = None,
        model_type: Optional[str] = None,
        allow_download: bool = True,
        n_threads: Optional[int] = None,
        device: Optional[str] = "cpu",
        verbose: bool = False,
    ):
        """
        Constructor
@ -92,7 +89,7 @@ class GPT4All:
        self.model_type = model_type
        self.model = pyllmodel.LLModel()
        # Retrieve model and download if allowed
-        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download, verbose=verbose)
+        self.config: ConfigType = self.retrieve_model(model_name, model_path=model_path, allow_download=allow_download)
        if device is not None:
            if device != "cpu":
                self.model.init_gpu(model_path=self.config["path"], device=device)
@ -113,17 +110,14 @@ class GPT4All:
        Returns:
            Model list in JSON format.
        """
-        resp = requests.get("https://gpt4all.io/models/models2.json")
+        return requests.get("https://gpt4all.io/models/models2.json").json()
        if resp.status_code != 200:
            raise ValueError(f'Request failed: HTTP {resp.status_code} {resp.reason}')
        return resp.json()
    @staticmethod
    def retrieve_model(
        model_name: str,
-        model_path: Optional[Union[str, os.PathLike[str]]] = None,
+        model_path: Optional[str] = None,
        allow_download: bool = True,
-        verbose: bool = False,
+        verbose: bool = True,
    ) -> ConfigType:
        """
        Find model file, and if it doesn't exist, download the model.
@ -166,7 +160,7 @@ class GPT4All:
                )
            model_path = DEFAULT_MODEL_DIRECTORY
        else:
-            model_path = str(model_path).replace("\\", "\\\\")
+            model_path = model_path.replace("\\", "\\\\")
        if not os.path.exists(model_path):
            raise ValueError(f"Invalid model directory: {model_path}")
@ -191,7 +185,7 @@ class GPT4All:
    @staticmethod
    def download_model(
        model_filename: str,
-        model_path: Union[str, os.PathLike[str]],
+        model_path: str,
        verbose: bool = True,
        url: Optional[str] = None,
    ) -> str:
@ -218,9 +212,6 @@ class GPT4All:
        download_url = get_download_url(model_filename)
        response = requests.get(download_url, stream=True)
        if response.status_code != 200:
            raise ValueError(f'Request failed: HTTP {response.status_code} {response.reason}')
        total_size_in_bytes = int(response.headers.get("content-length", 0))
        block_size = 2**20  # 1 MB
--- a/gpt4all-chat/chat.cpp
+++ b/gpt4all-chat/chat.cpp
@ -385,7 +385,7 @@ bool Chat::serialize(QDataStream &stream, int version) const
        stream << m_modelInfo.filename();
    if (version > 2)
        stream << m_collections;
-    if (!m_llmodel->serialize(stream, version, true /*serializeKV*/))
+    if (!m_llmodel->serialize(stream, version))
        return false;
    if (!m_chatModel->serialize(stream, version))
        return false;
@ -404,36 +404,29 @@ bool Chat::deserialize(QDataStream &stream, int version)
    QString modelId;
    stream >> modelId;
    if (version > 4) {
-        if (ModelList::globalInstance()->contains(modelId))
+        if (!ModelList::globalInstance()->contains(modelId))
-            m_modelInfo = ModelList::globalInstance()->modelInfo(modelId);
+            return false;
        m_modelInfo = ModelList::globalInstance()->modelInfo(modelId);
    } else {
-        if (ModelList::globalInstance()->containsByFilename(modelId))
+        if (!ModelList::globalInstance()->containsByFilename(modelId))
-            m_modelInfo = ModelList::globalInstance()->modelInfoByFilename(modelId);
+            return false;
        m_modelInfo = ModelList::globalInstance()->modelInfoByFilename(modelId);
    }
-    if (!m_modelInfo.id().isEmpty())
+    emit modelInfoChanged();
        emit modelInfoChanged();
    bool deserializeKV = true; // make this a setting
    bool discardKV = m_modelInfo.id().isEmpty();
    // Prior to version 2 gptj models had a bug that fixed the kv_cache to F32 instead of F16 so
    // unfortunately, we cannot deserialize these
    if (version < 2 && m_modelInfo.filename().contains("gpt4all-j"))
-        discardKV = true;
+        return false;
    if (version > 2) {
        stream >> m_collections;
        emit collectionListChanged(m_collections);
    }
    m_llmodel->setModelInfo(m_modelInfo);
-    if (!m_llmodel->deserialize(stream, version, deserializeKV, discardKV))
+    if (!m_llmodel->deserialize(stream, version))
        return false;
    if (!m_chatModel->deserialize(stream, version))
        return false;
    if (!deserializeKV || discardKV)
        m_llmodel->setStateFromText(m_chatModel->text());
    emit chatModelChanged();
    return stream.status() == QDataStream::Ok;
 }
--- a/gpt4all-chat/chatlistmodel.cpp
+++ b/gpt4all-chat/chatlistmodel.cpp
@ -84,16 +84,13 @@ void ChatSaver::saveChats(const QVector<Chat *> &chats)
    const QString savePath = MySettings::globalInstance()->modelPath();
    for (Chat *chat : chats) {
        QString fileName = "gpt4all-" + chat->id() + ".chat";
-        QString filePath = savePath + "/" + fileName;
+        QFile file(savePath + "/" + fileName);
-        QFile originalFile(filePath);
+        bool success = file.open(QIODevice::WriteOnly);
        QFile tempFile(filePath + ".tmp"); // Temporary file
        bool success = tempFile.open(QIODevice::WriteOnly);
        if (!success) {
-            qWarning() << "ERROR: Couldn't save chat to temporary file:" << tempFile.fileName();
+            qWarning() << "ERROR: Couldn't save chat to file:" << file.fileName();
            continue;
        }
-        QDataStream out(&tempFile);
+        QDataStream out(&file);
        out << (quint32)CHAT_FORMAT_MAGIC;
        out << (qint32)CHAT_FORMAT_VERSION;
@ -101,16 +98,11 @@ void ChatSaver::saveChats(const QVector<Chat *> &chats)
        qDebug() << "serializing chat" << fileName;
        if (!chat->serialize(out, CHAT_FORMAT_VERSION)) {
-            qWarning() << "ERROR: Couldn't serialize chat to file:" << tempFile.fileName();
+            qWarning() << "ERROR: Couldn't serialize chat to file:" << file.fileName();
-            tempFile.remove();
+            file.remove();
            continue;
        }
-
+        file.close();
        if (originalFile.exists())
            originalFile.remove();
        tempFile.rename(filePath);
    }
    qint64 elapsedTime = timer.elapsed();
    qDebug() << "serializing chats took:" << elapsedTime << "ms";
    emit saveChatsFinished();
@ -232,6 +224,7 @@ void ChatsRestoreThread::run()
            chat->moveToThread(qApp->thread());
            if (!chat->deserialize(in, version)) {
                qWarning() << "ERROR: Couldn't deserialize chat from file:" << file.fileName();
                file.remove();
            } else {
                emit chatRestored(chat);
            }
--- a/gpt4all-chat/chatllm.cpp
+++ b/gpt4all-chat/chatllm.cpp
@ -69,7 +69,6 @@ ChatLLM::ChatLLM(Chat *parent, bool isServer)
    , m_forceMetal(MySettings::globalInstance()->forceMetal())
    , m_reloadingToChangeVariant(false)
    , m_processedSystemPrompt(false)
    , m_restoreStateFromText(false)
 {
    moveToThread(&m_llmThread);
    connect(this, &ChatLLM::sendStartup, Network::globalInstance(), &Network::sendStartup);
@ -727,35 +726,7 @@ bool ChatLLM::handleSystemRecalculate(bool isRecalc)
    return false;
 }
-bool ChatLLM::handleRestoreStateFromTextPrompt(int32_t token)
+bool ChatLLM::serialize(QDataStream &stream, int version)
 {
 #if defined(DEBUG)
    qDebug() << "restore state from text prompt" << m_llmThread.objectName() << token << m_stopGenerating;
 #endif
    Q_UNUSED(token);
    return !m_stopGenerating;
 }
 bool ChatLLM::handleRestoreStateFromTextResponse(int32_t token, const std::string &response)
 {
 #if defined(DEBUG)
    qDebug() << "restore state from text response" << m_llmThread.objectName() << token << response << m_stopGenerating;
 #endif
    Q_UNUSED(token);
    Q_UNUSED(response);
    return false;
 }
 bool ChatLLM::handleRestoreStateFromTextRecalculate(bool isRecalc)
 {
 #if defined(DEBUG)
    qDebug() << "restore state from text recalc" << m_llmThread.objectName() << isRecalc;
 #endif
    Q_UNUSED(isRecalc);
    return false;
 }
 bool ChatLLM::serialize(QDataStream &stream, int version, bool serializeKV)
 {
    if (version > 1) {
        stream << m_llModelType;
@ -770,16 +741,8 @@ bool ChatLLM::serialize(QDataStream &stream, int version, bool serializeKV)
    stream << response();
    stream << generatedName();
    stream << m_promptResponseTokens;
    if (!serializeKV) {
 #if defined(DEBUG)
        qDebug() << "serialize" << m_llmThread.objectName() << m_state.size();
 #endif
        return stream.status() == QDataStream::Ok;
    }
    if (version <= 3) {
-        int responseLogits = 0;
+        int responseLogits;
        stream << responseLogits;
    }
    stream << m_ctx.n_past;
@ -796,7 +759,7 @@ bool ChatLLM::serialize(QDataStream &stream, int version, bool serializeKV)
    return stream.status() == QDataStream::Ok;
 }
-bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV, bool discardKV)
+bool ChatLLM::deserialize(QDataStream &stream, int version)
 {
    if (version > 1) {
        int internalStateVersion;
@ -810,60 +773,26 @@ bool ChatLLM::deserialize(QDataStream &stream, int version, bool deserializeKV,
    stream >> nameResponse;
    m_nameResponse = nameResponse.toStdString();
    stream >> m_promptResponseTokens;
    // If we do not deserialize the KV or it is discarded, then we need to restore the state from the
    // text only. This will be a costly operation, but the chat has to be restored from the text archive
    // alone.
    m_restoreStateFromText = !deserializeKV || discardKV;
    if (!deserializeKV) {
 #if defined(DEBUG)
        qDebug() << "deserialize" << m_llmThread.objectName();
 #endif
        return stream.status() == QDataStream::Ok;
    }
    if (version <= 3) {
        int responseLogits;
        stream >> responseLogits;
    }
-
+    stream >> m_ctx.n_past;
    int32_t n_past;
    stream >> n_past;
    if (!discardKV) m_ctx.n_past = n_past;
    quint64 logitsSize;
    stream >> logitsSize;
-    if (!discardKV) {
+    m_ctx.logits.resize(logitsSize);
-        m_ctx.logits.resize(logitsSize);
+    stream.readRawData(reinterpret_cast<char*>(m_ctx.logits.data()), logitsSize * sizeof(float));
        stream.readRawData(reinterpret_cast<char*>(m_ctx.logits.data()), logitsSize * sizeof(float));
    } else {
        stream.skipRawData(logitsSize * sizeof(float));
    }
    quint64 tokensSize;
    stream >> tokensSize;
-    if (!discardKV) {
+    m_ctx.tokens.resize(tokensSize);
-        m_ctx.tokens.resize(tokensSize);
+    stream.readRawData(reinterpret_cast<char*>(m_ctx.tokens.data()), tokensSize * sizeof(int));
        stream.readRawData(reinterpret_cast<char*>(m_ctx.tokens.data()), tokensSize * sizeof(int));
    } else {
        stream.skipRawData(tokensSize * sizeof(int));
    }
    if (version > 0) {
        QByteArray compressed;
        stream >> compressed;
-        if (!discardKV)
+        m_state = qUncompress(compressed);
            m_state = qUncompress(compressed);
    } else {
-        if (!discardKV)
+        stream >> m_state;
            stream >> m_state;
        else {
            QByteArray state;
            stream >> m_state;
        }
    }
 #if defined(DEBUG)
    qDebug() << "deserialize" << m_llmThread.objectName();
 #endif
@ -894,7 +823,7 @@ void ChatLLM::saveState()
 void ChatLLM::restoreState()
 {
-    if (!isModelLoaded())
+    if (!isModelLoaded() || m_state.isEmpty())
        return;
    if (m_llModelType == LLModelType::CHATGPT_) {
@ -909,19 +838,10 @@ void ChatLLM::restoreState()
        return;
    }
    if (m_restoreStateFromText) {
        Q_ASSERT(m_state.isEmpty());
        processRestoreStateFromText();
    }
 #if defined(DEBUG)
    qDebug() << "restoreState" << m_llmThread.objectName() << "size:" << m_state.size();
 #endif
    m_processedSystemPrompt = true;
    if (m_state.isEmpty())
        return;
    m_llModelInfo.model->restoreState(static_cast<const uint8_t*>(reinterpret_cast<void*>(m_state.data())));
    m_state.clear();
    m_state.resize(0);
@ -939,10 +859,7 @@ void ChatLLM::processSystemPrompt()
        return;
    }
    // Start with a whole new context
    m_stopGenerating = false;
    m_ctx = LLModel::PromptContext();
    auto promptFunc = std::bind(&ChatLLM::handleSystemPrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleSystemResponse, this, std::placeholders::_1,
        std::placeholders::_2);
@ -973,54 +890,5 @@ void ChatLLM::processSystemPrompt()
    printf("\n");
    fflush(stdout);
 #endif
-
+    m_processedSystemPrompt = true;
    m_processedSystemPrompt = !m_stopGenerating;
 }
 void ChatLLM::processRestoreStateFromText()
 {
    Q_ASSERT(isModelLoaded());
    if (!isModelLoaded() || !m_restoreStateFromText || m_isServer)
        return;
    m_isRecalc = true;
    emit recalcChanged();
    m_stopGenerating = false;
    m_ctx = LLModel::PromptContext();
    auto promptFunc = std::bind(&ChatLLM::handleRestoreStateFromTextPrompt, this, std::placeholders::_1);
    auto responseFunc = std::bind(&ChatLLM::handleRestoreStateFromTextResponse, this, std::placeholders::_1,
        std::placeholders::_2);
    auto recalcFunc = std::bind(&ChatLLM::handleRestoreStateFromTextRecalculate, this, std::placeholders::_1);
    const QString promptTemplate = MySettings::globalInstance()->modelPromptTemplate(m_modelInfo);
    const int32_t n_predict = MySettings::globalInstance()->modelMaxLength(m_modelInfo);
    const int32_t top_k = MySettings::globalInstance()->modelTopK(m_modelInfo);
    const float top_p = MySettings::globalInstance()->modelTopP(m_modelInfo);
    const float temp = MySettings::globalInstance()->modelTemperature(m_modelInfo);
    const int32_t n_batch = MySettings::globalInstance()->modelPromptBatchSize(m_modelInfo);
    const float repeat_penalty = MySettings::globalInstance()->modelRepeatPenalty(m_modelInfo);
    const int32_t repeat_penalty_tokens = MySettings::globalInstance()->modelRepeatPenaltyTokens(m_modelInfo);
    int n_threads = MySettings::globalInstance()->threadCount();
    m_ctx.n_predict = n_predict;
    m_ctx.top_k = top_k;
    m_ctx.top_p = top_p;
    m_ctx.temp = temp;
    m_ctx.n_batch = n_batch;
    m_ctx.repeat_penalty = repeat_penalty;
    m_ctx.repeat_last_n = repeat_penalty_tokens;
    m_llModelInfo.model->setThreadCount(n_threads);
    for (auto pair : m_stateFromText) {
        const QString str = pair.first == "Prompt: " ? promptTemplate.arg(pair.second) : pair.second;
        m_llModelInfo.model->prompt(str.toStdString(), promptFunc, responseFunc, recalcFunc, m_ctx);
    }
    if (!m_stopGenerating) {
        m_restoreStateFromText = false;
        m_stateFromText.clear();
    }
    m_isRecalc = false;
    emit recalcChanged();
 }
--- a/gpt4all-chat/chatllm.h
+++ b/gpt4all-chat/chatllm.h
@ -92,9 +92,8 @@ public:
    QString generatedName() const { return QString::fromStdString(m_nameResponse); }
-    bool serialize(QDataStream &stream, int version, bool serializeKV);
+    bool serialize(QDataStream &stream, int version);
-    bool deserialize(QDataStream &stream, int version, bool deserializeKV, bool discardKV);
+    bool deserialize(QDataStream &stream, int version);
    void setStateFromText(const QVector<QPair<QString, QString>> &stateFromText) { m_stateFromText = stateFromText; }
 public Q_SLOTS:
    bool prompt(const QList<QString> &collectionList, const QString &prompt);
@ -111,7 +110,6 @@ public Q_SLOTS:
    void handleForceMetalChanged(bool forceMetal);
    void handleDeviceChanged();
    void processSystemPrompt();
    void processRestoreStateFromText();
 Q_SIGNALS:
    void recalcChanged();
@ -146,9 +144,6 @@ protected:
    bool handleSystemPrompt(int32_t token);
    bool handleSystemResponse(int32_t token, const std::string &response);
    bool handleSystemRecalculate(bool isRecalc);
    bool handleRestoreStateFromTextPrompt(int32_t token);
    bool handleRestoreStateFromTextResponse(int32_t token, const std::string &response);
    bool handleRestoreStateFromTextRecalculate(bool isRecalc);
    void saveState();
    void restoreState();
@ -173,8 +168,6 @@ private:
    bool m_forceMetal;
    bool m_reloadingToChangeVariant;
    bool m_processedSystemPrompt;
    bool m_restoreStateFromText;
    QVector<QPair<QString, QString>> m_stateFromText;
 };
 #endif // CHATLLM_H
--- a/gpt4all-chat/chatmodel.h
+++ b/gpt4all-chat/chatmodel.h
@ -285,14 +285,6 @@ public:
        return stream.status() == QDataStream::Ok;
    }
    QVector<QPair<QString, QString>> text() const
    {
        QVector<QPair<QString, QString>> result;
        for (const auto &c : m_chatItems)
            result << qMakePair(c.name, c.value);
        return result;
    }
 Q_SIGNALS:
    void countChanged();
		`@ -1 +1 @@`
			`Subproject commit 3742085b0429cbe0ede49bcb9f891e4a5e25a724`				`Subproject commit 7b8f00f5ccf4fc3cc67fe1ced792b3aec1ae6c1c`