mirror of
https://github.com/nomic-ai/gpt4all.git
synced 2025-06-23 00:02:10 -04:00
Compare commits
No commits in common. "main" and "python-v1.0.11" have entirely different histories.
main
...
python-v1.
@ -1,22 +1,19 @@
|
|||||||
version: 2.1
|
version: 2.1
|
||||||
setup: true
|
setup: true
|
||||||
orbs:
|
orbs:
|
||||||
path-filtering: circleci/path-filtering@1.3.0
|
path-filtering: circleci/path-filtering@0.0.1
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
version: 2.1
|
version: 2.1
|
||||||
generate-config:
|
generate-config:
|
||||||
jobs:
|
jobs:
|
||||||
- path-filtering/filter:
|
- path-filtering/filter:
|
||||||
filters:
|
|
||||||
tags:
|
|
||||||
only:
|
|
||||||
- /.*/
|
|
||||||
base-revision: main
|
base-revision: main
|
||||||
config-path: .circleci/continue_config.yml
|
config-path: .circleci/continue_config.yml
|
||||||
mapping: |
|
mapping: |
|
||||||
.circleci/.* run-all-workflows true
|
|
||||||
gpt4all-backend/.* run-all-workflows true
|
|
||||||
gpt4all-bindings/python/.* run-python-workflow true
|
gpt4all-bindings/python/.* run-python-workflow true
|
||||||
gpt4all-bindings/typescript/.* run-ts-workflow true
|
gpt4all-bindings/typescript/.* run-ts-workflow true
|
||||||
|
gpt4all-bindings/csharp/.* run-csharp-workflow true
|
||||||
|
gpt4all-backend/.* run-chat-workflow true
|
||||||
gpt4all-chat/.* run-chat-workflow true
|
gpt4all-chat/.* run-chat-workflow true
|
||||||
|
.* run-default-workflow true
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,17 +0,0 @@
|
|||||||
import re
|
|
||||||
import sys
|
|
||||||
|
|
||||||
ID_REG = r"id: (.*)"
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
notary_log = sys.argv[1]
|
|
||||||
with open(notary_log, "r") as f:
|
|
||||||
notary_output = f.read()
|
|
||||||
id_m = re.search(ID_REG, notary_output)
|
|
||||||
if id_m:
|
|
||||||
print(id_m.group(1))
|
|
||||||
else:
|
|
||||||
raise RuntimeError("Unable to parse ID from notarization logs")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
@ -1,3 +1,3 @@
|
|||||||
[codespell]
|
[codespell]
|
||||||
ignore-words-list = blong, afterall, assistent, crasher, requestor
|
ignore-words-list = blong, belong, afterall, som
|
||||||
skip = ./.git,./gpt4all-chat/translations,*.pdf,*.svg,*.lock
|
skip = .git,*.pdf,*.svg,*.lock
|
||||||
|
35
.github/ISSUE_TEMPLATE/bindings-bug.md
vendored
35
.github/ISSUE_TEMPLATE/bindings-bug.md
vendored
@ -1,35 +0,0 @@
|
|||||||
---
|
|
||||||
name: "\U0001F6E0 Bindings Bug Report"
|
|
||||||
about: A bug report for the GPT4All Bindings
|
|
||||||
labels: ["bindings", "bug-unconfirmed"]
|
|
||||||
---
|
|
||||||
|
|
||||||
<!-- Before creating a new issue, please make sure to take a few moments to check the issue tracker for existing issues about the bug. -->
|
|
||||||
|
|
||||||
### Bug Report
|
|
||||||
|
|
||||||
<!-- A clear and concise description of what the bug is. -->
|
|
||||||
|
|
||||||
### Example Code
|
|
||||||
|
|
||||||
<!-- Please provide a minimal code example that can be used to experience this issue. Delete this section if it does not apply. -->
|
|
||||||
|
|
||||||
### Steps to Reproduce
|
|
||||||
|
|
||||||
<!-- List the steps that should be taken to experience this issue. -->
|
|
||||||
|
|
||||||
1.
|
|
||||||
2.
|
|
||||||
3.
|
|
||||||
|
|
||||||
### Expected Behavior
|
|
||||||
|
|
||||||
<!-- In a few words, what did you expect to happen? -->
|
|
||||||
|
|
||||||
### Your Environment
|
|
||||||
|
|
||||||
- Bindings version (e.g. "Version" from `pip show gpt4all`):
|
|
||||||
- Operating System:
|
|
||||||
- Chat model used (if applicable):
|
|
||||||
|
|
||||||
<!-- You can freely edit this text, please remove all the lines you believe are unnecessary. -->
|
|
70
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
70
.github/ISSUE_TEMPLATE/bug-report.yml
vendored
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
name: "\U0001F41B Bug Report"
|
||||||
|
description: Submit a bug report to help us improve GPT4All
|
||||||
|
labels: ["02 Bug Report"]
|
||||||
|
body:
|
||||||
|
- type: markdown
|
||||||
|
attributes:
|
||||||
|
value: >
|
||||||
|
Thank you for taking the time to file a bug report. Before creating a new
|
||||||
|
issue, please make sure to take a few moments to check the issue tracker
|
||||||
|
for existing issues about the bug.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: system-info
|
||||||
|
attributes:
|
||||||
|
label: System Info
|
||||||
|
description: Please share your system info with us.
|
||||||
|
placeholder: GPT4All version, platform, python version, etc...
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: information-scripts-examples
|
||||||
|
attributes:
|
||||||
|
label: Information
|
||||||
|
description: "The problem arises when using:"
|
||||||
|
options:
|
||||||
|
- label: "The official example notebooks/scripts"
|
||||||
|
- label: "My own modified scripts"
|
||||||
|
|
||||||
|
- type: checkboxes
|
||||||
|
id: related-components
|
||||||
|
attributes:
|
||||||
|
label: Related Components
|
||||||
|
description: "Select the components related to the issue (if applicable):"
|
||||||
|
options:
|
||||||
|
- label: "backend"
|
||||||
|
- label: "bindings"
|
||||||
|
- label: "python-bindings"
|
||||||
|
- label: "chat-ui"
|
||||||
|
- label: "models"
|
||||||
|
- label: "circleci"
|
||||||
|
- label: "docker"
|
||||||
|
- label: "api"
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: reproduction
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
attributes:
|
||||||
|
label: Reproduction
|
||||||
|
description: |
|
||||||
|
Please provide a [code sample](https://stackoverflow.com/help/minimal-reproducible-example) that reproduces the problem you ran into. It can be a Colab link or just a code snippet.
|
||||||
|
If you have code snippets, error messages, stack traces please provide them here as well.
|
||||||
|
Important! Use code tags to correctly format your code. See https://help.github.com/en/github/writing-on-github/creating-and-highlighting-code-blocks#syntax-highlighting
|
||||||
|
Avoid screenshots when possible, as they are hard to read and (more importantly) don't allow others to copy-and-paste your code.
|
||||||
|
|
||||||
|
placeholder: |
|
||||||
|
Steps to reproduce the behavior:
|
||||||
|
|
||||||
|
1.
|
||||||
|
2.
|
||||||
|
3.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: expected-behavior
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
attributes:
|
||||||
|
label: Expected behavior
|
||||||
|
description: "A clear and concise description of what you would expect to happen."
|
31
.github/ISSUE_TEMPLATE/chat-bug.md
vendored
31
.github/ISSUE_TEMPLATE/chat-bug.md
vendored
@ -1,31 +0,0 @@
|
|||||||
---
|
|
||||||
name: "\U0001F4AC GPT4All Bug Report"
|
|
||||||
about: A bug report for GPT4All Chat
|
|
||||||
labels: ["chat", "bug-unconfirmed"]
|
|
||||||
---
|
|
||||||
|
|
||||||
<!-- Before creating a new issue, please make sure to take a few moments to check the issue tracker for existing issues about the bug. -->
|
|
||||||
|
|
||||||
### Bug Report
|
|
||||||
|
|
||||||
<!-- A clear and concise description of what the bug is. -->
|
|
||||||
|
|
||||||
### Steps to Reproduce
|
|
||||||
|
|
||||||
<!-- List the steps that should be taken to experience this issue. Provide any relevant information about your configuration, and describe anything that was unexpected. -->
|
|
||||||
|
|
||||||
1.
|
|
||||||
2.
|
|
||||||
3.
|
|
||||||
|
|
||||||
### Expected Behavior
|
|
||||||
|
|
||||||
<!-- In a few words, what did you expect to happen? -->
|
|
||||||
|
|
||||||
### Your Environment
|
|
||||||
|
|
||||||
- GPT4All version:
|
|
||||||
- Operating System:
|
|
||||||
- Chat model used (if applicable):
|
|
||||||
|
|
||||||
<!-- You can freely edit this text, please remove all the lines you believe are unnecessary. -->
|
|
3
.github/ISSUE_TEMPLATE/config.yml
vendored
3
.github/ISSUE_TEMPLATE/config.yml
vendored
@ -1 +1,2 @@
|
|||||||
version: 2.1
|
blank_issues_enabled: false
|
||||||
|
version: 2.1
|
9
.github/ISSUE_TEMPLATE/documentation.md
vendored
9
.github/ISSUE_TEMPLATE/documentation.md
vendored
@ -1,9 +0,0 @@
|
|||||||
---
|
|
||||||
name: "\U0001F4C4 Documentation"
|
|
||||||
about: An issue related to the GPT4All documentation
|
|
||||||
labels: ["documentation"]
|
|
||||||
---
|
|
||||||
|
|
||||||
### Documentation
|
|
||||||
|
|
||||||
<!-- Please describe the issue with the documentation as clearly as possible. -->
|
|
19
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
19
.github/ISSUE_TEMPLATE/documentation.yml
vendored
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
name: Documentation
|
||||||
|
description: Report an issue related to the GPT4All documentation.
|
||||||
|
title: "DOC: <Please write a comprehensive title after the 'DOC: ' prefix>"
|
||||||
|
labels: [03 - Documentation]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: "Issue with current documentation:"
|
||||||
|
description: >
|
||||||
|
Please make sure to leave a reference to the document/code you're
|
||||||
|
referring to.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: "Idea or request for content:"
|
||||||
|
description: >
|
||||||
|
Please describe as clearly as possible what topics you think are missing
|
||||||
|
from the current documentation.
|
10
.github/ISSUE_TEMPLATE/feature-request.md
vendored
10
.github/ISSUE_TEMPLATE/feature-request.md
vendored
@ -1,10 +0,0 @@
|
|||||||
---
|
|
||||||
name: "\U0001F680 Feature Request"
|
|
||||||
about: Submit a proposal/request for a new GPT4All feature
|
|
||||||
title: "[Feature] Feature request title..."
|
|
||||||
labels: ["enhancement"]
|
|
||||||
---
|
|
||||||
|
|
||||||
### Feature Request
|
|
||||||
|
|
||||||
<!-- A clear and concise description of the feature proposal. -->
|
|
30
.github/ISSUE_TEMPLATE/feature-request.yml
vendored
Normal file
30
.github/ISSUE_TEMPLATE/feature-request.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
name: "\U0001F680 Feature Request"
|
||||||
|
description: Submit a proposal/request for a new GPT4All feature
|
||||||
|
labels: ["02 Feature Request"]
|
||||||
|
body:
|
||||||
|
- type: textarea
|
||||||
|
id: feature-request
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
attributes:
|
||||||
|
label: Feature request
|
||||||
|
description: |
|
||||||
|
A clear and concise description of the feature proposal. Please provide links to any relevant GitHub repos, papers, or other resources if relevant.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: motivation
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
attributes:
|
||||||
|
label: Motivation
|
||||||
|
description: |
|
||||||
|
Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
id: contribution
|
||||||
|
validations:
|
||||||
|
required: true
|
||||||
|
attributes:
|
||||||
|
label: Your contribution
|
||||||
|
description: |
|
||||||
|
Is there any way that you could help, e.g. by submitting a PR? Make sure to read the CONTRIBUTING.MD [readme](https://github.com/nomic-ai/gpt4all/blob/main/CONTRIBUTING.md)
|
32
.github/ISSUE_TEMPLATE/other-bug.md
vendored
32
.github/ISSUE_TEMPLATE/other-bug.md
vendored
@ -1,32 +0,0 @@
|
|||||||
---
|
|
||||||
name: "\U0001F41B Other Bug Report"
|
|
||||||
about: A bug in another component of GPT4All
|
|
||||||
labels: ["bug-unconfirmed"]
|
|
||||||
---
|
|
||||||
|
|
||||||
<!-- Before creating a new issue, please make sure to take a few moments to check the issue tracker for existing issues about the bug. -->
|
|
||||||
|
|
||||||
### Bug Report
|
|
||||||
|
|
||||||
<!-- A clear and concise description of what the bug is. -->
|
|
||||||
|
|
||||||
### Steps to Reproduce
|
|
||||||
|
|
||||||
<!-- List the steps that should be taken to experience this issue. Provide any relevant information about your configuration, and describe anything that was unexpected. If this bug involves original code, please provide a minimal version that can reproduce the issue. -->
|
|
||||||
|
|
||||||
1.
|
|
||||||
2.
|
|
||||||
3.
|
|
||||||
|
|
||||||
### Expected Behavior
|
|
||||||
|
|
||||||
<!-- In a few words, what did you expect to happen? -->
|
|
||||||
|
|
||||||
### Your Environment
|
|
||||||
|
|
||||||
- GPT4All version (if applicable):
|
|
||||||
- Operating System:
|
|
||||||
- Chat model used (if applicable):
|
|
||||||
|
|
||||||
<!-- You can freely edit this text, please remove all the lines you believe are unnecessary. -->
|
|
||||||
|
|
18
.github/ISSUE_TEMPLATE/other.yml
vendored
Normal file
18
.github/ISSUE_TEMPLATE/other.yml
vendored
Normal file
@ -0,0 +1,18 @@
|
|||||||
|
name: Other Issue
|
||||||
|
description: Raise an issue that wouldn't be covered by the other templates.
|
||||||
|
title: "Issue: <Please write a comprehensive title after the 'Issue: ' prefix>"
|
||||||
|
labels: [04 - Other]
|
||||||
|
|
||||||
|
body:
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: "Issue you'd like to raise."
|
||||||
|
description: >
|
||||||
|
Please describe the issue you'd like to raise as clearly as possible.
|
||||||
|
Make sure to include any relevant links or references.
|
||||||
|
|
||||||
|
- type: textarea
|
||||||
|
attributes:
|
||||||
|
label: "Suggestion:"
|
||||||
|
description: >
|
||||||
|
Please outline a suggestion to improve the issue here.
|
2
.github/workflows/codespell.yml
vendored
2
.github/workflows/codespell.yml
vendored
@ -14,6 +14,6 @@ jobs:
|
|||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v3
|
||||||
- name: Codespell
|
- name: Codespell
|
||||||
uses: codespell-project/actions-codespell@v2
|
uses: codespell-project/actions-codespell@v2
|
||||||
|
7
.gitignore
vendored
7
.gitignore
vendored
@ -181,11 +181,6 @@ CMakeLists.txt.user
|
|||||||
gpt4all-chat/models/*
|
gpt4all-chat/models/*
|
||||||
build_*
|
build_*
|
||||||
build-*
|
build-*
|
||||||
cmake-build-*
|
|
||||||
/gpt4all-chat/tests/python/config.py
|
|
||||||
|
|
||||||
# IntelliJ
|
# IntelliJ
|
||||||
.idea/
|
.idea/
|
||||||
|
|
||||||
# LLM models
|
|
||||||
*.gguf
|
|
30
.gitmodules
vendored
30
.gitmodules
vendored
@ -1,25 +1,9 @@
|
|||||||
|
[submodule "llama.cpp-230519"]
|
||||||
|
path = gpt4all-backend/llama.cpp-230519
|
||||||
|
url = https://github.com/ggerganov/llama.cpp.git
|
||||||
|
[submodule "llama.cpp-230511"]
|
||||||
|
path = gpt4all-backend/llama.cpp-230511
|
||||||
|
url = https://github.com/nomic-ai/llama.cpp
|
||||||
[submodule "llama.cpp-mainline"]
|
[submodule "llama.cpp-mainline"]
|
||||||
path = gpt4all-backend/deps/llama.cpp-mainline
|
path = gpt4all-backend/llama.cpp-mainline
|
||||||
url = https://github.com/nomic-ai/llama.cpp.git
|
url = https://github.com/nomic-ai/llama.cpp.git
|
||||||
branch = master
|
|
||||||
[submodule "gpt4all-chat/usearch"]
|
|
||||||
path = gpt4all-chat/deps/usearch
|
|
||||||
url = https://github.com/nomic-ai/usearch.git
|
|
||||||
[submodule "gpt4all-chat/deps/SingleApplication"]
|
|
||||||
path = gpt4all-chat/deps/SingleApplication
|
|
||||||
url = https://github.com/nomic-ai/SingleApplication.git
|
|
||||||
[submodule "gpt4all-chat/deps/fmt"]
|
|
||||||
path = gpt4all-chat/deps/fmt
|
|
||||||
url = https://github.com/fmtlib/fmt.git
|
|
||||||
[submodule "gpt4all-chat/deps/DuckX"]
|
|
||||||
path = gpt4all-chat/deps/DuckX
|
|
||||||
url = https://github.com/nomic-ai/DuckX.git
|
|
||||||
[submodule "gpt4all-chat/deps/QXlsx"]
|
|
||||||
path = gpt4all-chat/deps/QXlsx
|
|
||||||
url = https://github.com/nomic-ai/QXlsx.git
|
|
||||||
[submodule "gpt4all-chat/deps/minja"]
|
|
||||||
path = gpt4all-chat/deps/minja
|
|
||||||
url = https://github.com/nomic-ai/minja.git
|
|
||||||
[submodule "gpt4all-chat/deps/json"]
|
|
||||||
path = gpt4all-chat/deps/json
|
|
||||||
url = https://github.com/nlohmann/json.git
|
|
||||||
|
30
LICENSE_SOM.txt
Normal file
30
LICENSE_SOM.txt
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
Software for Open Models License (SOM)
|
||||||
|
Version 1.0 dated August 30th, 2023
|
||||||
|
|
||||||
|
This license governs use of the accompanying Software. If you use the Software, you accept this license. If you do not accept the license, do not use the Software.
|
||||||
|
|
||||||
|
This license is intended to encourage open release of models created, modified, processed, or otherwise used via the Software under open licensing terms, and should be interpreted in light of that intent.
|
||||||
|
|
||||||
|
1. Definitions
|
||||||
|
The “Licensor” is the person or entity who is making the Software available under this license. “Software” is the software made available by Licensor under this license.
|
||||||
|
A “Model” is the output of a machine learning algorithm, and excludes the Software.
|
||||||
|
“Model Source Materials” must include the Model and model weights, and may include any input data, input data descriptions, documentation or training descriptions for the Model.
|
||||||
|
“Open Licensing Terms” means: (a) any open source license approved by the Open Source Initiative, or (b) any other terms that make the Model Source Materials publicly available free of charge, and allow recipients to use, modify and distribute the Model Source Materials. Terms described in (b) may include reasonable restrictions such as non-commercial or non-production limitations, or require use in compliance with law.
|
||||||
|
|
||||||
|
2. Grant of Rights. Subject to the conditions and limitations in section 3:
|
||||||
|
(A) Copyright Grant. Licensor grants you a non-exclusive, worldwide, royalty-free copyright license to copy, modify, and distribute the Software and any modifications of the Software you create under this license. The foregoing license includes without limitation the right to create, modify, and use Models using this Software.
|
||||||
|
|
||||||
|
(B) Patent Grant. Licensor grants you a non-exclusive, worldwide, royalty-free license, under any patents owned or controlled by Licensor, to make, have made, use, sell, offer for sale, import, or otherwise exploit the Software. No license is granted to patent rights that are not embodied in the operation of the Software in the form provided by Licensor.
|
||||||
|
|
||||||
|
3. Conditions and Limitations
|
||||||
|
(A) Model Licensing and Access. If you use the Software to create, modify, process, or otherwise use any Model, including usage to create inferences with a Model, whether or not you make the Model available to others, you must make that Model Source Materials publicly available under Open Licensing Terms.
|
||||||
|
|
||||||
|
(B) No Re-Licensing. If you redistribute the Software, or modifications to the Software made under the license granted above, you must make it available only under the terms of this license. You may offer additional terms such as warranties, maintenance and support, but You, and not Licensor, are responsible for performing such terms.
|
||||||
|
|
||||||
|
(C) No Trademark License. This license does not grant you rights to use the Licensor’s name, logo, or trademarks.
|
||||||
|
|
||||||
|
(D) If you assert in writing a claim against any person or entity alleging that the use of the Software infringes any patent, all of your licenses to the Software under Section 2 end automatically as of the date you asserted the claim.
|
||||||
|
|
||||||
|
(E) If you distribute any portion of the Software, you must retain all copyright, patent, trademark, and attribution notices that are present in the Software, and you must include a copy of this license.
|
||||||
|
|
||||||
|
(F) The Software is licensed “as-is.” You bear the entire risk of using it. Licensor gives You no express warranties, guarantees or conditions. You may have additional consumer rights under your local laws that this license cannot change. To the extent permitted under your local laws, the Licensor disclaims and excludes the implied warranties of merchantability, fitness for a particular purpose and non-infringement. To the extent this disclaimer is unlawful, you, and not Licensor, are responsible for any liability.
|
@ -1,77 +0,0 @@
|
|||||||
# MAINTAINERS
|
|
||||||
|
|
||||||
## Rules
|
|
||||||
|
|
||||||
* All content inside GPT4All shall have a documented maintainer
|
|
||||||
* If a maintainer decides to retire or resign a call for volunteers will go
|
|
||||||
out
|
|
||||||
* If no further maintainer can be found in a reasonable time frame, then the
|
|
||||||
content will be marked deprecated and removed in time
|
|
||||||
|
|
||||||
## Job
|
|
||||||
|
|
||||||
Maintainers will be...
|
|
||||||
|
|
||||||
1. Responsible for overseeing content under their stewardship
|
|
||||||
2. Responsible for triaging new issues, reviewing PRs, assigning priority
|
|
||||||
to tasks
|
|
||||||
3. Responsible for keeping content in sufficient quality in a timely fashion
|
|
||||||
|
|
||||||
## List
|
|
||||||
|
|
||||||
Adam Treat ([@manyoso](https://github.com/manyoso))<br/>
|
|
||||||
E-mail: adam@nomic.ai<br/>
|
|
||||||
Discord: `@gonzochess75`
|
|
||||||
- Overall project maintainer
|
|
||||||
- Chat UI
|
|
||||||
|
|
||||||
Jared Van Bortel ([@cebtenzzre](https://github.com/cebtenzzre))<br/>
|
|
||||||
E-mail: jared@nomic.ai<br/>
|
|
||||||
Discord: `@cebtenzzre`
|
|
||||||
- gpt4all-backend
|
|
||||||
- Python binding
|
|
||||||
- Python CLI app
|
|
||||||
|
|
||||||
Jacob Nguyen ([@jacoobes](https://github.com/jacoobes))<br/>
|
|
||||||
Discord: `@jacoobes`<br/>
|
|
||||||
E-mail: `jacoobes@sern.dev`
|
|
||||||
- TypeScript binding
|
|
||||||
|
|
||||||
Dominik ([@cosmic-snow](https://github.com/cosmic-snow))<br/>
|
|
||||||
E-mail: cosmic-snow@mailfence.com<br/>
|
|
||||||
Discord: `@cosmic__snow`
|
|
||||||
- Community documentation (GitHub Wiki)
|
|
||||||
|
|
||||||
Max Cembalest ([@mcembalest](https://github.com/mcembalest))<br/>
|
|
||||||
E-mail: max@nomic.ai<br/>
|
|
||||||
Discord: `@maxcembalest.`
|
|
||||||
- Official documentation (gpt4all-bindings/python/docs -> https://docs.gpt4all.io/)
|
|
||||||
|
|
||||||
Thiago Ramos ([@thiagojramos](https://github.com/thiagojramos))<br/>
|
|
||||||
E-mail: thiagojramos@outlook.com<br/>
|
|
||||||
- pt\_BR translation
|
|
||||||
|
|
||||||
不知火 Shiranui ([@supersonictw](https://github.com/supersonictw))<br/>
|
|
||||||
E-mail: supersonic@livemail.tw<br/>
|
|
||||||
Discord: `@supersonictw`
|
|
||||||
- zh\_TW translation
|
|
||||||
|
|
||||||
Jeremy Tayco ([@jstayco](https://github.com/jstayco))<br/>
|
|
||||||
E-mail: jstayco@protonmail.ch<br/>
|
|
||||||
Discord: `@vertana`
|
|
||||||
- es\_MX translation
|
|
||||||
|
|
||||||
Riccardo Giovanetti ([@Harvester62](https://github.com/Harvester62))<br/>
|
|
||||||
E-mail: riccardo.giovanetti@gmail.com<br/>
|
|
||||||
Discord: `@harvester62`
|
|
||||||
- it\_IT translation
|
|
||||||
|
|
||||||
Tim ([@Tim453](https://github.com/Tim453))<br/>
|
|
||||||
E-mail: tim453@mailbox.org<br/>
|
|
||||||
Discord: `@Tim453`
|
|
||||||
- Flatpak
|
|
||||||
|
|
||||||
Jack ([@wuodoo](https://github.com/wuodoo))<br/>
|
|
||||||
E-mail: 2296103047@qq.com<br/>
|
|
||||||
Discord: `@mikage`
|
|
||||||
- zh\_CN translation
|
|
139
README.md
139
README.md
@ -1,117 +1,72 @@
|
|||||||
<h1 align="center">GPT4All</h1>
|
<h1 align="center">GPT4All</h1>
|
||||||
|
|
||||||
|
<p align="center">Open-source assistant-style large language models that run locally on your CPU</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
Now with support for DeepSeek R1 Distillations
|
<a href="https://gpt4all.io">GPT4All Website</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
<a href="https://www.nomic.ai/gpt4all">Website</a> • <a href="https://docs.gpt4all.io">Documentation</a> • <a href="https://discord.gg/mGZE39AS3e">Discord</a> • <a href="https://www.youtube.com/watch?v=gQcZDXRVJok">YouTube Tutorial</a>
|
<a href="https://docs.gpt4all.io">GPT4All Documentation</a>
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
GPT4All runs large language models (LLMs) privately on everyday desktops & laptops.
|
<a href="https://discord.gg/mGZE39AS3e">Discord</a>
|
||||||
</p>
|
|
||||||
<p align="center">
|
|
||||||
No API calls or GPUs required - you can just download the application and <a href="https://docs.gpt4all.io/gpt4all_desktop/quickstart.html#quickstart">get started</a>.
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
Read about what's new in <a href="https://www.nomic.ai/blog/tag/gpt4all">our blog</a>.
|
<a href="https://python.langchain.com/en/latest/modules/models/llms/integrations/gpt4all.html">🦜️🔗 Official Langchain Backend</a>
|
||||||
</p>
|
</p>
|
||||||
<p align="center">
|
|
||||||
<a href="https://nomic.ai/gpt4all/#newsletter-form">Subscribe to the newsletter</a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
https://github.com/nomic-ai/gpt4all/assets/70534565/513a0f15-4964-4109-89e4-4f9a9011f311
|
|
||||||
|
|
||||||
<p align="center">
|
<p align="center">
|
||||||
GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
|
GPT4All is made possible by our compute partner <a href="https://www.paperspace.com/">Paperspace</a>.
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
## Download Links
|
<p align="center">
|
||||||
|
<img width="600" height="365" src="https://user-images.githubusercontent.com/13879686/231876409-e3de1934-93bb-4b4b-9013-b491a969ebbc.gif">
|
||||||
<p>
|
|
||||||
— <a href="https://gpt4all.io/installers/gpt4all-installer-win64.exe">
|
|
||||||
<img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows Installer
|
|
||||||
</a> —
|
|
||||||
</p>
|
</p>
|
||||||
<p>
|
<p align="center">
|
||||||
— <a href="https://gpt4all.io/installers/gpt4all-installer-win64-arm.exe">
|
Run on an M1 macOS Device (not sped up!)
|
||||||
<img src="gpt4all-bindings/python/docs/assets/windows.png" style="height: 1em; width: auto" /> Windows ARM Installer
|
|
||||||
</a> —
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
— <a href="https://gpt4all.io/installers/gpt4all-installer-darwin.dmg">
|
|
||||||
<img src="gpt4all-bindings/python/docs/assets/mac.png" style="height: 1em; width: auto" /> macOS Installer
|
|
||||||
</a> —
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
— <a href="https://gpt4all.io/installers/gpt4all-installer-linux.run">
|
|
||||||
<img src="gpt4all-bindings/python/docs/assets/ubuntu.svg" style="height: 1em; width: auto" /> Ubuntu Installer
|
|
||||||
</a> —
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
The Windows and Linux builds require Intel Core i3 2nd Gen / AMD Bulldozer, or better.
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
The Windows ARM build supports Qualcomm Snapdragon and Microsoft SQ1/SQ2 processors.
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
The Linux build is x86-64 only (no ARM).
|
|
||||||
</p>
|
|
||||||
<p>
|
|
||||||
The macOS build requires Monterey 12.6 or newer. Best results with Apple Silicon M-series processors.
|
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
See the full [System Requirements](gpt4all-chat/system_requirements.md) for more details.
|
## GPT4All: An ecosystem of open-source on-edge large language models.
|
||||||
|
GPT4All is an ecosystem to train and deploy **powerful** and **customized** large language models that run locally on consumer grade CPUs. Note that your CPU needs to support [AVX or AVX2 instructions](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions).
|
||||||
|
|
||||||
<br/>
|
Learn more in the [documentation](https://docs.gpt4all.io).
|
||||||
<br/>
|
|
||||||
<p>
|
|
||||||
<a href='https://flathub.org/apps/io.gpt4all.gpt4all'>
|
|
||||||
<img style="height: 2em; width: auto" alt='Get it on Flathub' src='https://flathub.org/api/badge'><br/>
|
|
||||||
Flathub (community maintained)
|
|
||||||
</a>
|
|
||||||
</p>
|
|
||||||
|
|
||||||
## Install GPT4All Python
|
The goal is simple - be the best instruction tuned assistant-style language model that any person or enterprise can freely use, distribute and build on.
|
||||||
|
|
||||||
`gpt4all` gives you access to LLMs with our Python client around [`llama.cpp`](https://github.com/ggerganov/llama.cpp) implementations.
|
A GPT4All model is a 3GB - 8GB file that you can download and plug into the GPT4All open-source ecosystem software. **Nomic AI** supports and maintains this software ecosystem to enforce quality and security alongside spearheading the effort to allow any person or enterprise to easily train and deploy their own on-edge large language models.
|
||||||
|
|
||||||
Nomic contributes to open source software like [`llama.cpp`](https://github.com/ggerganov/llama.cpp) to make LLMs accessible and efficient **for all**.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pip install gpt4all
|
|
||||||
```
|
|
||||||
|
|
||||||
```python
|
|
||||||
from gpt4all import GPT4All
|
|
||||||
model = GPT4All("Meta-Llama-3-8B-Instruct.Q4_0.gguf") # downloads / loads a 4.66GB LLM
|
|
||||||
with model.chat_session():
|
|
||||||
print(model.generate("How can I run LLMs efficiently on my laptop?", max_tokens=1024))
|
|
||||||
```
|
|
||||||
|
|
||||||
|
|
||||||
## Integrations
|
### Chat Client
|
||||||
|
Run any GPT4All model natively on your home desktop with the auto-updating desktop chat client. See <a href="https://gpt4all.io">GPT4All Website</a> for a full list of open-source models you can run with this powerful desktop application.
|
||||||
|
|
||||||
:parrot::link: [Langchain](https://python.langchain.com/v0.2/docs/integrations/providers/gpt4all/)
|
Direct Installer Links:
|
||||||
:card_file_box: [Weaviate Vector Database](https://github.com/weaviate/weaviate) - [module docs](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all)
|
|
||||||
:telescope: [OpenLIT (OTel-native Monitoring)](https://github.com/openlit/openlit) - [Docs](https://docs.openlit.io/latest/integrations/gpt4all)
|
|
||||||
|
|
||||||
## Release History
|
* [macOS](https://gpt4all.io/installers/gpt4all-installer-darwin.dmg)
|
||||||
- **July 2nd, 2024**: V3.0.0 Release
|
|
||||||
- Fresh redesign of the chat application UI
|
|
||||||
- Improved user workflow for LocalDocs
|
|
||||||
- Expanded access to more model architectures
|
|
||||||
- **October 19th, 2023**: GGUF Support Launches with Support for:
|
|
||||||
- Mistral 7b base model, an updated model gallery on our website, several new local code models including Rift Coder v1.5
|
|
||||||
- [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) support for Q4\_0 and Q4\_1 quantizations in GGUF.
|
|
||||||
- Offline build support for running old versions of the GPT4All Local LLM Chat Client.
|
|
||||||
- **September 18th, 2023**: [Nomic Vulkan](https://blog.nomic.ai/posts/gpt4all-gpu-inference-with-vulkan) launches supporting local LLM inference on NVIDIA and AMD GPUs.
|
|
||||||
- **July 2023**: Stable support for LocalDocs, a feature that allows you to privately and locally chat with your data.
|
|
||||||
- **June 28th, 2023**: [Docker-based API server] launches allowing inference of local LLMs from an OpenAI-compatible HTTP endpoint.
|
|
||||||
|
|
||||||
[Docker-based API server]: https://github.com/nomic-ai/gpt4all/tree/cef74c2be20f5b697055d5b8b506861c7b997fab/gpt4all-api
|
* [Windows](https://gpt4all.io/installers/gpt4all-installer-win64.exe)
|
||||||
|
|
||||||
|
* [Ubuntu](https://gpt4all.io/installers/gpt4all-installer-linux.run)
|
||||||
|
|
||||||
|
Find the most up-to-date information on the [GPT4All Website](https://gpt4all.io/)
|
||||||
|
|
||||||
|
### Chat Client building and running
|
||||||
|
|
||||||
|
* Follow the visual instructions on the chat client [build_and_run](gpt4all-chat/build_and_run.md) page
|
||||||
|
|
||||||
|
### Bindings
|
||||||
|
|
||||||
|
* <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python/README.md">:snake: Official Python Bindings</a> [](https://pepy.tech/project/gpt4all)
|
||||||
|
* <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript">:computer: Official Typescript Bindings</a>
|
||||||
|
* <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/golang">:computer: Official GoLang Bindings</a>
|
||||||
|
* <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/csharp">:computer: Official C# Bindings</a>
|
||||||
|
* <a href="https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/java">:computer: Official Java Bindings</a>
|
||||||
|
|
||||||
|
### Integrations
|
||||||
|
|
||||||
|
* 🗃️ [Weaviate Vector Database](https://github.com/weaviate/weaviate) - [module docs](https://weaviate.io/developers/weaviate/modules/retriever-vectorizer-modules/text2vec-gpt4all)
|
||||||
|
|
||||||
## Contributing
|
## Contributing
|
||||||
GPT4All welcomes contributions, involvement, and discussion from the open source community!
|
GPT4All welcomes contributions, involvement, and discussion from the open source community!
|
||||||
@ -121,6 +76,20 @@ Check project discord, with project owners, or through existing issues/PRs to av
|
|||||||
Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
|
Please make sure to tag all of the above with relevant project identifiers or your contribution could potentially get lost.
|
||||||
Example tags: `backend`, `bindings`, `python-bindings`, `documentation`, etc.
|
Example tags: `backend`, `bindings`, `python-bindings`, `documentation`, etc.
|
||||||
|
|
||||||
|
## Technical Reports
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://gpt4all.io/reports/GPT4All_Technical_Report_3.pdf">:green_book: Technical Report 3: GPT4All Snoozy and Groovy </a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://static.nomic.ai/gpt4all/2023_GPT4All-J_Technical_Report_2.pdf">:green_book: Technical Report 2: GPT4All-J </a>
|
||||||
|
</p>
|
||||||
|
|
||||||
|
<p align="center">
|
||||||
|
<a href="https://s3.amazonaws.com/static.nomic.ai/gpt4all/2023_GPT4All_Technical_Report.pdf">:green_book: Technical Report 1: GPT4All</a>
|
||||||
|
</p>
|
||||||
|
|
||||||
## Citation
|
## Citation
|
||||||
|
|
||||||
If you utilize this repository, models or data in a downstream project, please consider citing it with:
|
If you utilize this repository, models or data in a downstream project, please consider citing it with:
|
||||||
|
@ -1,41 +0,0 @@
|
|||||||
function(gpt4all_add_warning_options target)
|
|
||||||
if (MSVC)
|
|
||||||
return()
|
|
||||||
endif()
|
|
||||||
target_compile_options("${target}" PRIVATE
|
|
||||||
# base options
|
|
||||||
-Wall
|
|
||||||
-Wextra
|
|
||||||
# extra options
|
|
||||||
-Wcast-align
|
|
||||||
-Wextra-semi
|
|
||||||
-Wformat=2
|
|
||||||
-Wmissing-include-dirs
|
|
||||||
-Wsuggest-override
|
|
||||||
-Wvla
|
|
||||||
# errors
|
|
||||||
-Werror=format-security
|
|
||||||
-Werror=init-self
|
|
||||||
-Werror=pointer-arith
|
|
||||||
-Werror=undef
|
|
||||||
# disabled warnings
|
|
||||||
-Wno-sign-compare
|
|
||||||
-Wno-unused-parameter
|
|
||||||
)
|
|
||||||
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
|
|
||||||
target_compile_options("${target}" PRIVATE
|
|
||||||
-Wduplicated-branches
|
|
||||||
-Wduplicated-cond
|
|
||||||
-Wlogical-op
|
|
||||||
-Wno-reorder
|
|
||||||
-Wno-null-dereference
|
|
||||||
)
|
|
||||||
elseif (CMAKE_CXX_COMPILER_ID MATCHES "^(Apple)?Clang$")
|
|
||||||
target_compile_options("${target}" PRIVATE
|
|
||||||
-Wunreachable-code-break
|
|
||||||
-Wunreachable-code-return
|
|
||||||
-Werror=pointer-integer-compare
|
|
||||||
-Wno-reorder-ctor
|
|
||||||
)
|
|
||||||
endif()
|
|
||||||
endfunction()
|
|
112
gpt4all-api/.gitignore
vendored
Normal file
112
gpt4all-api/.gitignore
vendored
Normal file
@ -0,0 +1,112 @@
|
|||||||
|
# Byte-compiled / optimized / DLL files
|
||||||
|
__pycache__/
|
||||||
|
app/__pycache__/
|
||||||
|
gpt4all_api/__pycache__/
|
||||||
|
gpt4all_api/app/api_v1/__pycache__/
|
||||||
|
*.py[cod]
|
||||||
|
*$py.class
|
||||||
|
|
||||||
|
# C extensions
|
||||||
|
*.so
|
||||||
|
|
||||||
|
# VS Code
|
||||||
|
.vscode/
|
||||||
|
|
||||||
|
# Distribution / packaging
|
||||||
|
.Python
|
||||||
|
build/
|
||||||
|
develop-eggs/
|
||||||
|
dist/
|
||||||
|
downloads/
|
||||||
|
eggs/
|
||||||
|
.eggs/
|
||||||
|
lib64/
|
||||||
|
parts/
|
||||||
|
sdist/
|
||||||
|
var/
|
||||||
|
wheels/
|
||||||
|
*.egg-info/
|
||||||
|
.installed.cfg
|
||||||
|
*.egg
|
||||||
|
MANIFEST
|
||||||
|
|
||||||
|
# PyInstaller
|
||||||
|
# Usually these files are written by a python script from a template
|
||||||
|
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||||
|
*.manifest
|
||||||
|
*.spec
|
||||||
|
|
||||||
|
# Installer logs
|
||||||
|
pip-log.txt
|
||||||
|
pip-delete-this-directory.txt
|
||||||
|
|
||||||
|
# Unit test / coverage reports
|
||||||
|
htmlcov/
|
||||||
|
.tox/
|
||||||
|
.coverage
|
||||||
|
.coverage.*
|
||||||
|
.cache
|
||||||
|
nosetests.xml
|
||||||
|
coverage.xml
|
||||||
|
*.cover
|
||||||
|
.hypothesis/
|
||||||
|
.pytest_cache/
|
||||||
|
|
||||||
|
# Translations
|
||||||
|
*.mo
|
||||||
|
*.pot
|
||||||
|
|
||||||
|
# Django stuff:
|
||||||
|
*.log
|
||||||
|
local_settings.py
|
||||||
|
db.sqlite3
|
||||||
|
|
||||||
|
# Flask stuff:
|
||||||
|
instance/
|
||||||
|
.webassets-cache
|
||||||
|
|
||||||
|
# Scrapy stuff:
|
||||||
|
.scrapy
|
||||||
|
|
||||||
|
# Sphinx documentation
|
||||||
|
docs/_build/
|
||||||
|
|
||||||
|
# PyBuilder
|
||||||
|
target/
|
||||||
|
|
||||||
|
# Jupyter Notebook
|
||||||
|
.ipynb_checkpoints
|
||||||
|
|
||||||
|
# pyenv
|
||||||
|
.python-version
|
||||||
|
|
||||||
|
# celery beat schedule file
|
||||||
|
celerybeat-schedule
|
||||||
|
|
||||||
|
# SageMath parsed files
|
||||||
|
*.sage.py
|
||||||
|
|
||||||
|
# Environments
|
||||||
|
.env
|
||||||
|
.venv
|
||||||
|
env/
|
||||||
|
venv/
|
||||||
|
ENV/
|
||||||
|
env.bak/
|
||||||
|
venv.bak/
|
||||||
|
|
||||||
|
# Spyder project settings
|
||||||
|
.spyderproject
|
||||||
|
.spyproject
|
||||||
|
|
||||||
|
# Rope project settings
|
||||||
|
.ropeproject
|
||||||
|
|
||||||
|
# mkdocs documentation
|
||||||
|
/site
|
||||||
|
|
||||||
|
# mypy
|
||||||
|
.mypy_cache/
|
||||||
|
|
||||||
|
*.lock
|
||||||
|
*.cache
|
7
gpt4all-api/.isort.cfg
Normal file
7
gpt4all-api/.isort.cfg
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
[settings]
|
||||||
|
known_third_party=geopy,nltk,np,numpy,pandas,pysbd,fire,torch
|
||||||
|
|
||||||
|
line_length=120
|
||||||
|
include_trailing_comma=True
|
||||||
|
multi_line_output=3
|
||||||
|
use_parentheses=True
|
13
gpt4all-api/LICENSE
Normal file
13
gpt4all-api/LICENSE
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
Copyright 2023 Nomic, Inc.
|
||||||
|
|
||||||
|
Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
you may not use this file except in compliance with the License.
|
||||||
|
You may obtain a copy of the License at
|
||||||
|
|
||||||
|
http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
|
||||||
|
Unless required by applicable law or agreed to in writing, software
|
||||||
|
distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
See the License for the specific language governing permissions and
|
||||||
|
limitations under the License.
|
87
gpt4all-api/README.md
Normal file
87
gpt4all-api/README.md
Normal file
@ -0,0 +1,87 @@
|
|||||||
|
# GPT4All REST API
|
||||||
|
This directory contains the source code to run and build docker images that run a FastAPI app
|
||||||
|
for serving inference from GPT4All models. The API matches the OpenAI API spec.
|
||||||
|
|
||||||
|
## Tutorial
|
||||||
|
|
||||||
|
The following tutorial assumes that you have checked out this repo and cd'd into it.
|
||||||
|
|
||||||
|
### Starting the app
|
||||||
|
|
||||||
|
First change your working directory to `gpt4all/gpt4all-api`.
|
||||||
|
|
||||||
|
Now you can build the FastAPI docker image. You only have to do this on initial build or when you add new dependencies to the requirements.txt file:
|
||||||
|
```bash
|
||||||
|
DOCKER_BUILDKIT=1 docker build -t gpt4all_api --progress plain -f gpt4all_api/Dockerfile.buildkit .
|
||||||
|
```
|
||||||
|
|
||||||
|
Then, start the backend with:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
This will run both the API and locally hosted GPU inference server. If you want to run the API without the GPU inference server, you can run:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up --build gpt4all_api
|
||||||
|
```
|
||||||
|
|
||||||
|
To run the API with the GPU inference server, you will need to include environment variables (like the `MODEL_ID`). Edit the `.env` file and run
|
||||||
|
```bash
|
||||||
|
docker compose --env-file .env up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
#### Spinning up your app
|
||||||
|
Run `docker compose up` to spin up the backend. Monitor the logs for errors in-case you forgot to set an environment variable above.
|
||||||
|
|
||||||
|
|
||||||
|
#### Development
|
||||||
|
Run
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose up --build
|
||||||
|
```
|
||||||
|
and edit files in the `api` directory. The api will hot-reload on changes.
|
||||||
|
|
||||||
|
You can run the unit tests with
|
||||||
|
|
||||||
|
```bash
|
||||||
|
make test
|
||||||
|
```
|
||||||
|
|
||||||
|
#### Viewing API documentation
|
||||||
|
|
||||||
|
Once the FastAPI ap is started you can access its documentation and test the search endpoint by going to:
|
||||||
|
```
|
||||||
|
localhost:80/docs
|
||||||
|
```
|
||||||
|
|
||||||
|
This documentation should match the OpenAI OpenAPI spec located at https://github.com/openai/openai-openapi/blob/master/openapi.yaml
|
||||||
|
|
||||||
|
|
||||||
|
#### Running inference
|
||||||
|
```python
|
||||||
|
import openai
|
||||||
|
openai.api_base = "http://localhost:4891/v1"
|
||||||
|
|
||||||
|
openai.api_key = "not needed for a local LLM"
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion():
|
||||||
|
model = "gpt4all-j-v1.3-groovy"
|
||||||
|
prompt = "Who is Michael Jordan?"
|
||||||
|
response = openai.Completion.create(
|
||||||
|
model=model,
|
||||||
|
prompt=prompt,
|
||||||
|
max_tokens=50,
|
||||||
|
temperature=0.28,
|
||||||
|
top_p=0.95,
|
||||||
|
n=1,
|
||||||
|
echo=True,
|
||||||
|
stream=False
|
||||||
|
)
|
||||||
|
assert len(response['choices'][0]['text']) > len(prompt)
|
||||||
|
print(response)
|
||||||
|
```
|
24
gpt4all-api/docker-compose.gpu.yaml
Normal file
24
gpt4all-api/docker-compose.gpu.yaml
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
gpt4all_gpu:
|
||||||
|
image: ghcr.io/huggingface/text-generation-inference:0.9.3
|
||||||
|
container_name: gpt4all_gpu
|
||||||
|
restart: always #restart on error (usually code compilation from save during bad state)
|
||||||
|
environment:
|
||||||
|
- HUGGING_FACE_HUB_TOKEN=token
|
||||||
|
- USE_FLASH_ATTENTION=false
|
||||||
|
- MODEL_ID=''
|
||||||
|
- NUM_SHARD=1
|
||||||
|
command: --model-id $MODEL_ID --num-shard $NUM_SHARD
|
||||||
|
volumes:
|
||||||
|
- ./:/data
|
||||||
|
ports:
|
||||||
|
- "8080:80"
|
||||||
|
shm_size: 1g
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
capabilities: [gpu]
|
19
gpt4all-api/docker-compose.yaml
Normal file
19
gpt4all-api/docker-compose.yaml
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
version: "3.8"
|
||||||
|
|
||||||
|
services:
|
||||||
|
gpt4all_api:
|
||||||
|
image: gpt4all_api
|
||||||
|
container_name: gpt4all_api
|
||||||
|
restart: always #restart on error (usually code compilation from save during bad state)
|
||||||
|
ports:
|
||||||
|
- "4891:4891"
|
||||||
|
environment:
|
||||||
|
- APP_ENVIRONMENT=dev
|
||||||
|
- WEB_CONCURRENCY=2
|
||||||
|
- LOGLEVEL=debug
|
||||||
|
- PORT=4891
|
||||||
|
- model=ggml-mpt-7b-chat.bin
|
||||||
|
- inference_mode=cpu
|
||||||
|
volumes:
|
||||||
|
- './gpt4all_api/app:/app'
|
||||||
|
command: ["/start-reload.sh"]
|
23
gpt4all-api/gpt4all_api/Dockerfile.buildkit
Normal file
23
gpt4all-api/gpt4all_api/Dockerfile.buildkit
Normal file
@ -0,0 +1,23 @@
|
|||||||
|
# syntax=docker/dockerfile:1.0.0-experimental
|
||||||
|
FROM tiangolo/uvicorn-gunicorn:python3.11
|
||||||
|
|
||||||
|
ARG MODEL_BIN=ggml-mpt-7b-chat.bin
|
||||||
|
|
||||||
|
# Put first so anytime this file changes other cached layers are invalidated.
|
||||||
|
COPY gpt4all_api/requirements.txt /requirements.txt
|
||||||
|
|
||||||
|
RUN pip install --upgrade pip
|
||||||
|
|
||||||
|
# Run various pip install commands with ssh keys from host machine.
|
||||||
|
RUN --mount=type=ssh pip install -r /requirements.txt && \
|
||||||
|
rm -Rf /root/.cache && rm -Rf /tmp/pip-install*
|
||||||
|
|
||||||
|
# Finally, copy app and client.
|
||||||
|
COPY gpt4all_api/app /app
|
||||||
|
|
||||||
|
RUN mkdir -p /models
|
||||||
|
|
||||||
|
# Include the following line to bake a model into the image and not have to download it on API start.
|
||||||
|
RUN wget -q --show-progress=off https://gpt4all.io/models/${MODEL_BIN} -P /models \
|
||||||
|
&& md5sum /models/${MODEL_BIN}
|
||||||
|
|
1
gpt4all-api/gpt4all_api/README.md
Normal file
1
gpt4all-api/gpt4all_api/README.md
Normal file
@ -0,0 +1 @@
|
|||||||
|
# FastAPI app for serving GPT4All models
|
0
gpt4all-api/gpt4all_api/app/api_v1/__init__.py
Normal file
0
gpt4all-api/gpt4all_api/app/api_v1/__init__.py
Normal file
9
gpt4all-api/gpt4all_api/app/api_v1/api.py
Normal file
9
gpt4all-api/gpt4all_api/app/api_v1/api.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from api_v1.routes import chat, completions, engines, health
|
||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
router.include_router(chat.router)
|
||||||
|
router.include_router(completions.router)
|
||||||
|
router.include_router(engines.router)
|
||||||
|
router.include_router(health.router)
|
29
gpt4all-api/gpt4all_api/app/api_v1/events.py
Normal file
29
gpt4all-api/gpt4all_api/app/api_v1/events.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
from api_v1.settings import settings
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
from starlette.requests import Request
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
startup_msg_fmt = """
|
||||||
|
Starting up GPT4All API
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
async def on_http_error(request: Request, exc: HTTPException):
|
||||||
|
return JSONResponse({'detail': exc.detail}, status_code=exc.status_code)
|
||||||
|
|
||||||
|
|
||||||
|
async def on_startup(app):
|
||||||
|
startup_msg = startup_msg_fmt.format(settings=settings)
|
||||||
|
log.info(startup_msg)
|
||||||
|
|
||||||
|
|
||||||
|
def startup_event_handler(app):
|
||||||
|
async def start_app() -> None:
|
||||||
|
await on_startup(app)
|
||||||
|
|
||||||
|
return start_app
|
61
gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
Normal file
61
gpt4all-api/gpt4all_api/app/api_v1/routes/chat.py
Normal file
@ -0,0 +1,61 @@
|
|||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from api_v1.settings import settings
|
||||||
|
from fastapi import APIRouter, Depends, Response, Security, status
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionMessage(BaseModel):
|
||||||
|
role: str
|
||||||
|
content: str
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionRequest(BaseModel):
|
||||||
|
model: str = Field(..., description='The model to generate a completion from.')
|
||||||
|
messages: List[ChatCompletionMessage] = Field(..., description='The model to generate a completion from.')
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionChoice(BaseModel):
|
||||||
|
message: ChatCompletionMessage
|
||||||
|
index: int
|
||||||
|
finish_reason: str
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionUsage(BaseModel):
|
||||||
|
prompt_tokens: int
|
||||||
|
completion_tokens: int
|
||||||
|
total_tokens: int
|
||||||
|
|
||||||
|
|
||||||
|
class ChatCompletionResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
object: str = 'text_completion'
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
choices: List[ChatCompletionChoice]
|
||||||
|
usage: ChatCompletionUsage
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/chat", tags=["Completions Endpoints"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/completions", response_model=ChatCompletionResponse)
|
||||||
|
async def chat_completion(request: ChatCompletionRequest):
|
||||||
|
'''
|
||||||
|
Completes a GPT4All model response.
|
||||||
|
'''
|
||||||
|
|
||||||
|
return ChatCompletionResponse(
|
||||||
|
id='asdf',
|
||||||
|
created=time.time(),
|
||||||
|
model=request.model,
|
||||||
|
choices=[{}],
|
||||||
|
usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
|
||||||
|
)
|
215
gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
Normal file
215
gpt4all-api/gpt4all_api/app/api_v1/routes/completions.py
Normal file
@ -0,0 +1,215 @@
|
|||||||
|
import json
|
||||||
|
from typing import List, Dict, Iterable, AsyncIterable
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from typing import Dict, List, Union, Optional
|
||||||
|
from uuid import uuid4
|
||||||
|
import aiohttp
|
||||||
|
import asyncio
|
||||||
|
from api_v1.settings import settings
|
||||||
|
from fastapi import APIRouter, Depends, Response, Security, status, HTTPException
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from gpt4all import GPT4All
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
|
||||||
|
### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionRequest(BaseModel):
|
||||||
|
model: str = Field(settings.model, description='The model to generate a completion from.')
|
||||||
|
prompt: Union[List[str], str] = Field(..., description='The prompt to begin completing from.')
|
||||||
|
max_tokens: int = Field(None, description='Max tokens to generate')
|
||||||
|
temperature: float = Field(settings.temp, description='Model temperature')
|
||||||
|
top_p: Optional[float] = Field(settings.top_p, description='top_p')
|
||||||
|
top_k: Optional[int] = Field(settings.top_k, description='top_k')
|
||||||
|
n: int = Field(1, description='How many completions to generate for each prompt')
|
||||||
|
stream: bool = Field(False, description='Stream responses')
|
||||||
|
repeat_penalty: float = Field(settings.repeat_penalty, description='Repeat penalty')
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionChoice(BaseModel):
|
||||||
|
text: str
|
||||||
|
index: int
|
||||||
|
logprobs: float
|
||||||
|
finish_reason: str
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionUsage(BaseModel):
|
||||||
|
prompt_tokens: int
|
||||||
|
completion_tokens: int
|
||||||
|
total_tokens: int
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
object: str = 'text_completion'
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
choices: List[CompletionChoice]
|
||||||
|
usage: CompletionUsage
|
||||||
|
|
||||||
|
|
||||||
|
class CompletionStreamResponse(BaseModel):
|
||||||
|
id: str
|
||||||
|
object: str = 'text_completion'
|
||||||
|
created: int
|
||||||
|
model: str
|
||||||
|
choices: List[CompletionChoice]
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/completions", tags=["Completion Endpoints"])
|
||||||
|
|
||||||
|
def stream_completion(output: Iterable, base_response: CompletionStreamResponse):
|
||||||
|
"""
|
||||||
|
Streams a GPT4All output to the client.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
output: The output of GPT4All.generate(), which is an iterable of tokens.
|
||||||
|
base_response: The base response object, which is cloned and modified for each token.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A Generator of CompletionStreamResponse objects, which are serialized to JSON Event Stream format.
|
||||||
|
"""
|
||||||
|
for token in output:
|
||||||
|
chunk = base_response.copy()
|
||||||
|
chunk.choices = [dict(CompletionChoice(
|
||||||
|
text=token,
|
||||||
|
index=0,
|
||||||
|
logprobs=-1,
|
||||||
|
finish_reason=''
|
||||||
|
))]
|
||||||
|
yield f"data: {json.dumps(dict(chunk))}\n\n"
|
||||||
|
|
||||||
|
async def gpu_infer(payload, header):
|
||||||
|
async with aiohttp.ClientSession() as session:
|
||||||
|
try:
|
||||||
|
async with session.post(
|
||||||
|
settings.hf_inference_server_host, headers=header, data=json.dumps(payload)
|
||||||
|
) as response:
|
||||||
|
resp = await response.json()
|
||||||
|
return resp
|
||||||
|
|
||||||
|
except aiohttp.ClientError as e:
|
||||||
|
# Handle client-side errors (e.g., connection error, invalid URL)
|
||||||
|
logger.error(f"Client error: {e}")
|
||||||
|
except aiohttp.ServerError as e:
|
||||||
|
# Handle server-side errors (e.g., internal server error)
|
||||||
|
logger.error(f"Server error: {e}")
|
||||||
|
except json.JSONDecodeError as e:
|
||||||
|
# Handle JSON decoding errors
|
||||||
|
logger.error(f"JSON decoding error: {e}")
|
||||||
|
except Exception as e:
|
||||||
|
# Handle other unexpected exceptions
|
||||||
|
logger.error(f"Unexpected error: {e}")
|
||||||
|
|
||||||
|
@router.post("/", response_model=CompletionResponse)
|
||||||
|
async def completions(request: CompletionRequest):
|
||||||
|
'''
|
||||||
|
Completes a GPT4All model response.
|
||||||
|
'''
|
||||||
|
if settings.inference_mode == "gpu":
|
||||||
|
params = request.dict(exclude={'model', 'prompt', 'max_tokens', 'n'})
|
||||||
|
params["max_new_tokens"] = request.max_tokens
|
||||||
|
params["num_return_sequences"] = request.n
|
||||||
|
|
||||||
|
header = {"Content-Type": "application/json"}
|
||||||
|
if isinstance(request.prompt, list):
|
||||||
|
tasks = []
|
||||||
|
for prompt in request.prompt:
|
||||||
|
payload = {"parameters": params}
|
||||||
|
payload["inputs"] = prompt
|
||||||
|
task = gpu_infer(payload, header)
|
||||||
|
tasks.append(task)
|
||||||
|
results = await asyncio.gather(*tasks)
|
||||||
|
|
||||||
|
choices = []
|
||||||
|
for response in results:
|
||||||
|
scores = response["scores"] if "scores" in response else -1.0
|
||||||
|
choices.append(
|
||||||
|
dict(
|
||||||
|
CompletionChoice(
|
||||||
|
text=response["generated_text"], index=0, logprobs=scores, finish_reason='stop'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return CompletionResponse(
|
||||||
|
id=str(uuid4()),
|
||||||
|
created=time.time(),
|
||||||
|
model=request.model,
|
||||||
|
choices=choices,
|
||||||
|
usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
payload = {"parameters": params}
|
||||||
|
# If streaming, we need to return a StreamingResponse
|
||||||
|
payload["inputs"] = request.prompt
|
||||||
|
|
||||||
|
resp = await gpu_infer(payload, header)
|
||||||
|
|
||||||
|
output = resp["generated_text"]
|
||||||
|
# this returns all logprobs
|
||||||
|
scores = resp["scores"] if "scores" in resp else -1.0
|
||||||
|
|
||||||
|
return CompletionResponse(
|
||||||
|
id=str(uuid4()),
|
||||||
|
created=time.time(),
|
||||||
|
model=request.model,
|
||||||
|
choices=[dict(CompletionChoice(text=output, index=0, logprobs=scores, finish_reason='stop'))],
|
||||||
|
usage={'prompt_tokens': 0, 'completion_tokens': 0, 'total_tokens': 0},
|
||||||
|
)
|
||||||
|
|
||||||
|
else:
|
||||||
|
|
||||||
|
if request.model != settings.model:
|
||||||
|
raise HTTPException(status_code=400,
|
||||||
|
detail=f"The GPT4All inference server is booted to only infer: `{settings.model}`")
|
||||||
|
|
||||||
|
if isinstance(request.prompt, list):
|
||||||
|
if len(request.prompt) > 1:
|
||||||
|
raise HTTPException(status_code=400, detail="Can only infer one inference per request in CPU mode.")
|
||||||
|
else:
|
||||||
|
request.prompt = request.prompt[0]
|
||||||
|
|
||||||
|
model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
|
||||||
|
|
||||||
|
output = model.generate(prompt=request.prompt,
|
||||||
|
max_tokens=request.max_tokens,
|
||||||
|
streaming=request.stream,
|
||||||
|
top_k=request.top_k,
|
||||||
|
top_p=request.top_p,
|
||||||
|
temp=request.temperature,
|
||||||
|
)
|
||||||
|
|
||||||
|
# If streaming, we need to return a StreamingResponse
|
||||||
|
if request.stream:
|
||||||
|
base_chunk = CompletionStreamResponse(
|
||||||
|
id=str(uuid4()),
|
||||||
|
created=time.time(),
|
||||||
|
model=request.model,
|
||||||
|
choices=[]
|
||||||
|
)
|
||||||
|
return StreamingResponse((response for response in stream_completion(output, base_chunk)),
|
||||||
|
media_type="text/event-stream")
|
||||||
|
else:
|
||||||
|
return CompletionResponse(
|
||||||
|
id=str(uuid4()),
|
||||||
|
created=time.time(),
|
||||||
|
model=request.model,
|
||||||
|
choices=[dict(CompletionChoice(
|
||||||
|
text=output,
|
||||||
|
index=0,
|
||||||
|
logprobs=-1,
|
||||||
|
finish_reason='stop'
|
||||||
|
))],
|
||||||
|
usage={
|
||||||
|
'prompt_tokens': 0, # TODO how to compute this?
|
||||||
|
'completion_tokens': 0,
|
||||||
|
'total_tokens': 0
|
||||||
|
}
|
||||||
|
)
|
65
gpt4all-api/gpt4all_api/app/api_v1/routes/embeddings.py
Normal file
65
gpt4all-api/gpt4all_api/app/api_v1/routes/embeddings.py
Normal file
@ -0,0 +1,65 @@
|
|||||||
|
from typing import List, Union
|
||||||
|
from fastapi import APIRouter
|
||||||
|
from api_v1.settings import settings
|
||||||
|
from gpt4all import Embed4All
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingRequest(BaseModel):
|
||||||
|
model: str = Field(
|
||||||
|
settings.model, description="The model to generate an embedding from."
|
||||||
|
)
|
||||||
|
input: Union[str, List[str], List[int], List[List[int]]] = Field(
|
||||||
|
..., description="Input text to embed, encoded as a string or array of tokens."
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingUsage(BaseModel):
|
||||||
|
prompt_tokens: int = 0
|
||||||
|
total_tokens: int = 0
|
||||||
|
|
||||||
|
|
||||||
|
class Embedding(BaseModel):
|
||||||
|
index: int = 0
|
||||||
|
object: str = "embedding"
|
||||||
|
embedding: List[float]
|
||||||
|
|
||||||
|
|
||||||
|
class EmbeddingResponse(BaseModel):
|
||||||
|
object: str = "list"
|
||||||
|
model: str
|
||||||
|
data: List[Embedding]
|
||||||
|
usage: EmbeddingUsage
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/embeddings", tags=["Embedding Endpoints"])
|
||||||
|
|
||||||
|
embedder = Embed4All()
|
||||||
|
|
||||||
|
|
||||||
|
def get_embedding(data: EmbeddingRequest) -> EmbeddingResponse:
|
||||||
|
"""
|
||||||
|
Calculates the embedding for the given input using a specified model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
data (EmbeddingRequest): An EmbeddingRequest object containing the input data
|
||||||
|
and model name.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
EmbeddingResponse: An EmbeddingResponse object encapsulating the calculated embedding,
|
||||||
|
usage info, and the model name.
|
||||||
|
"""
|
||||||
|
embedding = embedder.embed(data.input)
|
||||||
|
return EmbeddingResponse(
|
||||||
|
data=[Embedding(embedding=embedding)], usage=EmbeddingUsage(), model=data.model
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/", response_model=EmbeddingResponse)
|
||||||
|
def embeddings(data: EmbeddingRequest):
|
||||||
|
"""
|
||||||
|
Creates a GPT4All embedding
|
||||||
|
"""
|
||||||
|
return get_embedding(data)
|
40
gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
Normal file
40
gpt4all-api/gpt4all_api/app/api_v1/routes/engines.py
Normal file
@ -0,0 +1,40 @@
|
|||||||
|
import logging
|
||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
|
from api_v1.settings import settings
|
||||||
|
from fastapi import APIRouter, Depends, Response, Security, status
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
### This should follow https://github.com/openai/openai-openapi/blob/master/openapi.yaml
|
||||||
|
|
||||||
|
|
||||||
|
class ListEnginesResponse(BaseModel):
|
||||||
|
data: List[Dict] = Field(..., description="All available models.")
|
||||||
|
|
||||||
|
|
||||||
|
class EngineResponse(BaseModel):
|
||||||
|
data: List[Dict] = Field(..., description="All available models.")
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/engines", tags=["Search Endpoints"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/", response_model=ListEnginesResponse)
|
||||||
|
async def list_engines():
|
||||||
|
'''
|
||||||
|
List all available GPT4All models from
|
||||||
|
https://raw.githubusercontent.com/nomic-ai/gpt4all/main/gpt4all-chat/metadata/models.json
|
||||||
|
'''
|
||||||
|
raise NotImplementedError()
|
||||||
|
return ListEnginesResponse(data=[])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{engine_id}", response_model=EngineResponse)
|
||||||
|
async def retrieve_engine(engine_id: str):
|
||||||
|
''' '''
|
||||||
|
|
||||||
|
raise NotImplementedError()
|
||||||
|
return EngineResponse()
|
13
gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
Normal file
13
gpt4all-api/gpt4all_api/app/api_v1/routes/health.py
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
import logging
|
||||||
|
from fastapi import APIRouter
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter(prefix="/health", tags=["Health"])
|
||||||
|
|
||||||
|
|
||||||
|
@router.get('/', response_class=JSONResponse)
|
||||||
|
async def health_check():
|
||||||
|
"""Runs a health check on this instance of the API."""
|
||||||
|
return JSONResponse({'status': 'ok'}, headers={'Access-Control-Allow-Origin': '*'})
|
19
gpt4all-api/gpt4all_api/app/api_v1/settings.py
Normal file
19
gpt4all-api/gpt4all_api/app/api_v1/settings.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from pydantic import BaseSettings
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
app_environment = 'dev'
|
||||||
|
model: str = 'ggml-mpt-7b-chat.bin'
|
||||||
|
gpt4all_path: str = '/models'
|
||||||
|
inference_mode: str = "cpu"
|
||||||
|
hf_inference_server_host: str = "http://gpt4all_gpu:80/generate"
|
||||||
|
sentry_dns: str = None
|
||||||
|
|
||||||
|
temp: float = 0.18
|
||||||
|
top_p: float = 1.0
|
||||||
|
top_k: int = 50
|
||||||
|
repeat_penalty: float = 1.18
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
3
gpt4all-api/gpt4all_api/app/docs.py
Normal file
3
gpt4all-api/gpt4all_api/app/docs.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
desc = 'GPT4All API'
|
||||||
|
|
||||||
|
endpoint_paths = {'health': '/health'}
|
84
gpt4all-api/gpt4all_api/app/main.py
Normal file
84
gpt4all-api/gpt4all_api/app/main.py
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
import docs
|
||||||
|
from api_v1 import events
|
||||||
|
from api_v1.api import router as v1_router
|
||||||
|
from api_v1.settings import settings
|
||||||
|
from fastapi import FastAPI, HTTPException, Request
|
||||||
|
from fastapi.logger import logger as fastapi_logger
|
||||||
|
from starlette.middleware.cors import CORSMiddleware
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
app = FastAPI(title='GPT4All API', description=docs.desc)
|
||||||
|
|
||||||
|
# CORS Configuration (in-case you want to deploy)
|
||||||
|
app.add_middleware(
|
||||||
|
CORSMiddleware,
|
||||||
|
allow_origins=["*"],
|
||||||
|
allow_credentials=True,
|
||||||
|
allow_methods=["GET", "POST", "OPTIONS"],
|
||||||
|
allow_headers=["*"],
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info('Adding v1 endpoints..')
|
||||||
|
|
||||||
|
# add v1
|
||||||
|
app.include_router(v1_router, prefix='/v1')
|
||||||
|
app.add_event_handler('startup', events.startup_event_handler(app))
|
||||||
|
app.add_exception_handler(HTTPException, events.on_http_error)
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("startup")
|
||||||
|
async def startup():
|
||||||
|
global model
|
||||||
|
if settings.inference_mode == "cpu":
|
||||||
|
logger.info(f"Downloading/fetching model: {os.path.join(settings.gpt4all_path, settings.model)}")
|
||||||
|
from gpt4all import GPT4All
|
||||||
|
|
||||||
|
model = GPT4All(model_name=settings.model, model_path=settings.gpt4all_path)
|
||||||
|
|
||||||
|
logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
|
||||||
|
|
||||||
|
else:
|
||||||
|
# is it possible to do this once the server is up?
|
||||||
|
## TODO block until HF inference server is up.
|
||||||
|
logger.info(f"GPT4All API is ready to infer from {settings.model} on CPU.")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@app.on_event("shutdown")
|
||||||
|
async def shutdown():
|
||||||
|
logger.info("Shutting down API")
|
||||||
|
|
||||||
|
|
||||||
|
if settings.sentry_dns is not None:
|
||||||
|
import sentry_sdk
|
||||||
|
|
||||||
|
def traces_sampler(sampling_context):
|
||||||
|
if 'health' in sampling_context['transaction_context']['name']:
|
||||||
|
return False
|
||||||
|
|
||||||
|
sentry_sdk.init(
|
||||||
|
dsn=settings.sentry_dns, traces_sample_rate=0.1, traces_sampler=traces_sampler, send_default_pii=False
|
||||||
|
)
|
||||||
|
|
||||||
|
# This is needed to get logs to show up in the app
|
||||||
|
if "gunicorn" in os.environ.get("SERVER_SOFTWARE", ""):
|
||||||
|
gunicorn_error_logger = logging.getLogger("gunicorn.error")
|
||||||
|
gunicorn_logger = logging.getLogger("gunicorn")
|
||||||
|
|
||||||
|
root_logger = logging.getLogger()
|
||||||
|
fastapi_logger.setLevel(gunicorn_logger.level)
|
||||||
|
fastapi_logger.handlers = gunicorn_error_logger.handlers
|
||||||
|
root_logger.setLevel(gunicorn_logger.level)
|
||||||
|
|
||||||
|
uvicorn_logger = logging.getLogger("uvicorn.access")
|
||||||
|
uvicorn_logger.handlers = gunicorn_error_logger.handlers
|
||||||
|
else:
|
||||||
|
# https://github.com/tiangolo/fastapi/issues/2019
|
||||||
|
LOG_FORMAT2 = (
|
||||||
|
"[%(asctime)s %(process)d:%(threadName)s] %(name)s - %(levelname)s - %(message)s | %(filename)s:%(lineno)d"
|
||||||
|
)
|
||||||
|
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT2)
|
59
gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
Normal file
59
gpt4all-api/gpt4all_api/app/tests/test_endpoints.py
Normal file
@ -0,0 +1,59 @@
|
|||||||
|
"""
|
||||||
|
Use the OpenAI python API to test gpt4all models.
|
||||||
|
"""
|
||||||
|
from typing import List, get_args
|
||||||
|
|
||||||
|
import openai
|
||||||
|
|
||||||
|
openai.api_base = "http://localhost:4891/v1"
|
||||||
|
|
||||||
|
openai.api_key = "not needed for a local LLM"
|
||||||
|
|
||||||
|
|
||||||
|
def test_completion():
|
||||||
|
model = "ggml-mpt-7b-chat.bin"
|
||||||
|
prompt = "Who is Michael Jordan?"
|
||||||
|
response = openai.Completion.create(
|
||||||
|
model=model, prompt=prompt, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
|
||||||
|
)
|
||||||
|
assert len(response['choices'][0]['text']) > len(prompt)
|
||||||
|
|
||||||
|
def test_streaming_completion():
|
||||||
|
model = "ggml-mpt-7b-chat.bin"
|
||||||
|
prompt = "Who is Michael Jordan?"
|
||||||
|
tokens = []
|
||||||
|
for resp in openai.Completion.create(
|
||||||
|
model=model,
|
||||||
|
prompt=prompt,
|
||||||
|
max_tokens=50,
|
||||||
|
temperature=0.28,
|
||||||
|
top_p=0.95,
|
||||||
|
n=1,
|
||||||
|
echo=True,
|
||||||
|
stream=True):
|
||||||
|
tokens.append(resp.choices[0].text)
|
||||||
|
|
||||||
|
assert (len(tokens) > 0)
|
||||||
|
assert (len("".join(tokens)) > len(prompt))
|
||||||
|
|
||||||
|
|
||||||
|
def test_batched_completion():
|
||||||
|
model = "ggml-mpt-7b-chat.bin"
|
||||||
|
prompt = "Who is Michael Jordan?"
|
||||||
|
response = openai.Completion.create(
|
||||||
|
model=model, prompt=[prompt] * 3, max_tokens=50, temperature=0.28, top_p=0.95, n=1, echo=True, stream=False
|
||||||
|
)
|
||||||
|
assert len(response['choices'][0]['text']) > len(prompt)
|
||||||
|
assert len(response['choices']) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_embedding():
|
||||||
|
model = "ggml-all-MiniLM-L6-v2-f16.bin"
|
||||||
|
prompt = "Who is Michael Jordan?"
|
||||||
|
response = openai.Embedding.create(model=model, input=prompt)
|
||||||
|
output = response["data"][0]["embedding"]
|
||||||
|
args = get_args(List[float])
|
||||||
|
|
||||||
|
assert response["model"] == model
|
||||||
|
assert isinstance(output, list)
|
||||||
|
assert all(isinstance(x, args) for x in output)
|
12
gpt4all-api/gpt4all_api/requirements.txt
Normal file
12
gpt4all-api/gpt4all_api/requirements.txt
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
aiohttp>=3.6.2
|
||||||
|
aiofiles
|
||||||
|
pydantic>=1.4.0,<2.0.0
|
||||||
|
requests>=2.24.0
|
||||||
|
ujson>=2.0.2
|
||||||
|
fastapi>=0.95.0
|
||||||
|
Jinja2>=3.0
|
||||||
|
gpt4all>=1.0.0
|
||||||
|
pytest
|
||||||
|
openai
|
||||||
|
black
|
||||||
|
isort
|
46
gpt4all-api/makefile
Normal file
46
gpt4all-api/makefile
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
ROOT_DIR:=$(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
|
||||||
|
APP_NAME:=gpt4all_api
|
||||||
|
PYTHON:=python3.8
|
||||||
|
SHELL := /bin/bash
|
||||||
|
|
||||||
|
all: dependencies
|
||||||
|
|
||||||
|
fresh: clean dependencies
|
||||||
|
|
||||||
|
testenv: clean_testenv test_build
|
||||||
|
docker compose -f docker-compose.yaml up --build
|
||||||
|
|
||||||
|
testenv_gpu: clean_testenv test_build
|
||||||
|
docker compose -f docker-compose.yaml -f docker-compose.gpu.yaml up --build
|
||||||
|
|
||||||
|
testenv_d: clean_testenv test_build
|
||||||
|
docker compose up --build -d
|
||||||
|
|
||||||
|
test:
|
||||||
|
docker compose exec $(APP_NAME) pytest -svv --disable-warnings -p no:cacheprovider /app/tests
|
||||||
|
|
||||||
|
test_build:
|
||||||
|
DOCKER_BUILDKIT=1 docker build -t $(APP_NAME) --progress plain -f $(APP_NAME)/Dockerfile.buildkit .
|
||||||
|
|
||||||
|
clean_testenv:
|
||||||
|
docker compose down -v
|
||||||
|
|
||||||
|
fresh_testenv: clean_testenv testenv
|
||||||
|
|
||||||
|
venv:
|
||||||
|
if [ ! -d $(ROOT_DIR)/env ]; then $(PYTHON) -m venv $(ROOT_DIR)/env; fi
|
||||||
|
|
||||||
|
dependencies: venv
|
||||||
|
source $(ROOT_DIR)/env/bin/activate; $(PYTHON) -m pip install -r $(ROOT_DIR)/$(APP_NAME)/requirements.txt
|
||||||
|
|
||||||
|
clean: clean_testenv
|
||||||
|
# Remove existing environment
|
||||||
|
rm -rf $(ROOT_DIR)/env;
|
||||||
|
rm -rf $(ROOT_DIR)/$(APP_NAME)/*.pyc;
|
||||||
|
|
||||||
|
|
||||||
|
black:
|
||||||
|
source $(ROOT_DIR)/env/bin/activate; black -l 120 -S --target-version py38 $(APP_NAME)
|
||||||
|
|
||||||
|
isort:
|
||||||
|
source $(ROOT_DIR)/env/bin/activate; isort --ignore-whitespace --atomic -w 120 $(APP_NAME)
|
@ -1,27 +1,16 @@
|
|||||||
cmake_minimum_required(VERSION 3.23) # for FILE_SET
|
cmake_minimum_required(VERSION 3.16)
|
||||||
|
|
||||||
include(../common/common.cmake)
|
|
||||||
|
|
||||||
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
|
||||||
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
|
||||||
|
|
||||||
if (APPLE)
|
if(APPLE)
|
||||||
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
|
option(BUILD_UNIVERSAL "Build a Universal binary on macOS" ON)
|
||||||
else()
|
if(BUILD_UNIVERSAL)
|
||||||
option(LLMODEL_KOMPUTE "llmodel: use Kompute" ON)
|
|
||||||
option(LLMODEL_VULKAN "llmodel: use Vulkan" OFF)
|
|
||||||
option(LLMODEL_CUDA "llmodel: use CUDA" ON)
|
|
||||||
option(LLMODEL_ROCM "llmodel: use ROCm" OFF)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
if (APPLE)
|
|
||||||
if (BUILD_UNIVERSAL)
|
|
||||||
# Build a Universal binary on macOS
|
# Build a Universal binary on macOS
|
||||||
# This requires that the found Qt library is compiled as Universal binaries.
|
# This requires that the found Qt library is compiled as Universal binaries.
|
||||||
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
|
set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "" FORCE)
|
||||||
else()
|
else()
|
||||||
# Build for the host architecture on macOS
|
# Build for the host architecture on macOS
|
||||||
if (NOT CMAKE_OSX_ARCHITECTURES)
|
if(NOT CMAKE_OSX_ARCHITECTURES)
|
||||||
set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
|
set(CMAKE_OSX_ARCHITECTURES "${CMAKE_HOST_SYSTEM_PROCESSOR}" CACHE STRING "" FORCE)
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
@ -31,12 +20,12 @@ endif()
|
|||||||
include_directories("${CMAKE_CURRENT_BINARY_DIR}")
|
include_directories("${CMAKE_CURRENT_BINARY_DIR}")
|
||||||
|
|
||||||
set(LLMODEL_VERSION_MAJOR 0)
|
set(LLMODEL_VERSION_MAJOR 0)
|
||||||
set(LLMODEL_VERSION_MINOR 5)
|
set(LLMODEL_VERSION_MINOR 4)
|
||||||
set(LLMODEL_VERSION_PATCH 0)
|
set(LLMODEL_VERSION_PATCH 0)
|
||||||
set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
|
set(LLMODEL_VERSION "${LLMODEL_VERSION_MAJOR}.${LLMODEL_VERSION_MINOR}.${LLMODEL_VERSION_PATCH}")
|
||||||
project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
|
project(llmodel VERSION ${LLMODEL_VERSION} LANGUAGES CXX C)
|
||||||
|
|
||||||
set(CMAKE_CXX_STANDARD 23)
|
set(CMAKE_CXX_STANDARD 20)
|
||||||
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
set(CMAKE_CXX_STANDARD_REQUIRED ON)
|
||||||
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_RUNTIME_OUTPUT_DIRECTORY})
|
||||||
set(BUILD_SHARED_LIBS ON)
|
set(BUILD_SHARED_LIBS ON)
|
||||||
@ -50,88 +39,40 @@ else()
|
|||||||
message(STATUS "Interprocedural optimization support detected")
|
message(STATUS "Interprocedural optimization support detected")
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(DIRECTORY deps/llama.cpp-mainline)
|
if(NOT APPLE)
|
||||||
|
set(LLAMA_KOMPUTE YES)
|
||||||
|
endif()
|
||||||
|
|
||||||
include(llama.cpp.cmake)
|
include(llama.cpp.cmake)
|
||||||
|
|
||||||
set(BUILD_VARIANTS)
|
set(BUILD_VARIANTS default avxonly)
|
||||||
if (APPLE)
|
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
|
||||||
list(APPEND BUILD_VARIANTS metal)
|
set(BUILD_VARIANTS ${BUILD_VARIANTS} metal)
|
||||||
endif()
|
endif()
|
||||||
if (LLMODEL_KOMPUTE)
|
|
||||||
list(APPEND BUILD_VARIANTS kompute kompute-avxonly)
|
|
||||||
else()
|
|
||||||
list(PREPEND BUILD_VARIANTS cpu cpu-avxonly)
|
|
||||||
endif()
|
|
||||||
if (LLMODEL_VULKAN)
|
|
||||||
list(APPEND BUILD_VARIANTS vulkan vulkan-avxonly)
|
|
||||||
endif()
|
|
||||||
if (LLMODEL_CUDA)
|
|
||||||
cmake_minimum_required(VERSION 3.18) # for CMAKE_CUDA_ARCHITECTURES
|
|
||||||
|
|
||||||
# Defaults must be set before enable_language(CUDA).
|
set(CMAKE_VERBOSE_MAKEFILE ON)
|
||||||
# Keep this in sync with the arch list in ggml/src/CMakeLists.txt (plus 5.0 for non-F16 branch).
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
|
||||||
# 52 == lowest CUDA 12 standard
|
|
||||||
# 60 == f16 CUDA intrinsics
|
|
||||||
# 61 == integer CUDA intrinsics
|
|
||||||
# 70 == compute capability at which unrolling a loop in mul_mat_q kernels is faster
|
|
||||||
if (GGML_CUDA_F16 OR GGML_CUDA_DMMV_F16)
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "60;61;70;75") # needed for f16 CUDA intrinsics
|
|
||||||
else()
|
|
||||||
set(CMAKE_CUDA_ARCHITECTURES "50;52;61;70;75") # lowest CUDA 12 standard + lowest for integer intrinsics
|
|
||||||
#set(CMAKE_CUDA_ARCHITECTURES "OFF") # use this to compile much faster, but only F16 models work
|
|
||||||
endif()
|
|
||||||
endif()
|
|
||||||
message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
|
|
||||||
|
|
||||||
include(CheckLanguage)
|
|
||||||
check_language(CUDA)
|
|
||||||
if (NOT CMAKE_CUDA_COMPILER)
|
|
||||||
message(WARNING "CUDA Toolkit not found. To build without CUDA, use -DLLMODEL_CUDA=OFF.")
|
|
||||||
endif()
|
|
||||||
enable_language(CUDA)
|
|
||||||
list(APPEND BUILD_VARIANTS cuda cuda-avxonly)
|
|
||||||
endif()
|
|
||||||
if (LLMODEL_ROCM)
|
|
||||||
enable_language(HIP)
|
|
||||||
list(APPEND BUILD_VARIANTS rocm rocm-avxonly)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Go through each build variant
|
# Go through each build variant
|
||||||
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
||||||
# Determine flags
|
# Determine flags
|
||||||
if (BUILD_VARIANT MATCHES avxonly)
|
if (BUILD_VARIANT STREQUAL avxonly)
|
||||||
set(GPT4ALL_ALLOW_NON_AVX OFF)
|
set(GPT4ALL_ALLOW_NON_AVX NO)
|
||||||
else()
|
else()
|
||||||
set(GPT4ALL_ALLOW_NON_AVX ON)
|
set(GPT4ALL_ALLOW_NON_AVX YES)
|
||||||
endif()
|
endif()
|
||||||
set(GGML_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
|
set(LLAMA_AVX2 ${GPT4ALL_ALLOW_NON_AVX})
|
||||||
set(GGML_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
set(LLAMA_F16C ${GPT4ALL_ALLOW_NON_AVX})
|
||||||
set(GGML_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
set(LLAMA_FMA ${GPT4ALL_ALLOW_NON_AVX})
|
||||||
|
|
||||||
set(GGML_METAL OFF)
|
if (BUILD_VARIANT STREQUAL metal)
|
||||||
set(GGML_KOMPUTE OFF)
|
set(LLAMA_METAL YES)
|
||||||
set(GGML_VULKAN OFF)
|
else()
|
||||||
set(GGML_CUDA OFF)
|
set(LLAMA_METAL NO)
|
||||||
set(GGML_ROCM OFF)
|
|
||||||
if (BUILD_VARIANT MATCHES metal)
|
|
||||||
set(GGML_METAL ON)
|
|
||||||
elseif (BUILD_VARIANT MATCHES kompute)
|
|
||||||
set(GGML_KOMPUTE ON)
|
|
||||||
elseif (BUILD_VARIANT MATCHES vulkan)
|
|
||||||
set(GGML_VULKAN ON)
|
|
||||||
elseif (BUILD_VARIANT MATCHES cuda)
|
|
||||||
set(GGML_CUDA ON)
|
|
||||||
elseif (BUILD_VARIANT MATCHES rocm)
|
|
||||||
set(GGML_HIPBLAS ON)
|
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
# Include GGML
|
# Include GGML
|
||||||
include_ggml(-mainline-${BUILD_VARIANT})
|
set(LLAMA_K_QUANTS YES)
|
||||||
|
include_ggml(llama.cpp-mainline -mainline-${BUILD_VARIANT} ON)
|
||||||
if (BUILD_VARIANT MATCHES metal)
|
|
||||||
set(GGML_METALLIB "${GGML_METALLIB}" PARENT_SCOPE)
|
|
||||||
endif()
|
|
||||||
|
|
||||||
# Function for preparing individual implementations
|
# Function for preparing individual implementations
|
||||||
function(prepare_target TARGET_NAME BASE_LIB)
|
function(prepare_target TARGET_NAME BASE_LIB)
|
||||||
@ -151,35 +92,49 @@ foreach(BUILD_VARIANT IN LISTS BUILD_VARIANTS)
|
|||||||
|
|
||||||
# Add each individual implementations
|
# Add each individual implementations
|
||||||
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
|
add_library(llamamodel-mainline-${BUILD_VARIANT} SHARED
|
||||||
src/llamamodel.cpp src/llmodel_shared.cpp)
|
llamamodel.cpp llmodel_shared.cpp)
|
||||||
gpt4all_add_warning_options(llamamodel-mainline-${BUILD_VARIANT})
|
|
||||||
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
|
target_compile_definitions(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
|
||||||
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
target_include_directories(llamamodel-mainline-${BUILD_VARIANT} PRIVATE
|
|
||||||
src include/gpt4all-backend
|
|
||||||
)
|
|
||||||
prepare_target(llamamodel-mainline llama-mainline)
|
prepare_target(llamamodel-mainline llama-mainline)
|
||||||
|
|
||||||
if (NOT PROJECT_IS_TOP_LEVEL AND BUILD_VARIANT STREQUAL cuda)
|
add_library(replit-mainline-${BUILD_VARIANT} SHARED
|
||||||
set(CUDAToolkit_BIN_DIR ${CUDAToolkit_BIN_DIR} PARENT_SCOPE)
|
replit.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(replit-mainline-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
|
prepare_target(replit-mainline llama-mainline)
|
||||||
|
|
||||||
|
if (NOT LLAMA_METAL)
|
||||||
|
# FIXME: These need to be forward ported to latest ggml
|
||||||
|
# add_library(gptj-${BUILD_VARIANT} SHARED
|
||||||
|
# gptj.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
# prepare_target(gptj ggml-230511)
|
||||||
|
|
||||||
|
add_library(falcon-${BUILD_VARIANT} SHARED
|
||||||
|
falcon.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(falcon-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
|
prepare_target(falcon llama-mainline)
|
||||||
|
# FIXME: These need to be forward ported to latest ggml
|
||||||
|
# add_library(mpt-${BUILD_VARIANT} SHARED
|
||||||
|
# mpt.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
# prepare_target(mpt ggml-230511)
|
||||||
|
|
||||||
|
add_library(bert-${BUILD_VARIANT} SHARED
|
||||||
|
bert.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(bert-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
|
prepare_target(bert llama-mainline)
|
||||||
|
|
||||||
|
add_library(starcoder-${BUILD_VARIANT} SHARED
|
||||||
|
starcoder.cpp utils.h utils.cpp llmodel_shared.cpp llmodel_shared.h)
|
||||||
|
target_compile_definitions(starcoder-${BUILD_VARIANT} PRIVATE LLAMA_VERSIONS=>=3 LLAMA_DATE=999999)
|
||||||
|
prepare_target(starcoder llama-mainline)
|
||||||
endif()
|
endif()
|
||||||
endforeach()
|
endforeach()
|
||||||
|
|
||||||
add_library(llmodel
|
add_library(llmodel
|
||||||
src/dlhandle.cpp
|
llmodel.h llmodel.cpp llmodel_shared.cpp
|
||||||
src/llmodel.cpp
|
llmodel_c.h llmodel_c.cpp
|
||||||
src/llmodel_c.cpp
|
dlhandle.h
|
||||||
src/llmodel_shared.cpp
|
|
||||||
)
|
|
||||||
gpt4all_add_warning_options(llmodel)
|
|
||||||
target_sources(llmodel PUBLIC
|
|
||||||
FILE_SET public_headers TYPE HEADERS BASE_DIRS include
|
|
||||||
FILES include/gpt4all-backend/llmodel.h
|
|
||||||
include/gpt4all-backend/llmodel_c.h
|
|
||||||
include/gpt4all-backend/sysinfo.h
|
|
||||||
)
|
)
|
||||||
target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
|
target_compile_definitions(llmodel PRIVATE LIB_FILE_EXT="${CMAKE_SHARED_LIBRARY_SUFFIX}")
|
||||||
target_include_directories(llmodel PRIVATE src include/gpt4all-backend)
|
|
||||||
|
|
||||||
set_target_properties(llmodel PROPERTIES
|
set_target_properties(llmodel PROPERTIES
|
||||||
VERSION ${PROJECT_VERSION}
|
VERSION ${PROJECT_VERSION}
|
||||||
|
@ -27,7 +27,7 @@ Unfortunately, no for three reasons:
|
|||||||
|
|
||||||
# What is being done to make them more compatible?
|
# What is being done to make them more compatible?
|
||||||
|
|
||||||
A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differentiating them with namespaces or some other manner. Investigations continue.
|
A few things. Number one, we are maintaining compatibility with our current model zoo by way of the submodule pinning. However, we are also exploring how we can update to newer versions of llama.cpp without breaking our current models. This might involve an additional magic header check or it could possibly involve keeping the currently pinned submodule and also adding a new submodule with later changes and differienting them with namespaces or some other manner. Investigations continue.
|
||||||
|
|
||||||
# What about GPU inference?
|
# What about GPU inference?
|
||||||
|
|
||||||
|
1053
gpt4all-backend/bert.cpp
Normal file
1053
gpt4all-backend/bert.cpp
Normal file
File diff suppressed because it is too large
Load Diff
44
gpt4all-backend/bert_impl.h
Normal file
44
gpt4all-backend/bert_impl.h
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
#ifndef BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of bert.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define BERT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef BERT_H
|
||||||
|
#define BERT_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
struct BertPrivate;
|
||||||
|
class Bert : public LLModel {
|
||||||
|
public:
|
||||||
|
Bert();
|
||||||
|
~Bert();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return true; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string &modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
|
||||||
|
std::vector<float> embedding(const std::string &text) override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<BertPrivate> d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
Token sampleToken(PromptContext &ctx) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // BERT_H
|
@ -1 +0,0 @@
|
|||||||
Subproject commit 11f734c3b0334dbae4823b4a7467764e447fc6d6
|
|
104
gpt4all-backend/dlhandle.h
Normal file
104
gpt4all-backend/dlhandle.h
Normal file
@ -0,0 +1,104 @@
|
|||||||
|
#ifndef DLHANDLE_H
|
||||||
|
#define DLHANDLE_H
|
||||||
|
#ifndef _WIN32
|
||||||
|
#include <string>
|
||||||
|
#include <stdexcept>
|
||||||
|
#include <utility>
|
||||||
|
#include <dlfcn.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Dlhandle {
|
||||||
|
void *chandle;
|
||||||
|
|
||||||
|
public:
|
||||||
|
class Exception : public std::runtime_error {
|
||||||
|
public:
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
Dlhandle() : chandle(nullptr) {}
|
||||||
|
Dlhandle(const std::string& fpath, int flags = RTLD_LAZY | RTLD_LOCAL) {
|
||||||
|
chandle = dlopen(fpath.c_str(), flags);
|
||||||
|
if (!chandle) {
|
||||||
|
throw Exception("dlopen(\""+fpath+"\"): "+dlerror());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Dlhandle(const Dlhandle& o) = delete;
|
||||||
|
Dlhandle(Dlhandle&& o) : chandle(o.chandle) {
|
||||||
|
o.chandle = nullptr;
|
||||||
|
}
|
||||||
|
~Dlhandle() {
|
||||||
|
if (chandle) dlclose(chandle);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto operator =(Dlhandle&& o) {
|
||||||
|
chandle = std::exchange(o.chandle, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_valid() const {
|
||||||
|
return chandle != nullptr;
|
||||||
|
}
|
||||||
|
operator bool() const {
|
||||||
|
return is_valid();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
T* get(const std::string& fname) const {
|
||||||
|
auto fres = reinterpret_cast<T*>(dlsym(chandle, fname.c_str()));
|
||||||
|
return (dlerror()==NULL)?fres:nullptr;
|
||||||
|
}
|
||||||
|
auto get_fnc(const std::string& fname) const {
|
||||||
|
return get<void*(...)>(fname);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
#include <string>
|
||||||
|
#include <exception>
|
||||||
|
#include <stdexcept>
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#include <libloaderapi.h>
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class Dlhandle {
|
||||||
|
HMODULE chandle;
|
||||||
|
|
||||||
|
public:
|
||||||
|
class Exception : public std::runtime_error {
|
||||||
|
public:
|
||||||
|
using std::runtime_error::runtime_error;
|
||||||
|
};
|
||||||
|
|
||||||
|
Dlhandle() : chandle(nullptr) {}
|
||||||
|
Dlhandle(const std::string& fpath) {
|
||||||
|
chandle = LoadLibraryExA(fpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
|
||||||
|
if (!chandle) {
|
||||||
|
throw Exception("dlopen(\""+fpath+"\"): Error");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Dlhandle(const Dlhandle& o) = delete;
|
||||||
|
Dlhandle(Dlhandle&& o) : chandle(o.chandle) {
|
||||||
|
o.chandle = nullptr;
|
||||||
|
}
|
||||||
|
~Dlhandle() {
|
||||||
|
if (chandle) FreeLibrary(chandle);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool is_valid() const {
|
||||||
|
return chandle != nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename T>
|
||||||
|
T* get(const std::string& fname) const {
|
||||||
|
return reinterpret_cast<T*>(GetProcAddress(chandle, fname.c_str()));
|
||||||
|
}
|
||||||
|
auto get_fnc(const std::string& fname) const {
|
||||||
|
return get<void*(...)>(fname);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
#endif // DLHANDLE_H
|
985
gpt4all-backend/falcon.cpp
Normal file
985
gpt4all-backend/falcon.cpp
Normal file
@ -0,0 +1,985 @@
|
|||||||
|
#include "ggml.h"
|
||||||
|
#define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#include "falcon_impl.h"
|
||||||
|
#include "llama.h"
|
||||||
|
#include "llama-util.h"
|
||||||
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <iostream>
|
||||||
|
#include <sstream>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const char *modelType_ = "Falcon";
|
||||||
|
}
|
||||||
|
|
||||||
|
// commented out 40B support as it presently would require forking ggml/llama.cpp
|
||||||
|
// can re-add once mainline ggml supports it
|
||||||
|
|
||||||
|
#define FALCON_MAGIC 0x67676a74
|
||||||
|
|
||||||
|
// default hparams (Falcon 7B)
|
||||||
|
struct falcon_hparams {
|
||||||
|
int32_t n_vocab = 65024;
|
||||||
|
int32_t n_embd = 4544;
|
||||||
|
int32_t n_head = 71;
|
||||||
|
int32_t n_head_kv = 1;
|
||||||
|
int32_t n_layer = 32;
|
||||||
|
int32_t falcon_version = 7; // 7 for Falcon-7B, 40 for Falcon-40B
|
||||||
|
int32_t ftype = 1;
|
||||||
|
int32_t n_ctx = 2048;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct falcon_layer {
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor* input_layernorm;
|
||||||
|
struct ggml_tensor* input_layernorm_b;
|
||||||
|
//struct ggml_tensor* attention_norm; // Falcon-40B only
|
||||||
|
//struct ggml_tensor* attention_norm_b; // Falcon-40B only
|
||||||
|
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor* query_key_value;
|
||||||
|
struct ggml_tensor* wo;
|
||||||
|
|
||||||
|
// ff
|
||||||
|
struct ggml_tensor* ffn_up;
|
||||||
|
struct ggml_tensor* ffn_down;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct falcon_model {
|
||||||
|
falcon_hparams hparams;
|
||||||
|
|
||||||
|
struct ggml_tensor* tok_embeddings;
|
||||||
|
struct ggml_tensor* output_norm;
|
||||||
|
struct ggml_tensor* output_norm_b;
|
||||||
|
struct ggml_tensor* lm_head;
|
||||||
|
|
||||||
|
std::vector<falcon_layer> layers;
|
||||||
|
|
||||||
|
// key + value memory
|
||||||
|
llm_kv_cache kv_self;
|
||||||
|
|
||||||
|
struct ggml_context* ctx;
|
||||||
|
std::map<std::string, struct ggml_tensor*> tensors;
|
||||||
|
|
||||||
|
llm_buffer eval_buf;
|
||||||
|
llm_buffer work_buf;
|
||||||
|
llm_buffer scr0_buf;
|
||||||
|
llm_buffer scr1_buf;
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool kv_cache_init(
|
||||||
|
const struct falcon_hparams & hparams,
|
||||||
|
struct llm_kv_cache & cache,
|
||||||
|
ggml_type wtype,
|
||||||
|
int n_ctx) {
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int dim_head = n_embd / hparams.n_head;
|
||||||
|
const int dim_kv = dim_head * hparams.n_head_kv;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||||
|
const int64_t n_elements = dim_kv * n_mem;
|
||||||
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2_MiB);
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = cache.buf.size;
|
||||||
|
params.mem_buffer = cache.buf.addr;
|
||||||
|
params.no_alloc = false;
|
||||||
|
|
||||||
|
cache.ctx = ggml_init(params);
|
||||||
|
if (!cache.ctx) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||||
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load the model's weights from a file
|
||||||
|
bool falcon_model_load(const std::string & fname, falcon_model & model, gpt_vocab & vocab, size_t *mem_req) {
|
||||||
|
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
if (mem_req) {
|
||||||
|
*mem_req = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
if (!fin) {
|
||||||
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify magic
|
||||||
|
{
|
||||||
|
uint32_t magic;
|
||||||
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
|
if (magic != FALCON_MAGIC) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t format_version;
|
||||||
|
fin.read((char *) &format_version, sizeof(format_version));
|
||||||
|
|
||||||
|
// load hparams
|
||||||
|
{
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
|
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||||
|
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||||
|
fin.read((char *) &hparams.n_head_kv, sizeof(hparams.n_head_kv));
|
||||||
|
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||||
|
fin.read((char *) &hparams.falcon_version, sizeof(hparams.falcon_version));
|
||||||
|
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
|
||||||
|
|
||||||
|
if (hparams.falcon_version != 7) { // && hparams.falcon_version != 40) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad Falcon version: %d)\n", __func__, fname.c_str(), hparams.falcon_version);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
|
||||||
|
|
||||||
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
|
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||||
|
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||||
|
printf("%s: n_head_kv = %d\n", __func__, hparams.n_head_kv);
|
||||||
|
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||||
|
printf("%s: ftype = %d\n", __func__, hparams.ftype);
|
||||||
|
printf("%s: qntvr = %d\n", __func__, qntvr);
|
||||||
|
|
||||||
|
hparams.ftype %= GGML_QNT_VERSION_FACTOR;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load vocab
|
||||||
|
{
|
||||||
|
const int32_t n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
|
std::string word;
|
||||||
|
std::vector<char> buf(128);
|
||||||
|
|
||||||
|
for (int i = 0; i < n_vocab; i++) {
|
||||||
|
uint32_t len;
|
||||||
|
fin.read((char *) &len, sizeof(len));
|
||||||
|
|
||||||
|
buf.resize(len);
|
||||||
|
fin.read((char *) buf.data(), len);
|
||||||
|
word.assign(buf.data(), len);
|
||||||
|
|
||||||
|
uint32_t dummy;
|
||||||
|
fin.read((char *) &dummy, sizeof(dummy));
|
||||||
|
|
||||||
|
vocab.token_to_id[word] = i;
|
||||||
|
vocab.id_to_token[i] = word;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||||
|
// in order to save memory and also to speed up the computation
|
||||||
|
ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
|
||||||
|
if (wtype == GGML_TYPE_COUNT) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
|
||||||
|
__func__, fname.c_str(), model.hparams.ftype);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto & ctx = model.ctx;
|
||||||
|
|
||||||
|
size_t ctx_size = 0;
|
||||||
|
|
||||||
|
{
|
||||||
|
const auto& hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int n_head_kv = hparams.n_head_kv;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_ff = 4 * model.hparams.n_embd;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
const int head_dim = hparams.n_embd / hparams.n_head;
|
||||||
|
|
||||||
|
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // tok_embeddings
|
||||||
|
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm
|
||||||
|
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd; // output_norm_b
|
||||||
|
ctx_size += ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_vocab; // lm_head
|
||||||
|
|
||||||
|
// if (hparams.version == 40) { // Falcon-40B
|
||||||
|
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm
|
||||||
|
// ctx_size += n_layer * ggml_sizeof_tensor_1d(GGML_TYPE_F32, n_embd); // attention_norm_b
|
||||||
|
// }
|
||||||
|
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm
|
||||||
|
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(GGML_TYPE_F32) * n_embd); // input_layernorm_b
|
||||||
|
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * (n_head_kv * 2 + n_head) * head_dim); // query_key_value
|
||||||
|
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_embd); // wo
|
||||||
|
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_embd * n_ff); // ffn_up
|
||||||
|
ctx_size += n_layer * (ggml_tensor_overhead() + ggml_type_sizef(wtype) * n_ff * n_embd); // ffn_down
|
||||||
|
|
||||||
|
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mem_req) {
|
||||||
|
const int n_embd = model.hparams.n_embd;
|
||||||
|
const int dim_head = n_embd / model.hparams.n_head;
|
||||||
|
const int dim_kv = dim_head * model.hparams.n_head_kv;
|
||||||
|
const int n_layer = model.hparams.n_layer;
|
||||||
|
|
||||||
|
const int64_t n_mem = (int64_t)n_layer*model.hparams.n_ctx;
|
||||||
|
const int64_t n_elements = dim_kv * n_mem;
|
||||||
|
size_t kv_cache_size = 2u*n_elements*ggml_type_size(wtype) + 2_MiB;
|
||||||
|
*mem_req = ctx_size + kv_cache_size;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the ggml context
|
||||||
|
{
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = ctx_size,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
.no_alloc = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
model.ctx = ggml_init(params);
|
||||||
|
if (!model.ctx) {
|
||||||
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare memory for the weights
|
||||||
|
{
|
||||||
|
const auto& hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int n_head_kv = hparams.n_head_kv;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ff = 4 * model.hparams.n_embd;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
const int head_dim = hparams.n_embd / hparams.n_head;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
model.tok_embeddings = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||||
|
|
||||||
|
model.output_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
model.output_norm_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||||
|
|
||||||
|
// map by name
|
||||||
|
model.tensors["transformer.word_embeddings.weight"] =
|
||||||
|
model.tok_embeddings;
|
||||||
|
|
||||||
|
model.tensors["transformer.ln_f.weight"] = model.output_norm;
|
||||||
|
model.tensors["transformer.ln_f.bias"] = model.output_norm_b;
|
||||||
|
model.tensors["lm_head.weight"] = model.lm_head;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto& layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.input_layernorm =
|
||||||
|
ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
layer.input_layernorm_b =
|
||||||
|
ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
// if (hparams.version == 40) { // for Falcon-40B only
|
||||||
|
// layer.attention_norm =
|
||||||
|
// ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
// layer.attention_norm_b =
|
||||||
|
// ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
// }
|
||||||
|
|
||||||
|
// query_key_value shape for config.multi_query == True:
|
||||||
|
layer.query_key_value = ggml_new_tensor_2d(
|
||||||
|
ctx, wtype, n_embd, (n_head_kv * 2 + n_head) * head_dim);
|
||||||
|
layer.wo = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||||
|
|
||||||
|
layer.ffn_up = ggml_new_tensor_2d(ctx, wtype, n_embd, n_ff);
|
||||||
|
layer.ffn_down = ggml_new_tensor_2d(ctx, wtype, n_ff, n_embd);
|
||||||
|
|
||||||
|
// map by name
|
||||||
|
// if (hparams.version == 40) {
|
||||||
|
// // Falcon-40B:
|
||||||
|
// model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
// ".ln_mlp.weight"] = layer.input_layernorm;
|
||||||
|
// model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
// ".ln_mlp.bias"] = layer.input_layernorm_b;
|
||||||
|
// model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
// ".ln_attn.weight"] = layer.attention_norm;
|
||||||
|
// model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
// ".ln_attn.bias"] = layer.attention_norm_b;
|
||||||
|
// } else {
|
||||||
|
// Falcon-7B:
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
".input_layernorm.weight"] = layer.input_layernorm;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
".input_layernorm.bias"] = layer.input_layernorm_b;
|
||||||
|
//}
|
||||||
|
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
".self_attention.query_key_value.weight"] =
|
||||||
|
layer.query_key_value;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
".self_attention.dense.weight"] = layer.wo;
|
||||||
|
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
".mlp.dense_h_to_4h.weight"] = layer.ffn_up;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) +
|
||||||
|
".mlp.dense_4h_to_h.weight"] = layer.ffn_down;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// key + value memory
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_head_kv = hparams.n_head_kv;
|
||||||
|
const int head_dim = hparams.n_embd / hparams.n_head;
|
||||||
|
|
||||||
|
const int64_t n_mem = n_layer*n_ctx;
|
||||||
|
const int64_t n_elements = head_dim*n_mem;
|
||||||
|
|
||||||
|
if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F32, model.hparams.n_ctx)) {
|
||||||
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
|
||||||
|
|
||||||
|
printf("%s: memory_size = %8.2f MB, n_mem = %" PRId64 "\n", __func__, memory_size/1024.0/1024.0, n_mem);
|
||||||
|
}
|
||||||
|
|
||||||
|
// load weights
|
||||||
|
{
|
||||||
|
int n_tensors = 0;
|
||||||
|
size_t total_size = 0;
|
||||||
|
|
||||||
|
printf("%s: ", __func__);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
int32_t n_dims;
|
||||||
|
int32_t length;
|
||||||
|
int32_t ttype;
|
||||||
|
|
||||||
|
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||||
|
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||||
|
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
||||||
|
|
||||||
|
if (fin.eof()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t nelements = 1;
|
||||||
|
int32_t ne[2] = { 1, 1 };
|
||||||
|
for (int i = 0; i < n_dims; ++i) {
|
||||||
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||||
|
nelements *= ne[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string name(length, 0);
|
||||||
|
fin.read(&name[0], length);
|
||||||
|
fin.seekg(-static_cast<ptrdiff_t>(fin.tellg()) & 31, std::ios_base::cur);
|
||||||
|
|
||||||
|
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||||
|
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto tensor = model.tensors[name.data()];
|
||||||
|
if (ggml_nelements(tensor) != nelements) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%5d, %5d], expected [%5d, %5d]\n",
|
||||||
|
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// for debugging
|
||||||
|
if (0) {
|
||||||
|
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||||
|
|
||||||
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||||
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||||
|
|
||||||
|
total_size += ggml_nbytes(tensor);
|
||||||
|
if (++n_tensors % 8 == 0) {
|
||||||
|
printf(".");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf(" done\n");
|
||||||
|
|
||||||
|
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||||
|
}
|
||||||
|
|
||||||
|
fin.close();
|
||||||
|
|
||||||
|
model.eval_buf.resize(1280u * 1024 * 1024);
|
||||||
|
model.scr0_buf.resize(256u * 1024 * 1024);
|
||||||
|
model.scr1_buf.resize(256u * 1024 * 1024);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// evaluate the transformer
|
||||||
|
//
|
||||||
|
// - model: the model
|
||||||
|
// - n_threads: number of threads to use
|
||||||
|
// - n_past: the context size so far
|
||||||
|
// - embd_inp: the embeddings of the tokens in the context
|
||||||
|
// - embd_w: the predicted logits for the next token
|
||||||
|
//
|
||||||
|
bool falcon_eval(
|
||||||
|
falcon_model & model,
|
||||||
|
const int n_threads,
|
||||||
|
const int n_past,
|
||||||
|
const std::vector<gpt_vocab::id> & embd_inp,
|
||||||
|
std::vector<float> & embd_w,
|
||||||
|
size_t & mem_per_token) {
|
||||||
|
const int N = embd_inp.size();
|
||||||
|
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int n_head_kv = hparams.n_head_kv;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
const int version = hparams.falcon_version;
|
||||||
|
const size_t head_dim = n_embd / n_head;
|
||||||
|
|
||||||
|
struct ggml_init_params eval_ctx_params = {
|
||||||
|
.mem_size = model.eval_buf.size,
|
||||||
|
.mem_buffer = model.eval_buf.addr,
|
||||||
|
.no_alloc = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(eval_ctx_params);
|
||||||
|
struct ggml_cgraph gf = {};
|
||||||
|
|
||||||
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
|
|
||||||
|
// wte
|
||||||
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
|
||||||
|
struct ggml_tensor* repeat_dummy = ggml_new_tensor_3d(ctx0, inpL->type, head_dim, N + n_past, n_head);
|
||||||
|
|
||||||
|
ggml_type wtype = GGML_TYPE_F32;
|
||||||
|
const int sizeof_wtype = ggml_type_sizef(wtype);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
struct ggml_tensor * layernorm_output;
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
layernorm_output = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
|
layernorm_output = ggml_add(ctx0,
|
||||||
|
ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.layers[il].input_layernorm, layernorm_output),
|
||||||
|
layernorm_output),
|
||||||
|
ggml_repeat(ctx0, model.layers[il].input_layernorm_b, layernorm_output));
|
||||||
|
|
||||||
|
// if (version == 40) { // Falcon-40B only
|
||||||
|
// cur = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
|
// cur = ggml_add(ctx0,
|
||||||
|
// ggml_mul(ctx0,
|
||||||
|
// ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
|
||||||
|
// cur),
|
||||||
|
// ggml_repeat(ctx0, model.layers[il].attention_norm_b, cur));
|
||||||
|
// }
|
||||||
|
// else {
|
||||||
|
cur = layernorm_output;
|
||||||
|
// }
|
||||||
|
|
||||||
|
// compute QKV
|
||||||
|
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].query_key_value, cur);
|
||||||
|
|
||||||
|
// Note that the strides for Kcur, Vcur are set up so that the
|
||||||
|
// resulting views are misaligned with the tensor's storage
|
||||||
|
// (by applying the K/V offset we shift the tensor's original
|
||||||
|
// view to stick out behind the viewed QKV tensor's allocated
|
||||||
|
// memory, so to say). This is ok because no actual accesses
|
||||||
|
// happen to that out-of-range memory, but it can require some
|
||||||
|
// trickery when trying to accurately dump these views for
|
||||||
|
// debugging.
|
||||||
|
|
||||||
|
struct ggml_tensor * Qcur = ggml_view_3d(
|
||||||
|
ctx0, cur, head_dim, n_head, N,
|
||||||
|
head_dim * sizeof_wtype,
|
||||||
|
head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
|
||||||
|
0);
|
||||||
|
|
||||||
|
struct ggml_tensor * Kcur = ggml_view_3d(
|
||||||
|
ctx0, cur, head_dim, n_head_kv, N,
|
||||||
|
head_dim * sizeof_wtype,
|
||||||
|
head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
|
||||||
|
head_dim * n_head * sizeof_wtype);
|
||||||
|
|
||||||
|
struct ggml_tensor * Vcur = ggml_view_3d(
|
||||||
|
ctx0, cur, head_dim, n_head_kv, N,
|
||||||
|
head_dim * sizeof_wtype,
|
||||||
|
head_dim * (n_head + 2 * n_head_kv) * sizeof_wtype,
|
||||||
|
head_dim * (n_head + n_head_kv) * sizeof_wtype);
|
||||||
|
|
||||||
|
// using mode = 2 for neox mode
|
||||||
|
Qcur = ggml_rope_inplace(ctx0, Qcur, n_past, head_dim, 2, n_ctx);
|
||||||
|
Kcur = ggml_rope_inplace(ctx0, Kcur, n_past, head_dim, 2, n_ctx);
|
||||||
|
|
||||||
|
// store key and value to memory
|
||||||
|
{
|
||||||
|
struct ggml_tensor* k = ggml_view_1d(
|
||||||
|
ctx0, model.kv_self.k, N * n_head_kv * head_dim,
|
||||||
|
(ggml_element_size(model.kv_self.k) * n_head_kv * head_dim) *
|
||||||
|
(il * n_ctx + n_past));
|
||||||
|
struct ggml_tensor* v = ggml_view_1d(
|
||||||
|
ctx0, model.kv_self.v, N * n_head_kv * head_dim,
|
||||||
|
(ggml_element_size(model.kv_self.v) * n_head_kv * head_dim) *
|
||||||
|
(il * n_ctx + n_past));
|
||||||
|
|
||||||
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * K = ggml_permute(
|
||||||
|
ctx0,
|
||||||
|
ggml_view_3d(
|
||||||
|
ctx0,
|
||||||
|
model.kv_self.k,
|
||||||
|
head_dim, n_head_kv, n_past + N,
|
||||||
|
head_dim * sizeof_wtype,
|
||||||
|
head_dim * n_head_kv * sizeof_wtype,
|
||||||
|
il * n_ctx * ggml_element_size(model.kv_self.k) * n_head_kv * head_dim),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
// K * Q
|
||||||
|
|
||||||
|
// changed from repeat2 back to repeat, will not support 40B!
|
||||||
|
K = ggml_cont(ctx0, ggml_repeat(ctx0, K, repeat_dummy));
|
||||||
|
|
||||||
|
struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
|
||||||
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
|
struct ggml_tensor * KQ_scaled =
|
||||||
|
ggml_scale_inplace(ctx0,
|
||||||
|
KQ,
|
||||||
|
ggml_new_f32(ctx0, 1.0f/sqrt(float(head_dim)))
|
||||||
|
);
|
||||||
|
|
||||||
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
|
|
||||||
|
// KQ = soft_max(KQ_masked)
|
||||||
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
|
|
||||||
|
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||||
|
struct ggml_tensor* V = ggml_permute(
|
||||||
|
ctx0,
|
||||||
|
ggml_view_3d(
|
||||||
|
ctx0,
|
||||||
|
model.kv_self.v,
|
||||||
|
head_dim, n_head_kv, n_past + N,
|
||||||
|
head_dim * sizeof_wtype,
|
||||||
|
head_dim * n_head_kv * sizeof_wtype,
|
||||||
|
il * n_ctx * ggml_element_size(model.kv_self.v) * n_head_kv * head_dim),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
// changed from repeat2 back to repeat, will not support 40B!
|
||||||
|
V = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_repeat(ctx0, V, repeat_dummy)));
|
||||||
|
|
||||||
|
// KQV = transpose(V) * KQ_soft_max
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
|
||||||
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
|
cur = ggml_cpy(ctx0,
|
||||||
|
KQV_merged,
|
||||||
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
|
||||||
|
// projection
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].wo,
|
||||||
|
cur);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
|
||||||
|
|
||||||
|
struct ggml_tensor* inpFF = layernorm_output;
|
||||||
|
struct ggml_tensor* attn_out = ggml_cpy(
|
||||||
|
ctx0, cur, ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
|
||||||
|
{
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_up, inpFF);
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
cur = ggml_mul_mat(ctx0, model.layers[il].ffn_down, cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0, cur, attn_out);
|
||||||
|
cur = ggml_add(ctx0, cur, inpL);
|
||||||
|
// input for next layer
|
||||||
|
inpL = cur;
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
inpL = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
|
// inpL = ln_f_g*inpL + ln_f_b
|
||||||
|
inpL = ggml_add(ctx0,
|
||||||
|
ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.output_norm, inpL),
|
||||||
|
inpL),
|
||||||
|
ggml_repeat(ctx0, model.output_norm_b, inpL));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
{
|
||||||
|
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
|
||||||
|
|
||||||
|
//inpL = ggml_add(ctx0,
|
||||||
|
// ggml_repeat(ctx0, model.lmh_b, inpL),
|
||||||
|
// inpL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// logits -> probs
|
||||||
|
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||||
|
|
||||||
|
// run the computation
|
||||||
|
ggml_build_forward_expand(&gf, inpL);
|
||||||
|
ggml_graph_compute_g4a(model.work_buf, &gf, n_threads);
|
||||||
|
|
||||||
|
|
||||||
|
//if (n_past%100 == 0) {
|
||||||
|
// ggml_graph_print (&gf);
|
||||||
|
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||||
|
//}
|
||||||
|
|
||||||
|
//embd_w.resize(n_vocab*N);
|
||||||
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
|
// return result for just the last token
|
||||||
|
embd_w.resize(n_vocab);
|
||||||
|
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||||
|
|
||||||
|
if (mem_per_token == 0) {
|
||||||
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||||
|
}
|
||||||
|
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define MAX_RNG_STATE 64*1024
|
||||||
|
size_t falcon_get_state_size(const falcon_model &model) {
|
||||||
|
const size_t s_rng_size = sizeof(size_t);
|
||||||
|
const size_t s_rng = MAX_RNG_STATE;
|
||||||
|
const size_t s_kv_size = sizeof(size_t);
|
||||||
|
const size_t s_kv_ntok = sizeof(int);
|
||||||
|
const size_t s_kv = model.kv_self.buf.size;
|
||||||
|
const size_t s_total = (
|
||||||
|
+ s_rng_size
|
||||||
|
+ s_rng
|
||||||
|
+ s_kv_size
|
||||||
|
+ s_kv_ntok
|
||||||
|
+ s_kv
|
||||||
|
);
|
||||||
|
return s_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t falcon_copy_state_data(const falcon_model &model, const std::mt19937 &rng, uint8_t *dest)
|
||||||
|
{
|
||||||
|
uint8_t * out = dest;
|
||||||
|
// copy rng
|
||||||
|
{
|
||||||
|
std::stringstream rng_ss;
|
||||||
|
rng_ss << rng;
|
||||||
|
|
||||||
|
const size_t rng_size = rng_ss.str().size();
|
||||||
|
char rng_buf[MAX_RNG_STATE];
|
||||||
|
|
||||||
|
memset(&rng_buf[0], 0, MAX_RNG_STATE);
|
||||||
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
||||||
|
|
||||||
|
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
||||||
|
memcpy(out, &rng_buf[0], MAX_RNG_STATE); out += MAX_RNG_STATE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy kv cache
|
||||||
|
{
|
||||||
|
const size_t kv_size = model.kv_self.buf.size;
|
||||||
|
const int kv_ntok = model.kv_self.n;
|
||||||
|
|
||||||
|
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
||||||
|
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
||||||
|
|
||||||
|
if (kv_size) {
|
||||||
|
memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t written = out - dest;
|
||||||
|
assert(written == falcon_get_state_size(model));
|
||||||
|
fflush(stdout);
|
||||||
|
return written;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t falcon_set_state_data(falcon_model *model, std::mt19937 *rng, const uint8_t *src)
|
||||||
|
{
|
||||||
|
const uint8_t * in = src;
|
||||||
|
|
||||||
|
// set rng
|
||||||
|
{
|
||||||
|
size_t rng_size;
|
||||||
|
char rng_buf[MAX_RNG_STATE];
|
||||||
|
|
||||||
|
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
||||||
|
memcpy(&rng_buf[0], in, MAX_RNG_STATE); in += MAX_RNG_STATE;
|
||||||
|
|
||||||
|
std::stringstream rng_ss;
|
||||||
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||||
|
rng_ss >> *rng;
|
||||||
|
|
||||||
|
assert(rng_ss.fail() == false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set kv cache
|
||||||
|
{
|
||||||
|
size_t kv_size;
|
||||||
|
int kv_ntok;
|
||||||
|
|
||||||
|
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
||||||
|
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
||||||
|
|
||||||
|
if (kv_size) {
|
||||||
|
assert(model->kv_self.buf.size == kv_size);
|
||||||
|
|
||||||
|
void * k_data = model->kv_self.k->data; // remember data pointers
|
||||||
|
void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
||||||
|
|
||||||
|
memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;
|
||||||
|
|
||||||
|
model->kv_self.k->data = k_data; // restore correct data pointers
|
||||||
|
model->kv_self.v->data = v_data;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
model->kv_self.n = kv_ntok;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t nread = in - src;
|
||||||
|
assert(nread == falcon_get_state_size(*model));
|
||||||
|
fflush(stdout);
|
||||||
|
return nread;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct FalconPrivate {
|
||||||
|
const std::string modelPath;
|
||||||
|
bool modelLoaded;
|
||||||
|
gpt_vocab vocab;
|
||||||
|
falcon_model *model = nullptr;
|
||||||
|
int64_t n_threads = 0;
|
||||||
|
size_t mem_per_token = 0;
|
||||||
|
std::mt19937 rng;
|
||||||
|
};
|
||||||
|
|
||||||
|
Falcon::Falcon() : d_ptr(new FalconPrivate) {
|
||||||
|
d_ptr->model = new falcon_model;
|
||||||
|
d_ptr->model->ctx = nullptr;
|
||||||
|
d_ptr->modelLoaded = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
Falcon::~Falcon() {
|
||||||
|
if(d_ptr->model->ctx) {
|
||||||
|
ggml_free(d_ptr->model->ctx);
|
||||||
|
d_ptr->model->ctx = nullptr;
|
||||||
|
}
|
||||||
|
delete d_ptr->model;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Falcon::loadModel(const std::string &modelPath)
|
||||||
|
{
|
||||||
|
std::mt19937 rng(time(NULL));
|
||||||
|
d_ptr->rng = rng;
|
||||||
|
|
||||||
|
// load the model
|
||||||
|
if (!falcon_model_load(modelPath, *d_ptr->model, d_ptr->vocab, nullptr)) {
|
||||||
|
std::cerr << "FALCON ERROR: failed to load model from " << modelPath;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
d_ptr->modelLoaded = true;
|
||||||
|
fflush(stdout);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Falcon::isModelLoaded() const
|
||||||
|
{
|
||||||
|
return d_ptr -> modelLoaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t Falcon::requiredMem(const std::string &modelPath)
|
||||||
|
{
|
||||||
|
falcon_model dummy_model;
|
||||||
|
gpt_vocab dummy_vocab;
|
||||||
|
size_t mem_req;
|
||||||
|
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||||
|
falcon_model_load(modelPath, dummy_model, dummy_vocab, &mem_req);
|
||||||
|
return mem_req;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t Falcon::stateSize() const
|
||||||
|
{
|
||||||
|
return falcon_get_state_size(*d_ptr->model);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t Falcon::saveState(uint8_t *dest) const
|
||||||
|
{
|
||||||
|
return falcon_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t Falcon::restoreState(const uint8_t *src)
|
||||||
|
{
|
||||||
|
return falcon_set_state_data(d_ptr->model, &d_ptr->rng, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
void Falcon::setThreadCount(int32_t n_threads)
|
||||||
|
{
|
||||||
|
d_ptr->n_threads = n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t Falcon::threadCount() const
|
||||||
|
{
|
||||||
|
return d_ptr->n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<LLModel::Token> Falcon::tokenize(PromptContext &, const std::string &str) const
|
||||||
|
{
|
||||||
|
return ::gpt_tokenize(d_ptr->vocab, str);
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Token Falcon::sampleToken(PromptContext &promptCtx) const
|
||||||
|
{
|
||||||
|
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||||
|
return gpt_sample_top_k_top_p(d_ptr->model->hparams.n_vocab,
|
||||||
|
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
|
||||||
|
n_prev_toks,
|
||||||
|
promptCtx.logits,
|
||||||
|
promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
|
||||||
|
promptCtx.repeat_penalty,
|
||||||
|
d_ptr->rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string Falcon::tokenToString(Token id) const
|
||||||
|
{
|
||||||
|
return d_ptr->vocab.id_to_token[id];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Falcon::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||||
|
{
|
||||||
|
// determine the required inference memory per token:
|
||||||
|
static bool initialized = false;
|
||||||
|
if (!initialized) {
|
||||||
|
falcon_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, ctx.logits,
|
||||||
|
d_ptr->mem_per_token);
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return falcon_eval(*d_ptr->model, d_ptr->n_threads, ctx.n_past, tokens, ctx.logits, d_ptr->mem_per_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t Falcon::contextLength() const
|
||||||
|
{
|
||||||
|
return d_ptr->model->hparams.n_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<LLModel::Token> &Falcon::endTokens() const
|
||||||
|
{
|
||||||
|
static const std::vector<LLModel::Token> out = { 11 };
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define DLL_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define DLL_EXPORT __attribute__ ((visibility ("default")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
DLL_EXPORT bool is_g4a_backend_model_implementation() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_model_type() {
|
||||||
|
return modelType_;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_build_variant() {
|
||||||
|
return GGML_BUILD_VARIANT;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||||
|
uint32_t magic = 0;
|
||||||
|
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||||
|
uint32_t version = 0;
|
||||||
|
f.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||||
|
if (magic != FALCON_MAGIC) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
falcon_hparams hparams;
|
||||||
|
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
|
||||||
|
// we're matching the file format of existing pre-converted models
|
||||||
|
// compatible with ctransformers llama.cpp based format, which also
|
||||||
|
// unfortunately shares its magic number what llama uses, so we now
|
||||||
|
// differentiate by n_vocab
|
||||||
|
// give some wiggle room over the max to allow for finetunes that expand the
|
||||||
|
// vocabulary
|
||||||
|
if (!(hparams.n_vocab >= 65024 && hparams.n_vocab <= 65100)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (hparams.falcon_version != 7) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT LLModel *construct() {
|
||||||
|
return new Falcon;
|
||||||
|
}
|
||||||
|
}
|
42
gpt4all-backend/falcon_impl.h
Normal file
42
gpt4all-backend/falcon_impl.h
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#ifndef FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of falcon.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define FALCON_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef FALCON_H
|
||||||
|
#define FALCON_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
struct FalconPrivate;
|
||||||
|
class Falcon : public LLModel {
|
||||||
|
public:
|
||||||
|
Falcon();
|
||||||
|
~Falcon();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return false; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string &modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<FalconPrivate> d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
Token sampleToken(PromptContext &ctx) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // Falcon_H
|
975
gpt4all-backend/gptj.cpp
Normal file
975
gpt4all-backend/gptj.cpp
Normal file
@ -0,0 +1,975 @@
|
|||||||
|
#define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#include "gptj_impl.h"
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#if defined(_WIN32) && defined(_MSC_VER)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#include <io.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#else
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#include <sstream>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <ggml.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const char *modelType_ = "GPT-J";
|
||||||
|
}
|
||||||
|
|
||||||
|
// default hparams (GPT-J 6B)
|
||||||
|
struct gptj_hparams {
|
||||||
|
int32_t n_vocab = 50400;
|
||||||
|
int32_t n_ctx = 2048;
|
||||||
|
int32_t n_embd = 4096;
|
||||||
|
int32_t n_head = 16;
|
||||||
|
int32_t n_layer = 28;
|
||||||
|
int32_t n_rot = 64;
|
||||||
|
int32_t f16 = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gptj_layer {
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * ln_1_g;
|
||||||
|
struct ggml_tensor * ln_1_b;
|
||||||
|
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor * c_attn_q_proj_w;
|
||||||
|
struct ggml_tensor * c_attn_k_proj_w;
|
||||||
|
struct ggml_tensor * c_attn_v_proj_w;
|
||||||
|
|
||||||
|
struct ggml_tensor * c_attn_proj_w;
|
||||||
|
|
||||||
|
// ff
|
||||||
|
struct ggml_tensor * c_mlp_fc_w;
|
||||||
|
struct ggml_tensor * c_mlp_fc_b;
|
||||||
|
|
||||||
|
struct ggml_tensor * c_mlp_proj_w;
|
||||||
|
struct ggml_tensor * c_mlp_proj_b;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct gptj_model {
|
||||||
|
gptj_hparams hparams;
|
||||||
|
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * ln_f_g;
|
||||||
|
struct ggml_tensor * ln_f_b;
|
||||||
|
|
||||||
|
struct ggml_tensor * wte; // position embedding
|
||||||
|
|
||||||
|
struct ggml_tensor * lmh_g; // language model head
|
||||||
|
struct ggml_tensor * lmh_b; // language model bias
|
||||||
|
|
||||||
|
std::vector<gptj_layer> layers;
|
||||||
|
|
||||||
|
// key + value memory
|
||||||
|
struct llm_kv_cache kv_self;
|
||||||
|
|
||||||
|
//
|
||||||
|
struct ggml_context * ctx;
|
||||||
|
std::map<std::string, struct ggml_tensor *> tensors;
|
||||||
|
|
||||||
|
llm_buffer eval_buf;
|
||||||
|
llm_buffer scr0_buf;
|
||||||
|
llm_buffer scr1_buf;
|
||||||
|
|
||||||
|
~gptj_model() {
|
||||||
|
if (ctx) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool kv_cache_init(
|
||||||
|
const struct gptj_hparams & hparams,
|
||||||
|
struct llm_kv_cache & cache,
|
||||||
|
ggml_type wtype,
|
||||||
|
int n_ctx) {
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||||
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2_MiB);
|
||||||
|
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = cache.buf.size;
|
||||||
|
params.mem_buffer = cache.buf.addr;
|
||||||
|
params.no_alloc = false;
|
||||||
|
|
||||||
|
cache.ctx = ggml_init(params);
|
||||||
|
|
||||||
|
if (!cache.ctx) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||||
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load the model's weights from a stream
|
||||||
|
bool gptj_model_load(const std::string &fname, std::istream &fin, gptj_model & model, gpt_vocab & vocab, size_t * mem_req = nullptr) {
|
||||||
|
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
if(mem_req != nullptr) {
|
||||||
|
*mem_req = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify magic
|
||||||
|
{
|
||||||
|
uint32_t magic;
|
||||||
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
|
if (magic != 0x67676d6c) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// load hparams
|
||||||
|
{
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
|
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||||
|
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||||
|
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||||
|
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||||
|
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
|
||||||
|
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||||
|
|
||||||
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||||
|
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||||
|
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||||
|
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
|
||||||
|
printf("%s: f16 = %d\n", __func__, hparams.f16);
|
||||||
|
}
|
||||||
|
|
||||||
|
// load vocab
|
||||||
|
{
|
||||||
|
int32_t n_vocab = 0;
|
||||||
|
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||||
|
|
||||||
|
if (n_vocab != model.hparams.n_vocab) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
||||||
|
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string word;
|
||||||
|
for (int i = 0; i < n_vocab; i++) {
|
||||||
|
uint32_t len;
|
||||||
|
fin.read((char *) &len, sizeof(len));
|
||||||
|
|
||||||
|
word.resize(len);
|
||||||
|
fin.read((char *) word.data(), len);
|
||||||
|
|
||||||
|
vocab.token_to_id[word] = i;
|
||||||
|
vocab.id_to_token[i] = word;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||||
|
// in order to save memory and also to speed up the computation
|
||||||
|
ggml_type wtype = GGML_TYPE_COUNT;
|
||||||
|
switch (model.hparams.f16) {
|
||||||
|
case 0: wtype = GGML_TYPE_F32; break;
|
||||||
|
case 1: wtype = GGML_TYPE_F16; break;
|
||||||
|
case 2: wtype = GGML_TYPE_Q4_0; break;
|
||||||
|
case 3: wtype = GGML_TYPE_Q4_1; break;
|
||||||
|
case 5: wtype = GGML_TYPE_Q4_2; break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
||||||
|
__func__, fname.c_str(), model.hparams.f16);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto & ctx = model.ctx;
|
||||||
|
|
||||||
|
size_t ctx_size = 0;
|
||||||
|
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
|
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
|
||||||
|
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
|
||||||
|
|
||||||
|
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // wte
|
||||||
|
|
||||||
|
ctx_size += n_embd*n_vocab*ggml_type_sizef(wtype); // lmh_g
|
||||||
|
ctx_size += n_vocab*ggml_type_sizef(GGML_TYPE_F32); // lmh_b
|
||||||
|
|
||||||
|
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
|
||||||
|
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
|
||||||
|
|
||||||
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_q_proj_w
|
||||||
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_k_proj_w
|
||||||
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_v_proj_w
|
||||||
|
|
||||||
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
|
||||||
|
|
||||||
|
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
|
||||||
|
ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
|
||||||
|
|
||||||
|
ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
|
||||||
|
ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
|
||||||
|
|
||||||
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
|
||||||
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
|
||||||
|
|
||||||
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
||||||
|
|
||||||
|
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mem_req != nullptr) {
|
||||||
|
*mem_req += ctx_size;
|
||||||
|
const int n_embd = model.hparams.n_embd;
|
||||||
|
const int n_layer = model.hparams.n_layer;
|
||||||
|
|
||||||
|
const int64_t n_mem = (int64_t)n_layer*model.hparams.n_ctx;
|
||||||
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
|
*mem_req += (2u*n_elements*ggml_type_size(wtype) + 2_MiB);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// create the ggml context
|
||||||
|
{
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = ctx_size,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
.no_alloc = false
|
||||||
|
};
|
||||||
|
|
||||||
|
model.ctx = ggml_init(params);
|
||||||
|
if (!model.ctx) {
|
||||||
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare memory for the weights
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||||
|
|
||||||
|
model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
model.lmh_g = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
|
||||||
|
model.lmh_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_vocab);
|
||||||
|
|
||||||
|
// map by name
|
||||||
|
model.tensors["transformer.wte.weight"] = model.wte;
|
||||||
|
|
||||||
|
model.tensors["transformer.ln_f.weight"] = model.ln_f_g;
|
||||||
|
model.tensors["transformer.ln_f.bias"] = model.ln_f_b;
|
||||||
|
|
||||||
|
model.tensors["lm_head.weight"] = model.lmh_g;
|
||||||
|
model.tensors["lm_head.bias"] = model.lmh_b;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
layer.c_attn_q_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||||
|
layer.c_attn_k_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||||
|
layer.c_attn_v_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||||
|
|
||||||
|
layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||||
|
|
||||||
|
layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
|
||||||
|
layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
|
||||||
|
|
||||||
|
layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
|
||||||
|
layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
// map by name
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.weight"] = layer.ln_1_g;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".ln_1.bias"] = layer.ln_1_b;
|
||||||
|
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".attn.q_proj.weight"] = layer.c_attn_q_proj_w;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".attn.k_proj.weight"] = layer.c_attn_k_proj_w;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".attn.v_proj.weight"] = layer.c_attn_v_proj_w;
|
||||||
|
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".attn.out_proj.weight"] = layer.c_attn_proj_w;
|
||||||
|
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.weight"] = layer.c_mlp_fc_w;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_in.bias"] = layer.c_mlp_fc_b;
|
||||||
|
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.weight"] = layer.c_mlp_proj_w;
|
||||||
|
model.tensors["transformer.h." + std::to_string(i) + ".mlp.fc_out.bias"] = layer.c_mlp_proj_b;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// key + value memory
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
|
||||||
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
|
||||||
|
printf("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// load weights
|
||||||
|
{
|
||||||
|
int n_tensors = 0;
|
||||||
|
size_t total_size = 0;
|
||||||
|
|
||||||
|
printf("%s: ", __func__);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
int32_t n_dims;
|
||||||
|
int32_t length;
|
||||||
|
int32_t ftype;
|
||||||
|
|
||||||
|
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||||
|
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||||
|
fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
|
||||||
|
|
||||||
|
if (fin.eof()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t nelements = 1;
|
||||||
|
int32_t ne[2] = { 1, 1 };
|
||||||
|
for (int i = 0; i < n_dims; ++i) {
|
||||||
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||||
|
nelements *= ne[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string name(length, 0);
|
||||||
|
fin.read(&name[0], length);
|
||||||
|
|
||||||
|
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||||
|
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto tensor = model.tensors[name.data()];
|
||||||
|
if (ggml_nelements(tensor) != nelements) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
||||||
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (0) {
|
||||||
|
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
|
||||||
|
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t bpe = 0;
|
||||||
|
|
||||||
|
switch (ftype) {
|
||||||
|
case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
|
||||||
|
case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
|
||||||
|
case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
|
||||||
|
case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||||
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||||
|
|
||||||
|
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
|
total_size += ggml_nbytes(tensor);
|
||||||
|
if (++n_tensors % 8 == 0) {
|
||||||
|
printf(".");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf(" done\n");
|
||||||
|
|
||||||
|
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||||
|
}
|
||||||
|
|
||||||
|
model.scr0_buf.resize(256u * 1024 * 1024);
|
||||||
|
model.scr1_buf.resize(256u * 1024 * 1024);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load the model's weights from a file path
|
||||||
|
bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
|
||||||
|
|
||||||
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
if (!fin) {
|
||||||
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool loaded = gptj_model_load(fname, fin, model, vocab);
|
||||||
|
fin.close();
|
||||||
|
return loaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
// evaluate the transformer
|
||||||
|
//
|
||||||
|
// - model: the model
|
||||||
|
// - n_threads: number of threads to use
|
||||||
|
// - n_past: the context size so far
|
||||||
|
// - embd_inp: the embeddings of the tokens in the context
|
||||||
|
// - embd_w: the predicted logits for the next token
|
||||||
|
//
|
||||||
|
// The GPT-J model requires about 16MB of memory per input token.
|
||||||
|
//
|
||||||
|
bool gptj_eval(
|
||||||
|
gptj_model & model,
|
||||||
|
const int n_threads,
|
||||||
|
const int n_past,
|
||||||
|
const std::vector<gpt_vocab::id> & embd_inp,
|
||||||
|
std::vector<float> & embd_w,
|
||||||
|
size_t & mem_per_token) {
|
||||||
|
const int N = embd_inp.size();
|
||||||
|
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
const int n_rot = hparams.n_rot;
|
||||||
|
|
||||||
|
const size_t init_buf_size = 1024_MiB;
|
||||||
|
if (!model.eval_buf.addr || model.eval_buf.size < init_buf_size)
|
||||||
|
model.eval_buf.resize(init_buf_size);
|
||||||
|
|
||||||
|
if (mem_per_token > 0 && mem_per_token*N > model.eval_buf.size) {
|
||||||
|
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||||
|
printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.eval_buf.size, buf_size_new);
|
||||||
|
|
||||||
|
// reallocate
|
||||||
|
model.eval_buf.resize(buf_size_new);
|
||||||
|
if (model.eval_buf.addr == nullptr) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.eval_buf.size);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = model.eval_buf.size,
|
||||||
|
.mem_buffer = model.eval_buf.addr,
|
||||||
|
.no_alloc = false
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
struct ggml_cgraph gf = {};
|
||||||
|
gf.n_threads = n_threads;
|
||||||
|
|
||||||
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
|
|
||||||
|
// wte
|
||||||
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
struct ggml_tensor * cur;
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
cur = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
|
// cur = ln_1_g*cur + ln_1_b
|
||||||
|
cur = ggml_add(ctx0,
|
||||||
|
ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
|
||||||
|
cur),
|
||||||
|
ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * inpSA = cur;
|
||||||
|
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_q_proj_w, cur);
|
||||||
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_k_proj_w, cur);
|
||||||
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].c_attn_v_proj_w, cur);
|
||||||
|
|
||||||
|
// store key and value to memory
|
||||||
|
{
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||||
|
struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_self.v, N*n_embd, (ggml_element_size(model.kv_self.v)*n_embd)*(il*n_ctx + n_past));
|
||||||
|
|
||||||
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * Q =
|
||||||
|
ggml_permute(ctx0,
|
||||||
|
ggml_rope(ctx0,
|
||||||
|
ggml_cpy(ctx0,
|
||||||
|
Qcur,
|
||||||
|
ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
|
||||||
|
n_past, n_rot, 0),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * K =
|
||||||
|
ggml_permute(ctx0,
|
||||||
|
ggml_rope(ctx0,
|
||||||
|
ggml_reshape_3d(ctx0,
|
||||||
|
ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),
|
||||||
|
n_embd/n_head, n_head, n_past + N),
|
||||||
|
n_past, n_rot, 1),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
// K * Q
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
|
||||||
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
|
struct ggml_tensor * KQ_scaled =
|
||||||
|
ggml_scale(ctx0,
|
||||||
|
KQ,
|
||||||
|
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||||
|
);
|
||||||
|
|
||||||
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
||||||
|
|
||||||
|
// KQ = soft_max(KQ_masked)
|
||||||
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||||
|
|
||||||
|
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||||
|
struct ggml_tensor * V_trans =
|
||||||
|
ggml_cpy(ctx0,
|
||||||
|
ggml_permute(ctx0,
|
||||||
|
ggml_reshape_3d(ctx0,
|
||||||
|
ggml_view_1d(ctx0, model.kv_self.v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd),
|
||||||
|
n_embd/n_head, n_head, n_past + N),
|
||||||
|
1, 2, 0, 3),
|
||||||
|
ggml_new_tensor_3d(ctx0, model.kv_self.v->type, n_past + N, n_embd/n_head, n_head));
|
||||||
|
|
||||||
|
// KQV = transpose(V) * KQ_soft_max
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
|
||||||
|
|
||||||
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
|
cur = ggml_cpy(ctx0,
|
||||||
|
KQV_merged,
|
||||||
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
|
||||||
|
// projection (no bias)
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].c_attn_proj_w,
|
||||||
|
cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_tensor * inpFF = cur;
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
|
||||||
|
// feed-forward network
|
||||||
|
// this is independent of the self-attention result, so it could be done in parallel to the self-attention
|
||||||
|
{
|
||||||
|
// note here we pass inpSA instead of cur
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].c_mlp_fc_w,
|
||||||
|
inpSA);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
|
||||||
|
cur);
|
||||||
|
|
||||||
|
// GELU activation
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
|
||||||
|
// projection
|
||||||
|
// cur = proj_w*cur + proj_b
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].c_mlp_proj_w,
|
||||||
|
cur);
|
||||||
|
|
||||||
|
cur = ggml_add(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
|
||||||
|
cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention + FF
|
||||||
|
cur = ggml_add(ctx0, cur, inpFF);
|
||||||
|
|
||||||
|
// input for next layer
|
||||||
|
inpL = ggml_add(ctx0, cur, inpL);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
|
// norm
|
||||||
|
{
|
||||||
|
inpL = ggml_norm(ctx0, inpL);
|
||||||
|
|
||||||
|
// inpL = ln_f_g*inpL + ln_f_b
|
||||||
|
inpL = ggml_add(ctx0,
|
||||||
|
ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.ln_f_g, inpL),
|
||||||
|
inpL),
|
||||||
|
ggml_repeat(ctx0, model.ln_f_b, inpL));
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
||||||
|
|
||||||
|
// lm_head
|
||||||
|
{
|
||||||
|
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
|
||||||
|
|
||||||
|
inpL = ggml_add(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.lmh_b, inpL),
|
||||||
|
inpL);
|
||||||
|
}
|
||||||
|
|
||||||
|
// logits -> probs
|
||||||
|
//inpL = ggml_soft_max(ctx0, inpL);
|
||||||
|
|
||||||
|
// run the computation
|
||||||
|
ggml_build_forward_expand(&gf, inpL);
|
||||||
|
ggml_graph_compute (ctx0, &gf);
|
||||||
|
|
||||||
|
//if (n_past%100 == 0) {
|
||||||
|
// ggml_graph_print (&gf);
|
||||||
|
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
|
||||||
|
//}
|
||||||
|
|
||||||
|
//embd_w.resize(n_vocab*N);
|
||||||
|
//memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
|
||||||
|
|
||||||
|
// return result for just the last token
|
||||||
|
embd_w.resize(n_vocab);
|
||||||
|
memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||||
|
|
||||||
|
if (mem_per_token == 0) {
|
||||||
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||||
|
}
|
||||||
|
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GPTJ_MAX_RNG_STATE 64*1024
|
||||||
|
|
||||||
|
size_t gptj_get_state_size(const gptj_model &model)
|
||||||
|
{
|
||||||
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||||
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
||||||
|
const size_t s_rng_size = sizeof(size_t);
|
||||||
|
const size_t s_rng = GPTJ_MAX_RNG_STATE;
|
||||||
|
const size_t s_kv_size = sizeof(size_t);
|
||||||
|
const size_t s_kv_ntok = sizeof(int);
|
||||||
|
const size_t s_kv = model.kv_self.buf.size;
|
||||||
|
const size_t s_total = (
|
||||||
|
+ s_rng_size
|
||||||
|
+ s_rng
|
||||||
|
+ s_kv_size
|
||||||
|
+ s_kv_ntok
|
||||||
|
+ s_kv
|
||||||
|
);
|
||||||
|
fflush(stdout);
|
||||||
|
return s_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t gptj_copy_state_data(const gptj_model &model, const std::mt19937 &rng, uint8_t *dest)
|
||||||
|
{
|
||||||
|
uint8_t * out = dest;
|
||||||
|
fflush(stdout);
|
||||||
|
// copy rng
|
||||||
|
{
|
||||||
|
std::stringstream rng_ss;
|
||||||
|
rng_ss << rng;
|
||||||
|
|
||||||
|
const size_t rng_size = rng_ss.str().size();
|
||||||
|
char rng_buf[GPTJ_MAX_RNG_STATE];
|
||||||
|
|
||||||
|
memset(&rng_buf[0], 0, GPTJ_MAX_RNG_STATE);
|
||||||
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
||||||
|
|
||||||
|
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
||||||
|
memcpy(out, &rng_buf[0], GPTJ_MAX_RNG_STATE); out += GPTJ_MAX_RNG_STATE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy kv cache
|
||||||
|
{
|
||||||
|
const size_t kv_size = model.kv_self.buf.size;
|
||||||
|
const int kv_ntok = model.kv_self.n;
|
||||||
|
|
||||||
|
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
||||||
|
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
||||||
|
|
||||||
|
if (kv_size) {
|
||||||
|
memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t written = out - dest;
|
||||||
|
assert(written == gptj_get_state_size(model));
|
||||||
|
fflush(stdout);
|
||||||
|
return written;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t gptj_set_state_data(gptj_model *model, std::mt19937 *rng, const uint8_t *src)
|
||||||
|
{
|
||||||
|
const uint8_t * in = src;
|
||||||
|
|
||||||
|
// set rng
|
||||||
|
{
|
||||||
|
size_t rng_size;
|
||||||
|
char rng_buf[GPTJ_MAX_RNG_STATE];
|
||||||
|
|
||||||
|
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
||||||
|
memcpy(&rng_buf[0], in, GPTJ_MAX_RNG_STATE); in += GPTJ_MAX_RNG_STATE;
|
||||||
|
|
||||||
|
std::stringstream rng_ss;
|
||||||
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||||
|
rng_ss >> *rng;
|
||||||
|
|
||||||
|
assert(rng_ss.fail() == false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set kv cache
|
||||||
|
{
|
||||||
|
size_t kv_size;
|
||||||
|
int kv_ntok;
|
||||||
|
|
||||||
|
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
||||||
|
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
||||||
|
|
||||||
|
if (kv_size) {
|
||||||
|
assert(model->kv_self.buf.size == kv_size);
|
||||||
|
|
||||||
|
void * k_data = model->kv_self.k->data; // remember data pointers
|
||||||
|
void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
||||||
|
|
||||||
|
memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;
|
||||||
|
|
||||||
|
model->kv_self.k->data = k_data; // restore correct data pointers
|
||||||
|
model->kv_self.v->data = v_data;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
model->kv_self.n = kv_ntok;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t nread = in - src;
|
||||||
|
assert(nread == gptj_get_state_size(*model));
|
||||||
|
fflush(stdout);
|
||||||
|
return nread;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct GPTJPrivate {
|
||||||
|
const std::string modelPath;
|
||||||
|
bool modelLoaded;
|
||||||
|
gpt_vocab vocab;
|
||||||
|
gptj_model *model = nullptr;
|
||||||
|
int64_t n_threads = 0;
|
||||||
|
size_t mem_per_token = 0;
|
||||||
|
std::mt19937 rng;
|
||||||
|
};
|
||||||
|
|
||||||
|
GPTJ::GPTJ()
|
||||||
|
: d_ptr(new GPTJPrivate) {
|
||||||
|
d_ptr->model = new gptj_model;
|
||||||
|
d_ptr->model->ctx = nullptr;
|
||||||
|
d_ptr->modelLoaded = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GPTJ::requiredMem(const std::string &modelPath) {
|
||||||
|
gptj_model dummy_model;
|
||||||
|
gpt_vocab dummy_vocab;
|
||||||
|
size_t mem_req;
|
||||||
|
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||||
|
gptj_model_load(modelPath, fin, dummy_model, dummy_vocab, &mem_req);
|
||||||
|
return mem_req;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GPTJ::loadModel(const std::string &modelPath) {
|
||||||
|
std::mt19937 rng(time(NULL));
|
||||||
|
d_ptr->rng = rng;
|
||||||
|
|
||||||
|
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||||
|
|
||||||
|
// load the model
|
||||||
|
if (!gptj_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab)) {
|
||||||
|
std::cerr << "GPT-J ERROR: failed to load model from " << modelPath;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
d_ptr->modelLoaded = true;
|
||||||
|
fflush(stdout);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPTJ::setThreadCount(int32_t n_threads) {
|
||||||
|
d_ptr->n_threads = n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t GPTJ::threadCount() const
|
||||||
|
{
|
||||||
|
return d_ptr->n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
GPTJ::~GPTJ()
|
||||||
|
{
|
||||||
|
delete d_ptr->model;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GPTJ::isModelLoaded() const
|
||||||
|
{
|
||||||
|
return d_ptr->modelLoaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GPTJ::stateSize() const
|
||||||
|
{
|
||||||
|
return gptj_get_state_size(*d_ptr->model);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GPTJ::saveState(uint8_t *dest) const
|
||||||
|
{
|
||||||
|
return gptj_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t GPTJ::restoreState(const uint8_t *src)
|
||||||
|
{
|
||||||
|
return gptj_set_state_data(d_ptr->model, &d_ptr->rng, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<LLModel::Token> GPTJ::tokenize(PromptContext &, const std::string &str) const
|
||||||
|
{
|
||||||
|
return ::gpt_tokenize(d_ptr->vocab, str);
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Token GPTJ::sampleToken(PromptContext &promptCtx) const
|
||||||
|
{
|
||||||
|
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||||
|
return gpt_sample_top_k_top_p(d_ptr->model->hparams.n_vocab,
|
||||||
|
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
|
||||||
|
n_prev_toks,
|
||||||
|
promptCtx.logits,
|
||||||
|
promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
|
||||||
|
promptCtx.repeat_penalty,
|
||||||
|
d_ptr->rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string GPTJ::tokenToString(Token id) const
|
||||||
|
{
|
||||||
|
return d_ptr->vocab.id_to_token[id];
|
||||||
|
}
|
||||||
|
|
||||||
|
bool GPTJ::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||||
|
{
|
||||||
|
// determine the required inference memory per token:
|
||||||
|
static bool initialized = false;
|
||||||
|
if (!initialized) {
|
||||||
|
gptj_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, ctx.logits,
|
||||||
|
d_ptr->mem_per_token);
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return gptj_eval(*d_ptr->model, d_ptr->n_threads, ctx.n_past, tokens, ctx.logits, d_ptr->mem_per_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t GPTJ::contextLength() const
|
||||||
|
{
|
||||||
|
return d_ptr->model->hparams.n_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<LLModel::Token> &GPTJ::endTokens() const
|
||||||
|
{
|
||||||
|
static const std::vector<LLModel::Token> fres = {50256};
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define DLL_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define DLL_EXPORT __attribute__ ((visibility ("default")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
DLL_EXPORT bool is_g4a_backend_model_implementation() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_model_type() {
|
||||||
|
return modelType_;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_build_variant() {
|
||||||
|
return GGML_BUILD_VARIANT;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||||
|
uint32_t magic = 0;
|
||||||
|
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||||
|
gptj_hparams hparams;
|
||||||
|
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
|
||||||
|
if (!(hparams.n_vocab >= 50300 && hparams.n_vocab <= 50400)) {
|
||||||
|
return false; // not a gptj.
|
||||||
|
}
|
||||||
|
return magic == 0x67676d6c;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT LLModel *construct() {
|
||||||
|
return new GPTJ;
|
||||||
|
}
|
||||||
|
}
|
0
gpt4all-backend/gptj/placeholder
Normal file
0
gpt4all-backend/gptj/placeholder
Normal file
41
gpt4all-backend/gptj_impl.h
Normal file
41
gpt4all-backend/gptj_impl.h
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#ifndef GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of gptj.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define GPTJ_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef GPTJ_H
|
||||||
|
#define GPTJ_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
struct GPTJPrivate;
|
||||||
|
class GPTJ : public LLModel {
|
||||||
|
public:
|
||||||
|
GPTJ();
|
||||||
|
~GPTJ();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return false; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string &modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
GPTJPrivate *d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
Token sampleToken(PromptContext &ctx) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // GPTJ_H
|
@ -1,273 +0,0 @@
|
|||||||
#ifndef LLMODEL_H
|
|
||||||
#define LLMODEL_H
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstddef>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <expected>
|
|
||||||
#include <functional>
|
|
||||||
#include <optional>
|
|
||||||
#include <span>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
#include <string_view>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <utility>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
class Dlhandle;
|
|
||||||
|
|
||||||
using namespace std::string_literals;
|
|
||||||
|
|
||||||
#define LLMODEL_MAX_PROMPT_BATCH 128
|
|
||||||
|
|
||||||
class LLModel {
|
|
||||||
public:
|
|
||||||
using Token = int32_t;
|
|
||||||
using PromptCallback = std::function<bool(std::span<const Token> batch, bool cached)>;
|
|
||||||
using ResponseCallback = std::function<bool(Token token, std::string_view piece)>;
|
|
||||||
using EmbedCancelCallback = bool(unsigned *batchSizes, unsigned nBatch, const char *backend);
|
|
||||||
using ProgressCallback = std::function<bool(float progress)>;
|
|
||||||
|
|
||||||
class BadArchError: public std::runtime_error {
|
|
||||||
public:
|
|
||||||
BadArchError(std::string arch)
|
|
||||||
: runtime_error("Unsupported model architecture: " + arch)
|
|
||||||
, m_arch(std::move(arch))
|
|
||||||
{}
|
|
||||||
|
|
||||||
const std::string &arch() const noexcept { return m_arch; }
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::string m_arch;
|
|
||||||
};
|
|
||||||
|
|
||||||
class MissingImplementationError: public std::runtime_error {
|
|
||||||
public:
|
|
||||||
using std::runtime_error::runtime_error;
|
|
||||||
};
|
|
||||||
|
|
||||||
class UnsupportedModelError: public std::runtime_error {
|
|
||||||
public:
|
|
||||||
using std::runtime_error::runtime_error;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct GPUDevice {
|
|
||||||
const char *backend;
|
|
||||||
int index;
|
|
||||||
int type;
|
|
||||||
size_t heapSize;
|
|
||||||
std::string name;
|
|
||||||
std::string vendor;
|
|
||||||
|
|
||||||
GPUDevice(const char *backend, int index, int type, size_t heapSize, std::string name, std::string vendor):
|
|
||||||
backend(backend), index(index), type(type), heapSize(heapSize), name(std::move(name)),
|
|
||||||
vendor(std::move(vendor)) {}
|
|
||||||
|
|
||||||
std::string selectionName() const
|
|
||||||
{
|
|
||||||
assert(backend == "cuda"s || backend == "kompute"s);
|
|
||||||
return backendName() + ": " + name;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::string backendName() const { return backendIdToName(backend); }
|
|
||||||
|
|
||||||
static std::string backendIdToName(const std::string &backend) { return s_backendNames.at(backend); }
|
|
||||||
|
|
||||||
static std::string updateSelectionName(const std::string &name) {
|
|
||||||
if (name == "Auto" || name == "CPU" || name == "Metal")
|
|
||||||
return name;
|
|
||||||
auto it = std::find_if(s_backendNames.begin(), s_backendNames.end(), [&name](const auto &entry) {
|
|
||||||
return name.starts_with(entry.second + ": ");
|
|
||||||
});
|
|
||||||
if (it != s_backendNames.end())
|
|
||||||
return name;
|
|
||||||
return "Vulkan: " + name; // previously, there were only Vulkan devices
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
static inline const std::unordered_map<std::string, std::string> s_backendNames {
|
|
||||||
{"cpu", "CPU"}, {"metal", "Metal"}, {"cuda", "CUDA"}, {"kompute", "Vulkan"},
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
class Implementation {
|
|
||||||
public:
|
|
||||||
Implementation(const Implementation &) = delete;
|
|
||||||
Implementation(Implementation &&);
|
|
||||||
~Implementation();
|
|
||||||
|
|
||||||
std::string_view modelType() const { return m_modelType; }
|
|
||||||
std::string_view buildVariant() const { return m_buildVariant; }
|
|
||||||
|
|
||||||
static LLModel *construct(const std::string &modelPath, const std::string &backend = "auto", int n_ctx = 2048);
|
|
||||||
static std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0);
|
|
||||||
static int32_t maxContextLength(const std::string &modelPath);
|
|
||||||
static int32_t layerCount(const std::string &modelPath);
|
|
||||||
static bool isEmbeddingModel(const std::string &modelPath);
|
|
||||||
static auto chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>;
|
|
||||||
static void setImplementationsSearchPath(const std::string &path);
|
|
||||||
static const std::string &implementationsSearchPath();
|
|
||||||
static bool hasSupportedCPU();
|
|
||||||
// 0 for no, 1 for yes, -1 for non-x86_64
|
|
||||||
static int cpuSupportsAVX2();
|
|
||||||
|
|
||||||
private:
|
|
||||||
Implementation(Dlhandle &&);
|
|
||||||
|
|
||||||
static const std::vector<Implementation> &implementationList();
|
|
||||||
static const Implementation *implementation(const char *fname, const std::string &buildVariant);
|
|
||||||
static LLModel *constructGlobalLlama(const std::optional<std::string> &backend = std::nullopt);
|
|
||||||
|
|
||||||
char *(*m_getFileArch)(const char *fname);
|
|
||||||
bool (*m_isArchSupported)(const char *arch);
|
|
||||||
LLModel *(*m_construct)();
|
|
||||||
|
|
||||||
std::string_view m_modelType;
|
|
||||||
std::string_view m_buildVariant;
|
|
||||||
Dlhandle *m_dlhandle;
|
|
||||||
};
|
|
||||||
|
|
||||||
struct PromptContext {
|
|
||||||
int32_t n_predict = 200;
|
|
||||||
int32_t top_k = 40;
|
|
||||||
float top_p = 0.9f;
|
|
||||||
float min_p = 0.0f;
|
|
||||||
float temp = 0.9f;
|
|
||||||
int32_t n_batch = 9;
|
|
||||||
float repeat_penalty = 1.10f;
|
|
||||||
int32_t repeat_last_n = 64; // last n tokens to penalize
|
|
||||||
float contextErase = 0.5f; // percent of context to erase if we exceed the context window
|
|
||||||
};
|
|
||||||
|
|
||||||
explicit LLModel() {}
|
|
||||||
virtual ~LLModel() {}
|
|
||||||
|
|
||||||
virtual bool supportsEmbedding() const = 0;
|
|
||||||
virtual bool supportsCompletion() const = 0;
|
|
||||||
virtual bool loadModel(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
|
||||||
virtual bool isModelBlacklisted(const std::string &modelPath) const { (void)modelPath; return false; }
|
|
||||||
virtual bool isEmbeddingModel(const std::string &modelPath) const { (void)modelPath; return false; }
|
|
||||||
virtual bool isModelLoaded() const = 0;
|
|
||||||
virtual size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) = 0;
|
|
||||||
virtual size_t stateSize() const = 0;
|
|
||||||
virtual size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const = 0;
|
|
||||||
virtual size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) = 0;
|
|
||||||
|
|
||||||
// This method requires the model to return true from supportsCompletion otherwise it will throw
|
|
||||||
// an error
|
|
||||||
virtual void prompt(std::string_view prompt,
|
|
||||||
const PromptCallback &promptCallback,
|
|
||||||
const ResponseCallback &responseCallback,
|
|
||||||
const PromptContext &ctx);
|
|
||||||
|
|
||||||
virtual int32_t countPromptTokens(std::string_view prompt) const;
|
|
||||||
|
|
||||||
virtual size_t embeddingSize() const {
|
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
|
||||||
}
|
|
||||||
// user-specified prefix
|
|
||||||
virtual void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
|
||||||
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
|
|
||||||
EmbedCancelCallback *cancelCb = nullptr);
|
|
||||||
// automatic prefix
|
|
||||||
virtual void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval,
|
|
||||||
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false);
|
|
||||||
|
|
||||||
virtual void setThreadCount(int32_t n_threads) { (void)n_threads; }
|
|
||||||
virtual int32_t threadCount() const { return 1; }
|
|
||||||
|
|
||||||
const Implementation &implementation() const {
|
|
||||||
return *m_implementation;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) const {
|
|
||||||
(void)memoryRequired;
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const {
|
|
||||||
(void)memoryRequired;
|
|
||||||
(void)name;
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const {
|
|
||||||
(void)device;
|
|
||||||
if (unavail_reason) {
|
|
||||||
*unavail_reason = "model has no GPU support";
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual bool usingGPUDevice() const { return false; }
|
|
||||||
virtual const char *backendName() const { return "cpu"; }
|
|
||||||
virtual const char *gpuDeviceName() const { return nullptr; }
|
|
||||||
|
|
||||||
void setProgressCallback(ProgressCallback callback) { m_progressCallback = callback; }
|
|
||||||
|
|
||||||
virtual int32_t contextLength() const = 0;
|
|
||||||
virtual auto specialTokens() -> std::unordered_map<std::string, std::string> const = 0;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
// These are pure virtual because subclasses need to implement as the default implementation of
|
|
||||||
// 'prompt' above calls these functions
|
|
||||||
virtual std::vector<Token> tokenize(std::string_view str) const = 0;
|
|
||||||
virtual bool isSpecialToken(Token id) const = 0;
|
|
||||||
virtual std::string tokenToString(Token id) const = 0;
|
|
||||||
virtual void initSampler(const PromptContext &ctx) = 0;
|
|
||||||
virtual Token sampleToken() const = 0;
|
|
||||||
virtual bool evalTokens(int32_t nPast, std::span<const Token> tokens) const = 0;
|
|
||||||
virtual void shiftContext(const PromptContext &promptCtx, int32_t *nPast) = 0;
|
|
||||||
virtual int32_t inputLength() const = 0;
|
|
||||||
virtual int32_t computeModelInputPosition(std::span<const Token> input) const = 0;
|
|
||||||
virtual void setModelInputPosition(int32_t pos) = 0;
|
|
||||||
virtual void appendInputToken(Token tok) = 0;
|
|
||||||
virtual std::span<const Token> inputTokens() const = 0;
|
|
||||||
virtual const std::vector<Token> &endTokens() const = 0;
|
|
||||||
virtual bool shouldAddBOS() const = 0;
|
|
||||||
|
|
||||||
virtual int32_t maxContextLength(std::string const &modelPath) const
|
|
||||||
{
|
|
||||||
(void)modelPath;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual int32_t layerCount(std::string const &modelPath) const
|
|
||||||
{
|
|
||||||
(void)modelPath;
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
virtual auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string>
|
|
||||||
{
|
|
||||||
(void)modelPath;
|
|
||||||
return std::unexpected("not implemented");
|
|
||||||
}
|
|
||||||
|
|
||||||
const Implementation *m_implementation = nullptr;
|
|
||||||
|
|
||||||
ProgressCallback m_progressCallback;
|
|
||||||
static bool staticProgressCallback(float progress, void* ctx)
|
|
||||||
{
|
|
||||||
LLModel* model = static_cast<LLModel*>(ctx);
|
|
||||||
if (model && model->m_progressCallback)
|
|
||||||
return model->m_progressCallback(progress);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// prefill context with prompt
|
|
||||||
auto decodePrompt(const PromptCallback &promptCallback,
|
|
||||||
const PromptContext &promptCtx,
|
|
||||||
std::vector<Token> embd_inp)
|
|
||||||
-> std::optional<int32_t>;
|
|
||||||
// generate a response
|
|
||||||
void generateResponse(const ResponseCallback &responseCallback,
|
|
||||||
const PromptContext &promptCtx,
|
|
||||||
int32_t nPast);
|
|
||||||
|
|
||||||
friend class LLMImplementation;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // LLMODEL_H
|
|
1
gpt4all-backend/llama.cpp-230511
Submodule
1
gpt4all-backend/llama.cpp-230511
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit f826aac617e1c5847ecb5115f75433aff82f759a
|
1
gpt4all-backend/llama.cpp-230519
Submodule
1
gpt4all-backend/llama.cpp-230519
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 5ea43392731040b454c293123839b90e159cbb99
|
1
gpt4all-backend/llama.cpp-mainline
Submodule
1
gpt4all-backend/llama.cpp-mainline
Submodule
@ -0,0 +1 @@
|
|||||||
|
Subproject commit 9bee309a7c8db77ca53fab49c2d896d486905617
|
File diff suppressed because it is too large
Load Diff
0
gpt4all-backend/llama/placeholder
Normal file
0
gpt4all-backend/llama/placeholder
Normal file
397
gpt4all-backend/llamamodel.cpp
Normal file
397
gpt4all-backend/llamamodel.cpp
Normal file
@ -0,0 +1,397 @@
|
|||||||
|
#define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#include "llamamodel_impl.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <map>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#if defined(_WIN32) && defined(_MSC_VER)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#include <io.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#else
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#include <random>
|
||||||
|
#include <thread>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
#include <llama.h>
|
||||||
|
#include <ggml.h>
|
||||||
|
|
||||||
|
#ifdef GGML_USE_KOMPUTE
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const char *modelType_ = "LLaMA";
|
||||||
|
}
|
||||||
|
|
||||||
|
struct gpt_params {
|
||||||
|
int32_t seed = -1; // RNG seed
|
||||||
|
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||||
|
#if LLAMA_DATE <= 230511
|
||||||
|
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if LLAMA_DATE >= 230519
|
||||||
|
// sampling parameters
|
||||||
|
float tfs_z = 1.0f; // 1.0 = disabled
|
||||||
|
float typical_p = 1.0f; // 1.0 = disabled
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::string prompt = "";
|
||||||
|
|
||||||
|
bool memory_f16 = true; // use f16 instead of f32 for memory kv
|
||||||
|
|
||||||
|
bool use_mmap = true; // use mmap for faster loads
|
||||||
|
bool use_mlock = false; // use mlock to keep model in memory
|
||||||
|
};
|
||||||
|
|
||||||
|
#if LLAMA_DATE >= 230519
|
||||||
|
static int llama_sample_top_p_top_k(
|
||||||
|
llama_context *ctx,
|
||||||
|
const llama_token *last_n_tokens_data,
|
||||||
|
int last_n_tokens_size,
|
||||||
|
int top_k,
|
||||||
|
float top_p,
|
||||||
|
float temp,
|
||||||
|
float repeat_penalty) {
|
||||||
|
auto logits = llama_get_logits(ctx);
|
||||||
|
auto n_vocab = llama_n_vocab(ctx);
|
||||||
|
// Populate initial list of all candidates
|
||||||
|
std::vector<llama_token_data> candidates;
|
||||||
|
candidates.reserve(n_vocab);
|
||||||
|
for (int token_id = 0; token_id < n_vocab; token_id++) {
|
||||||
|
candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f});
|
||||||
|
}
|
||||||
|
llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false};
|
||||||
|
// Sample repeat penalty
|
||||||
|
llama_sample_repetition_penalty(nullptr, &candidates_p, last_n_tokens_data, last_n_tokens_size, repeat_penalty);
|
||||||
|
// Temperature sampling
|
||||||
|
llama_sample_top_k(ctx, &candidates_p, top_k, 1);
|
||||||
|
llama_sample_tail_free(ctx, &candidates_p, 1.0f, 1);
|
||||||
|
llama_sample_typical(ctx, &candidates_p, 1.0f, 1);
|
||||||
|
llama_sample_top_p(ctx, &candidates_p, top_p, 1);
|
||||||
|
llama_sample_temperature(ctx, &candidates_p, temp);
|
||||||
|
return llama_sample_token(ctx, &candidates_p);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct LLamaPrivate {
|
||||||
|
const std::string modelPath;
|
||||||
|
bool modelLoaded;
|
||||||
|
llama_context *ctx = nullptr;
|
||||||
|
llama_context_params params;
|
||||||
|
int64_t n_threads = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
LLamaModel::LLamaModel()
|
||||||
|
: d_ptr(new LLamaPrivate) {
|
||||||
|
d_ptr->modelLoaded = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// default hparams (LLaMA 7B)
|
||||||
|
struct llama_file_hparams {
|
||||||
|
uint32_t n_vocab = 32000;
|
||||||
|
uint32_t n_embd = 4096;
|
||||||
|
uint32_t n_mult = 256;
|
||||||
|
uint32_t n_head = 32;
|
||||||
|
uint32_t n_layer = 32;
|
||||||
|
uint32_t n_rot = 64;
|
||||||
|
enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16;
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t LLamaModel::requiredMem(const std::string &modelPath) {
|
||||||
|
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||||
|
fin.seekg(0, std::ios_base::end);
|
||||||
|
size_t filesize = fin.tellg();
|
||||||
|
fin.seekg(0, std::ios_base::beg);
|
||||||
|
uint32_t magic = 0;
|
||||||
|
fin.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||||
|
if (magic != 0x67676a74) return 0;
|
||||||
|
uint32_t version = 0;
|
||||||
|
fin.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||||
|
llama_file_hparams hparams;
|
||||||
|
fin.read(reinterpret_cast<char*>(&hparams.n_vocab), sizeof(hparams.n_vocab));
|
||||||
|
fin.read(reinterpret_cast<char*>(&hparams.n_embd), sizeof(hparams.n_embd));
|
||||||
|
fin.read(reinterpret_cast<char*>(&hparams.n_head), sizeof(hparams.n_head));
|
||||||
|
fin.read(reinterpret_cast<char*>(&hparams.n_layer), sizeof(hparams.n_layer));
|
||||||
|
fin.read(reinterpret_cast<char*>(&hparams.n_rot), sizeof(hparams.n_rot));
|
||||||
|
fin.read(reinterpret_cast<char*>(&hparams.ftype), sizeof(hparams.ftype));
|
||||||
|
const size_t n_ctx = 2048;
|
||||||
|
const size_t kvcache_element_size = 2; // fp16
|
||||||
|
const size_t est_kvcache_size = hparams.n_embd * hparams.n_layer * 2u * n_ctx * kvcache_element_size;
|
||||||
|
return filesize + est_kvcache_size;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::loadModel(const std::string &modelPath)
|
||||||
|
{
|
||||||
|
// load the model
|
||||||
|
d_ptr->params = llama_context_default_params();
|
||||||
|
|
||||||
|
gpt_params params;
|
||||||
|
d_ptr->params.n_ctx = 2048;
|
||||||
|
d_ptr->params.seed = params.seed;
|
||||||
|
d_ptr->params.f16_kv = params.memory_f16;
|
||||||
|
d_ptr->params.use_mmap = params.use_mmap;
|
||||||
|
#if defined (__APPLE__)
|
||||||
|
d_ptr->params.use_mlock = true;
|
||||||
|
#else
|
||||||
|
d_ptr->params.use_mlock = params.use_mlock;
|
||||||
|
#endif
|
||||||
|
#if LLAMA_DATE <= 230511
|
||||||
|
d_ptr->params.n_parts = params.n_parts;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
std::cerr << "llama.cpp: using Metal" << std::endl;
|
||||||
|
// metal always runs the whole model if n_gpu_layers is not 0, at least
|
||||||
|
// currently
|
||||||
|
d_ptr->params.n_gpu_layers = 1;
|
||||||
|
#endif
|
||||||
|
#ifdef GGML_USE_KOMPUTE
|
||||||
|
if (ggml_vk_has_device()) {
|
||||||
|
// vulkan always runs the whole model if n_gpu_layers is not 0, at least
|
||||||
|
// currently
|
||||||
|
d_ptr->params.n_gpu_layers = 1;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
d_ptr->ctx = llama_init_from_file(modelPath.c_str(), d_ptr->params);
|
||||||
|
if (!d_ptr->ctx) {
|
||||||
|
std::cerr << "LLAMA ERROR: failed to load model from " << modelPath << std::endl;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_USE_KOMPUTE
|
||||||
|
if (ggml_vk_has_device()) {
|
||||||
|
std::cerr << "llama.cpp: using Vulkan on " << ggml_vk_current_device().name << std::endl;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
d_ptr->modelLoaded = true;
|
||||||
|
fflush(stderr);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void LLamaModel::setThreadCount(int32_t n_threads) {
|
||||||
|
d_ptr->n_threads = n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t LLamaModel::threadCount() const {
|
||||||
|
return d_ptr->n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLamaModel::~LLamaModel()
|
||||||
|
{
|
||||||
|
if(d_ptr->ctx) {
|
||||||
|
llama_free(d_ptr->ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::isModelLoaded() const
|
||||||
|
{
|
||||||
|
return d_ptr->modelLoaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t LLamaModel::stateSize() const
|
||||||
|
{
|
||||||
|
return llama_get_state_size(d_ptr->ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t LLamaModel::saveState(uint8_t *dest) const
|
||||||
|
{
|
||||||
|
return llama_copy_state_data(d_ptr->ctx, dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t LLamaModel::restoreState(const uint8_t *src)
|
||||||
|
{
|
||||||
|
// const_cast is required, see: https://github.com/ggerganov/llama.cpp/pull/1540
|
||||||
|
return llama_set_state_data(d_ptr->ctx, const_cast<uint8_t*>(src));
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<LLModel::Token> LLamaModel::tokenize(PromptContext &ctx, const std::string &str) const
|
||||||
|
{
|
||||||
|
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
|
||||||
|
std::vector<LLModel::Token> fres(str.size()+4);
|
||||||
|
auto fres_len = llama_tokenize(d_ptr->ctx, str.c_str(), fres.data(), fres.size(), useBOS);
|
||||||
|
fres.resize(fres_len);
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string LLamaModel::tokenToString(Token id) const
|
||||||
|
{
|
||||||
|
return llama_token_to_str(d_ptr->ctx, id);
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
|
||||||
|
{
|
||||||
|
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||||
|
return llama_sample_top_p_top_k(d_ptr->ctx,
|
||||||
|
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
|
||||||
|
n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
|
||||||
|
promptCtx.repeat_penalty);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||||
|
{
|
||||||
|
// When we recalculate context we could have erased the original BOS token... we need to replace it
|
||||||
|
const bool useBOS = ctx.n_past == 0 && (ctx.tokens.empty() || ctx.tokens.front() != llama_token_bos());
|
||||||
|
if (useBOS) {
|
||||||
|
std::vector<int32_t> myTokens;
|
||||||
|
myTokens.push_back(llama_token_bos());
|
||||||
|
myTokens.insert(myTokens.end(), tokens.begin(), tokens.end());
|
||||||
|
ctx.n_past += 1;
|
||||||
|
return llama_eval(d_ptr->ctx, myTokens.data(), myTokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
|
||||||
|
} else
|
||||||
|
return llama_eval(d_ptr->ctx, tokens.data(), tokens.size(), ctx.n_past, d_ptr->n_threads) == 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t LLamaModel::contextLength() const
|
||||||
|
{
|
||||||
|
return llama_n_ctx(d_ptr->ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<LLModel::Token> &LLamaModel::endTokens() const
|
||||||
|
{
|
||||||
|
static const std::vector<LLModel::Token> fres = {llama_token_eos()};
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::vector<LLModel::GPUDevice> LLamaModel::availableGPUDevices(size_t memoryRequired)
|
||||||
|
{
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
std::vector<ggml_vk_device> vkDevices = ggml_vk_available_devices(memoryRequired);
|
||||||
|
|
||||||
|
std::vector<LLModel::GPUDevice> devices;
|
||||||
|
for(const auto& vkDevice : vkDevices) {
|
||||||
|
LLModel::GPUDevice device;
|
||||||
|
device.index = vkDevice.index;
|
||||||
|
device.type = vkDevice.type;
|
||||||
|
device.heapSize = vkDevice.heapSize;
|
||||||
|
device.name = vkDevice.name;
|
||||||
|
device.vendor = vkDevice.vendor;
|
||||||
|
|
||||||
|
devices.push_back(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
return devices;
|
||||||
|
#else
|
||||||
|
return std::vector<LLModel::GPUDevice>();
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::initializeGPUDevice(size_t memoryRequired, const std::string& device)
|
||||||
|
{
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
return ggml_vk_init_device(memoryRequired, device);
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::initializeGPUDevice(const LLModel::GPUDevice &device)
|
||||||
|
{
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
ggml_vk_device vkDevice;
|
||||||
|
vkDevice.index = device.index;
|
||||||
|
vkDevice.type = device.type;
|
||||||
|
vkDevice.heapSize = device.heapSize;
|
||||||
|
vkDevice.name = device.name;
|
||||||
|
vkDevice.vendor = device.vendor;
|
||||||
|
return ggml_vk_init_device(vkDevice);
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::initializeGPUDevice(int device)
|
||||||
|
{
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
return ggml_vk_init_device(device);
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLamaModel::hasGPUDevice()
|
||||||
|
{
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
return ggml_vk_has_device();
|
||||||
|
#else
|
||||||
|
return false;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define DLL_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define DLL_EXPORT __attribute__ ((visibility ("default")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
DLL_EXPORT bool is_g4a_backend_model_implementation() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_model_type() {
|
||||||
|
return modelType_;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_build_variant() {
|
||||||
|
return GGML_BUILD_VARIANT;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||||
|
// Check magic
|
||||||
|
uint32_t magic = 0;
|
||||||
|
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||||
|
if (magic != 0x67676a74) return false;
|
||||||
|
// Check version
|
||||||
|
uint32_t version = 0;
|
||||||
|
f.read(reinterpret_cast<char*>(&version), sizeof(version));
|
||||||
|
if (!(version LLAMA_VERSIONS)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
llama_file_hparams hparams;
|
||||||
|
f.read(reinterpret_cast<char*>(&hparams), sizeof(hparams));
|
||||||
|
if (!(hparams.n_vocab >= 32000 && hparams.n_vocab <= 32100)) {
|
||||||
|
return false; // not a llama.
|
||||||
|
}
|
||||||
|
#ifdef GGML_USE_METAL
|
||||||
|
// Check quant supported on metal
|
||||||
|
// skip fields
|
||||||
|
switch(hparams.ftype) {
|
||||||
|
// currently supported on Metal https://github.com/ggerganov/llama.cpp/blob/ae9663f1887513e152839e91f61c513075a19422/ggml-metal.m#L51-L55
|
||||||
|
case LLAMA_FTYPE_MOSTLY_F16:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q2_K:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_0:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q6_K:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_S:
|
||||||
|
case LLAMA_FTYPE_MOSTLY_Q4_K_M:
|
||||||
|
return true;
|
||||||
|
default: // unsupported quant-type for Metal
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT LLModel *construct() {
|
||||||
|
return new LLamaModel;
|
||||||
|
}
|
||||||
|
}
|
46
gpt4all-backend/llamamodel_impl.h
Normal file
46
gpt4all-backend/llamamodel_impl.h
Normal file
@ -0,0 +1,46 @@
|
|||||||
|
#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef LLAMAMODEL_H
|
||||||
|
#define LLAMAMODEL_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
struct LLamaPrivate;
|
||||||
|
class LLamaModel : public LLModel {
|
||||||
|
public:
|
||||||
|
LLamaModel();
|
||||||
|
~LLamaModel();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return false; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string &modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired) override;
|
||||||
|
bool initializeGPUDevice(size_t memoryRequired, const std::string& device) override;
|
||||||
|
bool initializeGPUDevice(const GPUDevice &device) override;
|
||||||
|
bool initializeGPUDevice(int device) override;
|
||||||
|
bool hasGPUDevice() override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
LLamaPrivate *d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
Token sampleToken(PromptContext& ctx) const override;
|
||||||
|
bool evalTokens(PromptContext& ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // LLAMAMODEL_H
|
181
gpt4all-backend/llmodel.cpp
Normal file
181
gpt4all-backend/llmodel.cpp
Normal file
@ -0,0 +1,181 @@
|
|||||||
|
#include "llmodel.h"
|
||||||
|
#include "dlhandle.h"
|
||||||
|
#include "sysinfo.h"
|
||||||
|
|
||||||
|
#include <iostream>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <fstream>
|
||||||
|
#include <filesystem>
|
||||||
|
#include <cassert>
|
||||||
|
#include <cstdlib>
|
||||||
|
#include <sstream>
|
||||||
|
#ifdef _MSC_VER
|
||||||
|
#include <intrin.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
std::string s_implementations_search_path = ".";
|
||||||
|
|
||||||
|
static bool has_at_least_minimal_hardware() {
|
||||||
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
return __builtin_cpu_supports("avx");
|
||||||
|
#else
|
||||||
|
int cpuInfo[4];
|
||||||
|
__cpuid(cpuInfo, 1);
|
||||||
|
return cpuInfo[2] & (1 << 28);
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
return true; // Don't know how to handle non-x86_64
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool requires_avxonly() {
|
||||||
|
#if defined(__x86_64__) || defined(_M_X64)
|
||||||
|
#ifndef _MSC_VER
|
||||||
|
return !__builtin_cpu_supports("avx2");
|
||||||
|
#else
|
||||||
|
int cpuInfo[4];
|
||||||
|
__cpuidex(cpuInfo, 7, 0);
|
||||||
|
return !(cpuInfo[1] & (1 << 5));
|
||||||
|
#endif
|
||||||
|
#else
|
||||||
|
return false; // Don't know how to handle non-x86_64
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
|
||||||
|
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
||||||
|
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
||||||
|
assert(get_model_type);
|
||||||
|
m_modelType = get_model_type();
|
||||||
|
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
||||||
|
assert(get_build_variant);
|
||||||
|
m_buildVariant = get_build_variant();
|
||||||
|
m_magicMatch = m_dlhandle->get<bool(std::ifstream&)>("magic_match");
|
||||||
|
assert(m_magicMatch);
|
||||||
|
m_construct = m_dlhandle->get<LLModel *()>("construct");
|
||||||
|
assert(m_construct);
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Implementation::Implementation(Implementation &&o)
|
||||||
|
: m_magicMatch(o.m_magicMatch)
|
||||||
|
, m_construct(o.m_construct)
|
||||||
|
, m_modelType(o.m_modelType)
|
||||||
|
, m_buildVariant(o.m_buildVariant)
|
||||||
|
, m_dlhandle(o.m_dlhandle) {
|
||||||
|
o.m_dlhandle = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Implementation::~Implementation() {
|
||||||
|
if (m_dlhandle) delete m_dlhandle;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool LLModel::Implementation::isImplementation(const Dlhandle &dl) {
|
||||||
|
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList() {
|
||||||
|
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
||||||
|
// individual models without the cleanup of the static list interfering
|
||||||
|
static auto* libs = new std::vector<Implementation>([] () {
|
||||||
|
std::vector<Implementation> fres;
|
||||||
|
|
||||||
|
auto search_in_directory = [&](const std::string& paths) {
|
||||||
|
std::stringstream ss(paths);
|
||||||
|
std::string path;
|
||||||
|
// Split the paths string by the delimiter and process each path.
|
||||||
|
while (std::getline(ss, path, ';')) {
|
||||||
|
std::filesystem::path fs_path(path);
|
||||||
|
// Iterate over all libraries
|
||||||
|
for (const auto& f : std::filesystem::directory_iterator(fs_path)) {
|
||||||
|
const std::filesystem::path& p = f.path();
|
||||||
|
if (p.extension() != LIB_FILE_EXT) continue;
|
||||||
|
// Add to list if model implementation
|
||||||
|
try {
|
||||||
|
Dlhandle dl(p.string());
|
||||||
|
if (!Implementation::isImplementation(dl)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
fres.emplace_back(Implementation(std::move(dl)));
|
||||||
|
} catch (...) {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
search_in_directory(s_implementations_search_path);
|
||||||
|
|
||||||
|
return fres;
|
||||||
|
}());
|
||||||
|
// Return static result
|
||||||
|
return *libs;
|
||||||
|
}
|
||||||
|
|
||||||
|
const LLModel::Implementation* LLModel::Implementation::implementation(std::ifstream& f, const std::string& buildVariant) {
|
||||||
|
for (const auto& i : implementationList()) {
|
||||||
|
f.seekg(0);
|
||||||
|
if (!i.m_magicMatch(f)) continue;
|
||||||
|
if (buildVariant != i.m_buildVariant) continue;
|
||||||
|
return &i;
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel *LLModel::Implementation::construct(const std::string &modelPath, std::string buildVariant) {
|
||||||
|
|
||||||
|
if (!has_at_least_minimal_hardware())
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
// Read magic
|
||||||
|
std::ifstream f(modelPath, std::ios::binary);
|
||||||
|
if (!f) return nullptr;
|
||||||
|
// Get correct implementation
|
||||||
|
const Implementation* impl = nullptr;
|
||||||
|
|
||||||
|
#if defined(__APPLE__) && defined(__arm64__) // FIXME: See if metal works for intel macs
|
||||||
|
if (buildVariant == "auto") {
|
||||||
|
size_t total_mem = getSystemTotalRAMInBytes();
|
||||||
|
impl = implementation(f, "metal");
|
||||||
|
if(impl) {
|
||||||
|
LLModel* metalimpl = impl->m_construct();
|
||||||
|
metalimpl->m_implementation = impl;
|
||||||
|
size_t req_mem = metalimpl->requiredMem(modelPath);
|
||||||
|
float req_to_total = (float) req_mem / (float) total_mem;
|
||||||
|
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
||||||
|
if (req_to_total >= 0.53) {
|
||||||
|
delete metalimpl;
|
||||||
|
impl = nullptr;
|
||||||
|
} else {
|
||||||
|
return metalimpl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
if (!impl) {
|
||||||
|
//TODO: Auto-detect CUDA/OpenCL
|
||||||
|
if (buildVariant == "auto") {
|
||||||
|
if (requires_avxonly()) {
|
||||||
|
buildVariant = "avxonly";
|
||||||
|
} else {
|
||||||
|
buildVariant = "default";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
impl = implementation(f, buildVariant);
|
||||||
|
if (!impl) return nullptr;
|
||||||
|
}
|
||||||
|
f.close();
|
||||||
|
|
||||||
|
// Construct and return llmodel implementation
|
||||||
|
auto fres = impl->m_construct();
|
||||||
|
fres->m_implementation = impl;
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
|
||||||
|
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path) {
|
||||||
|
s_implementations_search_path = path;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string& LLModel::Implementation::implementationsSearchPath() {
|
||||||
|
return s_implementations_search_path;
|
||||||
|
}
|
124
gpt4all-backend/llmodel.h
Normal file
124
gpt4all-backend/llmodel.h
Normal file
@ -0,0 +1,124 @@
|
|||||||
|
#ifndef LLMODEL_H
|
||||||
|
#define LLMODEL_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include <string_view>
|
||||||
|
#include <fstream>
|
||||||
|
#include <cstdint>
|
||||||
|
#include <limits>
|
||||||
|
|
||||||
|
#define LLMODEL_MAX_PROMPT_BATCH 128
|
||||||
|
|
||||||
|
class Dlhandle;
|
||||||
|
class LLModel {
|
||||||
|
public:
|
||||||
|
using Token = int32_t;
|
||||||
|
class Implementation {
|
||||||
|
public:
|
||||||
|
Implementation(Dlhandle&&);
|
||||||
|
Implementation(const Implementation&) = delete;
|
||||||
|
Implementation(Implementation&&);
|
||||||
|
~Implementation();
|
||||||
|
|
||||||
|
std::string_view modelType() const { return m_modelType; }
|
||||||
|
std::string_view buildVariant() const { return m_buildVariant; }
|
||||||
|
|
||||||
|
static bool isImplementation(const Dlhandle&);
|
||||||
|
static const std::vector<Implementation>& implementationList();
|
||||||
|
static const Implementation *implementation(std::ifstream& f, const std::string& buildVariant);
|
||||||
|
static LLModel *construct(const std::string &modelPath, std::string buildVariant = "auto");
|
||||||
|
static void setImplementationsSearchPath(const std::string& path);
|
||||||
|
static const std::string& implementationsSearchPath();
|
||||||
|
|
||||||
|
private:
|
||||||
|
bool (*m_magicMatch)(std::ifstream& f);
|
||||||
|
LLModel *(*m_construct)();
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::string_view m_modelType;
|
||||||
|
std::string_view m_buildVariant;
|
||||||
|
Dlhandle *m_dlhandle;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct PromptContext {
|
||||||
|
std::vector<float> logits; // logits of current context
|
||||||
|
std::vector<int32_t> tokens; // current tokens in the context window
|
||||||
|
int32_t n_past = 0; // number of tokens in past conversation
|
||||||
|
int32_t n_ctx = 0; // number of tokens possible in context window
|
||||||
|
int32_t n_predict = 200;
|
||||||
|
int32_t top_k = 40;
|
||||||
|
float top_p = 0.9f;
|
||||||
|
float temp = 0.9f;
|
||||||
|
int32_t n_batch = 9;
|
||||||
|
float repeat_penalty = 1.10f;
|
||||||
|
int32_t repeat_last_n = 64; // last n tokens to penalize
|
||||||
|
float contextErase = 0.75f; // percent of context to erase if we exceed the context
|
||||||
|
// window
|
||||||
|
};
|
||||||
|
|
||||||
|
struct GPUDevice {
|
||||||
|
int index = 0;
|
||||||
|
int type = 0;
|
||||||
|
size_t heapSize = 0;
|
||||||
|
std::string name;
|
||||||
|
std::string vendor;
|
||||||
|
};
|
||||||
|
|
||||||
|
explicit LLModel() {}
|
||||||
|
virtual ~LLModel() {}
|
||||||
|
|
||||||
|
virtual bool supportsEmbedding() const = 0;
|
||||||
|
virtual bool supportsCompletion() const = 0;
|
||||||
|
virtual bool loadModel(const std::string &modelPath) = 0;
|
||||||
|
virtual bool isModelLoaded() const = 0;
|
||||||
|
virtual size_t requiredMem(const std::string &modelPath) = 0;
|
||||||
|
virtual size_t stateSize() const { return 0; }
|
||||||
|
virtual size_t saveState(uint8_t */*dest*/) const { return 0; }
|
||||||
|
virtual size_t restoreState(const uint8_t */*src*/) { return 0; }
|
||||||
|
|
||||||
|
// This method requires the model to return true from supportsCompletion otherwise it will throw
|
||||||
|
// an error
|
||||||
|
virtual void prompt(const std::string &prompt,
|
||||||
|
std::function<bool(int32_t)> promptCallback,
|
||||||
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
|
std::function<bool(bool)> recalculateCallback,
|
||||||
|
PromptContext &ctx);
|
||||||
|
|
||||||
|
virtual std::vector<float> embedding(const std::string &text);
|
||||||
|
|
||||||
|
virtual void setThreadCount(int32_t /*n_threads*/) {}
|
||||||
|
virtual int32_t threadCount() const { return 1; }
|
||||||
|
|
||||||
|
const Implementation& implementation() const {
|
||||||
|
return *m_implementation;
|
||||||
|
}
|
||||||
|
|
||||||
|
virtual std::vector<GPUDevice> availableGPUDevices(size_t /*memoryRequired*/) { return std::vector<GPUDevice>(); }
|
||||||
|
virtual bool initializeGPUDevice(size_t /*memoryRequired*/, const std::string& /*device*/) { return false; }
|
||||||
|
virtual bool initializeGPUDevice(const GPUDevice &/*device*/) { return false; }
|
||||||
|
virtual bool initializeGPUDevice(int /*device*/) { return false; }
|
||||||
|
virtual bool hasGPUDevice() { return false; }
|
||||||
|
|
||||||
|
protected:
|
||||||
|
// These are pure virtual because subclasses need to implement as the default implementation of
|
||||||
|
// 'prompt' above calls these functions
|
||||||
|
virtual std::vector<Token> tokenize(PromptContext &, const std::string&) const = 0;
|
||||||
|
virtual std::string tokenToString(Token) const = 0;
|
||||||
|
virtual Token sampleToken(PromptContext &ctx) const = 0;
|
||||||
|
virtual bool evalTokens(PromptContext &/*ctx*/, const std::vector<int32_t>& /*tokens*/) const = 0;
|
||||||
|
virtual int32_t contextLength() const = 0;
|
||||||
|
virtual const std::vector<Token>& endTokens() const = 0;
|
||||||
|
|
||||||
|
// This is a helper function called from the default implementation of 'prompt' but it can be
|
||||||
|
// shared by all base classes so it isn't virtual
|
||||||
|
void recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate);
|
||||||
|
|
||||||
|
const Implementation *m_implementation = nullptr;
|
||||||
|
|
||||||
|
private:
|
||||||
|
friend class LLMImplementation;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // LLMODEL_H
|
265
gpt4all-backend/llmodel_c.cpp
Normal file
265
gpt4all-backend/llmodel_c.cpp
Normal file
@ -0,0 +1,265 @@
|
|||||||
|
#include "llmodel_c.h"
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
#include <cstring>
|
||||||
|
#include <cerrno>
|
||||||
|
#include <utility>
|
||||||
|
|
||||||
|
struct LLModelWrapper {
|
||||||
|
LLModel *llModel = nullptr;
|
||||||
|
LLModel::PromptContext promptContext;
|
||||||
|
~LLModelWrapper() { delete llModel; }
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
thread_local static std::string last_error_message;
|
||||||
|
|
||||||
|
|
||||||
|
llmodel_model llmodel_model_create(const char *model_path) {
|
||||||
|
auto fres = llmodel_model_create2(model_path, "auto", nullptr);
|
||||||
|
if (!fres) {
|
||||||
|
fprintf(stderr, "Invalid model file\n");
|
||||||
|
}
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
|
||||||
|
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, llmodel_error *error) {
|
||||||
|
auto wrapper = new LLModelWrapper;
|
||||||
|
int error_code = 0;
|
||||||
|
|
||||||
|
try {
|
||||||
|
wrapper->llModel = LLModel::Implementation::construct(model_path, build_variant);
|
||||||
|
} catch (const std::exception& e) {
|
||||||
|
error_code = EINVAL;
|
||||||
|
last_error_message = e.what();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!wrapper->llModel) {
|
||||||
|
delete std::exchange(wrapper, nullptr);
|
||||||
|
// Get errno and error message if none
|
||||||
|
if (error_code == 0) {
|
||||||
|
if (errno != 0) {
|
||||||
|
error_code = errno;
|
||||||
|
last_error_message = std::strerror(error_code);
|
||||||
|
} else {
|
||||||
|
error_code = ENOTSUP;
|
||||||
|
last_error_message = "Model format not supported (no matching implementation found)";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Set error argument
|
||||||
|
if (error) {
|
||||||
|
error->message = last_error_message.c_str();
|
||||||
|
error->code = error_code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return reinterpret_cast<llmodel_model*>(wrapper);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llmodel_model_destroy(llmodel_model model) {
|
||||||
|
delete reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t llmodel_required_mem(llmodel_model model, const char *model_path)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->requiredMem(model_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llmodel_loadModel(llmodel_model model, const char *model_path)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->loadModel(model_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llmodel_isModelLoaded(llmodel_model model)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->isModelLoaded();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t llmodel_get_state_size(llmodel_model model)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->stateSize();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->saveState(dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->restoreState(src);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wrapper functions for the C callbacks
|
||||||
|
bool prompt_wrapper(int32_t token_id, void *user_data) {
|
||||||
|
llmodel_prompt_callback callback = reinterpret_cast<llmodel_prompt_callback>(user_data);
|
||||||
|
return callback(token_id);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool response_wrapper(int32_t token_id, const std::string &response, void *user_data) {
|
||||||
|
llmodel_response_callback callback = reinterpret_cast<llmodel_response_callback>(user_data);
|
||||||
|
return callback(token_id, response.c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool recalculate_wrapper(bool is_recalculating, void *user_data) {
|
||||||
|
llmodel_recalculate_callback callback = reinterpret_cast<llmodel_recalculate_callback>(user_data);
|
||||||
|
return callback(is_recalculating);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||||
|
llmodel_prompt_callback prompt_callback,
|
||||||
|
llmodel_response_callback response_callback,
|
||||||
|
llmodel_recalculate_callback recalculate_callback,
|
||||||
|
llmodel_prompt_context *ctx)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
|
||||||
|
// Create std::function wrappers that call the C function pointers
|
||||||
|
std::function<bool(int32_t)> prompt_func =
|
||||||
|
std::bind(&prompt_wrapper, std::placeholders::_1, reinterpret_cast<void*>(prompt_callback));
|
||||||
|
std::function<bool(int32_t, const std::string&)> response_func =
|
||||||
|
std::bind(&response_wrapper, std::placeholders::_1, std::placeholders::_2, reinterpret_cast<void*>(response_callback));
|
||||||
|
std::function<bool(bool)> recalc_func =
|
||||||
|
std::bind(&recalculate_wrapper, std::placeholders::_1, reinterpret_cast<void*>(recalculate_callback));
|
||||||
|
|
||||||
|
if (size_t(ctx->n_past) < wrapper->promptContext.tokens.size())
|
||||||
|
wrapper->promptContext.tokens.resize(ctx->n_past);
|
||||||
|
|
||||||
|
// Copy the C prompt context
|
||||||
|
wrapper->promptContext.n_past = ctx->n_past;
|
||||||
|
wrapper->promptContext.n_ctx = ctx->n_ctx;
|
||||||
|
wrapper->promptContext.n_predict = ctx->n_predict;
|
||||||
|
wrapper->promptContext.top_k = ctx->top_k;
|
||||||
|
wrapper->promptContext.top_p = ctx->top_p;
|
||||||
|
wrapper->promptContext.temp = ctx->temp;
|
||||||
|
wrapper->promptContext.n_batch = ctx->n_batch;
|
||||||
|
wrapper->promptContext.repeat_penalty = ctx->repeat_penalty;
|
||||||
|
wrapper->promptContext.repeat_last_n = ctx->repeat_last_n;
|
||||||
|
wrapper->promptContext.contextErase = ctx->context_erase;
|
||||||
|
|
||||||
|
// Call the C++ prompt method
|
||||||
|
wrapper->llModel->prompt(prompt, prompt_func, response_func, recalc_func, wrapper->promptContext);
|
||||||
|
|
||||||
|
// Update the C context by giving access to the wrappers raw pointers to std::vector data
|
||||||
|
// which involves no copies
|
||||||
|
ctx->logits = wrapper->promptContext.logits.data();
|
||||||
|
ctx->logits_size = wrapper->promptContext.logits.size();
|
||||||
|
ctx->tokens = wrapper->promptContext.tokens.data();
|
||||||
|
ctx->tokens_size = wrapper->promptContext.tokens.size();
|
||||||
|
|
||||||
|
// Update the rest of the C prompt context
|
||||||
|
ctx->n_past = wrapper->promptContext.n_past;
|
||||||
|
ctx->n_ctx = wrapper->promptContext.n_ctx;
|
||||||
|
ctx->n_predict = wrapper->promptContext.n_predict;
|
||||||
|
ctx->top_k = wrapper->promptContext.top_k;
|
||||||
|
ctx->top_p = wrapper->promptContext.top_p;
|
||||||
|
ctx->temp = wrapper->promptContext.temp;
|
||||||
|
ctx->n_batch = wrapper->promptContext.n_batch;
|
||||||
|
ctx->repeat_penalty = wrapper->promptContext.repeat_penalty;
|
||||||
|
ctx->repeat_last_n = wrapper->promptContext.repeat_last_n;
|
||||||
|
ctx->context_erase = wrapper->promptContext.contextErase;
|
||||||
|
}
|
||||||
|
|
||||||
|
float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size)
|
||||||
|
{
|
||||||
|
if (model == nullptr || text == nullptr || !strlen(text)) {
|
||||||
|
*embedding_size = 0;
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
std::vector<float> embeddingVector = wrapper->llModel->embedding(text);
|
||||||
|
float *embedding = (float *)malloc(embeddingVector.size() * sizeof(float));
|
||||||
|
if (embedding == nullptr) {
|
||||||
|
*embedding_size = 0;
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
std::copy(embeddingVector.begin(), embeddingVector.end(), embedding);
|
||||||
|
*embedding_size = embeddingVector.size();
|
||||||
|
return embedding;
|
||||||
|
}
|
||||||
|
|
||||||
|
void llmodel_free_embedding(float *ptr)
|
||||||
|
{
|
||||||
|
free(ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
wrapper->llModel->setThreadCount(n_threads);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t llmodel_threadCount(llmodel_model model)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->threadCount();
|
||||||
|
}
|
||||||
|
|
||||||
|
void llmodel_set_implementation_search_path(const char *path)
|
||||||
|
{
|
||||||
|
LLModel::Implementation::setImplementationsSearchPath(path);
|
||||||
|
}
|
||||||
|
|
||||||
|
const char *llmodel_get_implementation_search_path()
|
||||||
|
{
|
||||||
|
return LLModel::Implementation::implementationsSearchPath().c_str();
|
||||||
|
}
|
||||||
|
|
||||||
|
struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
std::vector<LLModel::GPUDevice> devices = wrapper->llModel->availableGPUDevices(memoryRequired);
|
||||||
|
|
||||||
|
// Set the num_devices
|
||||||
|
*num_devices = devices.size();
|
||||||
|
|
||||||
|
if (*num_devices == 0) return nullptr; // Return nullptr if no devices are found
|
||||||
|
|
||||||
|
// Allocate memory for the output array
|
||||||
|
struct llmodel_gpu_device* output = (struct llmodel_gpu_device*) malloc(*num_devices * sizeof(struct llmodel_gpu_device));
|
||||||
|
|
||||||
|
for (int i = 0; i < *num_devices; i++) {
|
||||||
|
output[i].index = devices[i].index;
|
||||||
|
output[i].type = devices[i].type;
|
||||||
|
output[i].heapSize = devices[i].heapSize;
|
||||||
|
output[i].name = strdup(devices[i].name.c_str()); // Convert std::string to char* and allocate memory
|
||||||
|
output[i].vendor = strdup(devices[i].vendor.c_str()); // Convert std::string to char* and allocate memory
|
||||||
|
}
|
||||||
|
|
||||||
|
return output;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
|
||||||
|
{
|
||||||
|
LLModel::GPUDevice d;
|
||||||
|
d.index = device->index;
|
||||||
|
d.type = device->type;
|
||||||
|
d.heapSize = device->heapSize;
|
||||||
|
d.name = device->name;
|
||||||
|
d.vendor = device->vendor;
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->initializeGPUDevice(d);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->initializeGPUDevice(device);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool llmodel_has_gpu_device(llmodel_model model)
|
||||||
|
{
|
||||||
|
LLModelWrapper *wrapper = reinterpret_cast<LLModelWrapper*>(model);
|
||||||
|
return wrapper->llModel->hasGPUDevice();
|
||||||
|
}
|
@ -1,9 +1,9 @@
|
|||||||
#ifndef LLMODEL_C_H
|
#ifndef LLMODEL_C_H
|
||||||
#define LLMODEL_C_H
|
#define LLMODEL_C_H
|
||||||
|
|
||||||
#include <stdbool.h>
|
|
||||||
#include <stddef.h>
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
#include <stddef.h>
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
#ifdef __GNUC__
|
#ifdef __GNUC__
|
||||||
#define DEPRECATED __attribute__ ((deprecated))
|
#define DEPRECATED __attribute__ ((deprecated))
|
||||||
@ -24,9 +24,15 @@ extern "C" {
|
|||||||
typedef void *llmodel_model;
|
typedef void *llmodel_model;
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A token.
|
* Structure containing any errors that may eventually occur
|
||||||
*/
|
*/
|
||||||
typedef int32_t token_t;
|
struct llmodel_error {
|
||||||
|
const char *message; // Human readable error description; Thread-local; guaranteed to survive until next llmodel C API call
|
||||||
|
int code; // errno; 0 if none
|
||||||
|
};
|
||||||
|
#ifndef __cplusplus
|
||||||
|
typedef struct llmodel_error llmodel_error;
|
||||||
|
#endif
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* llmodel_prompt_context structure for holding the prompt context.
|
* llmodel_prompt_context structure for holding the prompt context.
|
||||||
@ -35,22 +41,26 @@ typedef int32_t token_t;
|
|||||||
* behavior.
|
* behavior.
|
||||||
*/
|
*/
|
||||||
struct llmodel_prompt_context {
|
struct llmodel_prompt_context {
|
||||||
|
float *logits; // logits of current context
|
||||||
|
size_t logits_size; // the size of the raw logits vector
|
||||||
|
int32_t *tokens; // current tokens in the context window
|
||||||
|
size_t tokens_size; // the size of the raw tokens vector
|
||||||
|
int32_t n_past; // number of tokens in past conversation
|
||||||
|
int32_t n_ctx; // number of tokens possible in context window
|
||||||
int32_t n_predict; // number of tokens to predict
|
int32_t n_predict; // number of tokens to predict
|
||||||
int32_t top_k; // top k logits to sample from
|
int32_t top_k; // top k logits to sample from
|
||||||
float top_p; // nucleus sampling probability threshold
|
float top_p; // nucleus sampling probability threshold
|
||||||
float min_p; // Min P sampling
|
float temp; // temperature to adjust model's output distribution
|
||||||
float temp; // temperature to adjust model's output distribution
|
|
||||||
int32_t n_batch; // number of predictions to generate in parallel
|
int32_t n_batch; // number of predictions to generate in parallel
|
||||||
float repeat_penalty; // penalty factor for repeated tokens
|
float repeat_penalty; // penalty factor for repeated tokens
|
||||||
int32_t repeat_last_n; // last n tokens to penalize
|
int32_t repeat_last_n; // last n tokens to penalize
|
||||||
float context_erase; // percent of context to erase if we exceed the context window
|
float context_erase; // percent of context to erase if we exceed the context window
|
||||||
};
|
};
|
||||||
|
|
||||||
struct llmodel_gpu_device {
|
struct llmodel_gpu_device {
|
||||||
const char * backend;
|
int index = 0;
|
||||||
int index;
|
int type = 0; // same as VkPhysicalDeviceType
|
||||||
int type; // same as VkPhysicalDeviceType
|
size_t heapSize = 0;
|
||||||
size_t heapSize;
|
|
||||||
const char * name;
|
const char * name;
|
||||||
const char * vendor;
|
const char * vendor;
|
||||||
};
|
};
|
||||||
@ -62,12 +72,10 @@ typedef struct llmodel_gpu_device llmodel_gpu_device;
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Callback type for prompt processing.
|
* Callback type for prompt processing.
|
||||||
* @param token_ids An array of token ids of the prompt.
|
* @param token_id The token id of the prompt.
|
||||||
* @param n_token_ids The number of tokens in the array.
|
|
||||||
* @param cached Whether the tokens were already in cache.
|
|
||||||
* @return a bool indicating whether the model should keep processing.
|
* @return a bool indicating whether the model should keep processing.
|
||||||
*/
|
*/
|
||||||
typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token_ids, bool cached);
|
typedef bool (*llmodel_prompt_callback)(int32_t token_id);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Callback type for response.
|
* Callback type for response.
|
||||||
@ -75,18 +83,14 @@ typedef bool (*llmodel_prompt_callback)(const token_t *token_ids, size_t n_token
|
|||||||
* @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
|
* @param response The response string. NOTE: a token_id of -1 indicates the string is an error string.
|
||||||
* @return a bool indicating whether the model should keep generating.
|
* @return a bool indicating whether the model should keep generating.
|
||||||
*/
|
*/
|
||||||
typedef bool (*llmodel_response_callback)(token_t token_id, const char *response);
|
typedef bool (*llmodel_response_callback)(int32_t token_id, const char *response);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Embedding cancellation callback for use with llmodel_embed.
|
* Callback type for recalculation of context.
|
||||||
* @param batch_sizes The number of tokens in each batch that will be embedded.
|
* @param whether the model is recalculating the context.
|
||||||
* @param n_batch The number of batches that will be embedded.
|
* @return a bool indicating whether the model should keep generating.
|
||||||
* @param backend The backend that will be used for embedding. One of "cpu", "kompute", "cuda", or "metal".
|
|
||||||
* @return True to cancel llmodel_embed, false to continue.
|
|
||||||
*/
|
*/
|
||||||
typedef bool (*llmodel_emb_cancel_callback)(unsigned *batch_sizes, unsigned n_batch, const char *backend);
|
typedef bool (*llmodel_recalculate_callback)(bool is_recalculating);
|
||||||
|
|
||||||
typedef void (*llmodel_special_token_callback)(const char *name, const char *token);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Create a llmodel instance.
|
* Create a llmodel instance.
|
||||||
@ -100,11 +104,11 @@ DEPRECATED llmodel_model llmodel_model_create(const char *model_path);
|
|||||||
* Create a llmodel instance.
|
* Create a llmodel instance.
|
||||||
* Recognises correct model type from file at model_path
|
* Recognises correct model type from file at model_path
|
||||||
* @param model_path A string representing the path to the model file; will only be used to detect model type.
|
* @param model_path A string representing the path to the model file; will only be used to detect model type.
|
||||||
* @param backend A string representing the implementation to use. One of 'auto', 'cpu', 'metal', 'kompute', or 'cuda'.
|
* @param build_variant A string representing the implementation to use (auto, default, avxonly, ...),
|
||||||
* @param error A pointer to a string; will only be set on error.
|
* @param error A pointer to a llmodel_error; will only be set on error.
|
||||||
* @return A pointer to the llmodel_model instance; NULL on error.
|
* @return A pointer to the llmodel_model instance; NULL on error.
|
||||||
*/
|
*/
|
||||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error);
|
llmodel_model llmodel_model_create2(const char *model_path, const char *build_variant, llmodel_error *error);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Destroy a llmodel instance.
|
* Destroy a llmodel instance.
|
||||||
@ -117,21 +121,17 @@ void llmodel_model_destroy(llmodel_model model);
|
|||||||
* Estimate RAM requirement for a model file
|
* Estimate RAM requirement for a model file
|
||||||
* @param model A pointer to the llmodel_model instance.
|
* @param model A pointer to the llmodel_model instance.
|
||||||
* @param model_path A string representing the path to the model file.
|
* @param model_path A string representing the path to the model file.
|
||||||
* @param n_ctx Maximum size of context window
|
|
||||||
* @param ngl Number of GPU layers to use (Vulkan)
|
|
||||||
* @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
|
* @return size greater than 0 if the model was parsed successfully, 0 if file could not be parsed.
|
||||||
*/
|
*/
|
||||||
size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl);
|
size_t llmodel_required_mem(llmodel_model model, const char *model_path);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Load a model from a file.
|
* Load a model from a file.
|
||||||
* @param model A pointer to the llmodel_model instance.
|
* @param model A pointer to the llmodel_model instance.
|
||||||
* @param model_path A string representing the path to the model file.
|
* @param model_path A string representing the path to the model file.
|
||||||
* @param n_ctx Maximum size of context window
|
|
||||||
* @param ngl Number of GPU layers to use (Vulkan)
|
|
||||||
* @return true if the model was loaded successfully, false otherwise.
|
* @return true if the model was loaded successfully, false otherwise.
|
||||||
*/
|
*/
|
||||||
bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl);
|
bool llmodel_loadModel(llmodel_model model, const char *model_path);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Check if a model is loaded.
|
* Check if a model is loaded.
|
||||||
@ -146,41 +146,25 @@ bool llmodel_isModelLoaded(llmodel_model model);
|
|||||||
* @param model A pointer to the llmodel_model instance.
|
* @param model A pointer to the llmodel_model instance.
|
||||||
* @return the size in bytes of the internal state of the model
|
* @return the size in bytes of the internal state of the model
|
||||||
*/
|
*/
|
||||||
uint64_t llmodel_state_get_size(llmodel_model model);
|
uint64_t llmodel_get_state_size(llmodel_model model);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Saves the internal state of the model.
|
* Saves the internal state of the model to the specified destination address.
|
||||||
* NOTE: This state data is specific to the type of model you have created.
|
* NOTE: This state data is specific to the type of model you have created.
|
||||||
* @param model A pointer to the llmodel_model instance.
|
* @param model A pointer to the llmodel_model instance.
|
||||||
* @param state Where to store the state. This must be a buffer of at least llmodel_state_get_size() bytes.
|
* @param dest A pointer to the destination.
|
||||||
* @param state_size The size of the destination for the state.
|
* @return the number of bytes copied
|
||||||
* @param input_tokens_out Where to store the address of the token cache state. This is dynamically allocated and must
|
|
||||||
* be freed with llmodel_state_free_input_tokens.
|
|
||||||
* @param n_input_tokens Where to store the size of the token cache state.
|
|
||||||
* @return The number of bytes copied. On error, zero is returned, the token cache is set to NULL, and the token cache
|
|
||||||
* size is set to zero.
|
|
||||||
*/
|
*/
|
||||||
uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
|
uint64_t llmodel_save_state_data(llmodel_model model, uint8_t *dest);
|
||||||
token_t **input_tokens_out, uint64_t *n_input_tokens);
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Frees the temporary token cache buffer created by a call to llmodel_state_get_data().
|
|
||||||
* @param input_tokens The token cache buffer.
|
|
||||||
*/
|
|
||||||
void llmodel_state_free_input_tokens(token_t *input_tokens);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Restores the internal state of the model using data from the specified address.
|
* Restores the internal state of the model using data from the specified address.
|
||||||
* NOTE: This state data is specific to the type of model you have created.
|
* NOTE: This state data is specific to the type of model you have created.
|
||||||
* @param model A pointer to the llmodel_model instance.
|
* @param model A pointer to the llmodel_model instance.
|
||||||
* @param state A pointer to the state data.
|
* @param src A pointer to the src.
|
||||||
* @param state_size The size of the state data.
|
* @return the number of bytes read
|
||||||
* @param input_tokens The token cache associated with the saved state.
|
|
||||||
* @param n_input_tokens The number of tokens in input_tokens.
|
|
||||||
* @return The number of bytes read, or zero on error.
|
|
||||||
*/
|
*/
|
||||||
uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
|
uint64_t llmodel_restore_state_data(llmodel_model model, const uint8_t *src);
|
||||||
const token_t *input_tokens, uint64_t n_input_tokens);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate a response using the model.
|
* Generate a response using the model.
|
||||||
@ -188,41 +172,27 @@ uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint6
|
|||||||
* @param prompt A string representing the input prompt.
|
* @param prompt A string representing the input prompt.
|
||||||
* @param prompt_callback A callback function for handling the processing of prompt.
|
* @param prompt_callback A callback function for handling the processing of prompt.
|
||||||
* @param response_callback A callback function for handling the generated response.
|
* @param response_callback A callback function for handling the generated response.
|
||||||
|
* @param recalculate_callback A callback function for handling recalculation requests.
|
||||||
* @param ctx A pointer to the llmodel_prompt_context structure.
|
* @param ctx A pointer to the llmodel_prompt_context structure.
|
||||||
* @param error A pointer to a string; will only be set on error.
|
|
||||||
*/
|
*/
|
||||||
bool llmodel_prompt(llmodel_model model,
|
void llmodel_prompt(llmodel_model model, const char *prompt,
|
||||||
const char *prompt,
|
llmodel_prompt_callback prompt_callback,
|
||||||
llmodel_prompt_callback prompt_callback,
|
llmodel_response_callback response_callback,
|
||||||
llmodel_response_callback response_callback,
|
llmodel_recalculate_callback recalculate_callback,
|
||||||
llmodel_prompt_context *ctx,
|
llmodel_prompt_context *ctx);
|
||||||
const char **error);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Generate an embedding using the model.
|
* Generate an embedding using the model.
|
||||||
* NOTE: If given NULL pointers for the model or text, or an empty text, a NULL pointer will be
|
* NOTE: If given NULL pointers for the model or text, or an empty text, a NULL pointer will be
|
||||||
* returned. Bindings should signal an error when NULL is the return value.
|
* returned. Bindings should signal an error when NULL is the return value.
|
||||||
* @param model A pointer to the llmodel_model instance.
|
* @param model A pointer to the llmodel_model instance.
|
||||||
* @param texts A pointer to a NULL-terminated array of strings representing the texts to generate an
|
* @param text A string representing the text to generate an embedding for.
|
||||||
* embedding for.
|
|
||||||
* @param embedding_size A pointer to a size_t type that will be set by the call indicating the length
|
* @param embedding_size A pointer to a size_t type that will be set by the call indicating the length
|
||||||
* of the returned floating point array.
|
* of the returned floating point array.
|
||||||
* @param prefix The model-specific prefix representing the embedding task, without the trailing colon. NULL for no
|
|
||||||
* prefix.
|
|
||||||
* @param dimensionality The embedding dimension, for use with Matryoshka-capable models. Set to -1 to for full-size.
|
|
||||||
* @param token_count Return location for the number of prompt tokens processed, or NULL.
|
|
||||||
* @param do_mean True to average multiple embeddings if the text is longer than the model can accept, False to
|
|
||||||
* truncate.
|
|
||||||
* @param atlas Try to be fully compatible with the Atlas API. Currently, this means texts longer than 8192 tokens with
|
|
||||||
* long_text_mode="mean" will raise an error. Disabled by default.
|
|
||||||
* @param cancel_cb Cancellation callback, or NULL. See the documentation of llmodel_emb_cancel_callback.
|
|
||||||
* @param error Return location for a malloc()ed string that will be set on error, or NULL.
|
|
||||||
* @return A pointer to an array of floating point values passed to the calling method which then will
|
* @return A pointer to an array of floating point values passed to the calling method which then will
|
||||||
* be responsible for lifetime of this memory. NULL if an error occurred.
|
* be responsible for lifetime of this memory.
|
||||||
*/
|
*/
|
||||||
float *llmodel_embed(llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix,
|
float *llmodel_embedding(llmodel_model model, const char *text, size_t *embedding_size);
|
||||||
int dimensionality, size_t *token_count, bool do_mean, bool atlas,
|
|
||||||
llmodel_emb_cancel_callback cancel_cb, const char **error);
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Frees the memory allocated by the llmodel_embedding function.
|
* Frees the memory allocated by the llmodel_embedding function.
|
||||||
@ -260,10 +230,9 @@ const char *llmodel_get_implementation_search_path();
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* Get a list of available GPU devices given the memory required.
|
* Get a list of available GPU devices given the memory required.
|
||||||
* @param memoryRequired The minimum amount of VRAM, in bytes
|
|
||||||
* @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
|
* @return A pointer to an array of llmodel_gpu_device's whose number is given by num_devices.
|
||||||
*/
|
*/
|
||||||
struct llmodel_gpu_device* llmodel_available_gpu_devices(size_t memoryRequired, int* num_devices);
|
struct llmodel_gpu_device* llmodel_available_gpu_devices(llmodel_model model, size_t memoryRequired, int* num_devices);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Initializes a GPU device based on a specified string criterion.
|
* Initializes a GPU device based on a specified string criterion.
|
||||||
@ -299,18 +268,9 @@ bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gp
|
|||||||
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
|
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device);
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* @return The name of the llama.cpp backend currently in use. One of "cpu", "kompute", or "metal".
|
* @return True if a GPU device is successfully initialized, false otherwise.
|
||||||
*/
|
*/
|
||||||
const char *llmodel_model_backend_name(llmodel_model model);
|
bool llmodel_has_gpu_device(llmodel_model model);
|
||||||
|
|
||||||
/**
|
|
||||||
* @return The name of the GPU device currently in use, or NULL for backends other than Kompute.
|
|
||||||
*/
|
|
||||||
const char *llmodel_model_gpu_device_name(llmodel_model model);
|
|
||||||
|
|
||||||
int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error);
|
|
||||||
|
|
||||||
void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback);
|
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
}
|
}
|
176
gpt4all-backend/llmodel_shared.cpp
Normal file
176
gpt4all-backend/llmodel_shared.cpp
Normal file
@ -0,0 +1,176 @@
|
|||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <iostream>
|
||||||
|
#include <unordered_set>
|
||||||
|
|
||||||
|
void LLModel::recalculateContext(PromptContext &promptCtx, std::function<bool(bool)> recalculate) {
|
||||||
|
size_t i = 0;
|
||||||
|
promptCtx.n_past = 0;
|
||||||
|
while (i < promptCtx.tokens.size()) {
|
||||||
|
size_t batch_end = std::min(i + promptCtx.n_batch, promptCtx.tokens.size());
|
||||||
|
std::vector<int32_t> batch(promptCtx.tokens.begin() + i, promptCtx.tokens.begin() + batch_end);
|
||||||
|
assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
|
||||||
|
if (!evalTokens(promptCtx, batch)) {
|
||||||
|
std::cerr << "LLModel ERROR: Failed to process prompt\n";
|
||||||
|
goto stop_generating;
|
||||||
|
}
|
||||||
|
promptCtx.n_past += batch.size();
|
||||||
|
if (!recalculate(true))
|
||||||
|
goto stop_generating;
|
||||||
|
i = batch_end;
|
||||||
|
}
|
||||||
|
assert(promptCtx.n_past == int32_t(promptCtx.tokens.size()));
|
||||||
|
|
||||||
|
stop_generating:
|
||||||
|
recalculate(false);
|
||||||
|
}
|
||||||
|
|
||||||
|
void LLModel::prompt(const std::string &prompt,
|
||||||
|
std::function<bool(int32_t)> promptCallback,
|
||||||
|
std::function<bool(int32_t, const std::string&)> responseCallback,
|
||||||
|
std::function<bool(bool)> recalculateCallback,
|
||||||
|
PromptContext &promptCtx)
|
||||||
|
{
|
||||||
|
if (!isModelLoaded()) {
|
||||||
|
std::cerr << implementation().modelType() << " ERROR: prompt won't work with an unloaded model!\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!supportsCompletion()) {
|
||||||
|
std::string errorMessage = "ERROR: this model does not support text completion or chat!\n";
|
||||||
|
responseCallback(-1, errorMessage);
|
||||||
|
std::cerr << implementation().modelType() << errorMessage;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// tokenize the prompt
|
||||||
|
std::vector<Token> embd_inp = tokenize(promptCtx, prompt);
|
||||||
|
|
||||||
|
// save the context size
|
||||||
|
promptCtx.n_ctx = contextLength();
|
||||||
|
|
||||||
|
if ((int) embd_inp.size() > promptCtx.n_ctx - 4) {
|
||||||
|
responseCallback(-1, "ERROR: The prompt size exceeds the context window size and cannot be processed.");
|
||||||
|
std::cerr << implementation().modelType() << " ERROR: The prompt is " << embd_inp.size() <<
|
||||||
|
" tokens and the context window is " << promptCtx.n_ctx << "!\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
promptCtx.n_predict = std::min(promptCtx.n_predict, promptCtx.n_ctx - (int) embd_inp.size());
|
||||||
|
promptCtx.n_past = std::min(promptCtx.n_past, promptCtx.n_ctx);
|
||||||
|
promptCtx.n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
|
||||||
|
|
||||||
|
// process the prompt in batches
|
||||||
|
size_t i = 0;
|
||||||
|
while (i < embd_inp.size()) {
|
||||||
|
size_t batch_end = std::min(i + promptCtx.n_batch, embd_inp.size());
|
||||||
|
std::vector<Token> batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
|
||||||
|
|
||||||
|
// Check if the context has run out...
|
||||||
|
if (promptCtx.n_past + int32_t(batch.size()) > promptCtx.n_ctx) {
|
||||||
|
const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
|
||||||
|
// Erase the first percentage of context from the tokens...
|
||||||
|
std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
|
||||||
|
promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
|
||||||
|
promptCtx.n_past = promptCtx.tokens.size();
|
||||||
|
recalculateContext(promptCtx, recalculateCallback);
|
||||||
|
assert(promptCtx.n_past + int32_t(batch.size()) <= promptCtx.n_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!evalTokens(promptCtx, batch)) {
|
||||||
|
std::cerr << implementation().modelType() << " ERROR: Failed to process prompt\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t tokens = batch_end - i;
|
||||||
|
for (size_t t = 0; t < tokens; ++t) {
|
||||||
|
if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
|
||||||
|
promptCtx.tokens.erase(promptCtx.tokens.begin());
|
||||||
|
promptCtx.tokens.push_back(batch.at(t));
|
||||||
|
if (!promptCallback(batch.at(t)))
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
promptCtx.n_past += batch.size();
|
||||||
|
i = batch_end;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string cachedResponse;
|
||||||
|
std::vector<Token> cachedTokens;
|
||||||
|
std::unordered_set<std::string> reversePrompts
|
||||||
|
= { "### Instruction", "### Prompt", "### Response", "### Human", "### Assistant", "### Context" };
|
||||||
|
|
||||||
|
// predict next tokens
|
||||||
|
for (int i = 0; i < promptCtx.n_predict; i++) {
|
||||||
|
|
||||||
|
// sample next token
|
||||||
|
auto id = sampleToken(promptCtx);
|
||||||
|
|
||||||
|
// Check if the context has run out...
|
||||||
|
if (promptCtx.n_past + 1 > promptCtx.n_ctx) {
|
||||||
|
const int32_t erasePoint = promptCtx.n_ctx * promptCtx.contextErase;
|
||||||
|
// Erase the first percentage of context from the tokens...
|
||||||
|
std::cerr << implementation().modelType() << ": reached the end of the context window so resizing\n";
|
||||||
|
promptCtx.tokens.erase(promptCtx.tokens.begin(), promptCtx.tokens.begin() + erasePoint);
|
||||||
|
promptCtx.n_past = promptCtx.tokens.size();
|
||||||
|
recalculateContext(promptCtx, recalculateCallback);
|
||||||
|
assert(promptCtx.n_past + 1 <= promptCtx.n_ctx);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!evalTokens(promptCtx, { id })) {
|
||||||
|
std::cerr << implementation().modelType() << " ERROR: Failed to predict next token\n";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
promptCtx.n_past += 1;
|
||||||
|
|
||||||
|
// display text
|
||||||
|
for (const auto token : endTokens()) {
|
||||||
|
if (id == token) return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::string str = tokenToString(id);
|
||||||
|
|
||||||
|
// Check if the provided str is part of our reverse prompts
|
||||||
|
bool foundPartialReversePrompt = false;
|
||||||
|
const std::string completed = cachedResponse + std::string(str);
|
||||||
|
if (reversePrompts.find(completed) != reversePrompts.end())
|
||||||
|
return;
|
||||||
|
|
||||||
|
// Check if it partially matches our reverse prompts and if so, cache
|
||||||
|
for (const auto& s : reversePrompts) {
|
||||||
|
if (s.compare(0, completed.size(), completed) == 0) {
|
||||||
|
foundPartialReversePrompt = true;
|
||||||
|
cachedResponse = completed;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Regardless the token gets added to our cache
|
||||||
|
cachedTokens.push_back(id);
|
||||||
|
|
||||||
|
// Continue if we have found a partial match
|
||||||
|
if (foundPartialReversePrompt)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
// Empty the cache
|
||||||
|
for (auto t : cachedTokens) {
|
||||||
|
if (int32_t(promptCtx.tokens.size()) == promptCtx.n_ctx)
|
||||||
|
promptCtx.tokens.erase(promptCtx.tokens.begin());
|
||||||
|
promptCtx.tokens.push_back(t);
|
||||||
|
//TODO: Conversion to std::string can be avoided here...
|
||||||
|
if (!responseCallback(t, std::string(tokenToString(t))))
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
cachedTokens.clear();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> LLModel::embedding(const std::string &/*text*/)
|
||||||
|
{
|
||||||
|
if (!supportsCompletion()) {
|
||||||
|
std::string errorMessage = "ERROR: this model does not support generating embeddings!\n";
|
||||||
|
std::cerr << implementation().modelType() << errorMessage;
|
||||||
|
}
|
||||||
|
return std::vector<float>();
|
||||||
|
}
|
92
gpt4all-backend/llmodel_shared.h
Normal file
92
gpt4all-backend/llmodel_shared.h
Normal file
@ -0,0 +1,92 @@
|
|||||||
|
#pragma once
|
||||||
|
#include <cstdint>
|
||||||
|
#include <cstddef>
|
||||||
|
#include <vector>
|
||||||
|
#include <ggml.h>
|
||||||
|
|
||||||
|
#if defined(GGML_USE_KOMPUTE)
|
||||||
|
#include "ggml-vulkan.h"
|
||||||
|
struct llm_buffer {
|
||||||
|
uint8_t * addr = NULL;
|
||||||
|
size_t size = 0;
|
||||||
|
ggml_vk_memory memory;
|
||||||
|
|
||||||
|
llm_buffer() = default;
|
||||||
|
|
||||||
|
void resize(size_t size) {
|
||||||
|
free();
|
||||||
|
|
||||||
|
if (!ggml_vk_has_device()) {
|
||||||
|
this->addr = new uint8_t[size];
|
||||||
|
this->size = size;
|
||||||
|
} else {
|
||||||
|
this->memory = ggml_vk_allocate(size);
|
||||||
|
this->addr = (uint8_t*)memory.data;
|
||||||
|
this->size = size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void free() {
|
||||||
|
if (!memory.primaryMemory) {
|
||||||
|
delete[] addr;
|
||||||
|
} else if (memory.data) {
|
||||||
|
ggml_vk_free_memory(memory);
|
||||||
|
}
|
||||||
|
this->addr = NULL;
|
||||||
|
this->size = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
~llm_buffer() {
|
||||||
|
free();
|
||||||
|
}
|
||||||
|
|
||||||
|
// disable copy and move
|
||||||
|
llm_buffer(const llm_buffer&) = delete;
|
||||||
|
llm_buffer(llm_buffer&&) = delete;
|
||||||
|
llm_buffer& operator=(const llm_buffer&) = delete;
|
||||||
|
llm_buffer& operator=(llm_buffer&&) = delete;
|
||||||
|
};
|
||||||
|
#else
|
||||||
|
struct llm_buffer {
|
||||||
|
uint8_t * addr = NULL;
|
||||||
|
size_t size = 0;
|
||||||
|
|
||||||
|
void resize(size_t size) {
|
||||||
|
delete[] addr;
|
||||||
|
addr = new uint8_t[size];
|
||||||
|
this->size = size;
|
||||||
|
}
|
||||||
|
|
||||||
|
~llm_buffer() {
|
||||||
|
delete[] addr;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct llm_kv_cache {
|
||||||
|
struct ggml_tensor * k;
|
||||||
|
struct ggml_tensor * v;
|
||||||
|
|
||||||
|
struct ggml_context * ctx = NULL;
|
||||||
|
|
||||||
|
llm_buffer buf;
|
||||||
|
|
||||||
|
int n; // number of tokens currently in the cache
|
||||||
|
|
||||||
|
~llm_kv_cache() {
|
||||||
|
if (ctx) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
#if LLAMA_DATE >= 230519
|
||||||
|
inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads) {
|
||||||
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
if (plan.work_size > 0) {
|
||||||
|
buf.resize(plan.work_size);
|
||||||
|
plan.work_data = buf.addr;
|
||||||
|
}
|
||||||
|
ggml_graph_compute(graph, &plan);
|
||||||
|
}
|
||||||
|
#endif
|
893
gpt4all-backend/mpt.cpp
Normal file
893
gpt4all-backend/mpt.cpp
Normal file
@ -0,0 +1,893 @@
|
|||||||
|
#define MPT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#include "mpt_impl.h"
|
||||||
|
|
||||||
|
#include "utils.h"
|
||||||
|
#include "llmodel_shared.h"
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
|
#include <cinttypes>
|
||||||
|
#include <cmath>
|
||||||
|
#include <cstdio>
|
||||||
|
#include <cstring>
|
||||||
|
#include <fstream>
|
||||||
|
#include <map>
|
||||||
|
#include <random>
|
||||||
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
#include <iostream>
|
||||||
|
#if defined(_WIN32) && defined(_MSC_VER)
|
||||||
|
#define WIN32_LEAN_AND_MEAN
|
||||||
|
#ifndef NOMINMAX
|
||||||
|
#define NOMINMAX
|
||||||
|
#endif
|
||||||
|
#include <windows.h>
|
||||||
|
#include <io.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#else
|
||||||
|
#include <unistd.h>
|
||||||
|
#endif
|
||||||
|
#include <sstream>
|
||||||
|
#include <thread>
|
||||||
|
#include <unordered_set>
|
||||||
|
#include <regex>
|
||||||
|
#include <ggml.h>
|
||||||
|
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
const char *modelType_ = "MPT";
|
||||||
|
}
|
||||||
|
|
||||||
|
// default hparams (MPT 7B)
|
||||||
|
struct mpt_hparams {
|
||||||
|
int32_t n_vocab = 50432;
|
||||||
|
int32_t n_ctx = 2048;
|
||||||
|
int32_t n_embd = 4096;
|
||||||
|
int32_t n_head = 32;
|
||||||
|
int32_t n_layer = 32;
|
||||||
|
float alibi_bias_max = 8;
|
||||||
|
float clip_qkv = 0;
|
||||||
|
int32_t expand = 4;
|
||||||
|
int32_t f16 = 1;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mpt_layer {
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * norm_1_w;
|
||||||
|
struct ggml_tensor * norm_2_w;
|
||||||
|
|
||||||
|
// attention
|
||||||
|
struct ggml_tensor * attn_Wqkv_w;
|
||||||
|
struct ggml_tensor * attn_out_proj_w;
|
||||||
|
|
||||||
|
// ff
|
||||||
|
struct ggml_tensor * ffn_up_proj_w;
|
||||||
|
struct ggml_tensor * ffn_down_proj_w;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct mpt_model {
|
||||||
|
mpt_hparams hparams;
|
||||||
|
|
||||||
|
// normalization
|
||||||
|
struct ggml_tensor * norm_f_w;
|
||||||
|
|
||||||
|
struct ggml_tensor * wte; // position embedding
|
||||||
|
|
||||||
|
// mpt does weight tying
|
||||||
|
|
||||||
|
std::vector<mpt_layer> layers;
|
||||||
|
|
||||||
|
struct llm_kv_cache kv_self;
|
||||||
|
struct ggml_context * ctx;
|
||||||
|
std::map<std::string, struct ggml_tensor *> tensors;
|
||||||
|
|
||||||
|
|
||||||
|
llm_buffer eval_buf;
|
||||||
|
llm_buffer scr0_buf;
|
||||||
|
llm_buffer scr1_buf;
|
||||||
|
|
||||||
|
~mpt_model() {
|
||||||
|
if (ctx) {
|
||||||
|
ggml_free(ctx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
static bool kv_cache_init(
|
||||||
|
const struct mpt_hparams & hparams,
|
||||||
|
struct llm_kv_cache & cache,
|
||||||
|
ggml_type wtype,
|
||||||
|
int n_ctx) {
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
|
||||||
|
const int64_t n_mem = (int64_t)n_layer*n_ctx;
|
||||||
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
|
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2_MiB);
|
||||||
|
|
||||||
|
struct ggml_init_params params;
|
||||||
|
params.mem_size = cache.buf.size;
|
||||||
|
params.mem_buffer = cache.buf.addr;
|
||||||
|
params.no_alloc = false;
|
||||||
|
|
||||||
|
cache.ctx = ggml_init(params);
|
||||||
|
|
||||||
|
if (!cache.ctx) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||||
|
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load the model's weights from a stream. if mem_req ptr is passed the model is
|
||||||
|
// only partially parsed to estimate required memory
|
||||||
|
bool mpt_model_load(const std::string &fname, std::istream &fin, mpt_model & model, gpt_vocab & vocab, size_t * mem_req) {
|
||||||
|
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
if (mem_req != nullptr) {
|
||||||
|
*mem_req = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
// verify magic
|
||||||
|
{
|
||||||
|
uint32_t magic;
|
||||||
|
fin.read((char *) &magic, sizeof(magic));
|
||||||
|
if (magic != 0x67676d6d) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// load hparams
|
||||||
|
{
|
||||||
|
auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
|
||||||
|
fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
|
||||||
|
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
|
||||||
|
fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
|
||||||
|
fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
|
||||||
|
fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
|
||||||
|
fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
|
||||||
|
fin.read((char *) &hparams.f16, sizeof(hparams.f16));
|
||||||
|
|
||||||
|
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
|
||||||
|
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
|
||||||
|
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
|
||||||
|
printf("%s: n_head = %d\n", __func__, hparams.n_head);
|
||||||
|
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
|
||||||
|
printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
|
||||||
|
printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
|
||||||
|
printf("%s: ftype = %d\n", __func__, hparams.f16);
|
||||||
|
}
|
||||||
|
|
||||||
|
// load vocab
|
||||||
|
{
|
||||||
|
int32_t n_vocab = model.hparams.n_vocab;
|
||||||
|
fin.read((char *) &n_vocab, sizeof(n_vocab));
|
||||||
|
|
||||||
|
if (n_vocab != model.hparams.n_vocab) {
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
|
||||||
|
__func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string word;
|
||||||
|
for (int i = 0; i < n_vocab; i++) {
|
||||||
|
uint32_t len;
|
||||||
|
fin.read((char *) &len, sizeof(len));
|
||||||
|
bool special = false;
|
||||||
|
if (len & (1<<31)) {
|
||||||
|
len = len &~ (1<<31);
|
||||||
|
special = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (len > 0) {
|
||||||
|
word.resize(len);
|
||||||
|
fin.read((char *) word.data(), len);
|
||||||
|
vocab.token_to_id[word] = i;
|
||||||
|
vocab.id_to_token[i] = word;
|
||||||
|
}
|
||||||
|
|
||||||
|
if(special) {
|
||||||
|
vocab.add_special_token(word);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
|
||||||
|
// in order to save memory and also to speed up the computation
|
||||||
|
ggml_type wtype = GGML_TYPE_COUNT;
|
||||||
|
switch (model.hparams.f16) {
|
||||||
|
case 0: wtype = GGML_TYPE_F32; break;
|
||||||
|
case 1: wtype = GGML_TYPE_F16; break;
|
||||||
|
case 2: wtype = GGML_TYPE_Q4_0; break;
|
||||||
|
case 3: wtype = GGML_TYPE_Q4_1; break;
|
||||||
|
case 5: wtype = GGML_TYPE_Q4_2; break;
|
||||||
|
default:
|
||||||
|
{
|
||||||
|
fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
|
||||||
|
__func__, fname.c_str(), model.hparams.f16);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
auto & ctx = model.ctx;
|
||||||
|
|
||||||
|
size_t ctx_size = 0;
|
||||||
|
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
const int expand = hparams.expand;
|
||||||
|
|
||||||
|
|
||||||
|
ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_w
|
||||||
|
|
||||||
|
ctx_size += n_embd*n_vocab*ggml_type_sizef(GGML_TYPE_F32); // wte
|
||||||
|
|
||||||
|
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_1_w
|
||||||
|
ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // norm_2_w
|
||||||
|
|
||||||
|
ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // attn_Wqkv_w
|
||||||
|
ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // attn_out_proj_w
|
||||||
|
|
||||||
|
ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype)); // ffn_up_proj_w
|
||||||
|
ctx_size += n_layer*(expand*n_embd*n_embd*ggml_type_sizef(wtype)); // ffn_down_proj_w
|
||||||
|
|
||||||
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_k
|
||||||
|
ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F16); // memory_v
|
||||||
|
|
||||||
|
// TODO probably less now?
|
||||||
|
ctx_size += (5 + 10*n_layer)*256; // object overhead
|
||||||
|
|
||||||
|
printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (mem_req != nullptr) {
|
||||||
|
*mem_req += ctx_size;
|
||||||
|
const int n_embd = model.hparams.n_embd;
|
||||||
|
const int n_layer = model.hparams.n_layer;
|
||||||
|
|
||||||
|
const int64_t n_mem = (int64_t)n_layer*model.hparams.n_ctx;
|
||||||
|
const int64_t n_elements = n_embd*n_mem;
|
||||||
|
|
||||||
|
*mem_req += (2u*n_elements*ggml_type_size(wtype) + 2_MiB);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// create the ggml context
|
||||||
|
{
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = ctx_size,
|
||||||
|
.mem_buffer = NULL,
|
||||||
|
.no_alloc = false,
|
||||||
|
};
|
||||||
|
|
||||||
|
model.ctx = ggml_init(params);
|
||||||
|
if (!model.ctx) {
|
||||||
|
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// prepare memory for the weights
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
const int expand = hparams.expand;
|
||||||
|
|
||||||
|
model.layers.resize(n_layer);
|
||||||
|
|
||||||
|
model.wte = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
|
model.norm_f_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
// map by name
|
||||||
|
model.tensors["transformer.wte.weight"] = model.wte;
|
||||||
|
model.tensors["transformer.norm_f.weight"] = model.norm_f_w;
|
||||||
|
|
||||||
|
for (int i = 0; i < n_layer; ++i) {
|
||||||
|
auto & layer = model.layers[i];
|
||||||
|
|
||||||
|
layer.norm_1_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
layer.norm_2_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
|
layer.attn_Wqkv_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd * 3);
|
||||||
|
layer.attn_out_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
|
||||||
|
layer.ffn_up_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, expand*n_embd);
|
||||||
|
layer.ffn_down_proj_w = ggml_new_tensor_2d(ctx, wtype, expand*n_embd, n_embd);
|
||||||
|
|
||||||
|
// map by name
|
||||||
|
model.tensors["transformer.blocks." + std::to_string(i) + ".norm_1.weight"] = layer.norm_1_w;
|
||||||
|
model.tensors["transformer.blocks." + std::to_string(i) + ".norm_2.weight"] = layer.norm_2_w;
|
||||||
|
model.tensors["transformer.blocks." + std::to_string(i) + ".attn.Wqkv.weight"] = layer.attn_Wqkv_w;
|
||||||
|
model.tensors["transformer.blocks." + std::to_string(i) + ".attn.out_proj.weight"] = layer.attn_out_proj_w;
|
||||||
|
|
||||||
|
model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.up_proj.weight"] = layer.ffn_up_proj_w;
|
||||||
|
model.tensors["transformer.blocks." + std::to_string(i) + ".ffn.down_proj.weight"] = layer.ffn_down_proj_w;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// key + value memory
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
if (!kv_cache_init(hparams, model.kv_self, GGML_TYPE_F16, model.hparams.n_ctx)) {
|
||||||
|
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
|
||||||
|
ggml_free(ctx);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t memory_size = ggml_nbytes(model.kv_self.k) + ggml_nbytes(model.kv_self.v);
|
||||||
|
printf("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// load weights
|
||||||
|
{
|
||||||
|
int n_tensors = 0;
|
||||||
|
size_t total_size = 0;
|
||||||
|
|
||||||
|
printf("%s: ", __func__);
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
int32_t n_dims;
|
||||||
|
int32_t length;
|
||||||
|
int32_t ttype;
|
||||||
|
|
||||||
|
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
|
||||||
|
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
|
||||||
|
fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
|
||||||
|
|
||||||
|
if (fin.eof()) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t nelements = 1;
|
||||||
|
int32_t ne[2] = { 1, 1 };
|
||||||
|
for (int i = 0; i < n_dims; ++i) {
|
||||||
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||||
|
nelements *= ne[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string name(length, 0);
|
||||||
|
fin.read(&name[0], length);
|
||||||
|
|
||||||
|
if (model.tensors.find(name.data()) == model.tensors.end()) {
|
||||||
|
fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto tensor = model.tensors[name.data()];
|
||||||
|
if (ggml_nelements(tensor) != nelements) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%" PRId64 ", %" PRId64 "], expected [%d, %d]\n",
|
||||||
|
__func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// for debugging
|
||||||
|
if (0) {
|
||||||
|
printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t bpe = ggml_type_size(ggml_type(ttype));
|
||||||
|
|
||||||
|
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
|
||||||
|
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
|
||||||
|
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
|
||||||
|
|
||||||
|
//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
|
||||||
|
total_size += ggml_nbytes(tensor);
|
||||||
|
if (++n_tensors % 8 == 0) {
|
||||||
|
printf(".");
|
||||||
|
fflush(stdout);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf(" done\n");
|
||||||
|
|
||||||
|
printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
|
||||||
|
}
|
||||||
|
|
||||||
|
model.scr0_buf.resize(256u * 1024 * 1024);
|
||||||
|
model.scr1_buf.resize(256u * 1024 * 1024);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// load the model's weights from a file path
|
||||||
|
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
|
||||||
|
|
||||||
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
if (!fin) {
|
||||||
|
fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool loaded = mpt_model_load(fname, fin, model, vocab, nullptr);
|
||||||
|
fin.close();
|
||||||
|
return loaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool mpt_eval(
|
||||||
|
mpt_model & model,
|
||||||
|
const int n_threads,
|
||||||
|
const int n_past,
|
||||||
|
const std::vector<int> & embd_inp,
|
||||||
|
std::vector<float> & embd_w,
|
||||||
|
size_t & mem_per_token) {
|
||||||
|
const int N = embd_inp.size();
|
||||||
|
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
|
const int n_embd = hparams.n_embd;
|
||||||
|
const int n_layer = hparams.n_layer;
|
||||||
|
const int n_ctx = hparams.n_ctx;
|
||||||
|
const int n_head = hparams.n_head;
|
||||||
|
const int n_vocab = hparams.n_vocab;
|
||||||
|
|
||||||
|
const size_t init_buf_size = 1024_MiB;
|
||||||
|
if (!model.eval_buf.addr || model.eval_buf.size < init_buf_size)
|
||||||
|
model.eval_buf.resize(init_buf_size);
|
||||||
|
|
||||||
|
if (mem_per_token > 0 && mem_per_token*N > model.eval_buf.size) {
|
||||||
|
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
|
||||||
|
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, model.buf.size, buf_size_new);
|
||||||
|
|
||||||
|
// reallocate
|
||||||
|
model.eval_buf.resize(buf_size_new);
|
||||||
|
if (model.eval_buf.addr == nullptr) {
|
||||||
|
fprintf(stderr, "%s: failed to allocate %zu bytes\n", __func__, model.eval_buf.size);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct ggml_init_params params = {
|
||||||
|
.mem_size = model.eval_buf.size,
|
||||||
|
.mem_buffer = model.eval_buf.addr,
|
||||||
|
.no_alloc = false
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_context * ctx0 = ggml_init(params);
|
||||||
|
struct ggml_cgraph gf = {};
|
||||||
|
gf.n_threads = n_threads;
|
||||||
|
|
||||||
|
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
|
||||||
|
memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
|
||||||
|
|
||||||
|
// wte
|
||||||
|
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.wte, embd);
|
||||||
|
|
||||||
|
for (int il = 0; il < n_layer; ++il) {
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
|
struct ggml_tensor * inpSA = inpL;
|
||||||
|
struct ggml_tensor * cur = inpSA;
|
||||||
|
// self-attention
|
||||||
|
{
|
||||||
|
|
||||||
|
// norm1
|
||||||
|
cur = ggml_norm(ctx0, cur);
|
||||||
|
cur = ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.layers[il].norm_1_w, cur),
|
||||||
|
cur);
|
||||||
|
// compute QKV
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].attn_Wqkv_w,
|
||||||
|
cur);
|
||||||
|
|
||||||
|
// TODO: clip_qkv
|
||||||
|
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*ggml_element_size(cur)*n_embd));
|
||||||
|
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*ggml_element_size(cur)*n_embd));
|
||||||
|
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*ggml_element_size(cur)*n_embd));
|
||||||
|
|
||||||
|
// TODO: qk_ln? (seems to be False in MPT-7B configs)
|
||||||
|
{
|
||||||
|
Vcur = ggml_transpose(ctx0, Vcur);
|
||||||
|
|
||||||
|
struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_self.k, N*n_embd, (ggml_element_size(model.kv_self.k)*n_embd)*(il*n_ctx + n_past));
|
||||||
|
struct ggml_tensor * v = ggml_view_2d(ctx0, model.kv_self.v, N, n_embd,
|
||||||
|
( n_ctx)*ggml_element_size(model.kv_self.v),
|
||||||
|
(il*n_ctx)*ggml_element_size(model.kv_self.v)*n_embd + n_past*ggml_element_size(model.kv_self.v));
|
||||||
|
|
||||||
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
|
||||||
|
ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
|
||||||
|
}
|
||||||
|
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * Q =
|
||||||
|
ggml_permute(ctx0,
|
||||||
|
ggml_reshape_3d(ctx0, Qcur, n_embd/n_head, n_head, N),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
struct ggml_tensor * K =
|
||||||
|
ggml_permute(ctx0,
|
||||||
|
ggml_reshape_3d(ctx0,
|
||||||
|
ggml_view_1d(ctx0, model.kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.kv_self.k)*n_embd),
|
||||||
|
n_embd/n_head, n_head, n_past + N),
|
||||||
|
0, 2, 1, 3);
|
||||||
|
|
||||||
|
// K * Q
|
||||||
|
struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
|
||||||
|
|
||||||
|
// KQ_scaled = KQ / sqrt(n_embd/n_head)
|
||||||
|
struct ggml_tensor * KQ_scaled =
|
||||||
|
ggml_scale(ctx0,
|
||||||
|
KQ,
|
||||||
|
ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))
|
||||||
|
);
|
||||||
|
|
||||||
|
|
||||||
|
// Alibi
|
||||||
|
struct ggml_tensor * KQ_scaled_biased = ggml_alibi(ctx0, ggml_cont(ctx0, KQ_scaled), n_past, n_head);
|
||||||
|
|
||||||
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_biased, n_past);
|
||||||
|
|
||||||
|
// KQ = soft_max(KQ_masked)
|
||||||
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
||||||
|
|
||||||
|
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
|
||||||
|
struct ggml_tensor * V =
|
||||||
|
ggml_view_3d(ctx0, model.kv_self.v,
|
||||||
|
n_past + N, n_embd/n_head, n_head,
|
||||||
|
n_ctx*ggml_element_size(model.kv_self.v),
|
||||||
|
n_ctx*ggml_element_size(model.kv_self.v)*n_embd/n_head,
|
||||||
|
il*n_ctx*ggml_element_size(model.kv_self.v)*n_embd);
|
||||||
|
|
||||||
|
// KQV = transpose(V) * KQ_soft_max
|
||||||
|
struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
|
||||||
|
|
||||||
|
// KQV_merged = KQV.permute(0, 2, 1, 3)
|
||||||
|
struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
|
||||||
|
|
||||||
|
// cur = KQV_merged.contiguous().view(n_embd, N)
|
||||||
|
cur = ggml_cpy(ctx0,
|
||||||
|
KQV_merged,
|
||||||
|
ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
|
||||||
|
|
||||||
|
// projection (no bias)
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].attn_out_proj_w,
|
||||||
|
cur);
|
||||||
|
}
|
||||||
|
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr1_buf.size, model.scr1_buf.addr, });
|
||||||
|
// residual
|
||||||
|
struct ggml_tensor * resSA = ggml_add(ctx0, cur, inpSA);
|
||||||
|
// feed-forward network
|
||||||
|
{
|
||||||
|
cur = resSA;
|
||||||
|
// norm2
|
||||||
|
cur = ggml_norm(ctx0, cur);
|
||||||
|
cur = ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.layers[il].norm_2_w, cur),
|
||||||
|
cur);
|
||||||
|
// ffn
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].ffn_up_proj_w,
|
||||||
|
cur);
|
||||||
|
cur = ggml_gelu(ctx0, cur);
|
||||||
|
cur = ggml_mul_mat(ctx0,
|
||||||
|
model.layers[il].ffn_down_proj_w,
|
||||||
|
cur);
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// self-attention + FF
|
||||||
|
inpL = ggml_add(ctx0, cur, resSA);
|
||||||
|
}
|
||||||
|
ggml_set_scratch(ctx0, {0, model.scr0_buf.size, model.scr0_buf.addr, });
|
||||||
|
|
||||||
|
struct ggml_tensor * out = inpL;
|
||||||
|
// -> logits
|
||||||
|
{
|
||||||
|
out = ggml_norm(ctx0, out);
|
||||||
|
out = ggml_mul(ctx0,
|
||||||
|
ggml_repeat(ctx0, model.norm_f_w, out),
|
||||||
|
out);
|
||||||
|
ggml_set_scratch(ctx0, { 0, 0, nullptr, });
|
||||||
|
out = ggml_mul_mat(ctx0, model.wte, out);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// run the computation
|
||||||
|
ggml_build_forward_expand(&gf, out);
|
||||||
|
ggml_graph_compute (ctx0, &gf);
|
||||||
|
|
||||||
|
|
||||||
|
// return result for just the last token
|
||||||
|
embd_w.resize(n_vocab);
|
||||||
|
memcpy(embd_w.data(), (float *) ggml_get_data(out) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
|
||||||
|
|
||||||
|
if (mem_per_token == 0) {
|
||||||
|
mem_per_token = ggml_used_mem(ctx0)/N;
|
||||||
|
}
|
||||||
|
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
|
||||||
|
|
||||||
|
ggml_free(ctx0);
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
#define MPT_MAX_RNG_STATE 64*1024
|
||||||
|
|
||||||
|
size_t mpt_get_state_size(const mpt_model &model)
|
||||||
|
{
|
||||||
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||||
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
||||||
|
const size_t s_rng_size = sizeof(size_t);
|
||||||
|
const size_t s_rng = MPT_MAX_RNG_STATE;
|
||||||
|
const size_t s_kv_size = sizeof(size_t);
|
||||||
|
const size_t s_kv_ntok = sizeof(int);
|
||||||
|
const size_t s_kv = model.kv_self.buf.size;
|
||||||
|
const size_t s_total = (
|
||||||
|
+ s_rng_size
|
||||||
|
+ s_rng
|
||||||
|
+ s_kv_size
|
||||||
|
+ s_kv_ntok
|
||||||
|
+ s_kv
|
||||||
|
);
|
||||||
|
fflush(stdout);
|
||||||
|
return s_total;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t mpt_copy_state_data(const mpt_model &model, const std::mt19937 &rng, uint8_t *dest)
|
||||||
|
{
|
||||||
|
uint8_t * out = dest;
|
||||||
|
fflush(stdout);
|
||||||
|
// copy rng
|
||||||
|
{
|
||||||
|
std::stringstream rng_ss;
|
||||||
|
rng_ss << rng;
|
||||||
|
|
||||||
|
const size_t rng_size = rng_ss.str().size();
|
||||||
|
char rng_buf[MPT_MAX_RNG_STATE];
|
||||||
|
|
||||||
|
memset(&rng_buf[0], 0, MPT_MAX_RNG_STATE);
|
||||||
|
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
||||||
|
|
||||||
|
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
|
||||||
|
memcpy(out, &rng_buf[0], MPT_MAX_RNG_STATE); out += MPT_MAX_RNG_STATE;
|
||||||
|
}
|
||||||
|
|
||||||
|
// copy kv cache
|
||||||
|
{
|
||||||
|
const size_t kv_size = model.kv_self.buf.size;
|
||||||
|
const int kv_ntok = model.kv_self.n;
|
||||||
|
|
||||||
|
memcpy(out, &kv_size, sizeof(kv_size)); out += sizeof(kv_size);
|
||||||
|
memcpy(out, &kv_ntok, sizeof(kv_ntok)); out += sizeof(kv_ntok);
|
||||||
|
|
||||||
|
if (kv_size) {
|
||||||
|
memcpy(out, model.kv_self.buf.addr, kv_size); out += kv_size;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t written = out - dest;
|
||||||
|
assert(written == mpt_get_state_size(model));
|
||||||
|
fflush(stdout);
|
||||||
|
return written;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t mpt_set_state_data(mpt_model *model, std::mt19937 *rng, const uint8_t *src)
|
||||||
|
{
|
||||||
|
const uint8_t * in = src;
|
||||||
|
|
||||||
|
// set rng
|
||||||
|
{
|
||||||
|
size_t rng_size;
|
||||||
|
char rng_buf[MPT_MAX_RNG_STATE];
|
||||||
|
|
||||||
|
memcpy(&rng_size, in, sizeof(rng_size)); in += sizeof(rng_size);
|
||||||
|
memcpy(&rng_buf[0], in, MPT_MAX_RNG_STATE); in += MPT_MAX_RNG_STATE;
|
||||||
|
|
||||||
|
std::stringstream rng_ss;
|
||||||
|
rng_ss.str(std::string(&rng_buf[0], rng_size));
|
||||||
|
rng_ss >> *rng;
|
||||||
|
|
||||||
|
assert(rng_ss.fail() == false);
|
||||||
|
}
|
||||||
|
|
||||||
|
// set kv cache
|
||||||
|
{
|
||||||
|
size_t kv_size;
|
||||||
|
int kv_ntok;
|
||||||
|
|
||||||
|
memcpy(&kv_size, in, sizeof(kv_size)); in += sizeof(kv_size);
|
||||||
|
memcpy(&kv_ntok, in, sizeof(kv_ntok)); in += sizeof(kv_ntok);
|
||||||
|
|
||||||
|
if (kv_size) {
|
||||||
|
assert(model->kv_self.buf.size == kv_size);
|
||||||
|
|
||||||
|
void * k_data = model->kv_self.k->data; // remember data pointers
|
||||||
|
void * v_data = model->kv_self.v->data; // because their value is stored in buf and overwritten by memcpy
|
||||||
|
|
||||||
|
memcpy(model->kv_self.buf.addr, in, kv_size); in += kv_size;
|
||||||
|
|
||||||
|
model->kv_self.k->data = k_data; // restore correct data pointers
|
||||||
|
model->kv_self.v->data = v_data;
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
model->kv_self.n = kv_ntok;
|
||||||
|
}
|
||||||
|
|
||||||
|
const size_t nread = in - src;
|
||||||
|
assert(nread == mpt_get_state_size(*model));
|
||||||
|
fflush(stdout);
|
||||||
|
return nread;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct MPTPrivate {
|
||||||
|
const std::string modelPath;
|
||||||
|
bool modelLoaded;
|
||||||
|
gpt_vocab vocab;
|
||||||
|
mpt_model *model = nullptr;
|
||||||
|
int64_t n_threads = 0;
|
||||||
|
size_t mem_per_token = 0;
|
||||||
|
std::mt19937 rng;
|
||||||
|
bool has_im_end = false;
|
||||||
|
};
|
||||||
|
|
||||||
|
MPT::MPT()
|
||||||
|
: d_ptr(new MPTPrivate) {
|
||||||
|
d_ptr->model = new mpt_model;
|
||||||
|
d_ptr->model->ctx = nullptr;
|
||||||
|
d_ptr->modelLoaded = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MPT::requiredMem(const std::string &modelPath) {
|
||||||
|
mpt_model dummy_model;
|
||||||
|
gpt_vocab dummy_vocab;
|
||||||
|
size_t mem_req;
|
||||||
|
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||||
|
mpt_model_load(modelPath, fin, dummy_model, dummy_vocab, &mem_req);
|
||||||
|
return mem_req;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MPT::loadModel(const std::string &modelPath) {
|
||||||
|
std::mt19937 rng(time(NULL));
|
||||||
|
d_ptr->rng = rng;
|
||||||
|
|
||||||
|
auto fin = std::ifstream(modelPath, std::ios::binary);
|
||||||
|
|
||||||
|
// load the model
|
||||||
|
if (!mpt_model_load(modelPath, fin, *d_ptr->model, d_ptr->vocab, nullptr)) {
|
||||||
|
std::cerr << "MPT ERROR: failed to load model from " << modelPath;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
d_ptr->modelLoaded = true;
|
||||||
|
d_ptr->has_im_end = d_ptr->vocab.token_to_id.find("<|im_end|>") != d_ptr->vocab.token_to_id.end();
|
||||||
|
fflush(stdout);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MPT::setThreadCount(int32_t n_threads) {
|
||||||
|
d_ptr->n_threads = n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t MPT::threadCount() const
|
||||||
|
{
|
||||||
|
return d_ptr->n_threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
MPT::~MPT()
|
||||||
|
{
|
||||||
|
delete d_ptr->model;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MPT::isModelLoaded() const
|
||||||
|
{
|
||||||
|
return d_ptr->modelLoaded;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MPT::stateSize() const
|
||||||
|
{
|
||||||
|
return mpt_get_state_size(*d_ptr->model);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MPT::saveState(uint8_t *dest) const
|
||||||
|
{
|
||||||
|
return mpt_copy_state_data(*d_ptr->model, d_ptr->rng, dest);
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MPT::restoreState(const uint8_t *src)
|
||||||
|
{
|
||||||
|
return mpt_set_state_data(d_ptr->model, &d_ptr->rng, src);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<LLModel::Token> MPT::tokenize(PromptContext &, const std::string &str) const
|
||||||
|
{
|
||||||
|
return ::gpt_tokenize(d_ptr->vocab, str);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string MPT::tokenToString(Token id) const
|
||||||
|
{
|
||||||
|
return d_ptr->vocab.id_to_token[id];
|
||||||
|
}
|
||||||
|
|
||||||
|
LLModel::Token MPT::sampleToken(PromptContext &promptCtx) const
|
||||||
|
{
|
||||||
|
const size_t n_prev_toks = std::min((size_t) promptCtx.repeat_last_n, promptCtx.tokens.size());
|
||||||
|
return gpt_sample_top_k_top_p(d_ptr->model->hparams.n_vocab,
|
||||||
|
promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
|
||||||
|
n_prev_toks,
|
||||||
|
promptCtx.logits,
|
||||||
|
promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
|
||||||
|
promptCtx.repeat_penalty,
|
||||||
|
d_ptr->rng);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool MPT::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
|
||||||
|
{
|
||||||
|
// determine the required inference memory per token:
|
||||||
|
static bool initialized = false;
|
||||||
|
if (!initialized) {
|
||||||
|
mpt_eval(*d_ptr->model, d_ptr->n_threads, 0, { 0, 1, 2, 3 }, ctx.logits,
|
||||||
|
d_ptr->mem_per_token);
|
||||||
|
initialized = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return mpt_eval(*d_ptr->model, d_ptr->n_threads, ctx.n_past, tokens, ctx.logits, d_ptr->mem_per_token);
|
||||||
|
}
|
||||||
|
|
||||||
|
int32_t MPT::contextLength() const
|
||||||
|
{
|
||||||
|
return d_ptr->model->hparams.n_ctx;
|
||||||
|
}
|
||||||
|
|
||||||
|
const std::vector<LLModel::Token> &MPT::endTokens() const
|
||||||
|
{
|
||||||
|
static const std::vector<LLModel::Token> fres = {0, d_ptr->vocab.token_to_id["<|im_end|>"]};
|
||||||
|
return fres;
|
||||||
|
}
|
||||||
|
|
||||||
|
#if defined(_WIN32)
|
||||||
|
#define DLL_EXPORT __declspec(dllexport)
|
||||||
|
#else
|
||||||
|
#define DLL_EXPORT __attribute__ ((visibility ("default")))
|
||||||
|
#endif
|
||||||
|
|
||||||
|
extern "C" {
|
||||||
|
DLL_EXPORT bool is_g4a_backend_model_implementation() {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_model_type() {
|
||||||
|
return modelType_;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT const char *get_build_variant() {
|
||||||
|
return GGML_BUILD_VARIANT;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT bool magic_match(std::istream& f) {
|
||||||
|
uint32_t magic = 0;
|
||||||
|
f.read(reinterpret_cast<char*>(&magic), sizeof(magic));
|
||||||
|
return magic == 0x67676d6d;
|
||||||
|
}
|
||||||
|
|
||||||
|
DLL_EXPORT LLModel *construct() {
|
||||||
|
return new MPT;
|
||||||
|
}
|
||||||
|
}
|
41
gpt4all-backend/mpt_impl.h
Normal file
41
gpt4all-backend/mpt_impl.h
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
#ifndef MPT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of mpt.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define MPT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef MPT_H
|
||||||
|
#define MPT_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
struct MPTPrivate;
|
||||||
|
class MPT : public LLModel {
|
||||||
|
public:
|
||||||
|
MPT();
|
||||||
|
~MPT();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return false; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string &modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
MPTPrivate *d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
Token sampleToken(PromptContext &ctx) const override;
|
||||||
|
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // MPT_H
|
1027
gpt4all-backend/replit.cpp
Normal file
1027
gpt4all-backend/replit.cpp
Normal file
File diff suppressed because it is too large
Load Diff
43
gpt4all-backend/replit_impl.h
Normal file
43
gpt4all-backend/replit_impl.h
Normal file
@ -0,0 +1,43 @@
|
|||||||
|
#ifndef REPLIT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of replit.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define REPLIT_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef REPLIT_H
|
||||||
|
#define REPLIT_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
|
||||||
|
|
||||||
|
struct ReplitPrivate;
|
||||||
|
class Replit : public LLModel {
|
||||||
|
public:
|
||||||
|
Replit();
|
||||||
|
~Replit();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return false; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string & modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
ReplitPrivate *d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
Token sampleToken(PromptContext &ctx) const override;
|
||||||
|
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // REPLIT_H
|
102
gpt4all-backend/scripts/convert_bert_hf_to_ggml.py
Normal file
102
gpt4all-backend/scripts/convert_bert_hf_to_ggml.py
Normal file
@ -0,0 +1,102 @@
|
|||||||
|
import sys
|
||||||
|
import struct
|
||||||
|
import json
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers import AutoModel, AutoTokenizer
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
|
||||||
|
print(" ftype == 0 -> float32")
|
||||||
|
print(" ftype == 1 -> float16")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# output in the same directory as the model
|
||||||
|
dir_model = sys.argv[1]
|
||||||
|
fname_out = sys.argv[1] + "/ggml-model.bin"
|
||||||
|
|
||||||
|
with open(dir_model + "/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
|
encoder = json.load(f)
|
||||||
|
|
||||||
|
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
with open(dir_model + "/vocab.txt", "r", encoding="utf-8") as f:
|
||||||
|
vocab = f.readlines()
|
||||||
|
# possible data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
#
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
ftype = 1
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
ftype = int(sys.argv[2])
|
||||||
|
if ftype < 0 or ftype > 1:
|
||||||
|
print("Invalid ftype: " + str(ftype))
|
||||||
|
sys.exit(1)
|
||||||
|
fname_out = sys.argv[1] + "/ggml-model-" + ftype_str[ftype] + ".bin"
|
||||||
|
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
model = AutoModel.from_pretrained(dir_model, low_cpu_mem_usage=True)
|
||||||
|
print (model)
|
||||||
|
|
||||||
|
print(tokenizer.encode('I believe the meaning of life is'))
|
||||||
|
|
||||||
|
list_vars = model.state_dict()
|
||||||
|
for name in list_vars.keys():
|
||||||
|
print(name, list_vars[name].shape, list_vars[name].dtype)
|
||||||
|
|
||||||
|
fout = open(fname_out, "wb")
|
||||||
|
|
||||||
|
print(hparams)
|
||||||
|
|
||||||
|
fout.write(struct.pack("i", 0x62657274)) # magic: ggml in hex
|
||||||
|
fout.write(struct.pack("i", hparams["vocab_size"]))
|
||||||
|
fout.write(struct.pack("i", hparams["max_position_embeddings"]))
|
||||||
|
fout.write(struct.pack("i", hparams["hidden_size"]))
|
||||||
|
fout.write(struct.pack("i", hparams["intermediate_size"]))
|
||||||
|
fout.write(struct.pack("i", hparams["num_attention_heads"]))
|
||||||
|
fout.write(struct.pack("i", hparams["num_hidden_layers"]))
|
||||||
|
fout.write(struct.pack("i", ftype))
|
||||||
|
|
||||||
|
for i in range(hparams["vocab_size"]):
|
||||||
|
text = vocab[i][:-1] # strips newline at the end
|
||||||
|
#print(f"{i}:{text}")
|
||||||
|
data = bytes(text, 'utf-8')
|
||||||
|
fout.write(struct.pack("i", len(data)))
|
||||||
|
fout.write(data)
|
||||||
|
|
||||||
|
for name in list_vars.keys():
|
||||||
|
data = list_vars[name].squeeze().numpy()
|
||||||
|
if name in ['embeddings.position_ids', 'pooler.dense.weight', 'pooler.dense.bias']:
|
||||||
|
continue
|
||||||
|
print("Processing variable: " + name + " with shape: ", data.shape)
|
||||||
|
|
||||||
|
n_dims = len(data.shape);
|
||||||
|
|
||||||
|
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||||
|
if ftype == 1 and name[-7:] == ".weight" and n_dims == 2:
|
||||||
|
print(" Converting to float16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
l_type = 1
|
||||||
|
else:
|
||||||
|
l_type = 0
|
||||||
|
|
||||||
|
# header
|
||||||
|
str = name.encode('utf-8')
|
||||||
|
fout.write(struct.pack("iii", n_dims, len(str), l_type))
|
||||||
|
for i in range(n_dims):
|
||||||
|
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
||||||
|
fout.write(str);
|
||||||
|
|
||||||
|
# data
|
||||||
|
data.tofile(fout)
|
||||||
|
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
print("Done. Output file: " + fname_out)
|
||||||
|
print("")
|
143
gpt4all-backend/scripts/convert_falcon_hf_to_ggml.py
Normal file
143
gpt4all-backend/scripts/convert_falcon_hf_to_ggml.py
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
# Based on: https://github.com/KerfuffleV2/ggml-falcon/blob/feat-improve-falcon-convert-hf/examples/falcon/convert-hf-to-ggml.py
|
||||||
|
# Convert Hugging Face fine-tuned bloom-like models to ggml format
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
#
|
||||||
|
# python3 convert_falcon_hf_to_ggml.py model_directory output_directory [use-f32]
|
||||||
|
#
|
||||||
|
# This script is similar to "convert-pt-to-ggml.py"
|
||||||
|
#
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import struct
|
||||||
|
import json
|
||||||
|
import code
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
import gc
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
|
||||||
|
|
||||||
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
|
def bytes_to_unicode():
|
||||||
|
"""
|
||||||
|
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||||
|
The reversible bpe codes work on unicode strings.
|
||||||
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
|
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||||
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
"""
|
||||||
|
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||||
|
cs = bs[:]
|
||||||
|
n = 0
|
||||||
|
for b in range(2**8):
|
||||||
|
if b not in bs:
|
||||||
|
bs.append(b)
|
||||||
|
cs.append(2**8+n)
|
||||||
|
n += 1
|
||||||
|
cs = [chr(n) for n in cs]
|
||||||
|
return dict(zip(bs, cs))
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("INFO: GGML V1 files produced are meant to be finalized through examples/falcon_quantize which will bring them to latest version and precision of choice");
|
||||||
|
print("Usage: python convert_falcon_hf_to_ggml.py model_directory output_directory [use-f32]")
|
||||||
|
print(" model_directory: name of the directory and model you convert (it should be a subdirectory)")
|
||||||
|
print(" output-directory: directory where the output file will be written")
|
||||||
|
print(" use-f32: if present, use float32 instead of float16 (f32 is recommended)")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
# num_parts = int(sys.argv[1])
|
||||||
|
dir_model = sys.argv[1] # name and dir of model
|
||||||
|
dir_out = sys.argv[2] # output directory
|
||||||
|
|
||||||
|
# make sure the output directory exists
|
||||||
|
os.makedirs(dir_out, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# possible data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
#
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
ftype = 1
|
||||||
|
if len(sys.argv) > 3:
|
||||||
|
ftype = 0
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
# print(tokenizer)
|
||||||
|
config = AutoConfig.from_pretrained(dir_model, trust_remote_code=True)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(dir_model, trust_remote_code=True, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True)
|
||||||
|
hparams = config.to_dict()
|
||||||
|
|
||||||
|
n_head = hparams["n_head"]
|
||||||
|
n_head_kv = hparams["n_head_kv"] if "n_head_kv" in hparams else 1
|
||||||
|
head_dim = hparams["hidden_size"] // n_head
|
||||||
|
print("* Loading model from: ", dir_model)
|
||||||
|
|
||||||
|
fname_out = dir_out + f"/ggml-model-{dir_model.split('/')[-1]}-{ftype_str[ftype]}.bin"
|
||||||
|
fout = open(fname_out, "wb")
|
||||||
|
fout.write(struct.pack("i", 0x67676a74)) # magic: ggmf in hex (version 1) - possibly change to ggfc ?
|
||||||
|
fout.write(struct.pack("i", 1)) # version
|
||||||
|
fout.write(struct.pack("i", hparams["vocab_size"]))
|
||||||
|
fout.write(struct.pack("i", hparams["hidden_size"]))
|
||||||
|
fout.write(struct.pack("i", n_head))
|
||||||
|
fout.write(struct.pack("i", n_head_kv))
|
||||||
|
fout.write(struct.pack("i", hparams["n_layer"]))
|
||||||
|
fout.write(struct.pack("i", 40 if "n_head_kv" in hparams else 7)) # obsolete field that breaks ggml compatibility - todo again remove one day
|
||||||
|
fout.write(struct.pack("i", ftype))
|
||||||
|
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
byte_encoder = bytes_to_unicode()
|
||||||
|
byte_decoder = {v:k for k, v in byte_encoder.items()}
|
||||||
|
|
||||||
|
for i in range(hparams["vocab_size"]):
|
||||||
|
text = bytearray([byte_decoder[c] for c in reverse_vocab[i]])
|
||||||
|
fout.write(struct.pack("i", len(text)))
|
||||||
|
fout.write(text)
|
||||||
|
fout.write(struct.pack("f", 0.0)) # falcon uses bpe on RefinedWeb - no probability scores used
|
||||||
|
|
||||||
|
model = model.state_dict()
|
||||||
|
for name in model.keys():
|
||||||
|
src = name
|
||||||
|
# The original query_key_value tensor contains n_head_kv "kv groups",
|
||||||
|
# each consisting of n_head/n_head_kv query weights followed by one key
|
||||||
|
# and one value weight (shared by all query heads in the kv group).
|
||||||
|
# This layout makes it a big pain to work with in GGML.
|
||||||
|
# So we rearrange them here,, so that we have n_head query weights
|
||||||
|
# followed by n_head_kv key weights followed by n_head_kv value weights,
|
||||||
|
# in contiguous fashion.
|
||||||
|
|
||||||
|
if "query_key_value" in src:
|
||||||
|
qkv = model[src].view(
|
||||||
|
n_head_kv, n_head // n_head_kv + 2, head_dim, head_dim * n_head)
|
||||||
|
|
||||||
|
q = qkv[:, :-2 ].reshape(n_head * head_dim, head_dim * n_head)
|
||||||
|
k = qkv[:, [-2]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||||
|
v = qkv[:, [-1]].reshape(n_head_kv * head_dim, head_dim * n_head)
|
||||||
|
|
||||||
|
model[src] = torch.cat((q,k,v)).reshape_as(model[src])
|
||||||
|
data = model[src].squeeze()
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
# default type is fp32
|
||||||
|
ftype_cur = 1 if ftype == 1 and n_dims > 1 else 0
|
||||||
|
data = data.to(dtype = torch.float16 if ftype_cur == 1 else torch.float32).numpy()
|
||||||
|
print(f' |', name, data.shape, '->', data.dtype)
|
||||||
|
# header
|
||||||
|
str = name.encode('utf-8')
|
||||||
|
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
|
||||||
|
for i in range(n_dims):
|
||||||
|
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
||||||
|
fout.write(str)
|
||||||
|
|
||||||
|
# data
|
||||||
|
data.tofile(fout)
|
||||||
|
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
print("Done. Output file: " + fname_out)
|
||||||
|
print("")
|
145
gpt4all-backend/scripts/convert_mpt_hf_to_ggml.py
Normal file
145
gpt4all-backend/scripts/convert_mpt_hf_to_ggml.py
Normal file
@ -0,0 +1,145 @@
|
|||||||
|
# Convert Hugging Face fine-tuned bloom-like models to ggml format
|
||||||
|
#
|
||||||
|
# Usage:
|
||||||
|
#
|
||||||
|
# python3 models/convert-h5-to-ggml.py
|
||||||
|
#
|
||||||
|
# This script is similar to "convert-pt-to-ggml.py"
|
||||||
|
#
|
||||||
|
|
||||||
|
import io
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import struct
|
||||||
|
import json
|
||||||
|
import code
|
||||||
|
import torch
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BloomForCausalLM
|
||||||
|
|
||||||
|
# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
|
||||||
|
def bytes_to_unicode():
|
||||||
|
"""
|
||||||
|
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||||
|
The reversible bpe codes work on unicode strings.
|
||||||
|
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||||
|
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||||
|
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||||
|
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||||
|
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||||
|
"""
|
||||||
|
bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
|
||||||
|
cs = bs[:]
|
||||||
|
n = 0
|
||||||
|
for b in range(2**8):
|
||||||
|
if b not in bs:
|
||||||
|
bs.append(b)
|
||||||
|
cs.append(2**8+n)
|
||||||
|
n += 1
|
||||||
|
cs = [chr(n) for n in cs]
|
||||||
|
return dict(zip(bs, cs))
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: python convert-hf-to-ggml.py model_name dir-output [use-f32]")
|
||||||
|
print(" model_name: name of the model to convert. Example: 'bigscience/bloomz-560m'")
|
||||||
|
print(" dir-output: directory where the output file will be written")
|
||||||
|
print(" use-f32: if present, use float32 instead of float16")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
model_name = sys.argv[1]
|
||||||
|
dir_out = sys.argv[2]
|
||||||
|
|
||||||
|
# make sure the output directory exists
|
||||||
|
os.makedirs(dir_out, exist_ok=True)
|
||||||
|
|
||||||
|
# possible data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
#
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
ftype = 1
|
||||||
|
if len(sys.argv) > 3:
|
||||||
|
ftype = 0
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
|
||||||
|
hparams = config.to_dict()
|
||||||
|
print("Loading model: ", model_name)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, config=config, torch_dtype=torch.float16 if ftype == 1 else torch.float32, low_cpu_mem_usage=True)
|
||||||
|
print("Model loaded: ", model_name)
|
||||||
|
|
||||||
|
|
||||||
|
fname_out = dir_out + f"/ggml-model-{model_name.split('/')[-1]}-{ftype_str[ftype]}.bin"
|
||||||
|
fout = open(fname_out, "wb")
|
||||||
|
vocab = tokenizer.vocab
|
||||||
|
|
||||||
|
hparams["multiple_of"] = 1
|
||||||
|
fout.write(struct.pack("I", 0x67676d6d)) # magic: ggml in hex
|
||||||
|
fout.write(struct.pack("I", model.config.vocab_size))
|
||||||
|
fout.write(struct.pack("I", model.config.max_seq_len))
|
||||||
|
fout.write(struct.pack("I", model.config.n_layers))
|
||||||
|
fout.write(struct.pack("I", model.config.n_heads))
|
||||||
|
fout.write(struct.pack("I", model.config.d_model))
|
||||||
|
fout.write(struct.pack("f", model.config.attn_config['alibi_bias_max']))
|
||||||
|
clip_qkv = model.config.attn_config['clip_qkv']
|
||||||
|
fout.write(struct.pack("f", clip_qkv if clip_qkv is not None else 0))
|
||||||
|
fout.write(struct.pack("I", ftype))
|
||||||
|
|
||||||
|
# # Is this correct??
|
||||||
|
# dot_token = tokenizer.encode(".")[0]
|
||||||
|
# write tokens to ggml file
|
||||||
|
dot_token = tokenizer.encode('.')[0]
|
||||||
|
fout.write(struct.pack("I", model.config.vocab_size))
|
||||||
|
|
||||||
|
for i in range(model.config.vocab_size):
|
||||||
|
text = tokenizer.decode([dot_token, i]).encode('utf-8')
|
||||||
|
# remove the first byte (it's always '.')
|
||||||
|
text = text[1:]
|
||||||
|
enclen = len(text)
|
||||||
|
if i in tokenizer.all_special_ids:
|
||||||
|
print(f"special token: {text}")
|
||||||
|
enclen = enclen | 1<<31
|
||||||
|
fout.write(struct.pack("I", enclen))
|
||||||
|
fout.write(text)
|
||||||
|
|
||||||
|
list_vars = model.state_dict()
|
||||||
|
for name in list_vars.keys():
|
||||||
|
data = list_vars[name].squeeze().numpy()
|
||||||
|
print("Processing variable: " + name + " with shape: ", data.shape)
|
||||||
|
|
||||||
|
n_dims = len(data.shape);
|
||||||
|
|
||||||
|
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||||
|
ftype_cur = 0;
|
||||||
|
if ftype != 0:
|
||||||
|
# Keep token embeddings in fp32
|
||||||
|
if name[-7:] == ".weight" and n_dims == 2 and ".wte" not in name:
|
||||||
|
print(" Converting to float16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
else:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
else:
|
||||||
|
if data.dtype != np.float32:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
|
||||||
|
# header
|
||||||
|
str = name.encode('utf-8')
|
||||||
|
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
|
||||||
|
for i in range(n_dims):
|
||||||
|
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
||||||
|
fout.write(str);
|
||||||
|
|
||||||
|
# data
|
||||||
|
data.tofile(fout)
|
||||||
|
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
print("Done. Output file: " + fname_out)
|
||||||
|
print("")
|
113
gpt4all-backend/scripts/convert_replit_hf_to_ggml.py
Normal file
113
gpt4all-backend/scripts/convert_replit_hf_to_ggml.py
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
import sys
|
||||||
|
import struct
|
||||||
|
import json
|
||||||
|
import numpy as np
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
import sentencepiece.sentencepiece_model_pb2 as model
|
||||||
|
|
||||||
|
if len(sys.argv) < 3:
|
||||||
|
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
|
||||||
|
print(" ftype == 0 -> float32")
|
||||||
|
print(" ftype == 1 -> float16")
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
||||||
|
# output in the same directory as the model
|
||||||
|
dir_model = sys.argv[1]
|
||||||
|
fname_out = sys.argv[1] + "/ggml-replit-code-v1-3b.bin"
|
||||||
|
|
||||||
|
|
||||||
|
with open(dir_model + "/config.json", "r", encoding="utf-8") as f:
|
||||||
|
hparams = json.load(f)
|
||||||
|
|
||||||
|
sp_proto = model.ModelProto()
|
||||||
|
sp_proto.ParseFromString(open(Path(sys.argv[1]) / "spiece.model", "rb").read())
|
||||||
|
|
||||||
|
|
||||||
|
# possible data types
|
||||||
|
# ftype == 0 -> float32
|
||||||
|
# ftype == 1 -> float16
|
||||||
|
#
|
||||||
|
# map from ftype to string
|
||||||
|
ftype_str = ["f32", "f16"]
|
||||||
|
|
||||||
|
ftype = 1
|
||||||
|
if len(sys.argv) > 2:
|
||||||
|
ftype = int(sys.argv[2])
|
||||||
|
if ftype < 0 or ftype > 1:
|
||||||
|
print("Invalid ftype: " + str(ftype))
|
||||||
|
sys.exit(1)
|
||||||
|
fname_out = sys.argv[1] + "/ggml-replit-code-v1-3b-" + ftype_str[ftype] + ".bin"
|
||||||
|
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(dir_model, trust_remote_code=True)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
dir_model, low_cpu_mem_usage=True, trust_remote_code=True
|
||||||
|
)
|
||||||
|
# print (model)
|
||||||
|
|
||||||
|
# print(tokenizer.encode('I believe the meaning of life is'))
|
||||||
|
|
||||||
|
list_vars = model.state_dict()
|
||||||
|
for name in list_vars.keys():
|
||||||
|
print(name, list_vars[name].shape, list_vars[name].dtype)
|
||||||
|
|
||||||
|
fout = open(fname_out, "wb")
|
||||||
|
|
||||||
|
print(hparams)
|
||||||
|
|
||||||
|
fout.write(struct.pack("i", 0x7265706c)) # magic: repl in hex
|
||||||
|
fout.write(struct.pack("i", hparams["vocab_size"]))
|
||||||
|
fout.write(struct.pack("i", hparams["max_seq_len"]))
|
||||||
|
fout.write(struct.pack("i", hparams["d_model"]))
|
||||||
|
fout.write(struct.pack("i", hparams["n_heads"]))
|
||||||
|
fout.write(struct.pack("i", hparams["n_layers"]))
|
||||||
|
fout.write(struct.pack("i", ftype))
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: temporary hack to not deal with implementing the tokenizer
|
||||||
|
for piece in sp_proto.pieces:
|
||||||
|
encoded_piece = piece.piece.encode("utf-8")
|
||||||
|
fout.write(struct.pack("i", len(encoded_piece)))
|
||||||
|
fout.write(encoded_piece)
|
||||||
|
fout.write(struct.pack("f", piece.score))
|
||||||
|
|
||||||
|
|
||||||
|
for name in list_vars.keys():
|
||||||
|
data = list_vars[name].squeeze().numpy()
|
||||||
|
print("Processing variable: " + name + " with shape: ", data.shape)
|
||||||
|
|
||||||
|
n_dims = len(data.shape)
|
||||||
|
|
||||||
|
# ftype == 0 -> float32, ftype == 1 -> float16
|
||||||
|
ftype_cur = 0
|
||||||
|
if ftype != 0:
|
||||||
|
if name[-7:] == ".weight" and n_dims == 2:
|
||||||
|
print(" Converting to float16")
|
||||||
|
data = data.astype(np.float16)
|
||||||
|
ftype_cur = 1
|
||||||
|
else:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
else:
|
||||||
|
if data.dtype != np.float32:
|
||||||
|
print(" Converting to float32")
|
||||||
|
data = data.astype(np.float32)
|
||||||
|
ftype_cur = 0
|
||||||
|
|
||||||
|
# header
|
||||||
|
str = name.encode("utf-8")
|
||||||
|
fout.write(struct.pack("iii", n_dims, len(str), ftype_cur))
|
||||||
|
for i in range(n_dims):
|
||||||
|
fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
|
||||||
|
fout.write(str)
|
||||||
|
|
||||||
|
# data
|
||||||
|
data.tofile(fout)
|
||||||
|
|
||||||
|
fout.close()
|
||||||
|
|
||||||
|
print("Done. Output file: " + fname_out)
|
||||||
|
print("")
|
@ -1,73 +0,0 @@
|
|||||||
#include "dlhandle.h"
|
|
||||||
|
|
||||||
#include <string>
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
# include <dlfcn.h>
|
|
||||||
#else
|
|
||||||
# include <cassert>
|
|
||||||
# include <sstream>
|
|
||||||
# define WIN32_LEAN_AND_MEAN
|
|
||||||
# ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
# endif
|
|
||||||
# include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
using namespace std::string_literals;
|
|
||||||
namespace fs = std::filesystem;
|
|
||||||
|
|
||||||
|
|
||||||
#ifndef _WIN32
|
|
||||||
|
|
||||||
Dlhandle::Dlhandle(const fs::path &fpath)
|
|
||||||
{
|
|
||||||
chandle = dlopen(fpath.c_str(), RTLD_LAZY | RTLD_LOCAL);
|
|
||||||
if (!chandle) {
|
|
||||||
throw Exception("dlopen: "s + dlerror());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Dlhandle::~Dlhandle()
|
|
||||||
{
|
|
||||||
if (chandle) dlclose(chandle);
|
|
||||||
}
|
|
||||||
|
|
||||||
void *Dlhandle::get_internal(const char *symbol) const
|
|
||||||
{
|
|
||||||
return dlsym(chandle, symbol);
|
|
||||||
}
|
|
||||||
|
|
||||||
#else // defined(_WIN32)
|
|
||||||
|
|
||||||
Dlhandle::Dlhandle(const fs::path &fpath)
|
|
||||||
{
|
|
||||||
fs::path afpath = fs::absolute(fpath);
|
|
||||||
|
|
||||||
// Suppress the "Entry Point Not Found" dialog, caused by outdated nvcuda.dll from the GPU driver
|
|
||||||
UINT lastErrorMode = GetErrorMode();
|
|
||||||
SetErrorMode(lastErrorMode | SEM_FAILCRITICALERRORS);
|
|
||||||
|
|
||||||
chandle = LoadLibraryExW(afpath.c_str(), NULL, LOAD_LIBRARY_SEARCH_DEFAULT_DIRS | LOAD_LIBRARY_SEARCH_DLL_LOAD_DIR);
|
|
||||||
|
|
||||||
SetErrorMode(lastErrorMode);
|
|
||||||
|
|
||||||
if (!chandle) {
|
|
||||||
DWORD err = GetLastError();
|
|
||||||
std::ostringstream ss;
|
|
||||||
ss << "LoadLibraryExW failed with error 0x" << std::hex << err;
|
|
||||||
throw Exception(ss.str());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Dlhandle::~Dlhandle()
|
|
||||||
{
|
|
||||||
if (chandle) FreeLibrary(HMODULE(chandle));
|
|
||||||
}
|
|
||||||
|
|
||||||
void *Dlhandle::get_internal(const char *symbol) const
|
|
||||||
{
|
|
||||||
return GetProcAddress(HMODULE(chandle), symbol);
|
|
||||||
}
|
|
||||||
|
|
||||||
#endif // defined(_WIN32)
|
|
@ -1,47 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <filesystem>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
#include <utility>
|
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
|
||||||
|
|
||||||
|
|
||||||
class Dlhandle {
|
|
||||||
void *chandle = nullptr;
|
|
||||||
|
|
||||||
public:
|
|
||||||
class Exception : public std::runtime_error {
|
|
||||||
public:
|
|
||||||
using std::runtime_error::runtime_error;
|
|
||||||
};
|
|
||||||
|
|
||||||
Dlhandle() = default;
|
|
||||||
Dlhandle(const fs::path &fpath);
|
|
||||||
Dlhandle(const Dlhandle &o) = delete;
|
|
||||||
Dlhandle(Dlhandle &&o)
|
|
||||||
: chandle(o.chandle)
|
|
||||||
{
|
|
||||||
o.chandle = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
~Dlhandle();
|
|
||||||
|
|
||||||
Dlhandle &operator=(Dlhandle &&o) {
|
|
||||||
chandle = std::exchange(o.chandle, nullptr);
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename T>
|
|
||||||
T *get(const std::string &symbol) const {
|
|
||||||
return reinterpret_cast<T *>(get_internal(symbol.c_str()));
|
|
||||||
}
|
|
||||||
|
|
||||||
auto get_fnc(const std::string &symbol) const {
|
|
||||||
return get<void*(...)>(symbol);
|
|
||||||
}
|
|
||||||
|
|
||||||
private:
|
|
||||||
void *get_internal(const char *symbol) const;
|
|
||||||
};
|
|
File diff suppressed because it is too large
Load Diff
@ -1,84 +0,0 @@
|
|||||||
#ifndef LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
|
||||||
#error This file is NOT meant to be included outside of llamamodel.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define LLAMAMODEL_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
|
||||||
#endif
|
|
||||||
#ifndef LLAMAMODEL_H
|
|
||||||
#define LLAMAMODEL_H
|
|
||||||
|
|
||||||
#include "llmodel.h"
|
|
||||||
|
|
||||||
#include <memory>
|
|
||||||
#include <span>
|
|
||||||
#include <string>
|
|
||||||
#include <string_view>
|
|
||||||
#include <vector>
|
|
||||||
#include <unordered_map>
|
|
||||||
|
|
||||||
struct LLamaPrivate;
|
|
||||||
struct EmbModelSpec;
|
|
||||||
|
|
||||||
class LLamaModel : public LLModel {
|
|
||||||
public:
|
|
||||||
LLamaModel();
|
|
||||||
~LLamaModel();
|
|
||||||
|
|
||||||
bool supportsEmbedding() const override { return m_supportsEmbedding; }
|
|
||||||
bool supportsCompletion() const override { return m_supportsCompletion; }
|
|
||||||
bool loadModel(const std::string &modelPath, int n_ctx, int ngl) override;
|
|
||||||
bool isModelBlacklisted(const std::string &modelPath) const override;
|
|
||||||
bool isEmbeddingModel(const std::string &modelPath) const override;
|
|
||||||
bool isModelLoaded() const override;
|
|
||||||
size_t requiredMem(const std::string &modelPath, int n_ctx, int ngl) override;
|
|
||||||
size_t stateSize() const override;
|
|
||||||
size_t saveState(std::span<uint8_t> stateOut, std::vector<Token> &inputTokensOut) const override;
|
|
||||||
size_t restoreState(std::span<const uint8_t> state, std::span<const Token> inputTokens) override;
|
|
||||||
void setThreadCount(int32_t n_threads) override;
|
|
||||||
int32_t threadCount() const override;
|
|
||||||
std::vector<GPUDevice> availableGPUDevices(size_t memoryRequired = 0) const override;
|
|
||||||
bool initializeGPUDevice(size_t memoryRequired, const std::string &name) const override;
|
|
||||||
bool initializeGPUDevice(int device, std::string *unavail_reason = nullptr) const override;
|
|
||||||
bool usingGPUDevice() const override;
|
|
||||||
const char *backendName() const override;
|
|
||||||
const char *gpuDeviceName() const override;
|
|
||||||
|
|
||||||
size_t embeddingSize() const override;
|
|
||||||
// user-specified prefix
|
|
||||||
void embed(const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix,
|
|
||||||
int dimensionality = -1, size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false,
|
|
||||||
EmbedCancelCallback *cancelCb = nullptr) override;
|
|
||||||
// automatic prefix
|
|
||||||
void embed(const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality = -1,
|
|
||||||
size_t *tokenCount = nullptr, bool doMean = true, bool atlas = false) override;
|
|
||||||
|
|
||||||
int32_t contextLength() const override;
|
|
||||||
auto specialTokens() -> std::unordered_map<std::string, std::string> const override;
|
|
||||||
|
|
||||||
protected:
|
|
||||||
std::vector<Token> tokenize(std::string_view str) const override;
|
|
||||||
bool isSpecialToken(Token id) const override;
|
|
||||||
std::string tokenToString(Token id) const override;
|
|
||||||
void initSampler(const PromptContext &ctx) override;
|
|
||||||
Token sampleToken() const override;
|
|
||||||
bool evalTokens(int32_t nPast, std::span<const Token> tokens) const override;
|
|
||||||
void shiftContext(const PromptContext &promptCtx, int32_t *nPast) override;
|
|
||||||
int32_t inputLength() const override;
|
|
||||||
int32_t computeModelInputPosition(std::span<const Token> input) const override;
|
|
||||||
void setModelInputPosition(int32_t pos) override;
|
|
||||||
void appendInputToken(Token tok) override;
|
|
||||||
std::span<const Token> inputTokens() const override;
|
|
||||||
const std::vector<Token> &endTokens() const override;
|
|
||||||
bool shouldAddBOS() const override;
|
|
||||||
int32_t maxContextLength(std::string const &modelPath) const override;
|
|
||||||
int32_t layerCount(std::string const &modelPath) const override;
|
|
||||||
auto chatTemplate(const char *modelPath) const -> std::expected<std::string, std::string> override;
|
|
||||||
|
|
||||||
void embedInternal(const std::vector<std::string> &texts, float *embeddings, std::string prefix, int dimensionality,
|
|
||||||
size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb,
|
|
||||||
const EmbModelSpec *spec);
|
|
||||||
|
|
||||||
private:
|
|
||||||
std::unique_ptr<LLamaPrivate> d_ptr;
|
|
||||||
bool m_supportsEmbedding = false;
|
|
||||||
bool m_supportsCompletion = false;
|
|
||||||
};
|
|
||||||
|
|
||||||
#endif // LLAMAMODEL_H
|
|
@ -1,358 +0,0 @@
|
|||||||
#include "llmodel.h"
|
|
||||||
|
|
||||||
#include "dlhandle.h"
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <filesystem>
|
|
||||||
#include <fstream>
|
|
||||||
#include <iostream>
|
|
||||||
#include <iterator>
|
|
||||||
#include <memory>
|
|
||||||
#include <optional>
|
|
||||||
#include <regex>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
|
||||||
#include <unordered_map>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#ifdef _WIN32
|
|
||||||
# define WIN32_LEAN_AND_MEAN
|
|
||||||
# ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
# endif
|
|
||||||
# include <windows.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#ifdef _MSC_VER
|
|
||||||
# include <intrin.h>
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__aarch64__)
|
|
||||||
# include "sysinfo.h" // for getSystemTotalRAMInBytes
|
|
||||||
#endif
|
|
||||||
|
|
||||||
namespace fs = std::filesystem;
|
|
||||||
|
|
||||||
#ifndef __APPLE__
|
|
||||||
static const std::string DEFAULT_BACKENDS[] = {"kompute", "cpu"};
|
|
||||||
#elif defined(__aarch64__)
|
|
||||||
static const std::string DEFAULT_BACKENDS[] = {"metal", "cpu"};
|
|
||||||
#else
|
|
||||||
static const std::string DEFAULT_BACKENDS[] = {"cpu"};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
std::string s_implementations_search_path = ".";
|
|
||||||
|
|
||||||
#if !(defined(__x86_64__) || defined(_M_X64))
|
|
||||||
// irrelevant on non-x86_64
|
|
||||||
#define cpu_supports_avx() -1
|
|
||||||
#define cpu_supports_avx2() -1
|
|
||||||
#elif defined(_MSC_VER)
|
|
||||||
// MSVC
|
|
||||||
static int get_cpu_info(int func_id, int reg_id) {
|
|
||||||
int info[4];
|
|
||||||
__cpuid(info, func_id);
|
|
||||||
return info[reg_id];
|
|
||||||
}
|
|
||||||
|
|
||||||
// AVX via EAX=1: Processor Info and Feature Bits, bit 28 of ECX
|
|
||||||
#define cpu_supports_avx() !!(get_cpu_info(1, 2) & (1 << 28))
|
|
||||||
// AVX2 via EAX=7, ECX=0: Extended Features, bit 5 of EBX
|
|
||||||
#define cpu_supports_avx2() !!(get_cpu_info(7, 1) & (1 << 5))
|
|
||||||
#else
|
|
||||||
// gcc/clang
|
|
||||||
#define cpu_supports_avx() !!__builtin_cpu_supports("avx")
|
|
||||||
#define cpu_supports_avx2() !!__builtin_cpu_supports("avx2")
|
|
||||||
#endif
|
|
||||||
|
|
||||||
LLModel::Implementation::Implementation(Dlhandle &&dlhandle_)
|
|
||||||
: m_dlhandle(new Dlhandle(std::move(dlhandle_))) {
|
|
||||||
auto get_model_type = m_dlhandle->get<const char *()>("get_model_type");
|
|
||||||
assert(get_model_type);
|
|
||||||
m_modelType = get_model_type();
|
|
||||||
auto get_build_variant = m_dlhandle->get<const char *()>("get_build_variant");
|
|
||||||
assert(get_build_variant);
|
|
||||||
m_buildVariant = get_build_variant();
|
|
||||||
m_getFileArch = m_dlhandle->get<char *(const char *)>("get_file_arch");
|
|
||||||
assert(m_getFileArch);
|
|
||||||
m_isArchSupported = m_dlhandle->get<bool(const char *)>("is_arch_supported");
|
|
||||||
assert(m_isArchSupported);
|
|
||||||
m_construct = m_dlhandle->get<LLModel *()>("construct");
|
|
||||||
assert(m_construct);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel::Implementation::Implementation(Implementation &&o)
|
|
||||||
: m_getFileArch(o.m_getFileArch)
|
|
||||||
, m_isArchSupported(o.m_isArchSupported)
|
|
||||||
, m_construct(o.m_construct)
|
|
||||||
, m_modelType(o.m_modelType)
|
|
||||||
, m_buildVariant(o.m_buildVariant)
|
|
||||||
, m_dlhandle(o.m_dlhandle) {
|
|
||||||
o.m_dlhandle = nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel::Implementation::~Implementation()
|
|
||||||
{
|
|
||||||
delete m_dlhandle;
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool isImplementation(const Dlhandle &dl)
|
|
||||||
{
|
|
||||||
return dl.get<bool(uint32_t)>("is_g4a_backend_model_implementation");
|
|
||||||
}
|
|
||||||
|
|
||||||
// Add the CUDA Toolkit to the DLL search path on Windows.
|
|
||||||
// This is necessary for chat.exe to find CUDA when started from Qt Creator.
|
|
||||||
static void addCudaSearchPath()
|
|
||||||
{
|
|
||||||
#ifdef _WIN32
|
|
||||||
if (const auto *cudaPath = _wgetenv(L"CUDA_PATH")) {
|
|
||||||
auto libDir = std::wstring(cudaPath) + L"\\bin";
|
|
||||||
if (!AddDllDirectory(libDir.c_str())) {
|
|
||||||
auto err = GetLastError();
|
|
||||||
std::wcerr << L"AddDllDirectory(\"" << libDir << L"\") failed with error 0x" << std::hex << err << L"\n";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::vector<LLModel::Implementation> &LLModel::Implementation::implementationList()
|
|
||||||
{
|
|
||||||
if (cpu_supports_avx() == 0) {
|
|
||||||
throw std::runtime_error("CPU does not support AVX");
|
|
||||||
}
|
|
||||||
|
|
||||||
// NOTE: allocated on heap so we leak intentionally on exit so we have a chance to clean up the
|
|
||||||
// individual models without the cleanup of the static list interfering
|
|
||||||
static auto* libs = new std::vector<Implementation>([] () {
|
|
||||||
std::vector<Implementation> fres;
|
|
||||||
|
|
||||||
addCudaSearchPath();
|
|
||||||
|
|
||||||
std::string impl_name_re = "llamamodel-mainline-(cpu|metal|kompute|vulkan|cuda)";
|
|
||||||
if (cpu_supports_avx2() == 0) {
|
|
||||||
impl_name_re += "-avxonly";
|
|
||||||
}
|
|
||||||
std::regex re(impl_name_re);
|
|
||||||
auto search_in_directory = [&](const std::string& paths) {
|
|
||||||
std::stringstream ss(paths);
|
|
||||||
std::string path;
|
|
||||||
// Split the paths string by the delimiter and process each path.
|
|
||||||
while (std::getline(ss, path, ';')) {
|
|
||||||
fs::directory_iterator iter;
|
|
||||||
try {
|
|
||||||
iter = fs::directory_iterator(std::u8string(path.begin(), path.end()));
|
|
||||||
} catch (const fs::filesystem_error &) {
|
|
||||||
continue; // skip nonexistent path
|
|
||||||
}
|
|
||||||
// Iterate over all libraries
|
|
||||||
for (const auto &f : iter) {
|
|
||||||
const fs::path &p = f.path();
|
|
||||||
|
|
||||||
if (p.extension() != LIB_FILE_EXT) continue;
|
|
||||||
if (!std::regex_search(p.stem().string(), re)) continue;
|
|
||||||
|
|
||||||
// Add to list if model implementation
|
|
||||||
Dlhandle dl;
|
|
||||||
try {
|
|
||||||
dl = Dlhandle(p);
|
|
||||||
} catch (const Dlhandle::Exception &e) {
|
|
||||||
std::cerr << "Failed to load " << p.filename().string() << ": " << e.what() << "\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
if (!isImplementation(dl)) {
|
|
||||||
std::cerr << "Not an implementation: " << p.filename().string() << "\n";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
fres.emplace_back(Implementation(std::move(dl)));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
search_in_directory(s_implementations_search_path);
|
|
||||||
|
|
||||||
return fres;
|
|
||||||
}());
|
|
||||||
// Return static result
|
|
||||||
return *libs;
|
|
||||||
}
|
|
||||||
|
|
||||||
static std::string applyCPUVariant(const std::string &buildVariant)
|
|
||||||
{
|
|
||||||
if (buildVariant != "metal" && cpu_supports_avx2() == 0) {
|
|
||||||
return buildVariant + "-avxonly";
|
|
||||||
}
|
|
||||||
return buildVariant;
|
|
||||||
}
|
|
||||||
|
|
||||||
const LLModel::Implementation* LLModel::Implementation::implementation(const char *fname, const std::string& buildVariant)
|
|
||||||
{
|
|
||||||
bool buildVariantMatched = false;
|
|
||||||
std::optional<std::string> archName;
|
|
||||||
for (const auto& i : implementationList()) {
|
|
||||||
if (buildVariant != i.m_buildVariant) continue;
|
|
||||||
buildVariantMatched = true;
|
|
||||||
|
|
||||||
char *arch = i.m_getFileArch(fname);
|
|
||||||
if (!arch) continue;
|
|
||||||
archName = arch;
|
|
||||||
|
|
||||||
bool archSupported = i.m_isArchSupported(arch);
|
|
||||||
free(arch);
|
|
||||||
if (archSupported) return &i;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!buildVariantMatched)
|
|
||||||
return nullptr;
|
|
||||||
if (!archName)
|
|
||||||
throw UnsupportedModelError("Unsupported file format");
|
|
||||||
|
|
||||||
throw BadArchError(std::move(*archName));
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel *LLModel::Implementation::construct(const std::string &modelPath, const std::string &backend, int n_ctx)
|
|
||||||
{
|
|
||||||
std::vector<std::string> desiredBackends;
|
|
||||||
if (backend != "auto") {
|
|
||||||
desiredBackends.push_back(backend);
|
|
||||||
} else {
|
|
||||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
|
||||||
}
|
|
||||||
|
|
||||||
for (const auto &desiredBackend: desiredBackends) {
|
|
||||||
const auto *impl = implementation(modelPath.c_str(), applyCPUVariant(desiredBackend));
|
|
||||||
|
|
||||||
if (impl) {
|
|
||||||
// Construct llmodel implementation
|
|
||||||
auto *fres = impl->m_construct();
|
|
||||||
fres->m_implementation = impl;
|
|
||||||
|
|
||||||
#if defined(__APPLE__) && defined(__aarch64__) // FIXME: See if metal works for intel macs
|
|
||||||
/* TODO(cebtenzzre): after we fix requiredMem, we should change this to happen at
|
|
||||||
* load time, not construct time. right now n_ctx is incorrectly hardcoded 2048 in
|
|
||||||
* most (all?) places where this is called, causing underestimation of required
|
|
||||||
* memory. */
|
|
||||||
if (backend == "auto" && desiredBackend == "metal") {
|
|
||||||
// on a 16GB M2 Mac a 13B q4_0 (0.52) works for me but a 13B q4_K_M (0.55) does not
|
|
||||||
size_t req_mem = fres->requiredMem(modelPath, n_ctx, 100);
|
|
||||||
if (req_mem >= size_t(0.53f * getSystemTotalRAMInBytes())) {
|
|
||||||
delete fres;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#else
|
|
||||||
(void)n_ctx;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
return fres;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
throw MissingImplementationError("Could not find any implementations for backend: " + backend);
|
|
||||||
}
|
|
||||||
|
|
||||||
LLModel *LLModel::Implementation::constructGlobalLlama(const std::optional<std::string> &backend)
|
|
||||||
{
|
|
||||||
static std::unordered_map<std::string, std::unique_ptr<LLModel>> implCache;
|
|
||||||
|
|
||||||
const std::vector<Implementation> *impls;
|
|
||||||
try {
|
|
||||||
impls = &implementationList();
|
|
||||||
} catch (const std::runtime_error &e) {
|
|
||||||
std::cerr << __func__ << ": implementationList failed: " << e.what() << "\n";
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> desiredBackends;
|
|
||||||
if (backend) {
|
|
||||||
desiredBackends.push_back(backend.value());
|
|
||||||
} else {
|
|
||||||
desiredBackends.insert(desiredBackends.end(), DEFAULT_BACKENDS, std::end(DEFAULT_BACKENDS));
|
|
||||||
}
|
|
||||||
|
|
||||||
const Implementation *impl = nullptr;
|
|
||||||
|
|
||||||
for (const auto &desiredBackend: desiredBackends) {
|
|
||||||
auto cacheIt = implCache.find(desiredBackend);
|
|
||||||
if (cacheIt != implCache.end())
|
|
||||||
return cacheIt->second.get(); // cached
|
|
||||||
|
|
||||||
for (const auto &i: *impls) {
|
|
||||||
if (i.m_modelType == "LLaMA" && i.m_buildVariant == applyCPUVariant(desiredBackend)) {
|
|
||||||
impl = &i;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (impl) {
|
|
||||||
auto *fres = impl->m_construct();
|
|
||||||
fres->m_implementation = impl;
|
|
||||||
implCache[desiredBackend] = std::unique_ptr<LLModel>(fres);
|
|
||||||
return fres;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::cerr << __func__ << ": could not find Llama implementation for backend: " << backend.value_or("default") << "\n";
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<LLModel::GPUDevice> LLModel::Implementation::availableGPUDevices(size_t memoryRequired)
|
|
||||||
{
|
|
||||||
std::vector<LLModel::GPUDevice> devices;
|
|
||||||
#ifndef __APPLE__
|
|
||||||
static const std::string backends[] = {"kompute", "cuda"};
|
|
||||||
for (const auto &backend: backends) {
|
|
||||||
auto *llama = constructGlobalLlama(backend);
|
|
||||||
if (llama) {
|
|
||||||
auto backendDevs = llama->availableGPUDevices(memoryRequired);
|
|
||||||
devices.insert(devices.end(), backendDevs.begin(), backendDevs.end());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return devices;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t LLModel::Implementation::maxContextLength(const std::string &modelPath)
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama ? llama->maxContextLength(modelPath) : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t LLModel::Implementation::layerCount(const std::string &modelPath)
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama ? llama->layerCount(modelPath) : -1;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LLModel::Implementation::isEmbeddingModel(const std::string &modelPath)
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama && llama->isEmbeddingModel(modelPath);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto LLModel::Implementation::chatTemplate(const char *modelPath) -> std::expected<std::string, std::string>
|
|
||||||
{
|
|
||||||
auto *llama = constructGlobalLlama();
|
|
||||||
return llama ? llama->chatTemplate(modelPath) : std::unexpected("backend not available");
|
|
||||||
}
|
|
||||||
|
|
||||||
void LLModel::Implementation::setImplementationsSearchPath(const std::string& path)
|
|
||||||
{
|
|
||||||
s_implementations_search_path = path;
|
|
||||||
}
|
|
||||||
|
|
||||||
const std::string& LLModel::Implementation::implementationsSearchPath()
|
|
||||||
{
|
|
||||||
return s_implementations_search_path;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool LLModel::Implementation::hasSupportedCPU()
|
|
||||||
{
|
|
||||||
return cpu_supports_avx() != 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
int LLModel::Implementation::cpuSupportsAVX2()
|
|
||||||
{
|
|
||||||
return cpu_supports_avx2();
|
|
||||||
}
|
|
@ -1,320 +0,0 @@
|
|||||||
#include "llmodel_c.h"
|
|
||||||
|
|
||||||
#include "llmodel.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cstdio>
|
|
||||||
#include <cstdlib>
|
|
||||||
#include <cstring>
|
|
||||||
#include <exception>
|
|
||||||
#include <iostream>
|
|
||||||
#include <memory>
|
|
||||||
#include <optional>
|
|
||||||
#include <string>
|
|
||||||
#include <string_view>
|
|
||||||
#include <vector>
|
|
||||||
#include <span>
|
|
||||||
|
|
||||||
namespace ranges = std::ranges;
|
|
||||||
|
|
||||||
static_assert(sizeof(token_t) == sizeof(LLModel::Token));
|
|
||||||
|
|
||||||
struct LLModelWrapper {
|
|
||||||
LLModel *llModel = nullptr;
|
|
||||||
~LLModelWrapper() { delete llModel; }
|
|
||||||
};
|
|
||||||
|
|
||||||
llmodel_model llmodel_model_create(const char *model_path)
|
|
||||||
{
|
|
||||||
const char *error;
|
|
||||||
auto fres = llmodel_model_create2(model_path, "auto", &error);
|
|
||||||
if (!fres) {
|
|
||||||
fprintf(stderr, "Unable to instantiate model: %s\n", error);
|
|
||||||
}
|
|
||||||
return fres;
|
|
||||||
}
|
|
||||||
|
|
||||||
static void llmodel_set_error(const char **errptr, const char *message)
|
|
||||||
{
|
|
||||||
thread_local static std::string last_error_message;
|
|
||||||
if (errptr) {
|
|
||||||
last_error_message = message;
|
|
||||||
*errptr = last_error_message.c_str();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
llmodel_model llmodel_model_create2(const char *model_path, const char *backend, const char **error)
|
|
||||||
{
|
|
||||||
LLModel *llModel;
|
|
||||||
try {
|
|
||||||
llModel = LLModel::Implementation::construct(model_path, backend);
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
llmodel_set_error(error, e.what());
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto wrapper = new LLModelWrapper;
|
|
||||||
wrapper->llModel = llModel;
|
|
||||||
return wrapper;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llmodel_model_destroy(llmodel_model model)
|
|
||||||
{
|
|
||||||
delete static_cast<LLModelWrapper *>(model);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t llmodel_required_mem(llmodel_model model, const char *model_path, int n_ctx, int ngl)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->requiredMem(model_path, n_ctx, ngl);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llmodel_loadModel(llmodel_model model, const char *model_path, int n_ctx, int ngl)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
|
|
||||||
std::string modelPath(model_path);
|
|
||||||
if (wrapper->llModel->isModelBlacklisted(modelPath)) {
|
|
||||||
size_t slash = modelPath.find_last_of("/\\");
|
|
||||||
auto basename = slash == std::string::npos ? modelPath : modelPath.substr(slash + 1);
|
|
||||||
std::cerr << "warning: model '" << basename << "' is out-of-date, please check for an updated version\n";
|
|
||||||
}
|
|
||||||
return wrapper->llModel->loadModel(modelPath, n_ctx, ngl);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llmodel_isModelLoaded(llmodel_model model)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->isModelLoaded();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t llmodel_state_get_size(llmodel_model model)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->stateSize();
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t llmodel_state_get_data(llmodel_model model, uint8_t *state_out, uint64_t state_size,
|
|
||||||
token_t **input_tokens_out, uint64_t *n_input_tokens)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
std::vector<LLModel::Token> inputTokens;
|
|
||||||
auto bytesWritten = wrapper->llModel->saveState({state_out, size_t(state_size)}, inputTokens);
|
|
||||||
if (bytesWritten) {
|
|
||||||
auto *buf = new LLModel::Token[inputTokens.size()];
|
|
||||||
ranges::copy(inputTokens, buf);
|
|
||||||
*input_tokens_out = buf;
|
|
||||||
*n_input_tokens = uint64_t(inputTokens.size());
|
|
||||||
} else {
|
|
||||||
*input_tokens_out = nullptr;
|
|
||||||
*n_input_tokens = 0;
|
|
||||||
}
|
|
||||||
return bytesWritten;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llmodel_state_free_input_tokens(LLModel::Token *input_tokens)
|
|
||||||
{
|
|
||||||
delete[] input_tokens;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t llmodel_state_set_data(llmodel_model model, const uint8_t *state, uint64_t state_size,
|
|
||||||
const token_t *input_tokens, uint64_t n_input_tokens)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->restoreState({state, size_t(state_size)}, {input_tokens, size_t(n_input_tokens)});
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llmodel_prompt(llmodel_model model,
|
|
||||||
const char *prompt,
|
|
||||||
llmodel_prompt_callback prompt_callback,
|
|
||||||
llmodel_response_callback response_callback,
|
|
||||||
llmodel_prompt_context *ctx,
|
|
||||||
const char **error)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
|
|
||||||
// Copy the C prompt context
|
|
||||||
LLModel::PromptContext promptContext {
|
|
||||||
.n_predict = ctx->n_predict,
|
|
||||||
.top_k = ctx->top_k,
|
|
||||||
.top_p = ctx->top_p,
|
|
||||||
.min_p = ctx->min_p,
|
|
||||||
.temp = ctx->temp,
|
|
||||||
.n_batch = ctx->n_batch,
|
|
||||||
.repeat_penalty = ctx->repeat_penalty,
|
|
||||||
.repeat_last_n = ctx->repeat_last_n,
|
|
||||||
.contextErase = ctx->context_erase,
|
|
||||||
};
|
|
||||||
|
|
||||||
auto prompt_func = [prompt_callback](std::span<const LLModel::Token> token_ids, bool cached) {
|
|
||||||
return prompt_callback(token_ids.data(), token_ids.size(), cached);
|
|
||||||
};
|
|
||||||
auto response_func = [response_callback](LLModel::Token token_id, std::string_view piece) {
|
|
||||||
return response_callback(token_id, piece.data());
|
|
||||||
};
|
|
||||||
|
|
||||||
// Call the C++ prompt method
|
|
||||||
try {
|
|
||||||
wrapper->llModel->prompt(prompt, prompt_func, response_func, promptContext);
|
|
||||||
} catch (std::exception const &e) {
|
|
||||||
llmodel_set_error(error, e.what());
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
float *llmodel_embed(
|
|
||||||
llmodel_model model, const char **texts, size_t *embedding_size, const char *prefix, int dimensionality,
|
|
||||||
size_t *token_count, bool do_mean, bool atlas, llmodel_emb_cancel_callback cancel_cb, const char **error
|
|
||||||
) {
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
|
|
||||||
if (!texts || !*texts) {
|
|
||||||
llmodel_set_error(error, "'texts' is NULL or empty");
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::vector<std::string> textsVec;
|
|
||||||
while (*texts) { textsVec.emplace_back(*texts++); }
|
|
||||||
|
|
||||||
size_t embd_size;
|
|
||||||
float *embedding;
|
|
||||||
|
|
||||||
try {
|
|
||||||
embd_size = wrapper->llModel->embeddingSize();
|
|
||||||
if (dimensionality > 0 && dimensionality < int(embd_size))
|
|
||||||
embd_size = dimensionality;
|
|
||||||
|
|
||||||
embd_size *= textsVec.size();
|
|
||||||
|
|
||||||
std::optional<std::string> prefixStr;
|
|
||||||
if (prefix) { prefixStr = prefix; }
|
|
||||||
|
|
||||||
embedding = new float[embd_size];
|
|
||||||
wrapper->llModel->embed(textsVec, embedding, prefixStr, dimensionality, token_count, do_mean, atlas, cancel_cb);
|
|
||||||
} catch (std::exception const &e) {
|
|
||||||
llmodel_set_error(error, e.what());
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
*embedding_size = embd_size;
|
|
||||||
return embedding;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llmodel_free_embedding(float *ptr)
|
|
||||||
{
|
|
||||||
delete[] ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void llmodel_setThreadCount(llmodel_model model, int32_t n_threads)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
wrapper->llModel->setThreadCount(n_threads);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t llmodel_threadCount(llmodel_model model)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->threadCount();
|
|
||||||
}
|
|
||||||
|
|
||||||
void llmodel_set_implementation_search_path(const char *path)
|
|
||||||
{
|
|
||||||
LLModel::Implementation::setImplementationsSearchPath(path);
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *llmodel_get_implementation_search_path()
|
|
||||||
{
|
|
||||||
return LLModel::Implementation::implementationsSearchPath().c_str();
|
|
||||||
}
|
|
||||||
|
|
||||||
// RAII wrapper around a C-style struct
|
|
||||||
struct llmodel_gpu_device_cpp: llmodel_gpu_device {
|
|
||||||
llmodel_gpu_device_cpp() = default;
|
|
||||||
|
|
||||||
llmodel_gpu_device_cpp(const llmodel_gpu_device_cpp &) = delete;
|
|
||||||
llmodel_gpu_device_cpp( llmodel_gpu_device_cpp &&) = delete;
|
|
||||||
|
|
||||||
const llmodel_gpu_device_cpp &operator=(const llmodel_gpu_device_cpp &) = delete;
|
|
||||||
llmodel_gpu_device_cpp &operator=( llmodel_gpu_device_cpp &&) = delete;
|
|
||||||
|
|
||||||
~llmodel_gpu_device_cpp() {
|
|
||||||
free(const_cast<char *>(name));
|
|
||||||
free(const_cast<char *>(vendor));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
static_assert(sizeof(llmodel_gpu_device_cpp) == sizeof(llmodel_gpu_device));
|
|
||||||
|
|
||||||
struct llmodel_gpu_device *llmodel_available_gpu_devices(size_t memoryRequired, int *num_devices)
|
|
||||||
{
|
|
||||||
static thread_local std::unique_ptr<llmodel_gpu_device_cpp[]> c_devices;
|
|
||||||
|
|
||||||
auto devices = LLModel::Implementation::availableGPUDevices(memoryRequired);
|
|
||||||
*num_devices = devices.size();
|
|
||||||
|
|
||||||
if (devices.empty()) { return nullptr; /* no devices */ }
|
|
||||||
|
|
||||||
c_devices = std::make_unique<llmodel_gpu_device_cpp[]>(devices.size());
|
|
||||||
for (unsigned i = 0; i < devices.size(); i++) {
|
|
||||||
const auto &dev = devices[i];
|
|
||||||
auto &cdev = c_devices[i];
|
|
||||||
cdev.backend = dev.backend;
|
|
||||||
cdev.index = dev.index;
|
|
||||||
cdev.type = dev.type;
|
|
||||||
cdev.heapSize = dev.heapSize;
|
|
||||||
cdev.name = strdup(dev.name.c_str());
|
|
||||||
cdev.vendor = strdup(dev.vendor.c_str());
|
|
||||||
}
|
|
||||||
|
|
||||||
return c_devices.get();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llmodel_gpu_init_gpu_device_by_string(llmodel_model model, size_t memoryRequired, const char *device)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->initializeGPUDevice(memoryRequired, std::string(device));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llmodel_gpu_init_gpu_device_by_struct(llmodel_model model, const llmodel_gpu_device *device)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->initializeGPUDevice(device->index);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool llmodel_gpu_init_gpu_device_by_int(llmodel_model model, int device)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->initializeGPUDevice(device);
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *llmodel_model_backend_name(llmodel_model model)
|
|
||||||
{
|
|
||||||
const auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->backendName();
|
|
||||||
}
|
|
||||||
|
|
||||||
const char *llmodel_model_gpu_device_name(llmodel_model model)
|
|
||||||
{
|
|
||||||
const auto *wrapper = static_cast<LLModelWrapper *>(model);
|
|
||||||
return wrapper->llModel->gpuDeviceName();
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t llmodel_count_prompt_tokens(llmodel_model model, const char *prompt, const char **error)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<const LLModelWrapper *>(model);
|
|
||||||
try {
|
|
||||||
return wrapper->llModel->countPromptTokens(prompt);
|
|
||||||
} catch (const std::exception& e) {
|
|
||||||
llmodel_set_error(error, e.what());
|
|
||||||
return -1;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void llmodel_model_foreach_special_token(llmodel_model model, llmodel_special_token_callback callback)
|
|
||||||
{
|
|
||||||
auto *wrapper = static_cast<const LLModelWrapper *>(model);
|
|
||||||
for (auto &[name, token] : wrapper->llModel->specialTokens())
|
|
||||||
callback(name.c_str(), token.c_str());
|
|
||||||
}
|
|
@ -1,298 +0,0 @@
|
|||||||
#include "llmodel.h"
|
|
||||||
|
|
||||||
#include <algorithm>
|
|
||||||
#include <cassert>
|
|
||||||
#include <cstddef>
|
|
||||||
#include <cstdint>
|
|
||||||
#include <iostream>
|
|
||||||
#include <iterator>
|
|
||||||
#include <optional>
|
|
||||||
#include <ranges>
|
|
||||||
#include <stdexcept>
|
|
||||||
#include <string>
|
|
||||||
#include <string_view>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
namespace ranges = std::ranges;
|
|
||||||
namespace views = std::ranges::views;
|
|
||||||
|
|
||||||
void LLModel::prompt(
|
|
||||||
std::string_view prompt,
|
|
||||||
const PromptCallback &promptCallback,
|
|
||||||
const ResponseCallback &responseCallback,
|
|
||||||
const PromptContext &promptCtx
|
|
||||||
) {
|
|
||||||
if (!isModelLoaded())
|
|
||||||
throw std::invalid_argument("Attempted to prompt an unloaded model.");
|
|
||||||
if (!supportsCompletion())
|
|
||||||
throw std::invalid_argument("Not a text completion model.");
|
|
||||||
if (!promptCtx.n_batch)
|
|
||||||
throw std::invalid_argument("Batch size cannot be zero.");
|
|
||||||
if (!promptCtx.n_predict)
|
|
||||||
return; // nothing requested
|
|
||||||
|
|
||||||
auto embd_inp = tokenize(prompt);
|
|
||||||
if (embd_inp.empty())
|
|
||||||
throw std::invalid_argument("Prompt tokenized to zero tokens.");
|
|
||||||
|
|
||||||
if (auto res = decodePrompt(promptCallback, promptCtx, std::move(embd_inp)))
|
|
||||||
generateResponse(responseCallback, promptCtx, /*n_past*/ *res);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t LLModel::countPromptTokens(std::string_view prompt) const
|
|
||||||
{
|
|
||||||
if (!isModelLoaded())
|
|
||||||
throw std::invalid_argument("Attempted to tokenize with an unloaded model.");
|
|
||||||
return int32_t(tokenize(prompt).size());
|
|
||||||
}
|
|
||||||
|
|
||||||
auto LLModel::decodePrompt(
|
|
||||||
const PromptCallback &promptCallback,
|
|
||||||
const PromptContext &promptCtx,
|
|
||||||
std::vector<Token> embd_inp
|
|
||||||
) -> std::optional<int32_t>
|
|
||||||
{
|
|
||||||
assert(!embd_inp.empty());
|
|
||||||
|
|
||||||
int32_t nCtx = contextLength();
|
|
||||||
int32_t n_batch = std::min(promptCtx.n_batch, LLMODEL_MAX_PROMPT_BATCH);
|
|
||||||
|
|
||||||
// Find the greatest n_past where the beginning of embd_inp matches the end of the token cache, starting at the
|
|
||||||
// requested n_past.
|
|
||||||
// This is used to skip unnecessary work when the prompt shares a common prefix with the previous result.
|
|
||||||
int32_t nPast = computeModelInputPosition(embd_inp);
|
|
||||||
|
|
||||||
// always decode up to a full batch before generating, even if cached
|
|
||||||
nPast -= std::min(n_batch, nPast);
|
|
||||||
|
|
||||||
// TODO(jared): generalize this to find the smallest new_embd_inp.size() - nPast given the cache
|
|
||||||
if (!nPast && int32_t(embd_inp.size()) > nCtx) {
|
|
||||||
// no cache hit -> shift the input before even processing
|
|
||||||
|
|
||||||
int32_t nKeep = shouldAddBOS();
|
|
||||||
auto newLength = int32_t(nCtx * (1.f - promptCtx.contextErase));
|
|
||||||
int32_t nDiscard = int32_t(embd_inp.size()) - std::max(1, std::min(nCtx, newLength));
|
|
||||||
|
|
||||||
// execute the callback even for skipped tokens. this misrepresents the position of BOS but we don't care
|
|
||||||
auto discardedTokens = embd_inp | views::drop(nKeep) | views::take(nDiscard);
|
|
||||||
if (!promptCallback(discardedTokens, true))
|
|
||||||
return std::nullopt;
|
|
||||||
|
|
||||||
// erase nDiscard tokens
|
|
||||||
embd_inp.erase(discardedTokens.begin(), discardedTokens.end());
|
|
||||||
assert(int32_t(embd_inp.size()) <= nCtx);
|
|
||||||
|
|
||||||
// check the cache again, just in case
|
|
||||||
nPast = computeModelInputPosition(embd_inp);
|
|
||||||
nPast -= std::min(n_batch, nPast);
|
|
||||||
}
|
|
||||||
|
|
||||||
setModelInputPosition(nPast);
|
|
||||||
|
|
||||||
// execute the callback even for skipped tokens
|
|
||||||
if (!promptCallback(embd_inp | views::take(nPast), true))
|
|
||||||
return std::nullopt;
|
|
||||||
|
|
||||||
// process the prompt in batches
|
|
||||||
for (int32_t i = nPast; i < embd_inp.size();) {
|
|
||||||
auto batch_end = std::min(i + n_batch, int32_t(embd_inp.size()));
|
|
||||||
std::span batch(embd_inp.begin() + i, embd_inp.begin() + batch_end);
|
|
||||||
|
|
||||||
// Check if the context has run out...
|
|
||||||
if (nPast + int32_t(batch.size()) > nCtx) {
|
|
||||||
shiftContext(promptCtx, &nPast);
|
|
||||||
assert(nPast + int32_t(batch.size()) <= nCtx);
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME(Adam): We should find a way to bubble these strings to the UI level to allow for translation
|
|
||||||
if (!evalTokens(nPast, batch))
|
|
||||||
throw std::runtime_error("An internal error was encountered during prompt processing.");
|
|
||||||
|
|
||||||
for (auto &tok : batch) {
|
|
||||||
appendInputToken(tok);
|
|
||||||
nPast++;
|
|
||||||
if (!promptCallback({ &tok, 1 }, false))
|
|
||||||
return std::nullopt;
|
|
||||||
}
|
|
||||||
i = batch_end;
|
|
||||||
}
|
|
||||||
|
|
||||||
return nPast;
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
|
||||||
* If string s overlaps with the string key such that some prefix of the key is at the end
|
|
||||||
* of the string, return the position in s where the first match starts. Otherwise, return
|
|
||||||
* std::string::npos. Examples:
|
|
||||||
* s = "bfo", key = "foo" -> 1
|
|
||||||
* s = "fooa", key = "foo" -> npos
|
|
||||||
*/
|
|
||||||
static std::string::size_type stringsOverlap(const std::string &s, const std::string &key)
|
|
||||||
{
|
|
||||||
if (s.empty() || key.empty())
|
|
||||||
throw std::invalid_argument("arguments to stringsOverlap must not be empty");
|
|
||||||
|
|
||||||
for (int start = std::max(0, int(s.size()) - int(key.size())); start < s.size(); start++) {
|
|
||||||
if (s.compare(start, s.size(), key, 0, s.size() - start) == 0)
|
|
||||||
return start;
|
|
||||||
}
|
|
||||||
return std::string::npos;
|
|
||||||
}
|
|
||||||
|
|
||||||
void LLModel::generateResponse(
|
|
||||||
const ResponseCallback &responseCallback,
|
|
||||||
const PromptContext &promptCtx,
|
|
||||||
int32_t nPast
|
|
||||||
) {
|
|
||||||
static const char *stopSequences[] {
|
|
||||||
"### System", "### Instruction", "### Human", "### User", "### Response", "### Assistant", "### Context",
|
|
||||||
"<|im_start|>", "<|im_end|>", "<|endoftext|>",
|
|
||||||
};
|
|
||||||
|
|
||||||
initSampler(promptCtx);
|
|
||||||
|
|
||||||
std::string cachedResponse;
|
|
||||||
std::vector<Token> cachedTokens;
|
|
||||||
int n_predicted = 0;
|
|
||||||
|
|
||||||
// Predict next tokens
|
|
||||||
for (bool stop = false; !stop;) {
|
|
||||||
// Sample next token
|
|
||||||
std::optional<Token> new_tok = sampleToken();
|
|
||||||
std::string new_piece = tokenToString(new_tok.value());
|
|
||||||
cachedTokens.push_back(new_tok.value());
|
|
||||||
cachedResponse += new_piece;
|
|
||||||
|
|
||||||
auto accept = [this, &promptCtx, &new_tok, &nPast] {
|
|
||||||
// Shift context if out of space
|
|
||||||
if (nPast >= contextLength()) {
|
|
||||||
shiftContext(promptCtx, &nPast);
|
|
||||||
assert(nPast < contextLength());
|
|
||||||
}
|
|
||||||
|
|
||||||
// Accept the token
|
|
||||||
Token tok = std::exchange(new_tok, std::nullopt).value();
|
|
||||||
if (!evalTokens(nPast, { &tok, 1 }))
|
|
||||||
throw std::runtime_error("An internal error was encountered during response generation.");
|
|
||||||
|
|
||||||
appendInputToken(tok);
|
|
||||||
nPast++;
|
|
||||||
};
|
|
||||||
|
|
||||||
// Check for EOS
|
|
||||||
auto lengthLimit = std::string::npos;
|
|
||||||
for (const auto token : endTokens()) {
|
|
||||||
if (new_tok == token) {
|
|
||||||
stop = true;
|
|
||||||
lengthLimit = cachedResponse.size() - new_piece.size();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lengthLimit != std::string::npos) {
|
|
||||||
// EOS matched
|
|
||||||
} else if (!isSpecialToken(new_tok.value())) {
|
|
||||||
// Check if the response contains a stop sequence
|
|
||||||
for (const auto &p : stopSequences) {
|
|
||||||
auto match = cachedResponse.find(p);
|
|
||||||
if (match != std::string::npos) stop = true;
|
|
||||||
lengthLimit = std::min(lengthLimit, match);
|
|
||||||
if (match == 0) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Check if the response matches the start of a stop sequence
|
|
||||||
if (lengthLimit == std::string::npos) {
|
|
||||||
for (const auto &p : stopSequences) {
|
|
||||||
auto match = stringsOverlap(cachedResponse, p);
|
|
||||||
lengthLimit = std::min(lengthLimit, match);
|
|
||||||
if (match == 0) break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else if (ranges::find(stopSequences, new_piece) < std::end(stopSequences)) {
|
|
||||||
// Special tokens must exactly match a stop sequence
|
|
||||||
stop = true;
|
|
||||||
lengthLimit = cachedResponse.size() - new_piece.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
// Empty the cache, up to the length limit
|
|
||||||
std::string::size_type responseLength = 0;
|
|
||||||
while (!cachedTokens.empty()) {
|
|
||||||
Token tok = cachedTokens.front();
|
|
||||||
std::string piece = tokenToString(tok);
|
|
||||||
|
|
||||||
// Stop if the piece (or part of it) does not fit within the length limit
|
|
||||||
if (responseLength + (stop ? 1 : piece.size()) > lengthLimit)
|
|
||||||
break;
|
|
||||||
|
|
||||||
// Remove token from cache
|
|
||||||
assert(cachedResponse.starts_with(piece));
|
|
||||||
cachedTokens.erase(cachedTokens.begin(), cachedTokens.begin() + 1);
|
|
||||||
cachedResponse.erase(cachedResponse.begin(), cachedResponse.begin() + piece.size());
|
|
||||||
|
|
||||||
// Accept the token, if needed (not cached)
|
|
||||||
if (cachedTokens.empty() && new_tok)
|
|
||||||
accept();
|
|
||||||
|
|
||||||
// Send the token
|
|
||||||
if (!responseCallback(tok, piece) || ++n_predicted >= promptCtx.n_predict) {
|
|
||||||
stop = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME(jared): we could avoid printing partial stop sequences if we didn't have to
|
|
||||||
// output token IDs and could cache a partial token for the next prompt call
|
|
||||||
responseLength += piece.size();
|
|
||||||
}
|
|
||||||
assert(cachedTokens.empty() == cachedResponse.empty());
|
|
||||||
|
|
||||||
// Accept the token, if needed (in cache)
|
|
||||||
if (new_tok) {
|
|
||||||
assert(!cachedTokens.empty() && cachedTokens.back() == new_tok);
|
|
||||||
if (stop) {
|
|
||||||
cachedTokens.pop_back();
|
|
||||||
} else {
|
|
||||||
accept();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (inputLength() < cachedTokens.size()) {
|
|
||||||
/* This is theoretically possible if the longest stop sequence is greater than
|
|
||||||
* n_ctx * contextErase tokens. */
|
|
||||||
throw std::runtime_error("shifted too much context, can't go back");
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifndef NDEBUG
|
|
||||||
auto inp = inputTokens();
|
|
||||||
auto discard_start = inp.end() - cachedTokens.size();
|
|
||||||
assert(std::equal(discard_start, inp.end(), cachedTokens.begin()));
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
void LLModel::embed(
|
|
||||||
const std::vector<std::string> &texts, float *embeddings, std::optional<std::string> prefix, int dimensionality,
|
|
||||||
size_t *tokenCount, bool doMean, bool atlas, EmbedCancelCallback *cancelCb
|
|
||||||
) {
|
|
||||||
(void)texts;
|
|
||||||
(void)embeddings;
|
|
||||||
(void)prefix;
|
|
||||||
(void)dimensionality;
|
|
||||||
(void)tokenCount;
|
|
||||||
(void)doMean;
|
|
||||||
(void)atlas;
|
|
||||||
(void)cancelCb;
|
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
|
||||||
}
|
|
||||||
|
|
||||||
void LLModel::embed(
|
|
||||||
const std::vector<std::string> &texts, float *embeddings, bool isRetrieval, int dimensionality, size_t *tokenCount,
|
|
||||||
bool doMean, bool atlas
|
|
||||||
) {
|
|
||||||
(void)texts;
|
|
||||||
(void)embeddings;
|
|
||||||
(void)isRetrieval;
|
|
||||||
(void)dimensionality;
|
|
||||||
(void)tokenCount;
|
|
||||||
(void)doMean;
|
|
||||||
(void)atlas;
|
|
||||||
throw std::logic_error(std::string(implementation().modelType()) + " does not support embeddings");
|
|
||||||
}
|
|
@ -1,17 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
|
|
||||||
#ifdef NDEBUG
|
|
||||||
# ifdef __has_builtin
|
|
||||||
# if __has_builtin(__builtin_unreachable)
|
|
||||||
# define UNREACHABLE() __builtin_unreachable()
|
|
||||||
# else
|
|
||||||
# define UNREACHABLE() do {} while (0)
|
|
||||||
# endif
|
|
||||||
# else
|
|
||||||
# define UNREACHABLE() do {} while (0)
|
|
||||||
# endif
|
|
||||||
#else
|
|
||||||
# define UNREACHABLE() assert(!"Unreachable statement was reached")
|
|
||||||
#endif
|
|
1023
gpt4all-backend/starcoder.cpp
Normal file
1023
gpt4all-backend/starcoder.cpp
Normal file
File diff suppressed because it is too large
Load Diff
42
gpt4all-backend/starcoder_impl.h
Normal file
42
gpt4all-backend/starcoder_impl.h
Normal file
@ -0,0 +1,42 @@
|
|||||||
|
#ifndef STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#error This file is NOT meant to be included outside of starcoder.cpp. Doing so is DANGEROUS. Be sure to know what you are doing before proceeding to #define STARCODER_H_I_KNOW_WHAT_I_AM_DOING_WHEN_INCLUDING_THIS_FILE
|
||||||
|
#endif
|
||||||
|
#ifndef STARCODER_H
|
||||||
|
#define STARCODER_H
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <functional>
|
||||||
|
#include <vector>
|
||||||
|
#include <memory>
|
||||||
|
#include "llmodel.h"
|
||||||
|
|
||||||
|
struct StarcoderPrivate;
|
||||||
|
class Starcoder : public LLModel {
|
||||||
|
public:
|
||||||
|
Starcoder();
|
||||||
|
~Starcoder();
|
||||||
|
|
||||||
|
bool supportsEmbedding() const override { return false; }
|
||||||
|
bool supportsCompletion() const override { return true; }
|
||||||
|
bool loadModel(const std::string &modelPath) override;
|
||||||
|
bool isModelLoaded() const override;
|
||||||
|
size_t requiredMem(const std::string &modelPath) override;
|
||||||
|
size_t stateSize() const override;
|
||||||
|
size_t saveState(uint8_t *dest) const override;
|
||||||
|
size_t restoreState(const uint8_t *src) override;
|
||||||
|
void setThreadCount(int32_t n_threads) override;
|
||||||
|
int32_t threadCount() const override;
|
||||||
|
|
||||||
|
private:
|
||||||
|
std::unique_ptr<StarcoderPrivate> d_ptr;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
std::vector<Token> tokenize(PromptContext &, const std::string&) const override;
|
||||||
|
Token sampleToken(PromptContext &ctx) const override;
|
||||||
|
std::string tokenToString(Token) const override;
|
||||||
|
bool evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const override;
|
||||||
|
int32_t contextLength() const override;
|
||||||
|
const std::vector<Token>& endTokens() const override;
|
||||||
|
};
|
||||||
|
|
||||||
|
#endif // STARCODER_H
|
@ -2,21 +2,17 @@
|
|||||||
#define SYSINFO_H
|
#define SYSINFO_H
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <iomanip>
|
|
||||||
#include <sstream>
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <sstream>
|
||||||
|
#include <iomanip>
|
||||||
|
|
||||||
#if defined(__linux__)
|
#if defined(__linux__)
|
||||||
# include <unistd.h>
|
#include <unistd.h>
|
||||||
#elif defined(__APPLE__)
|
#elif defined(__APPLE__)
|
||||||
# include <sys/types.h>
|
#include <sys/types.h>
|
||||||
# include <sys/sysctl.h>
|
#include <sys/sysctl.h>
|
||||||
#elif defined(_WIN32)
|
#elif defined(_WIN32)
|
||||||
# define WIN32_LEAN_AND_MEAN
|
#include <windows.h>
|
||||||
# ifndef NOMINMAX
|
|
||||||
# define NOMINMAX
|
|
||||||
# endif
|
|
||||||
# include <windows.h>
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static long long getSystemTotalRAMInBytes()
|
static long long getSystemTotalRAMInBytes()
|
328
gpt4all-backend/utils.cpp
Normal file
328
gpt4all-backend/utils.cpp
Normal file
@ -0,0 +1,328 @@
|
|||||||
|
#include "utils.h"
|
||||||
|
|
||||||
|
#include <fstream>
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
|
void replace(std::string & str, const std::string & needle, const std::string & replacement) {
|
||||||
|
size_t pos = 0;
|
||||||
|
while ((pos = str.find(needle, pos)) != std::string::npos) {
|
||||||
|
str.replace(pos, needle.length(), replacement);
|
||||||
|
pos += replacement.length();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::map<std::string, int32_t> json_parse(const std::string & fname) {
|
||||||
|
std::map<std::string, int32_t> result;
|
||||||
|
|
||||||
|
// read file into string
|
||||||
|
std::string json;
|
||||||
|
{
|
||||||
|
std::ifstream ifs(fname);
|
||||||
|
if (!ifs) {
|
||||||
|
fprintf(stderr, "Failed to open %s\n", fname.c_str());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
json = std::string((std::istreambuf_iterator<char>(ifs)),
|
||||||
|
(std::istreambuf_iterator<char>()));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (json[0] != '{') {
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// parse json
|
||||||
|
{
|
||||||
|
bool has_key = false;
|
||||||
|
bool in_token = false;
|
||||||
|
|
||||||
|
std::string str_key = "";
|
||||||
|
std::string str_val = "";
|
||||||
|
|
||||||
|
int n = json.size();
|
||||||
|
for (int i = 1; i < n; ++i) {
|
||||||
|
if (!in_token) {
|
||||||
|
if (json[i] == ' ') continue;
|
||||||
|
if (json[i] == '"') {
|
||||||
|
in_token = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (json[i] == '\\' && i+1 < n) {
|
||||||
|
if (has_key == false) {
|
||||||
|
str_key += json[i];
|
||||||
|
} else {
|
||||||
|
str_val += json[i];
|
||||||
|
}
|
||||||
|
++i;
|
||||||
|
} else if (json[i] == '"') {
|
||||||
|
if (has_key == false) {
|
||||||
|
has_key = true;
|
||||||
|
++i;
|
||||||
|
while (json[i] == ' ') ++i;
|
||||||
|
++i; // :
|
||||||
|
while (json[i] == ' ') ++i;
|
||||||
|
if (json[i] != '\"') {
|
||||||
|
while (json[i] != ',' && json[i] != '}') {
|
||||||
|
str_val += json[i++];
|
||||||
|
}
|
||||||
|
has_key = false;
|
||||||
|
} else {
|
||||||
|
in_token = true;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
has_key = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
::replace(str_key, "\\u0120", " " ); // \u0120 -> space
|
||||||
|
::replace(str_key, "\\u010a", "\n"); // \u010a -> new line
|
||||||
|
::replace(str_key, "\\\"", "\""); // \\\" -> "
|
||||||
|
|
||||||
|
try {
|
||||||
|
result[str_key] = std::stoi(str_val);
|
||||||
|
} catch (...) {
|
||||||
|
//fprintf(stderr, "%s: ignoring key '%s' with value '%s'\n", fname.c_str(), str_key.c_str(), str_val.c_str());
|
||||||
|
|
||||||
|
}
|
||||||
|
str_key = "";
|
||||||
|
str_val = "";
|
||||||
|
in_token = false;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (has_key == false) {
|
||||||
|
str_key += json[i];
|
||||||
|
} else {
|
||||||
|
str_val += json[i];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<gpt_vocab::id> gpt_tokenize_inner(const gpt_vocab & vocab, const std::string & text) {
|
||||||
|
std::vector<std::string> words;
|
||||||
|
|
||||||
|
// first split the text into words
|
||||||
|
{
|
||||||
|
std::string str = text;
|
||||||
|
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
|
||||||
|
|
||||||
|
std::regex re(pat);
|
||||||
|
std::smatch m;
|
||||||
|
|
||||||
|
while (std::regex_search(str, m, re)) {
|
||||||
|
for (auto x : m) {
|
||||||
|
words.push_back(x);
|
||||||
|
}
|
||||||
|
str = m.suffix();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find the longest tokens that form the words:
|
||||||
|
std::vector<gpt_vocab::id> tokens;
|
||||||
|
for (const auto & word : words) {
|
||||||
|
if (word.size() == 0) continue;
|
||||||
|
|
||||||
|
int i = 0;
|
||||||
|
int n = word.size();
|
||||||
|
while (i < n) {
|
||||||
|
int j = n;
|
||||||
|
while (j > i) {
|
||||||
|
auto it = vocab.token_to_id.find(word.substr(i, j-i));
|
||||||
|
if (it != vocab.token_to_id.end()) {
|
||||||
|
tokens.push_back(it->second);
|
||||||
|
i = j;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
--j;
|
||||||
|
}
|
||||||
|
if (i == n) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
if (j == i) {
|
||||||
|
auto sub = word.substr(i, 1);
|
||||||
|
if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
|
||||||
|
tokens.push_back(vocab.token_to_id.at(sub));
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
|
||||||
|
}
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tokens;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::string regex_escape(const std::string &s) {
|
||||||
|
static const std::regex metacharacters(R"([\.\^\$\-\+\(\)\[\]\{\}\|\?\*])");
|
||||||
|
return std::regex_replace(s, metacharacters, "\\$&");
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
|
||||||
|
// Generate the subpattern from the special_tokens vector if it's not empty
|
||||||
|
if (!vocab.special_tokens.empty()) {
|
||||||
|
std::vector<gpt_vocab::id> out;
|
||||||
|
std::vector<std::string> chunks;
|
||||||
|
std::string str = text;
|
||||||
|
std::string special_tokens_subpattern;
|
||||||
|
for (const auto &token : vocab.special_tokens) {
|
||||||
|
if (!special_tokens_subpattern.empty()) {
|
||||||
|
special_tokens_subpattern += "|";
|
||||||
|
}
|
||||||
|
special_tokens_subpattern += regex_escape(token);
|
||||||
|
}
|
||||||
|
std::regex re(special_tokens_subpattern);
|
||||||
|
std::smatch m;
|
||||||
|
while (std::regex_search(str, m, re)) {
|
||||||
|
auto tok = vocab.token_to_id.find(m.str());
|
||||||
|
if (tok != vocab.token_to_id.end()) {
|
||||||
|
auto tokid = tok->second;
|
||||||
|
auto pfxtoks = gpt_tokenize_inner(vocab, m.prefix());
|
||||||
|
out.insert(out.end(), pfxtoks.begin(), pfxtoks.end());
|
||||||
|
out.push_back(tokid);
|
||||||
|
str = m.suffix();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!str.empty()) {
|
||||||
|
auto tokrest = gpt_tokenize_inner(vocab, str);
|
||||||
|
out.insert(out.end(), tokrest.begin(), tokrest.end());
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
} else {
|
||||||
|
return gpt_tokenize_inner(vocab, text);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab) {
|
||||||
|
printf("%s: loading vocab from '%s'\n", __func__, fname.c_str());
|
||||||
|
|
||||||
|
vocab.token_to_id = ::json_parse(fname);
|
||||||
|
|
||||||
|
for (const auto & kv : vocab.token_to_id) {
|
||||||
|
vocab.id_to_token[kv.second] = kv.first;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: vocab size = %d\n", __func__, (int) vocab.token_to_id.size());
|
||||||
|
|
||||||
|
// print the vocabulary
|
||||||
|
//for (auto kv : vocab.token_to_id) {
|
||||||
|
// printf("'%s' -> %d\n", kv.first.data(), kv.second);
|
||||||
|
//}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
gpt_vocab::id gpt_sample_top_k_top_p(
|
||||||
|
const size_t actualVocabSize,
|
||||||
|
const int32_t * last_n_tokens_data,
|
||||||
|
int last_n_tokens_size,
|
||||||
|
const std::vector<float> logits,
|
||||||
|
int top_k,
|
||||||
|
double top_p,
|
||||||
|
double temp,
|
||||||
|
float repeat_penalty,
|
||||||
|
std::mt19937 & rng) {
|
||||||
|
int n_logits = actualVocabSize;
|
||||||
|
|
||||||
|
const auto last_n_tokens = std::vector<int32_t>(last_n_tokens_data, last_n_tokens_data + last_n_tokens_size);
|
||||||
|
const auto * plogits = logits.data();
|
||||||
|
|
||||||
|
if (temp <= 0) {
|
||||||
|
// select the token with the highest logit directly
|
||||||
|
float max_logit = plogits[0];
|
||||||
|
gpt_vocab::id max_id = 0;
|
||||||
|
|
||||||
|
for (int i = 1; i < n_logits; ++i) {
|
||||||
|
if (plogits[i] > max_logit) {
|
||||||
|
max_logit = plogits[i];
|
||||||
|
max_id = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return max_id;
|
||||||
|
}
|
||||||
|
std::vector<std::pair<double, gpt_vocab::id>> logits_id;
|
||||||
|
logits_id.reserve(n_logits);
|
||||||
|
|
||||||
|
{
|
||||||
|
const float scale = 1.0f/temp;
|
||||||
|
for (int i = 0; i < n_logits; ++i) {
|
||||||
|
// repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858)
|
||||||
|
// credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main
|
||||||
|
if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) {
|
||||||
|
// if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
|
||||||
|
if (plogits[i] < 0.0f) {
|
||||||
|
logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i));
|
||||||
|
} else {
|
||||||
|
logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
logits_id.push_back(std::make_pair(plogits[i]*scale, i));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// find the top K tokens
|
||||||
|
std::partial_sort(
|
||||||
|
logits_id.begin(),
|
||||||
|
logits_id.begin() + top_k, logits_id.end(),
|
||||||
|
[](const std::pair<double, gpt_vocab::id> & a, const std::pair<double, gpt_vocab::id> & b) {
|
||||||
|
return a.first > b.first;
|
||||||
|
});
|
||||||
|
|
||||||
|
logits_id.resize(top_k);
|
||||||
|
|
||||||
|
double maxl = -INFINITY;
|
||||||
|
for (const auto & kv : logits_id) {
|
||||||
|
maxl = std::max(maxl, kv.first);
|
||||||
|
}
|
||||||
|
|
||||||
|
// compute probs for the top K tokens
|
||||||
|
std::vector<double> probs;
|
||||||
|
probs.reserve(logits_id.size());
|
||||||
|
|
||||||
|
double sum = 0.0;
|
||||||
|
for (const auto & kv : logits_id) {
|
||||||
|
double p = exp(kv.first - maxl);
|
||||||
|
probs.push_back(p);
|
||||||
|
sum += p;
|
||||||
|
}
|
||||||
|
|
||||||
|
// normalize the probs
|
||||||
|
for (auto & p : probs) {
|
||||||
|
p /= sum;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (top_p < 1.0f) {
|
||||||
|
double cumsum = 0.0f;
|
||||||
|
for (int i = 0; i < top_k; i++) {
|
||||||
|
cumsum += probs[i];
|
||||||
|
if (cumsum >= top_p) {
|
||||||
|
top_k = i + 1;
|
||||||
|
probs.resize(top_k);
|
||||||
|
logits_id.resize(top_k);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cumsum = 1.0/cumsum;
|
||||||
|
for (int i = 0; i < (int) probs.size(); i++) {
|
||||||
|
probs[i] *= cumsum;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//printf("\n");
|
||||||
|
//for (int i = 0; i < (int) probs.size(); i++) {
|
||||||
|
// printf("%d: '%s' %f\n", i, vocab.id_to_token.at(logits_id[i].second).c_str(), probs[i]);
|
||||||
|
//}
|
||||||
|
//exit(0);
|
||||||
|
|
||||||
|
std::discrete_distribution<> dist(probs.begin(), probs.end());
|
||||||
|
int idx = dist(rng);
|
||||||
|
|
||||||
|
return logits_id[idx].second;
|
||||||
|
}
|
97
gpt4all-backend/utils.h
Normal file
97
gpt4all-backend/utils.h
Normal file
@ -0,0 +1,97 @@
|
|||||||
|
// Various helper functions and utilities
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <string>
|
||||||
|
#include <map>
|
||||||
|
#include <vector>
|
||||||
|
#include <random>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
//
|
||||||
|
// General purpose inline functions
|
||||||
|
//
|
||||||
|
constexpr inline unsigned long long operator ""_MiB(unsigned long long bytes) {
|
||||||
|
return bytes*1024*1024;
|
||||||
|
}
|
||||||
|
|
||||||
|
//
|
||||||
|
// CLI argument parsing
|
||||||
|
//
|
||||||
|
|
||||||
|
struct gpt_params {
|
||||||
|
int32_t seed = -1; // RNG seed
|
||||||
|
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
|
||||||
|
int32_t n_predict = 200; // new tokens to predict
|
||||||
|
|
||||||
|
// sampling parameters
|
||||||
|
int32_t top_k = 40;
|
||||||
|
float top_p = 0.9f;
|
||||||
|
float temp = 0.9f;
|
||||||
|
|
||||||
|
int32_t n_batch = 8; // batch size for prompt processing
|
||||||
|
|
||||||
|
std::string model = "models/gpt-2-117M/ggml-model.bin"; // model path
|
||||||
|
std::string prompt;
|
||||||
|
};
|
||||||
|
|
||||||
|
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
|
||||||
|
|
||||||
|
void gpt_print_usage(int argc, char ** argv, const gpt_params & params);
|
||||||
|
|
||||||
|
std::string gpt_random_prompt(std::mt19937 & rng);
|
||||||
|
|
||||||
|
//
|
||||||
|
// Vocab utils
|
||||||
|
//
|
||||||
|
|
||||||
|
struct gpt_vocab {
|
||||||
|
using id = int32_t;
|
||||||
|
using token = std::string;
|
||||||
|
|
||||||
|
std::map<token, id> token_to_id;
|
||||||
|
std::map<id, token> id_to_token;
|
||||||
|
std::vector<std::string> special_tokens;
|
||||||
|
|
||||||
|
void add_special_token(const std::string &token) {
|
||||||
|
special_tokens.push_back(token);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
void replace(std::string & str, const std::string & needle, const std::string & replacement);
|
||||||
|
|
||||||
|
// poor-man's JSON parsing
|
||||||
|
std::map<std::string, int32_t> json_parse(const std::string & fname);
|
||||||
|
|
||||||
|
// split text into tokens
|
||||||
|
//
|
||||||
|
// ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53
|
||||||
|
//
|
||||||
|
// Regex (Python):
|
||||||
|
// r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
||||||
|
//
|
||||||
|
// Regex (C++):
|
||||||
|
// R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"
|
||||||
|
//
|
||||||
|
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text);
|
||||||
|
|
||||||
|
// load the tokens from encoder.json
|
||||||
|
bool gpt_vocab_init(const std::string & fname, gpt_vocab & vocab);
|
||||||
|
|
||||||
|
// sample next token given probabilities for each embedding
|
||||||
|
//
|
||||||
|
// - consider only the top K tokens
|
||||||
|
// - from them, consider only the top tokens with cumulative probability > P
|
||||||
|
//
|
||||||
|
// TODO: not sure if this implementation is correct
|
||||||
|
//
|
||||||
|
gpt_vocab::id gpt_sample_top_k_top_p(
|
||||||
|
const size_t actualVocabSize,
|
||||||
|
const int32_t * last_n_tokens_data,
|
||||||
|
int last_n_tokens_size,
|
||||||
|
const std::vector<float> logits,
|
||||||
|
int top_k,
|
||||||
|
double top_p,
|
||||||
|
double temp,
|
||||||
|
float repeat_penalty,
|
||||||
|
std::mt19937 & rng);
|
@ -1,21 +1,3 @@
|
|||||||
# GPT4All Language Bindings
|
# GPT4All Bindings
|
||||||
These are the language bindings for the GPT4All backend. They provide functionality to load GPT4All models (and other llama.cpp models), generate text, and (in the case of the Python bindings) embed text as a vector representation.
|
This directory will contain language specific bindings on top of the C/C++ model backends.
|
||||||
|
We will have one directory per language binding (e.g. Python, Typescript, Golang, etc.).
|
||||||
See their respective folders for language-specific documentation.
|
|
||||||
|
|
||||||
### Languages
|
|
||||||
- [Python](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/python) (Nomic official, maintained by [@cebtenzzre](https://github.com/cebtenzzre))
|
|
||||||
- [Node.js/Typescript](https://github.com/nomic-ai/gpt4all/tree/main/gpt4all-bindings/typescript) (community, maintained by [@jacoobes](https://github.com/jacoobes) and [@iimez](https://github.com/iimez))
|
|
||||||
|
|
||||||
<br/>
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
<details><summary><b>Archived Bindings</b></summary>
|
|
||||||
<br/>
|
|
||||||
|
|
||||||
The following bindings have been removed from this repository due to lack of maintenance. If adopted, they can be brought back—feel free to message a developer on Dicsord if you are interested in maintaining one of them. Below are links to their last available version (not necessarily the last working version).
|
|
||||||
- C#: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/csharp)
|
|
||||||
- Java: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/java)
|
|
||||||
- Go: [41c9013f](https://github.com/nomic-ai/gpt4all/tree/41c9013fa46a194b3e4fee6ced1b9d1b65e177ac/gpt4all-bindings/golang)
|
|
||||||
|
|
||||||
</details>
|
|
@ -2,7 +2,8 @@
|
|||||||
|
|
||||||
GPT4All on the command-line.
|
GPT4All on the command-line.
|
||||||
|
|
||||||
More details on the [wiki](https://github.com/nomic-ai/gpt4all/wiki/Python-CLI).
|
## Documentation
|
||||||
|
<https://docs.gpt4all.io/gpt4all_cli.html>
|
||||||
|
|
||||||
## Quickstart
|
## Quickstart
|
||||||
|
|
||||||
@ -33,11 +34,11 @@ python -m pip install --user --upgrade gpt4all typer
|
|||||||
# run the CLI
|
# run the CLI
|
||||||
python app.py repl
|
python app.py repl
|
||||||
```
|
```
|
||||||
By default, it will automatically download the `Mistral Instruct` model to `.cache/gpt4all/` in your
|
By default, it will automatically download the `groovy` model to `.cache/gpt4all/` in your user
|
||||||
user directory, if necessary.
|
directory, if necessary.
|
||||||
|
|
||||||
If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
|
If you have already saved a model beforehand, specify its path with the `-m`/`--model` argument,
|
||||||
for example:
|
for example:
|
||||||
```shell
|
```shell
|
||||||
python app.py repl --model /home/user/my-gpt4all-models/mistral-7b-instruct-v0.1.Q4_0.gguf
|
python app.py repl --model /home/user/my-gpt4all-models/GPT4All-13B-snoozy.ggmlv3.q4_0.bin
|
||||||
```
|
```
|
||||||
|
22
gpt4all-bindings/cli/app.py
Executable file → Normal file
22
gpt4all-bindings/cli/app.py
Executable file → Normal file
@ -1,17 +1,16 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
"""GPT4All CLI
|
"""GPT4All CLI
|
||||||
|
|
||||||
The GPT4All CLI is a self-contained script based on the `gpt4all` and `typer` packages. It offers a
|
The GPT4All CLI is a self-contained script based on the `gpt4all` and `typer` packages. It offers a
|
||||||
REPL to communicate with a language model similar to the chat GUI application, but more basic.
|
REPL to communicate with a language model similar to the chat GUI application, but more basic.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import importlib.metadata
|
|
||||||
import io
|
import io
|
||||||
|
import pkg_resources # should be present as a dependency of gpt4all
|
||||||
import sys
|
import sys
|
||||||
|
import typer
|
||||||
|
|
||||||
from collections import namedtuple
|
from collections import namedtuple
|
||||||
from typing_extensions import Annotated
|
from typing_extensions import Annotated
|
||||||
|
|
||||||
import typer
|
|
||||||
from gpt4all import GPT4All
|
from gpt4all import GPT4All
|
||||||
|
|
||||||
|
|
||||||
@ -54,18 +53,14 @@ def repl(
|
|||||||
model: Annotated[
|
model: Annotated[
|
||||||
str,
|
str,
|
||||||
typer.Option("--model", "-m", help="Model to use for chatbot"),
|
typer.Option("--model", "-m", help="Model to use for chatbot"),
|
||||||
] = "mistral-7b-instruct-v0.1.Q4_0.gguf",
|
] = "ggml-gpt4all-j-v1.3-groovy",
|
||||||
n_threads: Annotated[
|
n_threads: Annotated[
|
||||||
int,
|
int,
|
||||||
typer.Option("--n-threads", "-t", help="Number of threads to use for chatbot"),
|
typer.Option("--n-threads", "-t", help="Number of threads to use for chatbot"),
|
||||||
] = None,
|
] = None,
|
||||||
device: Annotated[
|
|
||||||
str,
|
|
||||||
typer.Option("--device", "-d", help="Device to use for chatbot, e.g. gpu, amd, nvidia, intel. Defaults to CPU."),
|
|
||||||
] = None,
|
|
||||||
):
|
):
|
||||||
"""The CLI read-eval-print loop."""
|
"""The CLI read-eval-print loop."""
|
||||||
gpt4all_instance = GPT4All(model, device=device)
|
gpt4all_instance = GPT4All(model)
|
||||||
|
|
||||||
# if threads are passed, set them
|
# if threads are passed, set them
|
||||||
if n_threads is not None:
|
if n_threads is not None:
|
||||||
@ -84,7 +79,7 @@ def repl(
|
|||||||
|
|
||||||
use_new_loop = False
|
use_new_loop = False
|
||||||
try:
|
try:
|
||||||
version = importlib.metadata.version('gpt4all')
|
version = pkg_resources.Environment()['gpt4all'][0].version
|
||||||
version_major = int(version.split('.')[0])
|
version_major = int(version.split('.')[0])
|
||||||
if version_major >= 1:
|
if version_major >= 1:
|
||||||
use_new_loop = True
|
use_new_loop = True
|
||||||
@ -113,11 +108,13 @@ def _old_loop(gpt4all_instance):
|
|||||||
full_response = gpt4all_instance.chat_completion(
|
full_response = gpt4all_instance.chat_completion(
|
||||||
MESSAGES,
|
MESSAGES,
|
||||||
# preferential kwargs for chat ux
|
# preferential kwargs for chat ux
|
||||||
|
logits_size=0,
|
||||||
|
tokens_size=0,
|
||||||
n_past=0,
|
n_past=0,
|
||||||
|
n_ctx=0,
|
||||||
n_predict=200,
|
n_predict=200,
|
||||||
top_k=40,
|
top_k=40,
|
||||||
top_p=0.9,
|
top_p=0.9,
|
||||||
min_p=0.0,
|
|
||||||
temp=0.9,
|
temp=0.9,
|
||||||
n_batch=9,
|
n_batch=9,
|
||||||
repeat_penalty=1.1,
|
repeat_penalty=1.1,
|
||||||
@ -154,7 +151,6 @@ def _new_loop(gpt4all_instance):
|
|||||||
temp=0.9,
|
temp=0.9,
|
||||||
top_k=40,
|
top_k=40,
|
||||||
top_p=0.9,
|
top_p=0.9,
|
||||||
min_p=0.0,
|
|
||||||
repeat_penalty=1.1,
|
repeat_penalty=1.1,
|
||||||
repeat_last_n=64,
|
repeat_last_n=64,
|
||||||
n_batch=9,
|
n_batch=9,
|
||||||
|
346
gpt4all-bindings/csharp/.editorconfig
Normal file
346
gpt4all-bindings/csharp/.editorconfig
Normal file
@ -0,0 +1,346 @@
|
|||||||
|
# EditorConfig is awesome: https://EditorConfig.org
|
||||||
|
|
||||||
|
# top-most EditorConfig file
|
||||||
|
root = true
|
||||||
|
|
||||||
|
# Don't use tabs for indentation.
|
||||||
|
[*]
|
||||||
|
indent_style = space
|
||||||
|
# (Please don't specify an indent_size here; that has too many unintended consequences.)
|
||||||
|
|
||||||
|
# Code files
|
||||||
|
[*.{cs,csx,vb,vbx}]
|
||||||
|
indent_size = 4
|
||||||
|
insert_final_newline = true
|
||||||
|
charset = utf-8-bom
|
||||||
|
|
||||||
|
# XML project files
|
||||||
|
[*.{csproj,vbproj,vcxproj,vcxproj.filters,proj,projitems,shproj}]
|
||||||
|
indent_size = 4
|
||||||
|
|
||||||
|
# XML config files
|
||||||
|
[*.{props,targets,ruleset,config,nuspec,resx,vsixmanifest,vsct}]
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
# JSON files
|
||||||
|
[*.json]
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
# Powershell files
|
||||||
|
[*.ps1]
|
||||||
|
indent_size = 2
|
||||||
|
|
||||||
|
# Shell script files
|
||||||
|
[*.sh]
|
||||||
|
end_of_line = lf
|
||||||
|
indent_size = 2
|
||||||
|
insert_final_newline = true
|
||||||
|
|
||||||
|
# Dotnet code style settings:
|
||||||
|
[*.{cs,vb}]
|
||||||
|
|
||||||
|
# IDE0055: Fix formatting
|
||||||
|
dotnet_diagnostic.IDE0055.severity = error
|
||||||
|
|
||||||
|
# Sort using and Import directives with System.* appearing first
|
||||||
|
dotnet_sort_system_directives_first = true
|
||||||
|
dotnet_separate_import_directive_groups = false
|
||||||
|
|
||||||
|
# Avoid "this." and "Me." if not necessary
|
||||||
|
dotnet_style_qualification_for_field = false:suggestion
|
||||||
|
dotnet_style_qualification_for_property = false:suggestion
|
||||||
|
dotnet_style_qualification_for_method = false:suggestion
|
||||||
|
dotnet_style_qualification_for_event = false:suggestion
|
||||||
|
|
||||||
|
# Use language keywords instead of framework type names for type references
|
||||||
|
dotnet_style_predefined_type_for_locals_parameters_members = true:warning
|
||||||
|
dotnet_style_predefined_type_for_member_access = true:warning
|
||||||
|
|
||||||
|
# Suggest more modern language features when available
|
||||||
|
dotnet_style_object_initializer = true:suggestion
|
||||||
|
dotnet_style_collection_initializer = true:suggestion
|
||||||
|
dotnet_style_coalesce_expression = true:suggestion
|
||||||
|
dotnet_style_null_propagation = true:suggestion
|
||||||
|
dotnet_style_explicit_tuple_names = true:suggestion
|
||||||
|
|
||||||
|
# Whitespace options
|
||||||
|
dotnet_style_allow_multiple_blank_lines_experimental = false
|
||||||
|
|
||||||
|
# Private fields are camelCase with '_' prefix
|
||||||
|
dotnet_naming_rule.private_members_with_underscore.symbols = private_fields
|
||||||
|
dotnet_naming_rule.private_members_with_underscore.style = prefix_underscore
|
||||||
|
dotnet_naming_rule.private_members_with_underscore.severity = error
|
||||||
|
dotnet_naming_symbols.private_fields.applicable_kinds = field
|
||||||
|
dotnet_naming_symbols.private_fields.applicable_accessibilities = private
|
||||||
|
dotnet_naming_style.prefix_underscore.capitalization = camel_case
|
||||||
|
dotnet_naming_style.prefix_underscore.required_prefix = _
|
||||||
|
|
||||||
|
# Non-private static fields are PascalCase
|
||||||
|
dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.severity = suggestion
|
||||||
|
dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.symbols = non_private_static_fields
|
||||||
|
dotnet_naming_rule.non_private_static_fields_should_be_pascal_case.style = non_private_static_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.non_private_static_fields.applicable_kinds = field
|
||||||
|
dotnet_naming_symbols.non_private_static_fields.applicable_accessibilities = public, protected, internal, protected_internal, private_protected
|
||||||
|
dotnet_naming_symbols.non_private_static_fields.required_modifiers = static
|
||||||
|
|
||||||
|
dotnet_naming_style.non_private_static_field_style.capitalization = pascal_case
|
||||||
|
|
||||||
|
# Non-private readonly fields are PascalCase
|
||||||
|
dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.severity = suggestion
|
||||||
|
dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.symbols = non_private_readonly_fields
|
||||||
|
dotnet_naming_rule.non_private_readonly_fields_should_be_pascal_case.style = non_private_static_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.non_private_readonly_fields.applicable_kinds = field
|
||||||
|
dotnet_naming_symbols.non_private_readonly_fields.applicable_accessibilities = public, protected, internal, protected_internal, private_protected
|
||||||
|
dotnet_naming_symbols.non_private_readonly_fields.required_modifiers = readonly
|
||||||
|
|
||||||
|
dotnet_naming_style.non_private_readonly_field_style.capitalization = pascal_case
|
||||||
|
|
||||||
|
# Constants are PascalCase
|
||||||
|
dotnet_naming_rule.constants_should_be_pascal_case.severity = suggestion
|
||||||
|
dotnet_naming_rule.constants_should_be_pascal_case.symbols = constants
|
||||||
|
dotnet_naming_rule.constants_should_be_pascal_case.style = non_private_static_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.constants.applicable_kinds = field, local
|
||||||
|
dotnet_naming_symbols.constants.required_modifiers = const
|
||||||
|
|
||||||
|
dotnet_naming_style.constant_style.capitalization = pascal_case
|
||||||
|
|
||||||
|
# Static fields are camelCase and start with s_
|
||||||
|
dotnet_naming_rule.static_fields_should_be_camel_case.severity = none
|
||||||
|
dotnet_naming_rule.static_fields_should_be_camel_case.symbols = static_fields
|
||||||
|
dotnet_naming_rule.static_fields_should_be_camel_case.style = static_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.static_fields.applicable_kinds = field
|
||||||
|
dotnet_naming_symbols.static_fields.required_modifiers = static
|
||||||
|
|
||||||
|
dotnet_naming_style.static_field_style.capitalization = camel_case
|
||||||
|
dotnet_naming_style.static_field_style.required_prefix = s_
|
||||||
|
|
||||||
|
# Instance fields are camelCase and start with _
|
||||||
|
dotnet_naming_rule.instance_fields_should_be_camel_case.severity = none
|
||||||
|
dotnet_naming_rule.instance_fields_should_be_camel_case.symbols = instance_fields
|
||||||
|
dotnet_naming_rule.instance_fields_should_be_camel_case.style = instance_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.instance_fields.applicable_kinds = field
|
||||||
|
|
||||||
|
dotnet_naming_style.instance_field_style.capitalization = camel_case
|
||||||
|
dotnet_naming_style.instance_field_style.required_prefix = _
|
||||||
|
|
||||||
|
# Locals and parameters are camelCase
|
||||||
|
dotnet_naming_rule.locals_should_be_camel_case.severity = suggestion
|
||||||
|
dotnet_naming_rule.locals_should_be_camel_case.symbols = locals_and_parameters
|
||||||
|
dotnet_naming_rule.locals_should_be_camel_case.style = camel_case_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.locals_and_parameters.applicable_kinds = parameter, local
|
||||||
|
|
||||||
|
dotnet_naming_style.camel_case_style.capitalization = camel_case
|
||||||
|
|
||||||
|
# Local functions are PascalCase
|
||||||
|
dotnet_naming_rule.local_functions_should_be_pascal_case.severity = suggestion
|
||||||
|
dotnet_naming_rule.local_functions_should_be_pascal_case.symbols = local_functions
|
||||||
|
dotnet_naming_rule.local_functions_should_be_pascal_case.style = non_private_static_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.local_functions.applicable_kinds = local_function
|
||||||
|
|
||||||
|
dotnet_naming_style.local_function_style.capitalization = pascal_case
|
||||||
|
|
||||||
|
# By default, name items with PascalCase
|
||||||
|
dotnet_naming_rule.members_should_be_pascal_case.severity = suggestion
|
||||||
|
dotnet_naming_rule.members_should_be_pascal_case.symbols = all_members
|
||||||
|
dotnet_naming_rule.members_should_be_pascal_case.style = non_private_static_field_style
|
||||||
|
|
||||||
|
dotnet_naming_symbols.all_members.applicable_kinds = *
|
||||||
|
|
||||||
|
dotnet_naming_style.pascal_case_style.capitalization = pascal_case
|
||||||
|
|
||||||
|
# error RS2008: Enable analyzer release tracking for the analyzer project containing rule '{0}'
|
||||||
|
dotnet_diagnostic.RS2008.severity = none
|
||||||
|
|
||||||
|
# IDE0073: File header
|
||||||
|
dotnet_diagnostic.IDE0073.severity = none
|
||||||
|
#file_header_template = Licensed to the .NET Foundation under one or more agreements.\nThe .NET Foundation licenses this file to you under the MIT license.\nSee the LICENSE file in the project root for more information.
|
||||||
|
|
||||||
|
# IDE0035: Remove unreachable code
|
||||||
|
dotnet_diagnostic.IDE0035.severity = warning
|
||||||
|
|
||||||
|
# IDE0036: Order modifiers
|
||||||
|
dotnet_diagnostic.IDE0036.severity = warning
|
||||||
|
|
||||||
|
# IDE0043: Format string contains invalid placeholder
|
||||||
|
dotnet_diagnostic.IDE0043.severity = warning
|
||||||
|
|
||||||
|
# IDE0044: Make field readonly
|
||||||
|
dotnet_diagnostic.IDE0044.severity = warning
|
||||||
|
|
||||||
|
# IDE1006: Naming rule violation
|
||||||
|
#dotnet_diagnostic.IDE1006.severity = none
|
||||||
|
|
||||||
|
# RS0016: Only enable if API files are present
|
||||||
|
dotnet_public_api_analyzer.require_api_files = true
|
||||||
|
dotnet_style_operator_placement_when_wrapping = beginning_of_line
|
||||||
|
tab_width = 4
|
||||||
|
end_of_line = crlf
|
||||||
|
dotnet_style_prefer_is_null_check_over_reference_equality_method = true:suggestion
|
||||||
|
dotnet_style_prefer_auto_properties = true:silent
|
||||||
|
dotnet_style_prefer_simplified_boolean_expressions = true:suggestion
|
||||||
|
dotnet_style_prefer_conditional_expression_over_assignment = true:silent
|
||||||
|
dotnet_style_prefer_conditional_expression_over_return = true:silent
|
||||||
|
dotnet_style_prefer_inferred_tuple_names = true:suggestion
|
||||||
|
dotnet_style_prefer_inferred_anonymous_type_member_names = true:suggestion
|
||||||
|
dotnet_style_prefer_compound_assignment = true:suggestion
|
||||||
|
dotnet_style_prefer_simplified_interpolation = true:suggestion
|
||||||
|
dotnet_style_namespace_match_folder = true:suggestion
|
||||||
|
|
||||||
|
# CSharp code style settings:
|
||||||
|
[*.cs]
|
||||||
|
# Newline settings
|
||||||
|
csharp_new_line_before_open_brace = all
|
||||||
|
csharp_new_line_before_else = true
|
||||||
|
csharp_new_line_before_catch = true
|
||||||
|
csharp_new_line_before_finally = true
|
||||||
|
csharp_new_line_before_members_in_object_initializers = true
|
||||||
|
csharp_new_line_before_members_in_anonymous_types = true
|
||||||
|
csharp_new_line_between_query_expression_clauses = true
|
||||||
|
|
||||||
|
# Indentation preferences
|
||||||
|
csharp_indent_block_contents = true
|
||||||
|
csharp_indent_braces = false
|
||||||
|
csharp_indent_case_contents = true
|
||||||
|
csharp_indent_case_contents_when_block = true
|
||||||
|
csharp_indent_switch_labels = true
|
||||||
|
csharp_indent_labels = flush_left
|
||||||
|
|
||||||
|
# Whitespace options
|
||||||
|
csharp_style_allow_embedded_statements_on_same_line_experimental = false
|
||||||
|
csharp_style_allow_blank_lines_between_consecutive_braces_experimental = false
|
||||||
|
csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental = false
|
||||||
|
|
||||||
|
# Prefer "var" everywhere
|
||||||
|
csharp_style_var_for_built_in_types = true:suggestion
|
||||||
|
csharp_style_var_when_type_is_apparent = true:suggestion
|
||||||
|
csharp_style_var_elsewhere = true:suggestion
|
||||||
|
|
||||||
|
# Prefer method-like constructs to have a block body
|
||||||
|
csharp_style_expression_bodied_methods = false:none
|
||||||
|
csharp_style_expression_bodied_constructors = false:none
|
||||||
|
csharp_style_expression_bodied_operators = false:none
|
||||||
|
|
||||||
|
# Prefer property-like constructs to have an expression-body
|
||||||
|
csharp_style_expression_bodied_properties = true:none
|
||||||
|
csharp_style_expression_bodied_indexers = true:none
|
||||||
|
csharp_style_expression_bodied_accessors = true:none
|
||||||
|
|
||||||
|
# Suggest more modern language features when available
|
||||||
|
csharp_style_pattern_matching_over_is_with_cast_check = true:suggestion
|
||||||
|
csharp_style_pattern_matching_over_as_with_null_check = true:suggestion
|
||||||
|
csharp_style_inlined_variable_declaration = true:suggestion
|
||||||
|
csharp_style_throw_expression = true:suggestion
|
||||||
|
csharp_style_conditional_delegate_call = true:suggestion
|
||||||
|
|
||||||
|
# Space preferences
|
||||||
|
csharp_space_after_cast = false
|
||||||
|
csharp_space_after_colon_in_inheritance_clause = true
|
||||||
|
csharp_space_after_comma = true
|
||||||
|
csharp_space_after_dot = false
|
||||||
|
csharp_space_after_keywords_in_control_flow_statements = true
|
||||||
|
csharp_space_after_semicolon_in_for_statement = true
|
||||||
|
csharp_space_around_binary_operators = before_and_after
|
||||||
|
csharp_space_around_declaration_statements = do_not_ignore
|
||||||
|
csharp_space_before_colon_in_inheritance_clause = true
|
||||||
|
csharp_space_before_comma = false
|
||||||
|
csharp_space_before_dot = false
|
||||||
|
csharp_space_before_open_square_brackets = false
|
||||||
|
csharp_space_before_semicolon_in_for_statement = false
|
||||||
|
csharp_space_between_empty_square_brackets = false
|
||||||
|
csharp_space_between_method_call_empty_parameter_list_parentheses = false
|
||||||
|
csharp_space_between_method_call_name_and_opening_parenthesis = false
|
||||||
|
csharp_space_between_method_call_parameter_list_parentheses = false
|
||||||
|
csharp_space_between_method_declaration_empty_parameter_list_parentheses = false
|
||||||
|
csharp_space_between_method_declaration_name_and_open_parenthesis = false
|
||||||
|
csharp_space_between_method_declaration_parameter_list_parentheses = false
|
||||||
|
csharp_space_between_parentheses = false
|
||||||
|
csharp_space_between_square_brackets = false
|
||||||
|
|
||||||
|
# Blocks are allowed
|
||||||
|
csharp_prefer_braces = true:silent
|
||||||
|
csharp_preserve_single_line_blocks = true
|
||||||
|
csharp_preserve_single_line_statements = true
|
||||||
|
|
||||||
|
# Target-type new expressio
|
||||||
|
csharp_style_implicit_object_creation_when_type_is_apparent = true:suggestion
|
||||||
|
|
||||||
|
# Currently only enabled for C# due to crash in VB analyzer. VB can be enabled once
|
||||||
|
# https://github.com/dotnet/roslyn/pull/54259 has been published.
|
||||||
|
dotnet_style_allow_statement_immediately_after_block_experimental = false
|
||||||
|
dotnet_diagnostic.RCS0003.severity=warning
|
||||||
|
dotnet_diagnostic.RCS1036.severity=error
|
||||||
|
dotnet_diagnostic.IDE0005.severity=warning
|
||||||
|
dotnet_diagnostic.IDE0007.severity=error
|
||||||
|
csharp_using_directive_placement = outside_namespace:silent
|
||||||
|
csharp_prefer_simple_using_statement = true:suggestion
|
||||||
|
csharp_style_namespace_declarations = block_scoped:silent
|
||||||
|
csharp_style_expression_bodied_lambdas = true:silent
|
||||||
|
csharp_style_expression_bodied_local_functions = false:silent
|
||||||
|
csharp_style_prefer_null_check_over_type_check = true:suggestion
|
||||||
|
dotnet_diagnostic.RCS1075.severity = suggestion
|
||||||
|
|
||||||
|
[src/CodeStyle/**.{cs,vb}]
|
||||||
|
# warning RS0005: Do not use generic CodeAction.Create to create CodeAction
|
||||||
|
dotnet_diagnostic.RS0005.severity = none
|
||||||
|
|
||||||
|
[src/{Analyzers,CodeStyle,Features,Workspaces,EditorFeatures,VisualStudio}/**/*.{cs,vb}]
|
||||||
|
|
||||||
|
# IDE0011: Add braces
|
||||||
|
csharp_prefer_braces = when_multiline:warning
|
||||||
|
# NOTE: We need the below severity entry for Add Braces due to https://github.com/dotnet/roslyn/issues/44201
|
||||||
|
dotnet_diagnostic.IDE0011.severity = warning
|
||||||
|
|
||||||
|
# IDE0040: Add accessibility modifiers
|
||||||
|
dotnet_diagnostic.IDE0040.severity = warning
|
||||||
|
|
||||||
|
# CONSIDER: Are IDE0051 and IDE0052 too noisy to be warnings for IDE editing scenarios? Should they be made build-only warnings?
|
||||||
|
# IDE0051: Remove unused private member
|
||||||
|
dotnet_diagnostic.IDE0051.severity = warning
|
||||||
|
|
||||||
|
# IDE0052: Remove unread private member
|
||||||
|
dotnet_diagnostic.IDE0052.severity = warning
|
||||||
|
|
||||||
|
# IDE0059: Unnecessary assignment to a value
|
||||||
|
dotnet_diagnostic.IDE0059.severity = warning
|
||||||
|
|
||||||
|
# IDE0060: Remove unused parameter
|
||||||
|
dotnet_diagnostic.IDE0060.severity = warning
|
||||||
|
|
||||||
|
# CA1012: Abstract types should not have public constructors
|
||||||
|
dotnet_diagnostic.CA1012.severity = warning
|
||||||
|
|
||||||
|
# CA1822: Make member static
|
||||||
|
dotnet_diagnostic.CA1822.severity = warning
|
||||||
|
|
||||||
|
# Prefer "var" everywhere
|
||||||
|
dotnet_diagnostic.IDE0007.severity = warning
|
||||||
|
csharp_style_var_for_built_in_types = true:warning
|
||||||
|
csharp_style_var_when_type_is_apparent = true:warning
|
||||||
|
csharp_style_var_elsewhere = true:warning
|
||||||
|
|
||||||
|
# dotnet_style_allow_multiple_blank_lines_experimental
|
||||||
|
dotnet_diagnostic.IDE2000.severity = warning
|
||||||
|
|
||||||
|
# csharp_style_allow_embedded_statements_on_same_line_experimental
|
||||||
|
dotnet_diagnostic.IDE2001.severity = warning
|
||||||
|
|
||||||
|
# csharp_style_allow_blank_lines_between_consecutive_braces_experimental
|
||||||
|
dotnet_diagnostic.IDE2002.severity = warning
|
||||||
|
|
||||||
|
# dotnet_style_allow_statement_immediately_after_block_experimental
|
||||||
|
dotnet_diagnostic.IDE2003.severity = warning
|
||||||
|
|
||||||
|
# csharp_style_allow_blank_line_after_colon_in_constructor_initializer_experimental
|
||||||
|
dotnet_diagnostic.IDE2004.severity = warning
|
||||||
|
|
||||||
|
[src/{VisualStudio}/**/*.{cs,vb}]
|
||||||
|
# CA1822: Make member static
|
||||||
|
# There is a risk of accidentally breaking an internal API that partners rely on though IVT.
|
||||||
|
dotnet_code_quality.CA1822.api_surface = private
|
379
gpt4all-bindings/csharp/.gitignore
vendored
Normal file
379
gpt4all-bindings/csharp/.gitignore
vendored
Normal file
@ -0,0 +1,379 @@
|
|||||||
|
## Ignore Visual Studio temporary files, build results, and
|
||||||
|
## files generated by popular Visual Studio add-ons.
|
||||||
|
##
|
||||||
|
## Get latest from https://github.com/github/gitignore/blob/master/VisualStudio.gitignore
|
||||||
|
|
||||||
|
runtimes
|
||||||
|
**/*nuget
|
||||||
|
|
||||||
|
*.zip
|
||||||
|
include/
|
||||||
|
*.exp
|
||||||
|
*.lib
|
||||||
|
*.dll
|
||||||
|
|
||||||
|
# User-specific files
|
||||||
|
*.rsuser
|
||||||
|
*.suo
|
||||||
|
*.user
|
||||||
|
*.userosscache
|
||||||
|
*.sln.docstates
|
||||||
|
|
||||||
|
# User-specific files (MonoDevelop/Xamarin Studio)
|
||||||
|
*.userprefs
|
||||||
|
|
||||||
|
# Mono auto generated files
|
||||||
|
mono_crash.*
|
||||||
|
Tests/**/launchSettings.json
|
||||||
|
|
||||||
|
# Build results
|
||||||
|
[Dd]ebug/
|
||||||
|
[Dd]ebugPublic/
|
||||||
|
[Rr]elease/
|
||||||
|
[Rr]eleases/
|
||||||
|
x64/
|
||||||
|
x86/
|
||||||
|
[Ww][Ii][Nn]32/
|
||||||
|
[Aa][Rr][Mm]/
|
||||||
|
[Aa][Rr][Mm]64/
|
||||||
|
bld/
|
||||||
|
[Bb]in/
|
||||||
|
[Oo]bj/
|
||||||
|
[Oo]ut/
|
||||||
|
[Ll]og/
|
||||||
|
[Ll]ogs/
|
||||||
|
|
||||||
|
# Visual Studio 2015/2017 cache/options directory
|
||||||
|
.vs/
|
||||||
|
# Uncomment if you have tasks that create the project's static files in wwwroot
|
||||||
|
#wwwroot/
|
||||||
|
|
||||||
|
# Visual Studio 2017 auto generated files
|
||||||
|
Generated\ Files/
|
||||||
|
|
||||||
|
# MSTest test Results
|
||||||
|
[Tt]est[Rr]esult*/
|
||||||
|
[Bb]uild[Ll]og.*
|
||||||
|
|
||||||
|
# NUnit
|
||||||
|
*.VisualState.xml
|
||||||
|
TestResult.xml
|
||||||
|
nunit-*.xml
|
||||||
|
|
||||||
|
# Build Results of an ATL Project
|
||||||
|
[Dd]ebugPS/
|
||||||
|
[Rr]eleasePS/
|
||||||
|
dlldata.c
|
||||||
|
|
||||||
|
# Benchmark Results
|
||||||
|
BenchmarkDotNet.Artifacts/
|
||||||
|
|
||||||
|
# .NET Core
|
||||||
|
project.lock.json
|
||||||
|
project.fragment.lock.json
|
||||||
|
artifacts/
|
||||||
|
|
||||||
|
# ASP.NET Scaffolding
|
||||||
|
ScaffoldingReadMe.txt
|
||||||
|
|
||||||
|
# StyleCop
|
||||||
|
StyleCopReport.xml
|
||||||
|
|
||||||
|
# Files built by Visual Studio
|
||||||
|
*_i.c
|
||||||
|
*_p.c
|
||||||
|
*_h.h
|
||||||
|
*.ilk
|
||||||
|
*.meta
|
||||||
|
*.obj
|
||||||
|
*.iobj
|
||||||
|
*.pch
|
||||||
|
*.pdb
|
||||||
|
*.ipdb
|
||||||
|
*.pgc
|
||||||
|
*.pgd
|
||||||
|
*.rsp
|
||||||
|
*.sbr
|
||||||
|
*.tlb
|
||||||
|
*.tli
|
||||||
|
*.tlh
|
||||||
|
*.tmp
|
||||||
|
*.tmp_proj
|
||||||
|
*_wpftmp.csproj
|
||||||
|
*.log
|
||||||
|
*.vspscc
|
||||||
|
*.vssscc
|
||||||
|
.builds
|
||||||
|
*.pidb
|
||||||
|
*.svclog
|
||||||
|
*.scc
|
||||||
|
|
||||||
|
# Chutzpah Test files
|
||||||
|
_Chutzpah*
|
||||||
|
|
||||||
|
# Visual C++ cache files
|
||||||
|
ipch/
|
||||||
|
*.aps
|
||||||
|
*.ncb
|
||||||
|
*.opendb
|
||||||
|
*.opensdf
|
||||||
|
*.sdf
|
||||||
|
*.cachefile
|
||||||
|
*.VC.db
|
||||||
|
*.VC.VC.opendb
|
||||||
|
|
||||||
|
# Visual Studio profiler
|
||||||
|
*.psess
|
||||||
|
*.vsp
|
||||||
|
*.vspx
|
||||||
|
*.sap
|
||||||
|
|
||||||
|
# Visual Studio Trace Files
|
||||||
|
*.e2e
|
||||||
|
|
||||||
|
# TFS 2012 Local Workspace
|
||||||
|
$tf/
|
||||||
|
|
||||||
|
# Guidance Automation Toolkit
|
||||||
|
*.gpState
|
||||||
|
|
||||||
|
# ReSharper is a .NET coding add-in
|
||||||
|
_ReSharper*/
|
||||||
|
*.[Rr]e[Ss]harper
|
||||||
|
*.DotSettings.user
|
||||||
|
|
||||||
|
# TeamCity is a build add-in
|
||||||
|
_TeamCity*
|
||||||
|
|
||||||
|
# DotCover is a Code Coverage Tool
|
||||||
|
*.dotCover
|
||||||
|
|
||||||
|
# AxoCover is a Code Coverage Tool
|
||||||
|
.axoCover/*
|
||||||
|
!.axoCover/settings.json
|
||||||
|
|
||||||
|
# Coverlet is a free, cross platform Code Coverage Tool
|
||||||
|
coverage*.json
|
||||||
|
coverage*.xml
|
||||||
|
coverage*.info
|
||||||
|
|
||||||
|
# Visual Studio code coverage results
|
||||||
|
*.coverage
|
||||||
|
*.coveragexml
|
||||||
|
|
||||||
|
# NCrunch
|
||||||
|
_NCrunch_*
|
||||||
|
.*crunch*.local.xml
|
||||||
|
nCrunchTemp_*
|
||||||
|
|
||||||
|
# MightyMoose
|
||||||
|
*.mm.*
|
||||||
|
AutoTest.Net/
|
||||||
|
|
||||||
|
# Web workbench (sass)
|
||||||
|
.sass-cache/
|
||||||
|
|
||||||
|
# Installshield output folder
|
||||||
|
[Ee]xpress/
|
||||||
|
|
||||||
|
# DocProject is a documentation generator add-in
|
||||||
|
DocProject/buildhelp/
|
||||||
|
DocProject/Help/*.HxT
|
||||||
|
DocProject/Help/*.HxC
|
||||||
|
DocProject/Help/*.hhc
|
||||||
|
DocProject/Help/*.hhk
|
||||||
|
DocProject/Help/*.hhp
|
||||||
|
DocProject/Help/Html2
|
||||||
|
DocProject/Help/html
|
||||||
|
|
||||||
|
# Click-Once directory
|
||||||
|
publish/
|
||||||
|
|
||||||
|
# Publish Web Output
|
||||||
|
*.[Pp]ublish.xml
|
||||||
|
*.azurePubxml
|
||||||
|
# Note: Comment the next line if you want to checkin your web deploy settings,
|
||||||
|
# but database connection strings (with potential passwords) will be unencrypted
|
||||||
|
*.pubxml
|
||||||
|
*.publishproj
|
||||||
|
|
||||||
|
# Microsoft Azure Web App publish settings. Comment the next line if you want to
|
||||||
|
# checkin your Azure Web App publish settings, but sensitive information contained
|
||||||
|
# in these scripts will be unencrypted
|
||||||
|
PublishScripts/
|
||||||
|
|
||||||
|
# NuGet Packages
|
||||||
|
*.nupkg
|
||||||
|
# NuGet Symbol Packages
|
||||||
|
*.snupkg
|
||||||
|
# The packages folder can be ignored because of Package Restore
|
||||||
|
**/[Pp]ackages/*
|
||||||
|
# except build/, which is used as an MSBuild target.
|
||||||
|
!**/[Pp]ackages/build/
|
||||||
|
# Uncomment if necessary however generally it will be regenerated when needed
|
||||||
|
#!**/[Pp]ackages/repositories.config
|
||||||
|
# NuGet v3's project.json files produces more ignorable files
|
||||||
|
*.nuget.props
|
||||||
|
*.nuget.targets
|
||||||
|
|
||||||
|
# Microsoft Azure Build Output
|
||||||
|
csx/
|
||||||
|
*.build.csdef
|
||||||
|
|
||||||
|
# Microsoft Azure Emulator
|
||||||
|
ecf/
|
||||||
|
rcf/
|
||||||
|
|
||||||
|
# Windows Store app package directories and files
|
||||||
|
AppPackages/
|
||||||
|
BundleArtifacts/
|
||||||
|
Package.StoreAssociation.xml
|
||||||
|
_pkginfo.txt
|
||||||
|
*.appx
|
||||||
|
*.appxbundle
|
||||||
|
*.appxupload
|
||||||
|
|
||||||
|
# Visual Studio cache files
|
||||||
|
# files ending in .cache can be ignored
|
||||||
|
*.[Cc]ache
|
||||||
|
# but keep track of directories ending in .cache
|
||||||
|
!?*.[Cc]ache/
|
||||||
|
|
||||||
|
# Others
|
||||||
|
ClientBin/
|
||||||
|
~$*
|
||||||
|
*~
|
||||||
|
*.dbmdl
|
||||||
|
*.dbproj.schemaview
|
||||||
|
*.jfm
|
||||||
|
*.pfx
|
||||||
|
*.publishsettings
|
||||||
|
orleans.codegen.cs
|
||||||
|
|
||||||
|
# Including strong name files can present a security risk
|
||||||
|
# (https://github.com/github/gitignore/pull/2483#issue-259490424)
|
||||||
|
#*.snk
|
||||||
|
|
||||||
|
# Since there are multiple workflows, uncomment next line to ignore bower_components
|
||||||
|
# (https://github.com/github/gitignore/pull/1529#issuecomment-104372622)
|
||||||
|
#bower_components/
|
||||||
|
|
||||||
|
# RIA/Silverlight projects
|
||||||
|
Generated_Code/
|
||||||
|
|
||||||
|
# Backup & report files from converting an old project file
|
||||||
|
# to a newer Visual Studio version. Backup files are not needed,
|
||||||
|
# because we have git ;-)
|
||||||
|
_UpgradeReport_Files/
|
||||||
|
Backup*/
|
||||||
|
UpgradeLog*.XML
|
||||||
|
UpgradeLog*.htm
|
||||||
|
ServiceFabricBackup/
|
||||||
|
*.rptproj.bak
|
||||||
|
|
||||||
|
# SQL Server files
|
||||||
|
*.mdf
|
||||||
|
*.ldf
|
||||||
|
*.ndf
|
||||||
|
|
||||||
|
# Business Intelligence projects
|
||||||
|
*.rdl.data
|
||||||
|
*.bim.layout
|
||||||
|
*.bim_*.settings
|
||||||
|
*.rptproj.rsuser
|
||||||
|
*- [Bb]ackup.rdl
|
||||||
|
*- [Bb]ackup ([0-9]).rdl
|
||||||
|
*- [Bb]ackup ([0-9][0-9]).rdl
|
||||||
|
|
||||||
|
# Microsoft Fakes
|
||||||
|
FakesAssemblies/
|
||||||
|
|
||||||
|
# GhostDoc plugin setting file
|
||||||
|
*.GhostDoc.xml
|
||||||
|
|
||||||
|
# Node.js Tools for Visual Studio
|
||||||
|
.ntvs_analysis.dat
|
||||||
|
node_modules/
|
||||||
|
|
||||||
|
# Visual Studio 6 build log
|
||||||
|
*.plg
|
||||||
|
|
||||||
|
# Visual Studio 6 workspace options file
|
||||||
|
*.opt
|
||||||
|
|
||||||
|
# Visual Studio 6 auto-generated workspace file (contains which files were open etc.)
|
||||||
|
*.vbw
|
||||||
|
|
||||||
|
# Visual Studio LightSwitch build output
|
||||||
|
**/*.HTMLClient/GeneratedArtifacts
|
||||||
|
**/*.DesktopClient/GeneratedArtifacts
|
||||||
|
**/*.DesktopClient/ModelManifest.xml
|
||||||
|
**/*.Server/GeneratedArtifacts
|
||||||
|
**/*.Server/ModelManifest.xml
|
||||||
|
_Pvt_Extensions
|
||||||
|
|
||||||
|
# Paket dependency manager
|
||||||
|
.paket/paket.exe
|
||||||
|
paket-files/
|
||||||
|
|
||||||
|
# FAKE - F# Make
|
||||||
|
.fake/
|
||||||
|
|
||||||
|
# CodeRush personal settings
|
||||||
|
.cr/personal
|
||||||
|
|
||||||
|
# Python Tools for Visual Studio (PTVS)
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# Cake - Uncomment if you are using it
|
||||||
|
# tools/**
|
||||||
|
# !tools/packages.config
|
||||||
|
|
||||||
|
# Tabs Studio
|
||||||
|
*.tss
|
||||||
|
|
||||||
|
# Telerik's JustMock configuration file
|
||||||
|
*.jmconfig
|
||||||
|
|
||||||
|
# BizTalk build output
|
||||||
|
*.btp.cs
|
||||||
|
*.btm.cs
|
||||||
|
*.odx.cs
|
||||||
|
*.xsd.cs
|
||||||
|
|
||||||
|
# OpenCover UI analysis results
|
||||||
|
OpenCover/
|
||||||
|
|
||||||
|
# Azure Stream Analytics local run output
|
||||||
|
ASALocalRun/
|
||||||
|
|
||||||
|
# MSBuild Binary and Structured Log
|
||||||
|
*.binlog
|
||||||
|
|
||||||
|
# NVidia Nsight GPU debugger configuration file
|
||||||
|
*.nvuser
|
||||||
|
|
||||||
|
# MFractors (Xamarin productivity tool) working folder
|
||||||
|
.mfractor/
|
||||||
|
|
||||||
|
# Local History for Visual Studio
|
||||||
|
.localhistory/
|
||||||
|
|
||||||
|
# BeatPulse healthcheck temp database
|
||||||
|
healthchecksdb
|
||||||
|
|
||||||
|
# Backup folder for Package Reference Convert tool in Visual Studio 2017
|
||||||
|
MigrationBackup/
|
||||||
|
|
||||||
|
# Ionide (cross platform F# VS Code tools) working folder
|
||||||
|
.ionide/
|
||||||
|
|
||||||
|
# Fody - auto-generated XML schema
|
||||||
|
FodyWeavers.xsd
|
||||||
|
|
||||||
|
# JetBrains Rider
|
||||||
|
.idea
|
||||||
|
|
||||||
|
# Visual Studio Code
|
||||||
|
.vscode
|
44
gpt4all-bindings/csharp/Directory.Build.props
Normal file
44
gpt4all-bindings/csharp/Directory.Build.props
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
<?xml version="1.0" encoding="utf-8"?>
|
||||||
|
<Project>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<Company></Company>
|
||||||
|
<Copyright></Copyright>
|
||||||
|
<NeutralLanguage>en-US</NeutralLanguage>
|
||||||
|
<Version>0.6.3-alpha</Version>
|
||||||
|
<VersionSuffix>$(VersionSuffix)</VersionSuffix>
|
||||||
|
<Version Condition=" '$(VersionSuffix)' != '' ">$(Version)$(VersionSuffix)</Version>
|
||||||
|
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
|
||||||
|
<RepositoryUrl></RepositoryUrl>
|
||||||
|
<RepositoryType>git</RepositoryType>
|
||||||
|
<IncludeSymbols>true</IncludeSymbols>
|
||||||
|
<IncludeSource>true</IncludeSource>
|
||||||
|
<AnalysisLevel>latest-minimum</AnalysisLevel>
|
||||||
|
<EnforceCodeStyleInBuild>true</EnforceCodeStyleInBuild>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<Using Include="System"/>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<LangVersion>preview</LangVersion>
|
||||||
|
<Features>strict</Features>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<PackageReference Include="Roslynator.Analyzers" Version="4.2.0">
|
||||||
|
<PrivateAssets>all</PrivateAssets>
|
||||||
|
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||||
|
</PackageReference>
|
||||||
|
<PackageReference Include="Roslynator.CodeAnalysis.Analyzers" Version="4.2.0">
|
||||||
|
<PrivateAssets>all</PrivateAssets>
|
||||||
|
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||||
|
</PackageReference>
|
||||||
|
<PackageReference Include="Roslynator.Formatting.Analyzers" Version="4.2.0">
|
||||||
|
<PrivateAssets>all</PrivateAssets>
|
||||||
|
<IncludeAssets>runtime; build; native; contentfiles; analyzers</IncludeAssets>
|
||||||
|
</PackageReference>
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
</Project>
|
@ -0,0 +1,32 @@
|
|||||||
|
<Project Sdk="Microsoft.NET.Sdk">
|
||||||
|
|
||||||
|
<PropertyGroup>
|
||||||
|
<OutputType>Exe</OutputType>
|
||||||
|
<TargetFramework>net7.0</TargetFramework>
|
||||||
|
<ImplicitUsings>enable</ImplicitUsings>
|
||||||
|
<Nullable>enable</Nullable>
|
||||||
|
</PropertyGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<ProjectReference Include="..\Gpt4All\Gpt4All.csproj" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!-- Windows -->
|
||||||
|
<None Include="..\runtimes\win-x64\native\*.dll" Pack="true" PackagePath="runtimes\win-x64\native\%(Filename)%(Extension)" />
|
||||||
|
<!-- Linux -->
|
||||||
|
<None Include="..\runtimes\linux-x64\native\*.so" Pack="true" PackagePath="runtimes\linux-x64\native\%(Filename)%(Extension)" />
|
||||||
|
<!-- MacOS -->
|
||||||
|
<None Include="..\runtimes\osx\native\*.dylib" Pack="true" PackagePath="runtimes\osx\native\%(Filename)%(Extension)" />
|
||||||
|
</ItemGroup>
|
||||||
|
|
||||||
|
<ItemGroup>
|
||||||
|
<!-- Windows -->
|
||||||
|
<None Condition="$([MSBuild]::IsOSPlatform('Windows'))" Include="..\runtimes\win-x64\native\*.dll" Visible="False" CopyToOutputDirectory="PreserveNewest" />
|
||||||
|
<!-- Linux -->
|
||||||
|
<None Condition="$([MSBuild]::IsOSPlatform('Linux'))" Include="..\runtimes\linux-x64\native\*.so" Visible="False" CopyToOutputDirectory="PreserveNewest" />
|
||||||
|
<!-- MacOS -->
|
||||||
|
<None Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.dylib" Visible="False" CopyToOutputDirectory="PreserveNewest" />
|
||||||
|
<Content Condition="$([MSBuild]::IsOSPlatform('OSX'))" Include="..\runtimes\osx\native\*.metal" Visible="False" CopyToOutputDirectory="PreserveNewest" />
|
||||||
|
</ItemGroup>
|
||||||
|
</Project>
|
22
gpt4all-bindings/csharp/Gpt4All.Samples/Program.cs
Normal file
22
gpt4all-bindings/csharp/Gpt4All.Samples/Program.cs
Normal file
@ -0,0 +1,22 @@
|
|||||||
|
using Gpt4All;
|
||||||
|
|
||||||
|
var modelFactory = new Gpt4AllModelFactory();
|
||||||
|
if (args.Length < 2)
|
||||||
|
{
|
||||||
|
Console.WriteLine($"Usage: Gpt4All.Samples <model-path> <prompt>");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
var modelPath = args[0];
|
||||||
|
var prompt = args[1];
|
||||||
|
|
||||||
|
using var model = modelFactory.LoadModel(modelPath);
|
||||||
|
|
||||||
|
var result = await model.GetStreamingPredictionAsync(
|
||||||
|
prompt,
|
||||||
|
PredictRequestOptions.Defaults);
|
||||||
|
|
||||||
|
await foreach (var token in result.GetPredictionStreamingAsync())
|
||||||
|
{
|
||||||
|
Console.Write(token);
|
||||||
|
}
|
9
gpt4all-bindings/csharp/Gpt4All.Tests/Constants.cs
Normal file
9
gpt4all-bindings/csharp/Gpt4All.Tests/Constants.cs
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
namespace Gpt4All.Tests;
|
||||||
|
|
||||||
|
public static class Constants
|
||||||
|
{
|
||||||
|
public const string MODELS_BASE_DIR = "../../../models";
|
||||||
|
public const string LLAMA_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-gpt4all-l13b-snoozy.bin";
|
||||||
|
public const string GPTJ_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-gpt4all-j-v1.3-groovy.bin";
|
||||||
|
public const string MPT_MODEL_PATH = $"{MODELS_BASE_DIR}/ggml-mpt-7b-chat.bin";
|
||||||
|
}
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
x
Reference in New Issue
Block a user