mirror of
				https://github.com/nomic-ai/gpt4all.git
				synced 2025-10-31 00:04:39 -04:00 
			
		
		
		
	Signed-off-by: Adam Treat <treat.adam@gmail.com> Signed-off-by: Jared Van Bortel <jared@nomic.ai> Co-authored-by: Jared Van Bortel <jared@nomic.ai>
		
			
				
	
	
		
			50 lines
		
	
	
		
			918 B
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			50 lines
		
	
	
		
			918 B
		
	
	
	
		
			C++
		
	
	
	
	
	
| #pragma once
 | |
| 
 | |
| #include <ggml.h>
 | |
| 
 | |
| #include <cstddef>
 | |
| #include <cstdint>
 | |
| #include <vector>
 | |
| 
 | |
| struct llm_buffer {
 | |
|     uint8_t * addr = NULL;
 | |
|     size_t size = 0;
 | |
| 
 | |
|     void resize(size_t size) {
 | |
|         delete[] addr;
 | |
|         addr = new uint8_t[size];
 | |
|         this->size = size;
 | |
|     }
 | |
| 
 | |
|     ~llm_buffer() {
 | |
|         delete[] addr;
 | |
|     }
 | |
| };
 | |
| 
 | |
| struct llm_kv_cache {
 | |
|     struct ggml_tensor * k;
 | |
|     struct ggml_tensor * v;
 | |
| 
 | |
|     struct ggml_context * ctx = NULL;
 | |
| 
 | |
|     llm_buffer buf;
 | |
| 
 | |
|     int n; // number of tokens currently in the cache
 | |
| 
 | |
|     ~llm_kv_cache() {
 | |
|         if (ctx) {
 | |
|             ggml_free(ctx);
 | |
|         }
 | |
|     }
 | |
| };
 | |
| 
 | |
| inline void ggml_graph_compute_g4a(llm_buffer& buf, ggml_cgraph * graph, int n_threads)
 | |
| {
 | |
|     struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
 | |
|     if (plan.work_size > 0) {
 | |
|         buf.resize(plan.work_size);
 | |
|         plan.work_data = buf.addr;
 | |
|     }
 | |
|     ggml_graph_compute(graph, &plan);
 | |
| }
 |