Using llamacpp with rust

This commit is contained in:
Silas Marvin
2024-02-22 17:23:46 -10:00
parent 2f2ff81043
commit 418ccb81ff
7 changed files with 72 additions and 63 deletions

27
Cargo.lock generated
View File

@@ -98,7 +98,7 @@ dependencies = [
"bitflags 2.4.2",
"cexpr",
"clang-sys",
"itertools 0.12.1",
"itertools",
"lazy_static",
"lazycell",
"log",
@@ -566,15 +566,6 @@ dependencies = [
"either",
]
[[package]]
name = "itertools"
version = "0.12.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
dependencies = [
"either",
]
[[package]]
name = "itoa"
version = "1.0.10"
@@ -628,9 +619,8 @@ checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
[[package]]
name = "llama-cpp-2"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "747243ba163eb361f5d6d483a177450240ce6ca70cefcb7c489f6333e9fe4300"
version = "0.1.25"
source = "git+https://github.com/SilasMarvin/llama-cpp-rs?branch=silas-8-metal-on-mac#8c61f584e7aa200581b711147e685821190aa025"
dependencies = [
"llama-cpp-sys-2",
"thiserror",
@@ -639,9 +629,8 @@ dependencies = [
[[package]]
name = "llama-cpp-sys-2"
version = "0.1.27"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3844a3f833eca795309fec9223a316ced1cebeff9c5cfce5ee760825040d281f"
version = "0.1.25"
source = "git+https://github.com/SilasMarvin/llama-cpp-rs?branch=silas-8-metal-on-mac#8c61f584e7aa200581b711147e685821190aa025"
dependencies = [
"bindgen",
"cc",
@@ -685,8 +674,6 @@ dependencies = [
[[package]]
name = "lsp-server"
version = "0.7.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "248f65b78f6db5d8e1b1604b4098a28b43d21a8eb1deeca22b1c421b276c7095"
dependencies = [
"crossbeam-channel",
"log",
@@ -1011,7 +998,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
dependencies = [
"either",
"itertools 0.11.0",
"itertools",
"rayon",
]
@@ -1387,7 +1374,7 @@ dependencies = [
"esaxx-rs",
"getrandom",
"indicatif",
"itertools 0.11.0",
"itertools",
"lazy_static",
"log",
"macro_rules_attribute",

View File

@@ -7,7 +7,8 @@ edition = "2021"
[dependencies]
anyhow = "1.0.75"
lsp-server = "0.7.4"
# lsp-server = "0.7.4"
lsp-server = { path = "../rust-analyzer/lib/lsp-server" }
lsp-types = "0.94.1"
ropey = "1.6.1"
serde = "1.0.190"
@@ -18,7 +19,8 @@ tokenizers = "0.14.1"
parking_lot = "0.12.1"
once_cell = "1.19.0"
directories = "5.0.1"
llama-cpp-2 = "0.1.27"
# llama-cpp-2 = "0.1.27"
llama-cpp-2 = { path = "../llama-cpp-rs/llama-cpp-2" }
[features]
default = []

View File

@@ -1,6 +1,6 @@
use anyhow::{Context, Result};
use serde::Deserialize;
use serde_json::Value;
use serde_json::{json, Value};
use std::collections::HashMap;
#[cfg(target_os = "macos")]
@@ -21,7 +21,6 @@ pub enum ValidTransformerBackend {
PostgresML,
}
// TODO: Review this for real lol
#[derive(Clone, Deserialize)]
pub struct FIM {
pub start: String,
@@ -49,6 +48,14 @@ struct ValidMemoryConfiguration {
file_store: Option<Value>,
}
impl Default for ValidMemoryConfiguration {
fn default() -> Self {
Self {
file_store: Some(json!({})),
}
}
}
#[derive(Clone, Deserialize)]
struct ChatMessages {
role: String,
@@ -84,17 +91,52 @@ struct ModelGGUF {
kwargs: Kwargs,
}
impl Default for ModelGGUF {
fn default() -> Self {
Self {
model: Model {
repository: "stabilityai/stable-code-3b".to_string(),
name: Some("stable-code-3b-Q5_K_M.gguf".to_string()),
},
fim: Some(FIM {
start: "<fim_prefix>".to_string(),
middle: "<fim_suffix>".to_string(),
end: "<fim_middle>".to_string(),
}),
max_new_tokens: MaxNewTokens::default(),
chat: None,
kwargs: Kwargs::default(),
}
}
}
#[derive(Clone, Deserialize)]
struct ValidMacTransformerConfiguration {
model_gguf: Option<ModelGGUF>,
}
impl Default for ValidMacTransformerConfiguration {
fn default() -> Self {
Self {
model_gguf: Some(ModelGGUF::default()),
}
}
}
#[derive(Clone, Deserialize)]
struct ValidLinuxTransformerConfiguration {
model_gguf: Option<ModelGGUF>,
}
#[derive(Clone, Deserialize)]
impl Default for ValidLinuxTransformerConfiguration {
fn default() -> Self {
Self {
model_gguf: Some(ModelGGUF::default()),
}
}
}
#[derive(Clone, Deserialize, Default)]
struct ValidConfiguration {
memory: ValidMemoryConfiguration,
#[cfg(target_os = "macos")]
@@ -115,10 +157,11 @@ impl Configuration {
let configuration_args = args
.as_object_mut()
.context("Server configuration must be a JSON object")?
.remove("initializationOptions")
.unwrap_or_default();
let valid_args: ValidConfiguration = serde_json::from_value(configuration_args)?;
// TODO: Make sure they only specified one model or something ya know
.remove("initializationOptions");
let valid_args = match configuration_args {
Some(configuration_args) => serde_json::from_value(configuration_args)?,
None => ValidConfiguration::default(),
};
Ok(Self {
valid_config: valid_args,
})
@@ -192,7 +235,6 @@ impl Configuration {
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn custom_mac_gguf_model() {
@@ -239,7 +281,6 @@ mod tests {
]
},
"n_ctx": 2048,
"n_threads": 8,
"n_gpu_layers": 35,
}
},

View File

@@ -47,12 +47,14 @@ impl TransformerBackend for LlamaCPP {
fn do_generate(&self, prompt: &str) -> anyhow::Result<DoGenerateResponse> {
let max_new_tokens = self.configuration.get_max_new_tokens().generation;
unimplemented!()
self.model
.complete(prompt, max_new_tokens)
.map(|generated_text| DoGenerateResponse { generated_text })
}
fn do_generate_stream(
&self,
request: &GenerateStreamRequest,
_request: &GenerateStreamRequest,
) -> anyhow::Result<DoGenerateStreamResponse> {
unimplemented!()
}

View File

@@ -7,21 +7,20 @@ use llama_cpp_2::{
model::{params::LlamaModelParams, AddBos, LlamaModel},
token::data_array::LlamaTokenDataArray,
};
use once_cell::sync::Lazy;
use std::{num::NonZeroU32, path::PathBuf, time::Duration};
use crate::configuration::Kwargs;
static BACKEND: Lazy<LlamaBackend> = Lazy::new(|| LlamaBackend::init().unwrap());
pub struct Model {
backend: LlamaBackend,
model: LlamaModel,
n_ctx: NonZeroU32,
}
impl Model {
pub fn new(model_path: PathBuf, kwargs: &Kwargs) -> anyhow::Result<Self> {
// Init the backend
let backend = LlamaBackend::init()?;
// Get n_gpu_layers if set in kwargs
// As a default we set it to 1000, which should put all layers on the GPU
let n_gpu_layers = kwargs
@@ -43,7 +42,7 @@ impl Model {
// Load the model
eprintln!("SETTING MODEL AT PATH: {:?}", model_path);
let model = LlamaModel::load_from_file(&backend, model_path, &model_params)?;
let model = LlamaModel::load_from_file(&BACKEND, model_path, &model_params)?;
eprintln!("\nMODEL SET\n");
// Get n_ctx if set in kwargs
@@ -58,11 +57,7 @@ impl Model {
.unwrap_or_else(|| Ok(NonZeroU32::new(2048)))?
.context("n_ctx must not be zero")?;
Ok(Model {
backend,
model,
n_ctx,
})
Ok(Model { model, n_ctx })
}
pub fn complete(&self, prompt: &str, max_new_tokens: usize) -> anyhow::Result<String> {
@@ -71,7 +66,7 @@ impl Model {
let mut ctx = self
.model
.new_context(&self.backend, ctx_params)
.new_context(&BACKEND, ctx_params)
.with_context(|| "unable to create the llama_context")?;
let tokens_list = self

View File

@@ -142,7 +142,7 @@ impl Worker {
.memory_backend
.lock()
.build_prompt(&request.params.text_document_position)?;
eprintln!("\n\n****************{}***************\n\n", prompt);
eprintln!("\nPROMPT*************\n{}\n************\n", prompt);
let response = self.transformer_backend.do_generate(&prompt)?;
let result = GenerateResult {
generated_text: response.generated_text,

View File

@@ -1,18 +0,0 @@
{
"macos": {
"model_gguf": {
"repository": "deepseek-coder-6.7b-base",
"name": "Q4_K_M.gguf",
"fim": false,
"n_ctx": 2048,
"n_threads": 8,
"n_gpu_layers": 35
}
},
"linux": {
"model_gptq": {
"repository": "theblokesomething",
"name": "some q5 or something"
}
}
}