Using llamacpp with rust

2025-12-24 01:44:23 +01:00 · 2024-02-22 17:23:46 -10:00
parent 2f2ff81043
commit 418ccb81ff
7 changed files with 72 additions and 63 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -98,7 +98,7 @@ dependencies = [
 "bitflags 2.4.2",
 "cexpr",
 "clang-sys",
- "itertools 0.12.1",
+ "itertools",
 "lazy_static",
 "lazycell",
 "log",
@@ -566,15 +566,6 @@ dependencies = [
 "either",
 ]

-[[package]]
-name = "itertools"
-version = "0.12.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ba291022dbbd398a455acf126c1e341954079855bc60dfdda641363bd6922569"
-dependencies = [
- "either",
-]
-
 [[package]]
 name = "itoa"
 version = "1.0.10"
@@ -628,9 +619,8 @@ checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"

 [[package]]
 name = "llama-cpp-2"
-version = "0.1.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "747243ba163eb361f5d6d483a177450240ce6ca70cefcb7c489f6333e9fe4300"
+version = "0.1.25"
+source = "git+https://github.com/SilasMarvin/llama-cpp-rs?branch=silas-8-metal-on-mac#8c61f584e7aa200581b711147e685821190aa025"
 dependencies = [
 "llama-cpp-sys-2",
 "thiserror",
@@ -639,9 +629,8 @@ dependencies = [

 [[package]]
 name = "llama-cpp-sys-2"
-version = "0.1.27"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3844a3f833eca795309fec9223a316ced1cebeff9c5cfce5ee760825040d281f"
+version = "0.1.25"
+source = "git+https://github.com/SilasMarvin/llama-cpp-rs?branch=silas-8-metal-on-mac#8c61f584e7aa200581b711147e685821190aa025"
 dependencies = [
 "bindgen",
 "cc",
@@ -685,8 +674,6 @@ dependencies = [
 [[package]]
 name = "lsp-server"
 version = "0.7.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "248f65b78f6db5d8e1b1604b4098a28b43d21a8eb1deeca22b1c421b276c7095"
 dependencies = [
 "crossbeam-channel",
 "log",
@@ -1011,7 +998,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "059f538b55efd2309c9794130bc149c6a553db90e9d99c2030785c82f0bd7df9"
 dependencies = [
 "either",
- "itertools 0.11.0",
+ "itertools",
 "rayon",
 ]

@@ -1387,7 +1374,7 @@ dependencies = [
 "esaxx-rs",
 "getrandom",
 "indicatif",
- "itertools 0.11.0",
+ "itertools",
 "lazy_static",
 "log",
 "macro_rules_attribute",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -7,7 +7,8 @@ edition = "2021"

 [dependencies]
 anyhow = "1.0.75"
-lsp-server = "0.7.4"
+# lsp-server = "0.7.4"
+lsp-server = { path = "../rust-analyzer/lib/lsp-server" }
 lsp-types = "0.94.1"
 ropey = "1.6.1"
 serde = "1.0.190"
@@ -18,7 +19,8 @@ tokenizers = "0.14.1"
 parking_lot = "0.12.1"
 once_cell = "1.19.0"
 directories = "5.0.1"
-llama-cpp-2 = "0.1.27"
+# llama-cpp-2 = "0.1.27"
+llama-cpp-2 = { path = "../llama-cpp-rs/llama-cpp-2" }

 [features]
 default = []
--- a/src/configuration.rs
+++ b/src/configuration.rs
@@ -1,6 +1,6 @@
 use anyhow::{Context, Result};
 use serde::Deserialize;
-use serde_json::Value;
+use serde_json::{json, Value};
 use std::collections::HashMap;

 #[cfg(target_os = "macos")]
@@ -21,7 +21,6 @@ pub enum ValidTransformerBackend {
    PostgresML,
 }

-// TODO: Review this for real lol
 #[derive(Clone, Deserialize)]
 pub struct FIM {
    pub start: String,
@@ -49,6 +48,14 @@ struct ValidMemoryConfiguration {
    file_store: Option<Value>,
 }

+impl Default for ValidMemoryConfiguration {
+    fn default() -> Self {
+        Self {
+            file_store: Some(json!({})),
+        }
+    }
+}
+
 #[derive(Clone, Deserialize)]
 struct ChatMessages {
    role: String,
@@ -84,17 +91,52 @@ struct ModelGGUF {
    kwargs: Kwargs,
 }

+impl Default for ModelGGUF {
+    fn default() -> Self {
+        Self {
+            model: Model {
+                repository: "stabilityai/stable-code-3b".to_string(),
+                name: Some("stable-code-3b-Q5_K_M.gguf".to_string()),
+            },
+            fim: Some(FIM {
+                start: "<fim_prefix>".to_string(),
+                middle: "<fim_suffix>".to_string(),
+                end: "<fim_middle>".to_string(),
+            }),
+            max_new_tokens: MaxNewTokens::default(),
+            chat: None,
+            kwargs: Kwargs::default(),
+        }
+    }
+}
+
 #[derive(Clone, Deserialize)]
 struct ValidMacTransformerConfiguration {
    model_gguf: Option<ModelGGUF>,
 }

+impl Default for ValidMacTransformerConfiguration {
+    fn default() -> Self {
+        Self {
+            model_gguf: Some(ModelGGUF::default()),
+        }
+    }
+}
+
 #[derive(Clone, Deserialize)]
 struct ValidLinuxTransformerConfiguration {
    model_gguf: Option<ModelGGUF>,
 }

-#[derive(Clone, Deserialize)]
+impl Default for ValidLinuxTransformerConfiguration {
+    fn default() -> Self {
+        Self {
+            model_gguf: Some(ModelGGUF::default()),
+        }
+    }
+}
+
+#[derive(Clone, Deserialize, Default)]
 struct ValidConfiguration {
    memory: ValidMemoryConfiguration,
    #[cfg(target_os = "macos")]
@@ -115,10 +157,11 @@ impl Configuration {
        let configuration_args = args
            .as_object_mut()
            .context("Server configuration must be a JSON object")?
-            .remove("initializationOptions")
-            .unwrap_or_default();
-        let valid_args: ValidConfiguration = serde_json::from_value(configuration_args)?;
-        // TODO: Make sure they only specified one model or something ya know
+            .remove("initializationOptions");
+        let valid_args = match configuration_args {
+            Some(configuration_args) => serde_json::from_value(configuration_args)?,
+            None => ValidConfiguration::default(),
+        };
        Ok(Self {
            valid_config: valid_args,
        })
@@ -192,7 +235,6 @@ impl Configuration {
 #[cfg(test)]
 mod tests {
    use super::*;
-    use serde_json::json;

    #[test]
    fn custom_mac_gguf_model() {
@@ -239,7 +281,6 @@ mod tests {
                            ]
                        },
                        "n_ctx": 2048,
-                        "n_threads": 8,
                        "n_gpu_layers": 35,
                    }
                },
--- a/src/transformer_backends/llama_cpp/mod.rs
+++ b/src/transformer_backends/llama_cpp/mod.rs
@@ -47,12 +47,14 @@ impl TransformerBackend for LlamaCPP {

    fn do_generate(&self, prompt: &str) -> anyhow::Result<DoGenerateResponse> {
        let max_new_tokens = self.configuration.get_max_new_tokens().generation;
-        unimplemented!()
+        self.model
+            .complete(prompt, max_new_tokens)
+            .map(|generated_text| DoGenerateResponse { generated_text })
    }

    fn do_generate_stream(
        &self,
-        request: &GenerateStreamRequest,
+        _request: &GenerateStreamRequest,
    ) -> anyhow::Result<DoGenerateStreamResponse> {
        unimplemented!()
    }
--- a/src/transformer_backends/llama_cpp/model.rs
+++ b/src/transformer_backends/llama_cpp/model.rs
@@ -7,21 +7,20 @@ use llama_cpp_2::{
    model::{params::LlamaModelParams, AddBos, LlamaModel},
    token::data_array::LlamaTokenDataArray,
 };
+use once_cell::sync::Lazy;
 use std::{num::NonZeroU32, path::PathBuf, time::Duration};

 use crate::configuration::Kwargs;

+static BACKEND: Lazy<LlamaBackend> = Lazy::new(|| LlamaBackend::init().unwrap());
+
 pub struct Model {
-    backend: LlamaBackend,
    model: LlamaModel,
    n_ctx: NonZeroU32,
 }

 impl Model {
    pub fn new(model_path: PathBuf, kwargs: &Kwargs) -> anyhow::Result<Self> {
-        // Init the backend
-        let backend = LlamaBackend::init()?;
-
        // Get n_gpu_layers if set in kwargs
        // As a default we set it to 1000, which should put all layers on the GPU
        let n_gpu_layers = kwargs
@@ -43,7 +42,7 @@ impl Model {

        // Load the model
        eprintln!("SETTING MODEL AT PATH: {:?}", model_path);
-        let model = LlamaModel::load_from_file(&backend, model_path, &model_params)?;
+        let model = LlamaModel::load_from_file(&BACKEND, model_path, &model_params)?;
        eprintln!("\nMODEL SET\n");

        // Get n_ctx if set in kwargs
@@ -58,11 +57,7 @@ impl Model {
            .unwrap_or_else(|| Ok(NonZeroU32::new(2048)))?
            .context("n_ctx must not be zero")?;

-        Ok(Model {
-            backend,
-            model,
-            n_ctx,
-        })
+        Ok(Model { model, n_ctx })
    }

    pub fn complete(&self, prompt: &str, max_new_tokens: usize) -> anyhow::Result<String> {
@@ -71,7 +66,7 @@ impl Model {

        let mut ctx = self
            .model
-            .new_context(&self.backend, ctx_params)
+            .new_context(&BACKEND, ctx_params)
            .with_context(|| "unable to create the llama_context")?;

        let tokens_list = self
--- a/src/worker.rs
+++ b/src/worker.rs
@@ -142,7 +142,7 @@ impl Worker {
            .memory_backend
            .lock()
            .build_prompt(&request.params.text_document_position)?;
-        eprintln!("\n\n****************{}***************\n\n", prompt);
+        eprintln!("\nPROMPT*************\n{}\n************\n", prompt);
        let response = self.transformer_backend.do_generate(&prompt)?;
        let result = GenerateResult {
            generated_text: response.generated_text,
--- a/test.json
+++ b/test.json
@@ -1,18 +0,0 @@
-{
-  "macos": {
-    "model_gguf": {
-      "repository": "deepseek-coder-6.7b-base",
-      "name": "Q4_K_M.gguf",
-      "fim": false,
-      "n_ctx": 2048,
-      "n_threads": 8,
-      "n_gpu_layers": 35
-    }
-  },
-  "linux": {
-    "model_gptq": {
-      "repository": "theblokesomething",
-      "name": "some q5 or something"
-    }
-  }
-}