mirror of
https://github.com/SilasMarvin/lsp-ai.git
synced 2025-12-22 08:54:25 +01:00
Started the work for crawling and added better code grabbing for the FileStore
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1468,6 +1468,7 @@ dependencies = [
|
|||||||
"directories",
|
"directories",
|
||||||
"hf-hub",
|
"hf-hub",
|
||||||
"ignore",
|
"ignore",
|
||||||
|
"indexmap",
|
||||||
"llama-cpp-2",
|
"llama-cpp-2",
|
||||||
"lsp-server",
|
"lsp-server",
|
||||||
"lsp-types",
|
"lsp-types",
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ reqwest = { version = "0.11.25", features = ["blocking", "json"] }
|
|||||||
ignore = "0.4.22"
|
ignore = "0.4.22"
|
||||||
pgml = { path = "submodules/postgresml/pgml-sdks/pgml" }
|
pgml = { path = "submodules/postgresml/pgml-sdks/pgml" }
|
||||||
tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
|
tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
|
||||||
|
indexmap = "2.2.5"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
default = []
|
default = []
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ const DEFAULT_MAX_GENERATION_TOKENS: usize = 256;
|
|||||||
pub type Kwargs = HashMap<String, Value>;
|
pub type Kwargs = HashMap<String, Value>;
|
||||||
|
|
||||||
pub enum ValidMemoryBackend {
|
pub enum ValidMemoryBackend {
|
||||||
FileStore,
|
FileStore(FileStore),
|
||||||
PostgresML(PostgresML),
|
PostgresML(PostgresML),
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -60,18 +60,24 @@ impl Default for MaxTokens {
|
|||||||
#[derive(Clone, Debug, Deserialize)]
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
pub struct PostgresML {
|
pub struct PostgresML {
|
||||||
pub database_url: Option<String>,
|
pub database_url: Option<String>,
|
||||||
|
pub crawl: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Deserialize, Default)]
|
||||||
|
pub struct FileStore {
|
||||||
|
pub crawl: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Clone, Debug, Deserialize)]
|
#[derive(Clone, Debug, Deserialize)]
|
||||||
struct ValidMemoryConfiguration {
|
struct ValidMemoryConfiguration {
|
||||||
file_store: Option<Value>,
|
file_store: Option<FileStore>,
|
||||||
postgresml: Option<PostgresML>,
|
postgresml: Option<PostgresML>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for ValidMemoryConfiguration {
|
impl Default for ValidMemoryConfiguration {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
file_store: Some(json!({})),
|
file_store: Some(FileStore::default()),
|
||||||
postgresml: None,
|
postgresml: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -227,8 +233,9 @@ impl Configuration {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_memory_backend(&self) -> Result<ValidMemoryBackend> {
|
pub fn get_memory_backend(&self) -> Result<ValidMemoryBackend> {
|
||||||
if self.valid_config.memory.file_store.is_some() {
|
// if self.valid_config.memory.file_store.is_some() {
|
||||||
Ok(ValidMemoryBackend::FileStore)
|
if let Some(file_store) = &self.valid_config.memory.file_store {
|
||||||
|
Ok(ValidMemoryBackend::FileStore(file_store.to_owned()))
|
||||||
} else if let Some(postgresml) = &self.valid_config.memory.postgresml {
|
} else if let Some(postgresml) = &self.valid_config.memory.postgresml {
|
||||||
Ok(ValidMemoryBackend::PostgresML(postgresml.to_owned()))
|
Ok(ValidMemoryBackend::PostgresML(postgresml.to_owned()))
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -1,30 +1,92 @@
|
|||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
|
use indexmap::IndexSet;
|
||||||
use lsp_types::TextDocumentPositionParams;
|
use lsp_types::TextDocumentPositionParams;
|
||||||
use ropey::Rope;
|
use ropey::Rope;
|
||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use tracing::instrument;
|
use tracing::instrument;
|
||||||
|
|
||||||
use crate::{configuration::Configuration, utils::tokens_to_estimated_characters};
|
use crate::{
|
||||||
|
configuration::{self, Configuration},
|
||||||
|
utils::tokens_to_estimated_characters,
|
||||||
|
};
|
||||||
|
|
||||||
use super::{MemoryBackend, Prompt, PromptForType};
|
use super::{MemoryBackend, Prompt, PromptForType};
|
||||||
|
|
||||||
pub struct FileStore {
|
pub struct FileStore {
|
||||||
|
crawl: bool,
|
||||||
configuration: Configuration,
|
configuration: Configuration,
|
||||||
file_map: HashMap<String, Rope>,
|
file_map: HashMap<String, Rope>,
|
||||||
|
accessed_files: IndexSet<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO: Put some thought into the crawling here. Do we want to have a crawl option where it tries to crawl through all relevant
|
// TODO: Put some thought into the crawling here. Do we want to have a crawl option where it tries to crawl through all relevant
|
||||||
// files and then when asked for context it loads them in by the most recently accessed? That seems kind of silly honestly, but I could see
|
// files and then when asked for context it loads them in by the most recently accessed? That seems kind of silly honestly, but I could see
|
||||||
// how users who want to use models with massive context lengths would just want their entire project as context for generation tasks
|
// how users who want to use models with massive context lengths would just want their entire project as context for generation tasks
|
||||||
// I'm not sure yet, this is something I need to think through more
|
// I'm not sure yet, this is something I need to think through more
|
||||||
|
|
||||||
|
// Ok here are some more ideas
|
||||||
|
// We take a crawl arg which is a bool of true or false for file_store
|
||||||
|
// If true we crawl until we get to the max_context_length and then we stop crawling
|
||||||
|
// We keep track of the last opened / changed files, and prioritize those when building the context for our llms
|
||||||
|
|
||||||
|
// For memory backends like PostgresML, they will need to take some kind of max_context_length to crawl or something.
|
||||||
|
// In other words, there needs to be some specification for how much they should be crawling because the limiting happens in the vector_recall
|
||||||
impl FileStore {
|
impl FileStore {
|
||||||
pub fn new(configuration: Configuration) -> Self {
|
pub fn new(file_store_config: configuration::FileStore, configuration: Configuration) -> Self {
|
||||||
|
// TODO: maybe crawl
|
||||||
Self {
|
Self {
|
||||||
|
crawl: file_store_config.crawl,
|
||||||
configuration,
|
configuration,
|
||||||
file_map: HashMap::new(),
|
file_map: HashMap::new(),
|
||||||
|
accessed_files: IndexSet::new(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn new_without_crawl(configuration: Configuration) -> Self {
|
||||||
|
Self {
|
||||||
|
crawl: false,
|
||||||
|
configuration,
|
||||||
|
file_map: HashMap::new(),
|
||||||
|
accessed_files: IndexSet::new(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_rope_for_position(
|
||||||
|
&self,
|
||||||
|
position: &TextDocumentPositionParams,
|
||||||
|
characters: usize,
|
||||||
|
) -> anyhow::Result<(Rope, usize)> {
|
||||||
|
// Get the rope and set our initial cursor index
|
||||||
|
let current_document_uri = position.text_document.uri.to_string();
|
||||||
|
let mut rope = self
|
||||||
|
.file_map
|
||||||
|
.get(¤t_document_uri)
|
||||||
|
.context("Error file not found")?
|
||||||
|
.clone();
|
||||||
|
let mut cursor_index = rope.line_to_char(position.position.line as usize)
|
||||||
|
+ position.position.character as usize;
|
||||||
|
// Add to our rope if we need to
|
||||||
|
for file in self
|
||||||
|
.accessed_files
|
||||||
|
.iter()
|
||||||
|
.filter(|f| **f != current_document_uri)
|
||||||
|
{
|
||||||
|
let needed = characters.checked_sub(rope.len_chars()).unwrap_or(0);
|
||||||
|
if needed == 0 {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
let r = self.file_map.get(file).context("Error file not found")?;
|
||||||
|
let slice_max = needed.min(r.len_chars());
|
||||||
|
let rope_str_slice = r
|
||||||
|
.get_slice(0..slice_max)
|
||||||
|
.context("Error getting slice")?
|
||||||
|
.to_string();
|
||||||
|
rope.insert(0, &rope_str_slice);
|
||||||
|
cursor_index += slice_max;
|
||||||
|
}
|
||||||
|
Ok((rope, cursor_index))
|
||||||
|
}
|
||||||
|
|
||||||
pub fn get_characters_around_position(
|
pub fn get_characters_around_position(
|
||||||
&self,
|
&self,
|
||||||
position: &TextDocumentPositionParams,
|
position: &TextDocumentPositionParams,
|
||||||
@@ -53,14 +115,7 @@ impl FileStore {
|
|||||||
prompt_for_type: PromptForType,
|
prompt_for_type: PromptForType,
|
||||||
max_context_length: usize,
|
max_context_length: usize,
|
||||||
) -> anyhow::Result<String> {
|
) -> anyhow::Result<String> {
|
||||||
let mut rope = self
|
let (mut rope, cursor_index) = self.get_rope_for_position(position, max_context_length)?;
|
||||||
.file_map
|
|
||||||
.get(position.text_document.uri.as_str())
|
|
||||||
.context("Error file not found")?
|
|
||||||
.clone();
|
|
||||||
|
|
||||||
let cursor_index = rope.line_to_char(position.position.line as usize)
|
|
||||||
+ position.position.character as usize;
|
|
||||||
|
|
||||||
let is_chat_enabled = match prompt_for_type {
|
let is_chat_enabled = match prompt_for_type {
|
||||||
PromptForType::Completion => self
|
PromptForType::Completion => self
|
||||||
@@ -157,8 +212,9 @@ impl MemoryBackend for FileStore {
|
|||||||
params: lsp_types::DidOpenTextDocumentParams,
|
params: lsp_types::DidOpenTextDocumentParams,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let rope = Rope::from_str(¶ms.text_document.text);
|
let rope = Rope::from_str(¶ms.text_document.text);
|
||||||
self.file_map
|
let uri = params.text_document.uri.to_string();
|
||||||
.insert(params.text_document.uri.to_string(), rope);
|
self.file_map.insert(uri.clone(), rope);
|
||||||
|
self.accessed_files.shift_insert(0, uri);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -167,9 +223,10 @@ impl MemoryBackend for FileStore {
|
|||||||
&mut self,
|
&mut self,
|
||||||
params: lsp_types::DidChangeTextDocumentParams,
|
params: lsp_types::DidChangeTextDocumentParams,
|
||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
|
let uri = params.text_document.uri.to_string();
|
||||||
let rope = self
|
let rope = self
|
||||||
.file_map
|
.file_map
|
||||||
.get_mut(params.text_document.uri.as_str())
|
.get_mut(&uri)
|
||||||
.context("Error trying to get file that does not exist")?;
|
.context("Error trying to get file that does not exist")?;
|
||||||
for change in params.content_changes {
|
for change in params.content_changes {
|
||||||
// If range is ommitted, text is the new text of the document
|
// If range is ommitted, text is the new text of the document
|
||||||
@@ -184,6 +241,7 @@ impl MemoryBackend for FileStore {
|
|||||||
*rope = Rope::from_str(&change.text);
|
*rope = Rope::from_str(&change.text);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
self.accessed_files.shift_insert(0, uri);
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -46,9 +46,9 @@ impl TryFrom<Configuration> for Box<dyn MemoryBackend + Send> {
|
|||||||
|
|
||||||
fn try_from(configuration: Configuration) -> Result<Self, Self::Error> {
|
fn try_from(configuration: Configuration) -> Result<Self, Self::Error> {
|
||||||
match configuration.get_memory_backend()? {
|
match configuration.get_memory_backend()? {
|
||||||
ValidMemoryBackend::FileStore => {
|
ValidMemoryBackend::FileStore(file_store_config) => Ok(Box::new(
|
||||||
Ok(Box::new(file_store::FileStore::new(configuration)))
|
file_store::FileStore::new(file_store_config, configuration),
|
||||||
}
|
)),
|
||||||
ValidMemoryBackend::PostgresML(postgresml_config) => Ok(Box::new(
|
ValidMemoryBackend::PostgresML(postgresml_config) => Ok(Box::new(
|
||||||
postgresml::PostgresML::new(postgresml_config, configuration)?,
|
postgresml::PostgresML::new(postgresml_config, configuration)?,
|
||||||
)),
|
)),
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
use std::{
|
use std::{
|
||||||
sync::mpsc::{self, Sender, TryRecvError},
|
sync::mpsc::{self, Sender},
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -24,6 +24,7 @@ pub struct PostgresML {
|
|||||||
pipeline: Pipeline,
|
pipeline: Pipeline,
|
||||||
runtime: Runtime,
|
runtime: Runtime,
|
||||||
debounce_tx: Sender<String>,
|
debounce_tx: Sender<String>,
|
||||||
|
added_pipeline: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl PostgresML {
|
impl PostgresML {
|
||||||
@@ -31,7 +32,7 @@ impl PostgresML {
|
|||||||
postgresml_config: configuration::PostgresML,
|
postgresml_config: configuration::PostgresML,
|
||||||
configuration: Configuration,
|
configuration: Configuration,
|
||||||
) -> anyhow::Result<Self> {
|
) -> anyhow::Result<Self> {
|
||||||
let file_store = FileStore::new(configuration.clone());
|
let file_store = FileStore::new_without_crawl(configuration.clone());
|
||||||
let database_url = if let Some(database_url) = postgresml_config.database_url {
|
let database_url = if let Some(database_url) = postgresml_config.database_url {
|
||||||
database_url
|
database_url
|
||||||
} else {
|
} else {
|
||||||
@@ -39,7 +40,7 @@ impl PostgresML {
|
|||||||
};
|
};
|
||||||
// TODO: Think on the naming of the collection
|
// TODO: Think on the naming of the collection
|
||||||
// Maybe filter on metadata or I'm not sure
|
// Maybe filter on metadata or I'm not sure
|
||||||
let collection = Collection::new("test-lsp-ai", Some(database_url))?;
|
let collection = Collection::new("test-lsp-ai-2", Some(database_url))?;
|
||||||
// TODO: Review the pipeline
|
// TODO: Review the pipeline
|
||||||
let pipeline = Pipeline::new(
|
let pipeline = Pipeline::new(
|
||||||
"v1",
|
"v1",
|
||||||
@@ -66,15 +67,6 @@ impl PostgresML {
|
|||||||
.worker_threads(2)
|
.worker_threads(2)
|
||||||
.enable_all()
|
.enable_all()
|
||||||
.build()?;
|
.build()?;
|
||||||
// Add the collection to the pipeline
|
|
||||||
let mut task_collection = collection.clone();
|
|
||||||
let mut task_pipeline = pipeline.clone();
|
|
||||||
runtime.spawn(async move {
|
|
||||||
task_collection
|
|
||||||
.add_pipeline(&mut task_pipeline)
|
|
||||||
.await
|
|
||||||
.expect("PGML - Error adding pipeline to collection");
|
|
||||||
});
|
|
||||||
// Setup up a debouncer for changed text documents
|
// Setup up a debouncer for changed text documents
|
||||||
let mut task_collection = collection.clone();
|
let mut task_collection = collection.clone();
|
||||||
let (debounce_tx, debounce_rx) = mpsc::channel::<String>();
|
let (debounce_tx, debounce_rx) = mpsc::channel::<String>();
|
||||||
@@ -124,6 +116,7 @@ impl PostgresML {
|
|||||||
pipeline,
|
pipeline,
|
||||||
runtime,
|
runtime,
|
||||||
debounce_tx,
|
debounce_tx,
|
||||||
|
added_pipeline: false,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -140,7 +133,7 @@ impl MemoryBackend for PostgresML {
|
|||||||
position: &TextDocumentPositionParams,
|
position: &TextDocumentPositionParams,
|
||||||
prompt_for_type: PromptForType,
|
prompt_for_type: PromptForType,
|
||||||
) -> anyhow::Result<Prompt> {
|
) -> anyhow::Result<Prompt> {
|
||||||
// This is blocking, but this is ok as we only query for it from the worker when we are actually doing a transform
|
// This is blocking, but that is ok as we only query for it from the worker when we are actually doing a transform
|
||||||
let query = self
|
let query = self
|
||||||
.file_store
|
.file_store
|
||||||
.get_characters_around_position(position, 512)?;
|
.get_characters_around_position(position, 512)?;
|
||||||
@@ -189,8 +182,16 @@ impl MemoryBackend for PostgresML {
|
|||||||
) -> anyhow::Result<()> {
|
) -> anyhow::Result<()> {
|
||||||
let text = params.text_document.text.clone();
|
let text = params.text_document.text.clone();
|
||||||
let path = params.text_document.uri.path().to_owned();
|
let path = params.text_document.uri.path().to_owned();
|
||||||
|
let task_added_pipeline = self.added_pipeline;
|
||||||
let mut task_collection = self.collection.clone();
|
let mut task_collection = self.collection.clone();
|
||||||
|
let mut task_pipeline = self.pipeline.clone();
|
||||||
self.runtime.spawn(async move {
|
self.runtime.spawn(async move {
|
||||||
|
if !task_added_pipeline {
|
||||||
|
task_collection
|
||||||
|
.add_pipeline(&mut task_pipeline)
|
||||||
|
.await
|
||||||
|
.expect("PGML - Error adding pipeline to collection");
|
||||||
|
}
|
||||||
task_collection
|
task_collection
|
||||||
.upsert_documents(
|
.upsert_documents(
|
||||||
vec![json!({
|
vec![json!({
|
||||||
@@ -201,7 +202,7 @@ impl MemoryBackend for PostgresML {
|
|||||||
None,
|
None,
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
.expect("PGML - Error adding pipeline to collection");
|
.expect("PGML - Error upserting documents");
|
||||||
});
|
});
|
||||||
self.file_store.opened_text_document(params)
|
self.file_store.opened_text_document(params)
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user