Started the work for crawling and added better code grabbing for the FileStore

This commit is contained in:
Silas Marvin
2024-03-12 20:27:25 -07:00
parent 4822c763de
commit 217933c0c7
6 changed files with 103 additions and 35 deletions

1
Cargo.lock generated
View File

@@ -1468,6 +1468,7 @@ dependencies = [
"directories", "directories",
"hf-hub", "hf-hub",
"ignore", "ignore",
"indexmap",
"llama-cpp-2", "llama-cpp-2",
"lsp-server", "lsp-server",
"lsp-types", "lsp-types",

View File

@@ -29,6 +29,7 @@ reqwest = { version = "0.11.25", features = ["blocking", "json"] }
ignore = "0.4.22" ignore = "0.4.22"
pgml = { path = "submodules/postgresml/pgml-sdks/pgml" } pgml = { path = "submodules/postgresml/pgml-sdks/pgml" }
tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] } tokio = { version = "1.36.0", features = ["rt-multi-thread", "time"] }
indexmap = "2.2.5"
[features] [features]
default = [] default = []

View File

@@ -12,7 +12,7 @@ const DEFAULT_MAX_GENERATION_TOKENS: usize = 256;
pub type Kwargs = HashMap<String, Value>; pub type Kwargs = HashMap<String, Value>;
pub enum ValidMemoryBackend { pub enum ValidMemoryBackend {
FileStore, FileStore(FileStore),
PostgresML(PostgresML), PostgresML(PostgresML),
} }
@@ -60,18 +60,24 @@ impl Default for MaxTokens {
#[derive(Clone, Debug, Deserialize)] #[derive(Clone, Debug, Deserialize)]
pub struct PostgresML { pub struct PostgresML {
pub database_url: Option<String>, pub database_url: Option<String>,
pub crawl: bool,
}
#[derive(Clone, Debug, Deserialize, Default)]
pub struct FileStore {
pub crawl: bool,
} }
#[derive(Clone, Debug, Deserialize)] #[derive(Clone, Debug, Deserialize)]
struct ValidMemoryConfiguration { struct ValidMemoryConfiguration {
file_store: Option<Value>, file_store: Option<FileStore>,
postgresml: Option<PostgresML>, postgresml: Option<PostgresML>,
} }
impl Default for ValidMemoryConfiguration { impl Default for ValidMemoryConfiguration {
fn default() -> Self { fn default() -> Self {
Self { Self {
file_store: Some(json!({})), file_store: Some(FileStore::default()),
postgresml: None, postgresml: None,
} }
} }
@@ -227,8 +233,9 @@ impl Configuration {
} }
pub fn get_memory_backend(&self) -> Result<ValidMemoryBackend> { pub fn get_memory_backend(&self) -> Result<ValidMemoryBackend> {
if self.valid_config.memory.file_store.is_some() { // if self.valid_config.memory.file_store.is_some() {
Ok(ValidMemoryBackend::FileStore) if let Some(file_store) = &self.valid_config.memory.file_store {
Ok(ValidMemoryBackend::FileStore(file_store.to_owned()))
} else if let Some(postgresml) = &self.valid_config.memory.postgresml { } else if let Some(postgresml) = &self.valid_config.memory.postgresml {
Ok(ValidMemoryBackend::PostgresML(postgresml.to_owned())) Ok(ValidMemoryBackend::PostgresML(postgresml.to_owned()))
} else { } else {

View File

@@ -1,30 +1,92 @@
use anyhow::Context; use anyhow::Context;
use indexmap::IndexSet;
use lsp_types::TextDocumentPositionParams; use lsp_types::TextDocumentPositionParams;
use ropey::Rope; use ropey::Rope;
use std::collections::HashMap; use std::collections::HashMap;
use tracing::instrument; use tracing::instrument;
use crate::{configuration::Configuration, utils::tokens_to_estimated_characters}; use crate::{
configuration::{self, Configuration},
utils::tokens_to_estimated_characters,
};
use super::{MemoryBackend, Prompt, PromptForType}; use super::{MemoryBackend, Prompt, PromptForType};
pub struct FileStore { pub struct FileStore {
crawl: bool,
configuration: Configuration, configuration: Configuration,
file_map: HashMap<String, Rope>, file_map: HashMap<String, Rope>,
accessed_files: IndexSet<String>,
} }
// TODO: Put some thought into the crawling here. Do we want to have a crawl option where it tries to crawl through all relevant // TODO: Put some thought into the crawling here. Do we want to have a crawl option where it tries to crawl through all relevant
// files and then when asked for context it loads them in by the most recently accessed? That seems kind of silly honestly, but I could see // files and then when asked for context it loads them in by the most recently accessed? That seems kind of silly honestly, but I could see
// how users who want to use models with massive context lengths would just want their entire project as context for generation tasks // how users who want to use models with massive context lengths would just want their entire project as context for generation tasks
// I'm not sure yet, this is something I need to think through more // I'm not sure yet, this is something I need to think through more
// Ok here are some more ideas
// We take a crawl arg which is a bool of true or false for file_store
// If true we crawl until we get to the max_context_length and then we stop crawling
// We keep track of the last opened / changed files, and prioritize those when building the context for our llms
// For memory backends like PostgresML, they will need to take some kind of max_context_length to crawl or something.
// In other words, there needs to be some specification for how much they should be crawling because the limiting happens in the vector_recall
impl FileStore { impl FileStore {
pub fn new(configuration: Configuration) -> Self { pub fn new(file_store_config: configuration::FileStore, configuration: Configuration) -> Self {
// TODO: maybe crawl
Self { Self {
crawl: file_store_config.crawl,
configuration, configuration,
file_map: HashMap::new(), file_map: HashMap::new(),
accessed_files: IndexSet::new(),
} }
} }
pub fn new_without_crawl(configuration: Configuration) -> Self {
Self {
crawl: false,
configuration,
file_map: HashMap::new(),
accessed_files: IndexSet::new(),
}
}
fn get_rope_for_position(
&self,
position: &TextDocumentPositionParams,
characters: usize,
) -> anyhow::Result<(Rope, usize)> {
// Get the rope and set our initial cursor index
let current_document_uri = position.text_document.uri.to_string();
let mut rope = self
.file_map
.get(&current_document_uri)
.context("Error file not found")?
.clone();
let mut cursor_index = rope.line_to_char(position.position.line as usize)
+ position.position.character as usize;
// Add to our rope if we need to
for file in self
.accessed_files
.iter()
.filter(|f| **f != current_document_uri)
{
let needed = characters.checked_sub(rope.len_chars()).unwrap_or(0);
if needed == 0 {
break;
}
let r = self.file_map.get(file).context("Error file not found")?;
let slice_max = needed.min(r.len_chars());
let rope_str_slice = r
.get_slice(0..slice_max)
.context("Error getting slice")?
.to_string();
rope.insert(0, &rope_str_slice);
cursor_index += slice_max;
}
Ok((rope, cursor_index))
}
pub fn get_characters_around_position( pub fn get_characters_around_position(
&self, &self,
position: &TextDocumentPositionParams, position: &TextDocumentPositionParams,
@@ -53,14 +115,7 @@ impl FileStore {
prompt_for_type: PromptForType, prompt_for_type: PromptForType,
max_context_length: usize, max_context_length: usize,
) -> anyhow::Result<String> { ) -> anyhow::Result<String> {
let mut rope = self let (mut rope, cursor_index) = self.get_rope_for_position(position, max_context_length)?;
.file_map
.get(position.text_document.uri.as_str())
.context("Error file not found")?
.clone();
let cursor_index = rope.line_to_char(position.position.line as usize)
+ position.position.character as usize;
let is_chat_enabled = match prompt_for_type { let is_chat_enabled = match prompt_for_type {
PromptForType::Completion => self PromptForType::Completion => self
@@ -157,8 +212,9 @@ impl MemoryBackend for FileStore {
params: lsp_types::DidOpenTextDocumentParams, params: lsp_types::DidOpenTextDocumentParams,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let rope = Rope::from_str(&params.text_document.text); let rope = Rope::from_str(&params.text_document.text);
self.file_map let uri = params.text_document.uri.to_string();
.insert(params.text_document.uri.to_string(), rope); self.file_map.insert(uri.clone(), rope);
self.accessed_files.shift_insert(0, uri);
Ok(()) Ok(())
} }
@@ -167,9 +223,10 @@ impl MemoryBackend for FileStore {
&mut self, &mut self,
params: lsp_types::DidChangeTextDocumentParams, params: lsp_types::DidChangeTextDocumentParams,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let uri = params.text_document.uri.to_string();
let rope = self let rope = self
.file_map .file_map
.get_mut(params.text_document.uri.as_str()) .get_mut(&uri)
.context("Error trying to get file that does not exist")?; .context("Error trying to get file that does not exist")?;
for change in params.content_changes { for change in params.content_changes {
// If range is ommitted, text is the new text of the document // If range is ommitted, text is the new text of the document
@@ -184,6 +241,7 @@ impl MemoryBackend for FileStore {
*rope = Rope::from_str(&change.text); *rope = Rope::from_str(&change.text);
} }
} }
self.accessed_files.shift_insert(0, uri);
Ok(()) Ok(())
} }

View File

@@ -46,9 +46,9 @@ impl TryFrom<Configuration> for Box<dyn MemoryBackend + Send> {
fn try_from(configuration: Configuration) -> Result<Self, Self::Error> { fn try_from(configuration: Configuration) -> Result<Self, Self::Error> {
match configuration.get_memory_backend()? { match configuration.get_memory_backend()? {
ValidMemoryBackend::FileStore => { ValidMemoryBackend::FileStore(file_store_config) => Ok(Box::new(
Ok(Box::new(file_store::FileStore::new(configuration))) file_store::FileStore::new(file_store_config, configuration),
} )),
ValidMemoryBackend::PostgresML(postgresml_config) => Ok(Box::new( ValidMemoryBackend::PostgresML(postgresml_config) => Ok(Box::new(
postgresml::PostgresML::new(postgresml_config, configuration)?, postgresml::PostgresML::new(postgresml_config, configuration)?,
)), )),

View File

@@ -1,5 +1,5 @@
use std::{ use std::{
sync::mpsc::{self, Sender, TryRecvError}, sync::mpsc::{self, Sender},
time::Duration, time::Duration,
}; };
@@ -24,6 +24,7 @@ pub struct PostgresML {
pipeline: Pipeline, pipeline: Pipeline,
runtime: Runtime, runtime: Runtime,
debounce_tx: Sender<String>, debounce_tx: Sender<String>,
added_pipeline: bool,
} }
impl PostgresML { impl PostgresML {
@@ -31,7 +32,7 @@ impl PostgresML {
postgresml_config: configuration::PostgresML, postgresml_config: configuration::PostgresML,
configuration: Configuration, configuration: Configuration,
) -> anyhow::Result<Self> { ) -> anyhow::Result<Self> {
let file_store = FileStore::new(configuration.clone()); let file_store = FileStore::new_without_crawl(configuration.clone());
let database_url = if let Some(database_url) = postgresml_config.database_url { let database_url = if let Some(database_url) = postgresml_config.database_url {
database_url database_url
} else { } else {
@@ -39,7 +40,7 @@ impl PostgresML {
}; };
// TODO: Think on the naming of the collection // TODO: Think on the naming of the collection
// Maybe filter on metadata or I'm not sure // Maybe filter on metadata or I'm not sure
let collection = Collection::new("test-lsp-ai", Some(database_url))?; let collection = Collection::new("test-lsp-ai-2", Some(database_url))?;
// TODO: Review the pipeline // TODO: Review the pipeline
let pipeline = Pipeline::new( let pipeline = Pipeline::new(
"v1", "v1",
@@ -66,15 +67,6 @@ impl PostgresML {
.worker_threads(2) .worker_threads(2)
.enable_all() .enable_all()
.build()?; .build()?;
// Add the collection to the pipeline
let mut task_collection = collection.clone();
let mut task_pipeline = pipeline.clone();
runtime.spawn(async move {
task_collection
.add_pipeline(&mut task_pipeline)
.await
.expect("PGML - Error adding pipeline to collection");
});
// Setup up a debouncer for changed text documents // Setup up a debouncer for changed text documents
let mut task_collection = collection.clone(); let mut task_collection = collection.clone();
let (debounce_tx, debounce_rx) = mpsc::channel::<String>(); let (debounce_tx, debounce_rx) = mpsc::channel::<String>();
@@ -124,6 +116,7 @@ impl PostgresML {
pipeline, pipeline,
runtime, runtime,
debounce_tx, debounce_tx,
added_pipeline: false,
}) })
} }
} }
@@ -140,7 +133,7 @@ impl MemoryBackend for PostgresML {
position: &TextDocumentPositionParams, position: &TextDocumentPositionParams,
prompt_for_type: PromptForType, prompt_for_type: PromptForType,
) -> anyhow::Result<Prompt> { ) -> anyhow::Result<Prompt> {
// This is blocking, but this is ok as we only query for it from the worker when we are actually doing a transform // This is blocking, but that is ok as we only query for it from the worker when we are actually doing a transform
let query = self let query = self
.file_store .file_store
.get_characters_around_position(position, 512)?; .get_characters_around_position(position, 512)?;
@@ -189,8 +182,16 @@ impl MemoryBackend for PostgresML {
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let text = params.text_document.text.clone(); let text = params.text_document.text.clone();
let path = params.text_document.uri.path().to_owned(); let path = params.text_document.uri.path().to_owned();
let task_added_pipeline = self.added_pipeline;
let mut task_collection = self.collection.clone(); let mut task_collection = self.collection.clone();
let mut task_pipeline = self.pipeline.clone();
self.runtime.spawn(async move { self.runtime.spawn(async move {
if !task_added_pipeline {
task_collection
.add_pipeline(&mut task_pipeline)
.await
.expect("PGML - Error adding pipeline to collection");
}
task_collection task_collection
.upsert_documents( .upsert_documents(
vec![json!({ vec![json!({
@@ -201,7 +202,7 @@ impl MemoryBackend for PostgresML {
None, None,
) )
.await .await
.expect("PGML - Error adding pipeline to collection"); .expect("PGML - Error upserting documents");
}); });
self.file_store.opened_text_document(params) self.file_store.opened_text_document(params)
} }