Working PostgresML backend with resyncing

This commit is contained in:
Silas Marvin
2024-06-18 20:03:10 -07:00
parent cbe487ca3a
commit 3e8c99b237
4 changed files with 118 additions and 13 deletions

1
Cargo.lock generated
View File

@@ -1569,6 +1569,7 @@ dependencies = [
"llama-cpp-2", "llama-cpp-2",
"lsp-server", "lsp-server",
"lsp-types", "lsp-types",
"md5",
"minijinja", "minijinja",
"once_cell", "once_cell",
"parking_lot", "parking_lot",

View File

@@ -36,6 +36,7 @@ tree-sitter = "0.22"
utils-tree-sitter = { workspace = true, features = ["all"] } utils-tree-sitter = { workspace = true, features = ["all"] }
splitter-tree-sitter = { workspace = true } splitter-tree-sitter = { workspace = true }
text-splitter = { version = "0.13.3" } text-splitter = { version = "0.13.3" }
md5 = "0.7.0"
[build-dependencies] [build-dependencies]
cc="*" cc="*"

View File

@@ -57,7 +57,6 @@ impl Crawl {
for result in WalkBuilder::new(&root_uri[7..]).build() { for result in WalkBuilder::new(&root_uri[7..]).build() {
let result = result?; let result = result?;
let path = result.path(); let path = result.path();
eprintln!("CRAWLING: {}", path.display());
if !path.is_dir() { if !path.is_dir() {
if let Some(path_str) = path.to_str() { if let Some(path_str) = path.to_str() {
if self.crawl_config.all_files { if self.crawl_config.all_files {

View File

@@ -2,6 +2,7 @@ use anyhow::Context;
use lsp_types::TextDocumentPositionParams; use lsp_types::TextDocumentPositionParams;
use parking_lot::Mutex; use parking_lot::Mutex;
use pgml::{Collection, Pipeline}; use pgml::{Collection, Pipeline};
use rand::{distributions::Alphanumeric, Rng};
use serde_json::{json, Value}; use serde_json::{json, Value};
use std::{ use std::{
io::Read, io::Read,
@@ -26,6 +27,8 @@ use super::{
ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType, ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType,
}; };
const RESYNC_MAX_FILE_SIZE: u64 = 10_000_000;
fn chunk_to_document(uri: &str, chunk: Chunk) -> Value { fn chunk_to_document(uri: &str, chunk: Chunk) -> Value {
json!({ json!({
"id": chunk_to_id(uri, &chunk), "id": chunk_to_id(uri, &chunk),
@@ -94,11 +97,21 @@ impl PostgresML {
let database_url = if let Some(database_url) = postgresml_config.database_url { let database_url = if let Some(database_url) = postgresml_config.database_url {
database_url database_url
} else { } else {
std::env::var("PGML_DATABASE_URL")? std::env::var("PGML_DATABASE_URL").context("please provide either the `database_url` in the `postgresml` config, or set the `PGML_DATABASE_URL` environment variable")?
}; };
// TODO: Think through Collections and Pipelines let collection_name = match configuration.client_params.root_uri.clone() {
let mut collection = Collection::new("test-lsp-ai-5", Some(database_url))?; Some(root_uri) => format!("{:x}", md5::compute(root_uri.as_bytes())),
None => {
warn!("no root_uri provided in server configuration - generating random string for collection name");
rand::thread_rng()
.sample_iter(&Alphanumeric)
.take(21)
.map(char::from)
.collect()
}
};
let mut collection = Collection::new(&collection_name, Some(database_url))?;
let mut pipeline = Pipeline::new( let mut pipeline = Pipeline::new(
"v1", "v1",
Some( Some(
@@ -145,7 +158,6 @@ impl PostgresML {
if file_uris.is_empty() { if file_uris.is_empty() {
continue; continue;
} }
// Build the chunks for our changed files // Build the chunks for our changed files
let chunks: Vec<Vec<Chunk>> = match file_uris let chunks: Vec<Vec<Chunk>> = match file_uris
.iter() .iter()
@@ -160,11 +172,10 @@ impl PostgresML {
{ {
Ok(chunks) => chunks, Ok(chunks) => chunks,
Err(e) => { Err(e) => {
error!("{e}"); error!("{e:?}");
continue; continue;
} }
}; };
// Delete old chunks that no longer exist after the latest file changes // Delete old chunks that no longer exist after the latest file changes
let delete_or_statements: Vec<Value> = file_uris let delete_or_statements: Vec<Value> = file_uris
.iter() .iter()
@@ -196,10 +207,10 @@ impl PostgresML {
.into(), .into(),
) )
.await .await
.context("PGML - error deleting documents")
{ {
error!("PGML - Error deleting file: {e:?}"); error!("{e:?}");
} }
// Prepare and upsert our new chunks // Prepare and upsert our new chunks
let documents: Vec<pgml::types::Json> = chunks let documents: Vec<pgml::types::Json> = chunks
.into_iter() .into_iter()
@@ -218,7 +229,7 @@ impl PostgresML {
.await .await
.context("PGML - Error upserting changed files") .context("PGML - Error upserting changed files")
{ {
error!("{e}"); error!("{e:?}");
continue; continue;
} }
@@ -237,12 +248,105 @@ impl PostgresML {
splitter, splitter,
}; };
// Resync our Collection
let task_s = s.clone();
TOKIO_RUNTIME.spawn(async move {
if let Err(e) = task_s.resync().await {
error!("{e:?}")
}
});
if let Err(e) = s.maybe_do_crawl(None) { if let Err(e) = s.maybe_do_crawl(None) {
error!("{e}") error!("{e:?}")
} }
Ok(s) Ok(s)
} }
async fn resync(&self) -> anyhow::Result<()> {
let mut collection = self.collection.clone();
let documents = collection
.get_documents(Some(
json!({
"limit": 100_000_000,
"keys": ["uri"]
})
.into(),
))
.await?;
let try_get_file_contents = |path: &std::path::Path| {
// Open the file and see if it is small enough to read
let mut f = std::fs::File::open(path)?;
let metadata = f.metadata()?;
if metadata.len() > RESYNC_MAX_FILE_SIZE {
anyhow::bail!("file size is greater than: {RESYNC_MAX_FILE_SIZE}")
}
// Read the file contents
let mut contents = vec![];
f.read_to_end(&mut contents)?;
anyhow::Ok(String::from_utf8(contents)?)
};
let mut documents_to_delete = vec![];
let mut chunks_to_upsert = vec![];
let mut current_chunks_bytes = 0;
for document in documents.into_iter() {
let uri = match document["document"]["uri"].as_str() {
Some(uri) => uri,
None => continue, // This should never happen, but is really bad as we now have a document with essentially no way to delete it
};
let path = uri.replace("file://", "");
let path = std::path::Path::new(&path);
if !path.exists() {
documents_to_delete.push(uri.to_string());
} else {
// Try to read the file. If we fail delete it
let contents = match try_get_file_contents(path) {
Ok(contents) => contents,
Err(e) => {
error!("{e:?}");
documents_to_delete.push(uri.to_string());
continue;
}
};
// Split the file into chunks
current_chunks_bytes += contents.len();
let chunks: Vec<pgml::types::Json> = self
.splitter
.split_file_contents(&uri, &contents)
.into_iter()
.map(|chunk| chunk_to_document(&uri, chunk).into())
.collect();
chunks_to_upsert.extend(chunks);
// If we have over 10 mega bytes of chunks do the upsert
if current_chunks_bytes > 10_000_000 {
collection
.upsert_documents(chunks_to_upsert, None)
.await
.context("PGML - error upserting documents during resync")?;
}
chunks_to_upsert = vec![];
}
}
// Delete documents
if !documents_to_delete.is_empty() {
collection
.delete_documents(
json!({
"uri": {
"$in": documents_to_delete
}
})
.into(),
)
.await
.context("PGML - error deleting documents during resync")?;
}
Ok(())
}
fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> { fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
if let Some(crawl) = &self.crawl { if let Some(crawl) = &self.crawl {
let mut documents = vec![]; let mut documents = vec![];
@@ -281,8 +385,8 @@ impl PostgresML {
.map(|chunk| chunk_to_document(&uri, chunk).into()) .map(|chunk| chunk_to_document(&uri, chunk).into())
.collect(); .collect();
documents.extend(chunks); documents.extend(chunks);
// If we have over 100 mega bytes of data do the upsert // If we have over 10 mega bytes of data do the upsert
if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory if current_bytes >= 10_000_000 || total_bytes as u64 >= config.max_crawl_memory
{ {
// Upsert the documents // Upsert the documents
let mut collection = self.collection.clone(); let mut collection = self.collection.clone();