mirror of
https://github.com/SilasMarvin/lsp-ai.git
synced 2025-12-17 22:44:24 +01:00
Working PostgresML backend with resyncing
This commit is contained in:
1
Cargo.lock
generated
1
Cargo.lock
generated
@@ -1569,6 +1569,7 @@ dependencies = [
|
|||||||
"llama-cpp-2",
|
"llama-cpp-2",
|
||||||
"lsp-server",
|
"lsp-server",
|
||||||
"lsp-types",
|
"lsp-types",
|
||||||
|
"md5",
|
||||||
"minijinja",
|
"minijinja",
|
||||||
"once_cell",
|
"once_cell",
|
||||||
"parking_lot",
|
"parking_lot",
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ tree-sitter = "0.22"
|
|||||||
utils-tree-sitter = { workspace = true, features = ["all"] }
|
utils-tree-sitter = { workspace = true, features = ["all"] }
|
||||||
splitter-tree-sitter = { workspace = true }
|
splitter-tree-sitter = { workspace = true }
|
||||||
text-splitter = { version = "0.13.3" }
|
text-splitter = { version = "0.13.3" }
|
||||||
|
md5 = "0.7.0"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
cc="*"
|
cc="*"
|
||||||
|
|||||||
@@ -57,7 +57,6 @@ impl Crawl {
|
|||||||
for result in WalkBuilder::new(&root_uri[7..]).build() {
|
for result in WalkBuilder::new(&root_uri[7..]).build() {
|
||||||
let result = result?;
|
let result = result?;
|
||||||
let path = result.path();
|
let path = result.path();
|
||||||
eprintln!("CRAWLING: {}", path.display());
|
|
||||||
if !path.is_dir() {
|
if !path.is_dir() {
|
||||||
if let Some(path_str) = path.to_str() {
|
if let Some(path_str) = path.to_str() {
|
||||||
if self.crawl_config.all_files {
|
if self.crawl_config.all_files {
|
||||||
|
|||||||
@@ -2,6 +2,7 @@ use anyhow::Context;
|
|||||||
use lsp_types::TextDocumentPositionParams;
|
use lsp_types::TextDocumentPositionParams;
|
||||||
use parking_lot::Mutex;
|
use parking_lot::Mutex;
|
||||||
use pgml::{Collection, Pipeline};
|
use pgml::{Collection, Pipeline};
|
||||||
|
use rand::{distributions::Alphanumeric, Rng};
|
||||||
use serde_json::{json, Value};
|
use serde_json::{json, Value};
|
||||||
use std::{
|
use std::{
|
||||||
io::Read,
|
io::Read,
|
||||||
@@ -26,6 +27,8 @@ use super::{
|
|||||||
ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType,
|
ContextAndCodePrompt, FIMPrompt, MemoryBackend, MemoryRunParams, Prompt, PromptType,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
const RESYNC_MAX_FILE_SIZE: u64 = 10_000_000;
|
||||||
|
|
||||||
fn chunk_to_document(uri: &str, chunk: Chunk) -> Value {
|
fn chunk_to_document(uri: &str, chunk: Chunk) -> Value {
|
||||||
json!({
|
json!({
|
||||||
"id": chunk_to_id(uri, &chunk),
|
"id": chunk_to_id(uri, &chunk),
|
||||||
@@ -94,11 +97,21 @@ impl PostgresML {
|
|||||||
let database_url = if let Some(database_url) = postgresml_config.database_url {
|
let database_url = if let Some(database_url) = postgresml_config.database_url {
|
||||||
database_url
|
database_url
|
||||||
} else {
|
} else {
|
||||||
std::env::var("PGML_DATABASE_URL")?
|
std::env::var("PGML_DATABASE_URL").context("please provide either the `database_url` in the `postgresml` config, or set the `PGML_DATABASE_URL` environment variable")?
|
||||||
};
|
};
|
||||||
|
|
||||||
// TODO: Think through Collections and Pipelines
|
let collection_name = match configuration.client_params.root_uri.clone() {
|
||||||
let mut collection = Collection::new("test-lsp-ai-5", Some(database_url))?;
|
Some(root_uri) => format!("{:x}", md5::compute(root_uri.as_bytes())),
|
||||||
|
None => {
|
||||||
|
warn!("no root_uri provided in server configuration - generating random string for collection name");
|
||||||
|
rand::thread_rng()
|
||||||
|
.sample_iter(&Alphanumeric)
|
||||||
|
.take(21)
|
||||||
|
.map(char::from)
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let mut collection = Collection::new(&collection_name, Some(database_url))?;
|
||||||
let mut pipeline = Pipeline::new(
|
let mut pipeline = Pipeline::new(
|
||||||
"v1",
|
"v1",
|
||||||
Some(
|
Some(
|
||||||
@@ -145,7 +158,6 @@ impl PostgresML {
|
|||||||
if file_uris.is_empty() {
|
if file_uris.is_empty() {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Build the chunks for our changed files
|
// Build the chunks for our changed files
|
||||||
let chunks: Vec<Vec<Chunk>> = match file_uris
|
let chunks: Vec<Vec<Chunk>> = match file_uris
|
||||||
.iter()
|
.iter()
|
||||||
@@ -160,11 +172,10 @@ impl PostgresML {
|
|||||||
{
|
{
|
||||||
Ok(chunks) => chunks,
|
Ok(chunks) => chunks,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("{e}");
|
error!("{e:?}");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Delete old chunks that no longer exist after the latest file changes
|
// Delete old chunks that no longer exist after the latest file changes
|
||||||
let delete_or_statements: Vec<Value> = file_uris
|
let delete_or_statements: Vec<Value> = file_uris
|
||||||
.iter()
|
.iter()
|
||||||
@@ -196,10 +207,10 @@ impl PostgresML {
|
|||||||
.into(),
|
.into(),
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
.context("PGML - error deleting documents")
|
||||||
{
|
{
|
||||||
error!("PGML - Error deleting file: {e:?}");
|
error!("{e:?}");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare and upsert our new chunks
|
// Prepare and upsert our new chunks
|
||||||
let documents: Vec<pgml::types::Json> = chunks
|
let documents: Vec<pgml::types::Json> = chunks
|
||||||
.into_iter()
|
.into_iter()
|
||||||
@@ -218,7 +229,7 @@ impl PostgresML {
|
|||||||
.await
|
.await
|
||||||
.context("PGML - Error upserting changed files")
|
.context("PGML - Error upserting changed files")
|
||||||
{
|
{
|
||||||
error!("{e}");
|
error!("{e:?}");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -237,12 +248,105 @@ impl PostgresML {
|
|||||||
splitter,
|
splitter,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Resync our Collection
|
||||||
|
let task_s = s.clone();
|
||||||
|
TOKIO_RUNTIME.spawn(async move {
|
||||||
|
if let Err(e) = task_s.resync().await {
|
||||||
|
error!("{e:?}")
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
if let Err(e) = s.maybe_do_crawl(None) {
|
if let Err(e) = s.maybe_do_crawl(None) {
|
||||||
error!("{e}")
|
error!("{e:?}")
|
||||||
}
|
}
|
||||||
Ok(s)
|
Ok(s)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn resync(&self) -> anyhow::Result<()> {
|
||||||
|
let mut collection = self.collection.clone();
|
||||||
|
|
||||||
|
let documents = collection
|
||||||
|
.get_documents(Some(
|
||||||
|
json!({
|
||||||
|
"limit": 100_000_000,
|
||||||
|
"keys": ["uri"]
|
||||||
|
})
|
||||||
|
.into(),
|
||||||
|
))
|
||||||
|
.await?;
|
||||||
|
|
||||||
|
let try_get_file_contents = |path: &std::path::Path| {
|
||||||
|
// Open the file and see if it is small enough to read
|
||||||
|
let mut f = std::fs::File::open(path)?;
|
||||||
|
let metadata = f.metadata()?;
|
||||||
|
if metadata.len() > RESYNC_MAX_FILE_SIZE {
|
||||||
|
anyhow::bail!("file size is greater than: {RESYNC_MAX_FILE_SIZE}")
|
||||||
|
}
|
||||||
|
// Read the file contents
|
||||||
|
let mut contents = vec![];
|
||||||
|
f.read_to_end(&mut contents)?;
|
||||||
|
anyhow::Ok(String::from_utf8(contents)?)
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut documents_to_delete = vec![];
|
||||||
|
let mut chunks_to_upsert = vec![];
|
||||||
|
let mut current_chunks_bytes = 0;
|
||||||
|
for document in documents.into_iter() {
|
||||||
|
let uri = match document["document"]["uri"].as_str() {
|
||||||
|
Some(uri) => uri,
|
||||||
|
None => continue, // This should never happen, but is really bad as we now have a document with essentially no way to delete it
|
||||||
|
};
|
||||||
|
|
||||||
|
let path = uri.replace("file://", "");
|
||||||
|
let path = std::path::Path::new(&path);
|
||||||
|
if !path.exists() {
|
||||||
|
documents_to_delete.push(uri.to_string());
|
||||||
|
} else {
|
||||||
|
// Try to read the file. If we fail delete it
|
||||||
|
let contents = match try_get_file_contents(path) {
|
||||||
|
Ok(contents) => contents,
|
||||||
|
Err(e) => {
|
||||||
|
error!("{e:?}");
|
||||||
|
documents_to_delete.push(uri.to_string());
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// Split the file into chunks
|
||||||
|
current_chunks_bytes += contents.len();
|
||||||
|
let chunks: Vec<pgml::types::Json> = self
|
||||||
|
.splitter
|
||||||
|
.split_file_contents(&uri, &contents)
|
||||||
|
.into_iter()
|
||||||
|
.map(|chunk| chunk_to_document(&uri, chunk).into())
|
||||||
|
.collect();
|
||||||
|
chunks_to_upsert.extend(chunks);
|
||||||
|
// If we have over 10 mega bytes of chunks do the upsert
|
||||||
|
if current_chunks_bytes > 10_000_000 {
|
||||||
|
collection
|
||||||
|
.upsert_documents(chunks_to_upsert, None)
|
||||||
|
.await
|
||||||
|
.context("PGML - error upserting documents during resync")?;
|
||||||
|
}
|
||||||
|
chunks_to_upsert = vec![];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Delete documents
|
||||||
|
if !documents_to_delete.is_empty() {
|
||||||
|
collection
|
||||||
|
.delete_documents(
|
||||||
|
json!({
|
||||||
|
"uri": {
|
||||||
|
"$in": documents_to_delete
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.into(),
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
.context("PGML - error deleting documents during resync")?;
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
|
fn maybe_do_crawl(&self, triggered_file: Option<String>) -> anyhow::Result<()> {
|
||||||
if let Some(crawl) = &self.crawl {
|
if let Some(crawl) = &self.crawl {
|
||||||
let mut documents = vec![];
|
let mut documents = vec![];
|
||||||
@@ -281,8 +385,8 @@ impl PostgresML {
|
|||||||
.map(|chunk| chunk_to_document(&uri, chunk).into())
|
.map(|chunk| chunk_to_document(&uri, chunk).into())
|
||||||
.collect();
|
.collect();
|
||||||
documents.extend(chunks);
|
documents.extend(chunks);
|
||||||
// If we have over 100 mega bytes of data do the upsert
|
// If we have over 10 mega bytes of data do the upsert
|
||||||
if current_bytes >= 100_000_000 || total_bytes as u64 >= config.max_crawl_memory
|
if current_bytes >= 10_000_000 || total_bytes as u64 >= config.max_crawl_memory
|
||||||
{
|
{
|
||||||
// Upsert the documents
|
// Upsert the documents
|
||||||
let mut collection = self.collection.clone();
|
let mut collection = self.collection.clone();
|
||||||
|
|||||||
Reference in New Issue
Block a user