Seedling of our engineer is here

This commit is contained in:
Anton Osika
2023-04-29 14:53:21 +02:00
parent f8f992098a
commit 026ac206c1
4 changed files with 187 additions and 0 deletions

33
chat_to_files.py Normal file
View File

@@ -0,0 +1,33 @@
from ast import List, Tuple
import os
import re
def parse_chat(chat):# -> List[Tuple[str, str]]:
# Get all ``` blocks
regex = r"```(.*?)```"
matches = re.finditer(regex, chat, re.DOTALL)
files = []
for match in matches:
path = match.group(1).split("\n")[0]
# Get the code
code = match.group(1).split("\n")[1:]
code = "\n".join(code)
# Add the file to the list
files.append((path, code))
return files
def to_files(chat, path):
os.makedirs(path, exist_ok=True)
with open(os.path.join(path, 'all_output.txt'), "w") as f:
f.write(chat)
files = parse_chat(chat)
for file_name, file_content in files:
with open(os.path.join(path, file_name), "w") as f:
f.write(file_content)

64
main.py Normal file
View File

@@ -0,0 +1,64 @@
import os
import pathlib
from typing import Optional
import openai
from chat_to_files import to_files
import typer
app = typer.Typer()
@app.command()
def chat(
engine: str = "gpt-4",
temperature: float = 0.0,
max_tokens: int = 4096,
n: int = 1,
stream: bool = True,
system_prompt: str = typer.Argument("system", help="System prompt file"),
user_prompt: str = typer.Argument("user", help="User prompt file"),
code_to_file_path: Optional[str] = typer.Option(
None, "--out", "-c", help="Code to file path"
),
):
# ensure file path corresponds to file in the same file as this script, using __file__
if system_prompt == "system":
# get folder of script
system_prompt = pathlib.Path(__file__).parent / system_prompt
if user_prompt == "user":
user_prompt = pathlib.Path(__file__).parent / user_prompt
with open(system_prompt, "r") as f:
system_prompt = f.read()
with open(user_prompt, "r") as f:
user_prompt = f.read()
response = openai.ChatCompletion.create(
model=engine,
messages=[
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt},
],
temperature=temperature,
max_tokens=max_tokens,
n=n,
stream=stream,
stop=None,
)
chat = []
for chunk in response:
delta = chunk['choices'][0]['delta']
msg = delta.get('content', '')
print(msg, end="")
chat.append(msg)
if code_to_file_path is not None:
to_files("".join(chat), code_to_file_path)
if __name__ == "__main__":
app()

15
system Normal file
View File

@@ -0,0 +1,15 @@
You will get instructions for code to write.
You will write a very long answer. Make sure that every detail of the architecture is, in the end, implemented as code.
You will first lay out the names of the core classes, functions, methods that will be necessary, As well as a quick comment on their purpose.
Then you will output the content of each file, with syntax below.
(You will start with the "entrypoint" file, then go to the ones that are imported by that file, and so on.)
Make sure that files contain all imports, types etc. Make sure that code in different files are compatible with each other.
Ensure to implement all code, if you are unsure, write a plausible implementation.
Before you finish, double check that all parts of the architecture is present in the files.
File syntax:
```main_file.py
[ADD YOUR CODE HERE]
```

75
user Normal file
View File

@@ -0,0 +1,75 @@
Instructions:
We are writing a feature computation framework.
It will mainly consist of FeatureBuilder classes.
Each Feature Builder will have the methods:
- get(key, context, cache): To first check cache, and then go on to call dependencies to compute the feature. Returns value and hash of value.
- dry_run(key, context): To check that "type" of key will match input requirements of features
- input_type(context): That explains what dimensions key is applying to
- output_type(context): That explains what type the output is
It will have the class attr:
- deps: list of FeatureBuilder classes
Where it is unclear, please make assumptions and add a commend in the code about it
Here is an example of Builders we want:
ProductEmbeddingString: takes product_id, queries the product_db and gets the title as a string
ProductEmbedding: takes string and returns and embedding
ProductEmbeddingDB: takes just `merchant` name, uses all product_ids and returns the blob that is a database of embeddings
ProductEmbeddingSearcher: takes a string, constructs embeddingDB feature (note: all features are cached), embeds the string and searches the db
LLMProductPrompt: queries the ProductEmbeddingString, and formats a template that says "get recommendations for {title}"
LLMSuggestions: Takes product_id, looks up prompts and gets list of suggestions of product descriptions
LLMLogic: Takes the product_id, gets the LLM suggestions, embeds the suggestions, does a search, and returns a list of product_ids
The LLMLogic is the logic_builder in a file such as this one:
```
def main(merchant, market):
cache = get_cache()
interaction_data_db = get_interaction_data_db()
product_db = get_product_db()
merchant_config = get_merchant_config(merchant)[merchant]
context = Context(
interaction_data_db=interaction_data_db,
product_db=product_db,
merchant_config=merchant_config,
)
product_ids = cache(ProductIds.get)(
key=(merchant, market),
context=context,
cache=cache,
)
for logic_builder in merchant_config['logic_builders']:
for product_id in product_ids:
key = (merchant, market, product_id)
p2p_recs = cache(logic_builder.get)(key, cache, context)
redis.set(key, p2p_recs)
```
API to product_db:
```
async def get_product_attribute_dimensions(
self,
) -> dict[AttributeId, Dimension]:
return await self.repository.get_product_attribute_dimensions(self.merchant)
async def get_products(
self,
attribute_ids: set[AttributeId],
product_ids: set[ProductId] | None = None,
) -> dict[ProductId, dict[AttributeId, dict[IngestionDimensionKey, Any]]]:
return await self.repository.get_products_dict(
self.merchant, attribute_ids, product_ids
)
```
(note, dimensions are not so important. They related to information that varies by: locale, warehouse, pricelist etc)
Remember to read the Instructions carefully.