diff --git a/src/prompt_system.py b/src/prompt_system.py index a50df41..a44b1e7 100644 --- a/src/prompt_system.py +++ b/src/prompt_system.py @@ -8,6 +8,7 @@ Here is an example of how an executor can be defined. It always starts with a co ```python # this executor binary files as input and returns the length of each binary file as output from jina import Executor, requests, DocumentArray, Document +import json class MyInfoExecutor(Executor): def __init__(self, **kwargs): super().__init__() @@ -15,8 +16,9 @@ class MyInfoExecutor(Executor): @requests() # each executor must have exactly this decorator without parameters def foo(self, docs: DocumentArray, **kwargs) => DocumentArray: for d in docs: - d.load_uri_to_blob() - d.blob = None + content = json.loads(d.text) + ... + d.text = json.dumps(modified_content) return docs ``` @@ -29,64 +31,29 @@ A Document is a python class that represents a single document. Here is the protobuf definition of a Document: message DocumentProto {{ - // A hexdigest that represents a unique document ID - string id = 1; - - oneof content {{ - // the raw binary content of this document, which often represents the original document when comes into jina - bytes blob = 2; - - // the ndarray of the image/audio/video document - NdArrayProto tensor = 3; - - // a text document - string text = 4; - }} - - // a uri of the document is a remote url starts with http or https or data URI scheme - string uri = 5; - - // list of the sub-documents of this document (recursive structure) - repeated DocumentProto chunks = 6; - - // the matched documents on the same level (recursive structure) - repeated DocumentProto matches = 7; - - // the embedding of this document - NdArrayProto embedding = 8; + // used to store json data the executor gets and returns + string text = 1; }} -Here is an example of how a DocumentArray can be defined: +Here are examples of how a DocumentArray can be defined: from jina import DocumentArray, Document +import json -d1 = Document(text='hello') +d1 = Document(text=json.dumps({{'he_says': 'hello'}})) # you can load binary data into a document url = 'https://...' response = requests.get(url) obj_data = response.content -d2 = Document(blob=obj_data) # blob is bytes like b'\\x89PNG\\r\\n\\x1a\\n...' +base64_data = base64.b64encode(png_data).decode('utf-8') +d2 = Document(text=json.dumps({{'image': base64_data}})) -d3 = Document(tensor=numpy.array([1, 2, 3]), chunks=[Document(uri=/local/path/to/file)] -d4 = Document( - uri='https://docs.docarray.org/img/logo.png', -) -d5 = Document() -d5.tensor = np.ones((2,4)) -d5.uri = 'https://audio.com/audio.mp3' -d6 = Document() -d6.blob # like b'RIFF\\x00\\x00\\x00\\x00WAVEfmt \\x10\\x00...' -docs = DocumentArray([ - d1, d2, d3, d4 -]) -d7 = Document() -d7.text = 'test string' -d8 = Document() -d8.text = json.dumps([{{"id": "1", "text": ["hello", 'test']}}, {{"id": "2", "text": "world"}}]) -# the document has a helper function load_uri_to_blob: -# For instance, d4.load_uri_to_blob() downloads the file from d4.uri and stores it in d4.blob. -# If d4.uri was something like 'https://website.web/img.jpg', then d4.blob would be something like b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x01\\x01... +array = numpy.array([1, 2, 3]) +array_list = array.tolist() +d3 = Document(text=json.dumps(array_list)) +d4 = Document() +d4.text = '{{"uri": "https://.../logo.png"}}' ''' diff --git a/src/prompt_tasks.py b/src/prompt_tasks.py index 5f3d7dd..bc80b62 100644 --- a/src/prompt_tasks.py +++ b/src/prompt_tasks.py @@ -134,6 +134,5 @@ The executor must not access external apis except unless it is explicitly mentio The executor must not load data from the local file system unless it was created by the executor itself. The executor must not use a pre-trained model unless it is explicitly mentioned in the description. The executor must not train a model. -The executor must not use Document.tags. -The executor must only use Document.uri, Document.blob and Document.text. +The executor must not use any attribute of Document accept Document.text. ''' \ No newline at end of file