fix: json only

This commit is contained in:
Florian Hönicke
2023-04-12 01:32:02 +02:00
parent 6a1f042aa1
commit 3373e2ee50
2 changed files with 17 additions and 51 deletions

View File

@@ -8,6 +8,7 @@ Here is an example of how an executor can be defined. It always starts with a co
```python
# this executor binary files as input and returns the length of each binary file as output
from jina import Executor, requests, DocumentArray, Document
import json
class MyInfoExecutor(Executor):
def __init__(self, **kwargs):
super().__init__()
@@ -15,8 +16,9 @@ class MyInfoExecutor(Executor):
@requests() # each executor must have exactly this decorator without parameters
def foo(self, docs: DocumentArray, **kwargs) => DocumentArray:
for d in docs:
d.load_uri_to_blob()
d.blob = None
content = json.loads(d.text)
...
d.text = json.dumps(modified_content)
return docs
```
@@ -29,64 +31,29 @@ A Document is a python class that represents a single document.
Here is the protobuf definition of a Document:
message DocumentProto {{
// A hexdigest that represents a unique document ID
string id = 1;
oneof content {{
// the raw binary content of this document, which often represents the original document when comes into jina
bytes blob = 2;
// the ndarray of the image/audio/video document
NdArrayProto tensor = 3;
// a text document
string text = 4;
}}
// a uri of the document is a remote url starts with http or https or data URI scheme
string uri = 5;
// list of the sub-documents of this document (recursive structure)
repeated DocumentProto chunks = 6;
// the matched documents on the same level (recursive structure)
repeated DocumentProto matches = 7;
// the embedding of this document
NdArrayProto embedding = 8;
// used to store json data the executor gets and returns
string text = 1;
}}
Here is an example of how a DocumentArray can be defined:
Here are examples of how a DocumentArray can be defined:
from jina import DocumentArray, Document
import json
d1 = Document(text='hello')
d1 = Document(text=json.dumps({{'he_says': 'hello'}}))
# you can load binary data into a document
url = 'https://...'
response = requests.get(url)
obj_data = response.content
d2 = Document(blob=obj_data) # blob is bytes like b'\\x89PNG\\r\\n\\x1a\\n...'
base64_data = base64.b64encode(png_data).decode('utf-8')
d2 = Document(text=json.dumps({{'image': base64_data}}))
d3 = Document(tensor=numpy.array([1, 2, 3]), chunks=[Document(uri=/local/path/to/file)]
d4 = Document(
uri='https://docs.docarray.org/img/logo.png',
)
d5 = Document()
d5.tensor = np.ones((2,4))
d5.uri = 'https://audio.com/audio.mp3'
d6 = Document()
d6.blob # like b'RIFF\\x00\\x00\\x00\\x00WAVEfmt \\x10\\x00...'
docs = DocumentArray([
d1, d2, d3, d4
])
d7 = Document()
d7.text = 'test string'
d8 = Document()
d8.text = json.dumps([{{"id": "1", "text": ["hello", 'test']}}, {{"id": "2", "text": "world"}}])
# the document has a helper function load_uri_to_blob:
# For instance, d4.load_uri_to_blob() downloads the file from d4.uri and stores it in d4.blob.
# If d4.uri was something like 'https://website.web/img.jpg', then d4.blob would be something like b'\\xff\\xd8\\xff\\xe0\\x00\\x10JFIF\\x00\\x01\\x01...
array = numpy.array([1, 2, 3])
array_list = array.tolist()
d3 = Document(text=json.dumps(array_list))
d4 = Document()
d4.text = '{{"uri": "https://.../logo.png"}}'
'''

View File

@@ -134,6 +134,5 @@ The executor must not access external apis except unless it is explicitly mentio
The executor must not load data from the local file system unless it was created by the executor itself.
The executor must not use a pre-trained model unless it is explicitly mentioned in the description.
The executor must not train a model.
The executor must not use Document.tags.
The executor must only use Document.uri, Document.blob and Document.text.
The executor must not use any attribute of Document accept Document.text.
'''