-
Notifications
You must be signed in to change notification settings - Fork 8
/
seed.py
71 lines (51 loc) · 1.58 KB
/
seed.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
import uuid
import requests
from dotenv import load_dotenv
from haystack import Document
from weaviate.util import generate_uuid5
from pipeline import MaxPipeline, split_markdown_sections
load_dotenv() # take environment variables from .env.
pipeline = MaxPipeline(
openai_token=os.getenv("OPENAI_TOKEN")
)
def get_uuid(content):
return str(uuid.uuid5(uuid.NAMESPACE_URL, content))
def get_sample_doc():
content = {
"content": "sample text",
"slug": "sample-slug",
"type": "tutorials",
}
body = {
"entries": [
{
"id": get_uuid(content["content"]),
"content": content["content"],
"meta": {
"slug": content["slug"],
"type": content["type"],
},
}
]
}
return body
def embed_docs_with_api(docs):
client = requests.Session()
host = os.environ.get("MAX_URL", "http://localhost:8000")
r = client.post(json=docs, url=f"{host}/entries")
if r.status_code != 200:
print(docs)
print(r.text)
def embed_docs_directly(docs):
for entry in docs['entries']:
headings = split_markdown_sections(entry['content'])
documents = [Document(id=generate_uuid5(doc), content=doc, content_type='text', meta=entry['meta']) for doc in headings if doc]
pipeline.embed_documents(documents)
pipeline.update_embeddings()
return []
def seed_sample_doc():
docs = get_sample_doc()
embed_docs_directly(docs)
if __name__ == "__main__":
seed_sample_doc()