-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathorch.py
133 lines (95 loc) · 3.07 KB
/
orch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
from duckduckgo_search import DDGS
import requests
from bs4 import BeautifulSoup
import html2text
from llama_index.core.tools import FunctionTool
#from serp_api import search
from image_generator import generate_image
from llama_index.tools.code_interpreter import CodeInterpreterToolSpec
from llama_index.tools.arxiv import ArxivToolSpec
from llama_index.core.tools.ondemand_loader_tool import OnDemandLoaderTool
from llama_index.readers.wikipedia import WikipediaReader
from llama_index.agent.openai import OpenAIAgent
from llama_index.tools.exa import ExaToolSpec
from copy import copy
import os
from markitdown import MarkItDown
from openai import OpenAI
exa_tool = ExaToolSpec(
api_key = os.getenv("EXA_API_KEY"),
)
code_spec = CodeInterpreterToolSpec()
# Initialize DuckDuckGo Search
ddgs = DDGS()
reader = WikipediaReader()
def html_to_markdown(html_content: str) -> str:
"""
Convert HTML content to Markdown.
Args:
html_content (str): HTML content to convert.
Returns:
str: Converted Markdown text.
"""
# Parse HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
# Remove JavaScript code
for script in soup(['script', 'style']):
script.extract()
# Get cleaned HTML content
cleaned_html = str(soup)
# Convert HTML to Markdown using html2text
markdown_text = html2text.html2text(cleaned_html)
return markdown_text
def search(query: str) -> list:
"""
Perform a search engine query and return the results.f
Args:
query (str): Search query.
Returns:
list: List of search results.
"""
results = list(ddgs.text(query, max_results=10))
return results
def crawl_site(link: str) -> str:
"""
Crawl a website, retrieve its HTML content, and convert it to Markdown.
Args:
link (str): URL of the website to crawl.
Returns:
str: Converted Markdown text.
"""
# Make a GET request to the website
response = requests.get(link)
# Convert HTML content to Markdown
markdown_text = html_to_markdown(response.text)
return markdown_text
def get_tools():
tools = [
#FunctionTool.from_defaults(fn = search),
FunctionTool.from_defaults(fn = crawl_site),
#FunctionTool.from_defaults(fn = generate_image),
OnDemandLoaderTool.from_defaults(
reader,
name="WikipediaTool",
description="A tool for loading and querying articles from Wikipedia",
)
]
arxiv_codespec = ArxivToolSpec()
#tools += code_spec.to_tool_list()
tools += arxiv_codespec.to_tool_list()
tools += exa_tool.to_tool_list()
return copy(tools)
def extract_text(file_path: str) -> str:
"""
Extract text content from any file.
Args:
file_path (str): Path to the PDF file.
Returns:
str: Extracted text content.
"""
text_content = ""
client = OpenAI()
md = MarkItDown(llm_client=client, llm_model="gpt-4o")
result = md.convert(file_path)
text_content = result.text_content
return text_content