From bf0158f4a48ec5a0b23a50dcc5aca0508785f69c Mon Sep 17 00:00:00 2001 From: Pierre-Loic Doulcet Date: Mon, 4 Nov 2024 20:06:40 +0100 Subject: [PATCH 1/4] add input url and http_proxy --- llama_parse/base.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/llama_parse/base.py b/llama_parse/base.py index 4f7fa59..41fdd62 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -190,6 +190,13 @@ class LlamaParse(BasePydanticReader): azure_openai_key: Optional[str] = Field( default=None, description="Azure Openai Key" ) + input_url: Optional[str] = Field( + default=None, description="An url to a document that need to be parsed" + ) + http_proxy: Optional[str] = Field( + default=None, + description="(optional) If set with input_url will use the specified http proxy to download the file.", + ) @field_validator("api_key", mode="before", check_fields=True) @classmethod @@ -255,6 +262,8 @@ async def _create_job( fs = fs or get_default_fs() file_handle = fs.open(file_input, "rb") files = {"file": (os.path.basename(file_path), file_handle, mime_type)} + elif self.input_url is not None: + files = None else: raise ValueError( "file_input must be either a file path string, file bytes, or buffer object" @@ -316,6 +325,13 @@ async def _create_job( if self.azure_openai_key is not None: data["azure_openai_key"] = self.azure_openai_key + if self.input_url is not None: + files = None + data["input_url"] = self.input_url + + if self.http_proxy is not None: + data["http_proxy"] = self.http_proxy + try: async with self.client_context() as client: response = await client.post( From 72d4cb1f30c2713b6e62f2fa9b35e5a16e971384 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 12 Nov 2024 12:50:20 -0600 Subject: [PATCH 2/4] refactor into input url as direct input --- llama_parse/base.py | 43 ++++++++++++++++++++++++++++--------------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/llama_parse/base.py b/llama_parse/base.py index 41fdd62..451f50f 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -1,6 +1,6 @@ import os import asyncio -from io import TextIOWrapper +from urllib.parse import urlparse import httpx import mimetypes @@ -11,7 +11,6 @@ from io import BufferedIOBase from fsspec import AbstractFileSystem -from fsspec.spec import AbstractBufferedFile from llama_index.core.async_utils import asyncio_run, run_jobs from llama_index.core.bridge.pydantic import Field, field_validator from llama_index.core.constants import DEFAULT_BASE_URL @@ -190,9 +189,6 @@ class LlamaParse(BasePydanticReader): azure_openai_key: Optional[str] = Field( default=None, description="Azure Openai Key" ) - input_url: Optional[str] = Field( - default=None, description="An url to a document that need to be parsed" - ) http_proxy: Optional[str] = Field( default=None, description="(optional) If set with input_url will use the specified http proxy to download the file.", @@ -228,6 +224,28 @@ async def client_context(self) -> AsyncGenerator[httpx.AsyncClient, None]: async with httpx.AsyncClient(timeout=self.max_timeout) as client: yield client + def _is_input_url(self, file_path: FileInput) -> bool: + """Check if the input is a valid URL. + + This method checks for: + - Proper URL scheme (http/https) + - Valid URL structure + - Network location (domain) + """ + if not isinstance(file_path, str): + return False + try: + result = urlparse(file_path) + return all( + [ + result.scheme in ("http", "https"), + result.netloc, # Has domain + result.scheme, # Has scheme + ] + ) + except Exception: + return False + # upload a document and get back a job_id async def _create_job( self, @@ -239,6 +257,7 @@ async def _create_job( url = f"{self.base_url}/api/parsing/upload" files = None file_handle = None + input_url = file_input if self._is_input_url(file_input) else None if isinstance(file_input, (bytes, BufferedIOBase)): if not extra_info or "file_name" not in extra_info: @@ -248,6 +267,8 @@ async def _create_job( file_name = extra_info["file_name"] mime_type = mimetypes.guess_type(file_name)[0] files = {"file": (file_name, file_input, mime_type)} + elif input_url is not None: + files = None elif isinstance(file_input, (str, Path, PurePosixPath, PurePath)): file_path = str(file_input) file_ext = os.path.splitext(file_path)[1].lower() @@ -262,8 +283,6 @@ async def _create_job( fs = fs or get_default_fs() file_handle = fs.open(file_input, "rb") files = {"file": (os.path.basename(file_path), file_handle, mime_type)} - elif self.input_url is not None: - files = None else: raise ValueError( "file_input must be either a file path string, file bytes, or buffer object" @@ -325,9 +344,9 @@ async def _create_job( if self.azure_openai_key is not None: data["azure_openai_key"] = self.azure_openai_key - if self.input_url is not None: + if input_url is not None: files = None - data["input_url"] = self.input_url + data["input_url"] = str(input_url) if self.http_proxy is not None: data["http_proxy"] = self.http_proxy @@ -348,12 +367,6 @@ async def _create_job( if file_handle is not None: file_handle.close() - @staticmethod - def __get_filename(f: Union[TextIOWrapper, AbstractBufferedFile]) -> str: - if isinstance(f, TextIOWrapper): - return f.name - return f.full_name - async def _get_job_result( self, job_id: str, result_type: str, verbose: bool = False ) -> Dict[str, Any]: From 39488438e5a7a10fd76b373d3b5737d07f242024 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 12 Nov 2024 12:50:51 -0600 Subject: [PATCH 3/4] add tests --- tests/test_reader.py | 63 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/tests/test_reader.py b/tests/test_reader.py index 70da8aa..2c8e9ba 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -76,13 +76,14 @@ def test_simple_page_markdown_buffer(markdown_parser: LlamaParse) -> None: os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", reason="LLAMA_CLOUD_API_KEY not set", ) -def test_simple_page_with_custom_fs() -> None: +@pytest.mark.asyncio +async def test_simple_page_with_custom_fs() -> None: parser = LlamaParse(result_type="markdown") fs = LocalFileSystem() filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data(filepath, fs=fs) + result = await parser.aload_data(filepath, fs=fs) assert len(result) == 1 @@ -90,13 +91,14 @@ def test_simple_page_with_custom_fs() -> None: os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", reason="LLAMA_CLOUD_API_KEY not set", ) -def test_simple_page_progress_workers() -> None: +@pytest.mark.asyncio +async def test_simple_page_progress_workers() -> None: parser = LlamaParse(result_type="markdown", show_progress=True, verbose=True) filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data([filepath, filepath]) + result = await parser.aload_data([filepath, filepath]) assert len(result) == 2 assert len(result[0].text) > 0 @@ -107,7 +109,7 @@ def test_simple_page_progress_workers() -> None: filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data([filepath, filepath]) + result = await parser.aload_data([filepath, filepath]) assert len(result) == 2 assert len(result[0].text) > 0 @@ -116,12 +118,59 @@ def test_simple_page_progress_workers() -> None: os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", reason="LLAMA_CLOUD_API_KEY not set", ) -def test_custom_client() -> None: +@pytest.mark.asyncio +async def test_custom_client() -> None: custom_client = AsyncClient(verify=False, timeout=10) parser = LlamaParse(result_type="markdown", custom_client=custom_client) filepath = os.path.join( os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" ) - result = parser.load_data(filepath) + result = await parser.aload_data(filepath) assert len(result) == 1 assert len(result[0].text) > 0 + + +@pytest.mark.skipif( + os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", + reason="LLAMA_CLOUD_API_KEY not set", +) +@pytest.mark.asyncio +async def test_input_url() -> None: + parser = LlamaParse(result_type="markdown") + + # links to a resume example + input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png" + result = await parser.aload_data(input_url) + assert len(result) == 1 + assert "your name" in result[0].text.lower() + + +@pytest.mark.skipif( + os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", + reason="LLAMA_CLOUD_API_KEY not set", +) +@pytest.mark.asyncio +async def test_input_url_with_website_input() -> None: + parser = LlamaParse(result_type="markdown") + input_url = "https://www.google.com" + result = await parser.aload_data(input_url) + assert len(result) == 1 + assert "google" in result[0].text.lower() + + +@pytest.mark.skipif( + os.environ.get("LLAMA_CLOUD_API_KEY", "") == "", + reason="LLAMA_CLOUD_API_KEY not set", +) +@pytest.mark.asyncio +async def test_mixing_input_types() -> None: + parser = LlamaParse(result_type="markdown") + filepath = os.path.join( + os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf" + ) + input_url = "https://www.google.com" + result = await parser.aload_data([filepath, input_url]) + + assert len(result) == 2 + assert "table 2" in result[0].text.lower() + assert "google" in result[1].text.lower() From 9161fb3e2460524483ebf5fc6b58579456cb8836 Mon Sep 17 00:00:00 2001 From: Logan Markewich Date: Tue, 12 Nov 2024 12:52:07 -0600 Subject: [PATCH 4/4] vbump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 323af8f..012937e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "llama-parse" -version = "0.5.13" +version = "0.5.14" description = "Parse files into RAG-Optimized formats." authors = ["Logan Markewich "] license = "MIT"