From bf0158f4a48ec5a0b23a50dcc5aca0508785f69c Mon Sep 17 00:00:00 2001
From: Pierre-Loic Doulcet <pierre@runllama.ai>
Date: Mon, 4 Nov 2024 20:06:40 +0100
Subject: [PATCH 1/4] add input url and http_proxy

---
 llama_parse/base.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llama_parse/base.py b/llama_parse/base.py
index 4f7fa59..41fdd62 100644
--- a/llama_parse/base.py
+++ b/llama_parse/base.py
@@ -190,6 +190,13 @@ class LlamaParse(BasePydanticReader):
     azure_openai_key: Optional[str] = Field(
         default=None, description="Azure Openai Key"
     )
+    input_url: Optional[str] = Field(
+        default=None, description="An url to a document that need to be parsed"
+    )
+    http_proxy: Optional[str] = Field(
+        default=None,
+        description="(optional) If set with input_url will use the specified http proxy to download the file.",
+    )
 
     @field_validator("api_key", mode="before", check_fields=True)
     @classmethod
@@ -255,6 +262,8 @@ async def _create_job(
             fs = fs or get_default_fs()
             file_handle = fs.open(file_input, "rb")
             files = {"file": (os.path.basename(file_path), file_handle, mime_type)}
+        elif self.input_url is not None:
+            files = None
         else:
             raise ValueError(
                 "file_input must be either a file path string, file bytes, or buffer object"
@@ -316,6 +325,13 @@ async def _create_job(
         if self.azure_openai_key is not None:
             data["azure_openai_key"] = self.azure_openai_key
 
+        if self.input_url is not None:
+            files = None
+            data["input_url"] = self.input_url
+
+        if self.http_proxy is not None:
+            data["http_proxy"] = self.http_proxy
+
         try:
             async with self.client_context() as client:
                 response = await client.post(

From 72d4cb1f30c2713b6e62f2fa9b35e5a16e971384 Mon Sep 17 00:00:00 2001
From: Logan Markewich <logan.markewich@live.com>
Date: Tue, 12 Nov 2024 12:50:20 -0600
Subject: [PATCH 2/4] refactor into input url as direct input

---
 llama_parse/base.py | 43 ++++++++++++++++++++++++++++---------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/llama_parse/base.py b/llama_parse/base.py
index 41fdd62..451f50f 100644
--- a/llama_parse/base.py
+++ b/llama_parse/base.py
@@ -1,6 +1,6 @@
 import os
 import asyncio
-from io import TextIOWrapper
+from urllib.parse import urlparse
 
 import httpx
 import mimetypes
@@ -11,7 +11,6 @@
 from io import BufferedIOBase
 
 from fsspec import AbstractFileSystem
-from fsspec.spec import AbstractBufferedFile
 from llama_index.core.async_utils import asyncio_run, run_jobs
 from llama_index.core.bridge.pydantic import Field, field_validator
 from llama_index.core.constants import DEFAULT_BASE_URL
@@ -190,9 +189,6 @@ class LlamaParse(BasePydanticReader):
     azure_openai_key: Optional[str] = Field(
         default=None, description="Azure Openai Key"
     )
-    input_url: Optional[str] = Field(
-        default=None, description="An url to a document that need to be parsed"
-    )
     http_proxy: Optional[str] = Field(
         default=None,
         description="(optional) If set with input_url will use the specified http proxy to download the file.",
@@ -228,6 +224,28 @@ async def client_context(self) -> AsyncGenerator[httpx.AsyncClient, None]:
             async with httpx.AsyncClient(timeout=self.max_timeout) as client:
                 yield client
 
+    def _is_input_url(self, file_path: FileInput) -> bool:
+        """Check if the input is a valid URL.
+
+        This method checks for:
+        - Proper URL scheme (http/https)
+        - Valid URL structure
+        - Network location (domain)
+        """
+        if not isinstance(file_path, str):
+            return False
+        try:
+            result = urlparse(file_path)
+            return all(
+                [
+                    result.scheme in ("http", "https"),
+                    result.netloc,  # Has domain
+                    result.scheme,  # Has scheme
+                ]
+            )
+        except Exception:
+            return False
+
     # upload a document and get back a job_id
     async def _create_job(
         self,
@@ -239,6 +257,7 @@ async def _create_job(
         url = f"{self.base_url}/api/parsing/upload"
         files = None
         file_handle = None
+        input_url = file_input if self._is_input_url(file_input) else None
 
         if isinstance(file_input, (bytes, BufferedIOBase)):
             if not extra_info or "file_name" not in extra_info:
@@ -248,6 +267,8 @@ async def _create_job(
             file_name = extra_info["file_name"]
             mime_type = mimetypes.guess_type(file_name)[0]
             files = {"file": (file_name, file_input, mime_type)}
+        elif input_url is not None:
+            files = None
         elif isinstance(file_input, (str, Path, PurePosixPath, PurePath)):
             file_path = str(file_input)
             file_ext = os.path.splitext(file_path)[1].lower()
@@ -262,8 +283,6 @@ async def _create_job(
             fs = fs or get_default_fs()
             file_handle = fs.open(file_input, "rb")
             files = {"file": (os.path.basename(file_path), file_handle, mime_type)}
-        elif self.input_url is not None:
-            files = None
         else:
             raise ValueError(
                 "file_input must be either a file path string, file bytes, or buffer object"
@@ -325,9 +344,9 @@ async def _create_job(
         if self.azure_openai_key is not None:
             data["azure_openai_key"] = self.azure_openai_key
 
-        if self.input_url is not None:
+        if input_url is not None:
             files = None
-            data["input_url"] = self.input_url
+            data["input_url"] = str(input_url)
 
         if self.http_proxy is not None:
             data["http_proxy"] = self.http_proxy
@@ -348,12 +367,6 @@ async def _create_job(
             if file_handle is not None:
                 file_handle.close()
 
-    @staticmethod
-    def __get_filename(f: Union[TextIOWrapper, AbstractBufferedFile]) -> str:
-        if isinstance(f, TextIOWrapper):
-            return f.name
-        return f.full_name
-
     async def _get_job_result(
         self, job_id: str, result_type: str, verbose: bool = False
     ) -> Dict[str, Any]:

From 39488438e5a7a10fd76b373d3b5737d07f242024 Mon Sep 17 00:00:00 2001
From: Logan Markewich <logan.markewich@live.com>
Date: Tue, 12 Nov 2024 12:50:51 -0600
Subject: [PATCH 3/4] add tests

---
 tests/test_reader.py | 63 +++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/tests/test_reader.py b/tests/test_reader.py
index 70da8aa..2c8e9ba 100644
--- a/tests/test_reader.py
+++ b/tests/test_reader.py
@@ -76,13 +76,14 @@ def test_simple_page_markdown_buffer(markdown_parser: LlamaParse) -> None:
     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
     reason="LLAMA_CLOUD_API_KEY not set",
 )
-def test_simple_page_with_custom_fs() -> None:
+@pytest.mark.asyncio
+async def test_simple_page_with_custom_fs() -> None:
     parser = LlamaParse(result_type="markdown")
     fs = LocalFileSystem()
     filepath = os.path.join(
         os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf"
     )
-    result = parser.load_data(filepath, fs=fs)
+    result = await parser.aload_data(filepath, fs=fs)
     assert len(result) == 1
 
 
@@ -90,13 +91,14 @@ def test_simple_page_with_custom_fs() -> None:
     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
     reason="LLAMA_CLOUD_API_KEY not set",
 )
-def test_simple_page_progress_workers() -> None:
+@pytest.mark.asyncio
+async def test_simple_page_progress_workers() -> None:
     parser = LlamaParse(result_type="markdown", show_progress=True, verbose=True)
 
     filepath = os.path.join(
         os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf"
     )
-    result = parser.load_data([filepath, filepath])
+    result = await parser.aload_data([filepath, filepath])
     assert len(result) == 2
     assert len(result[0].text) > 0
 
@@ -107,7 +109,7 @@ def test_simple_page_progress_workers() -> None:
     filepath = os.path.join(
         os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf"
     )
-    result = parser.load_data([filepath, filepath])
+    result = await parser.aload_data([filepath, filepath])
     assert len(result) == 2
     assert len(result[0].text) > 0
 
@@ -116,12 +118,59 @@ def test_simple_page_progress_workers() -> None:
     os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
     reason="LLAMA_CLOUD_API_KEY not set",
 )
-def test_custom_client() -> None:
+@pytest.mark.asyncio
+async def test_custom_client() -> None:
     custom_client = AsyncClient(verify=False, timeout=10)
     parser = LlamaParse(result_type="markdown", custom_client=custom_client)
     filepath = os.path.join(
         os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf"
     )
-    result = parser.load_data(filepath)
+    result = await parser.aload_data(filepath)
     assert len(result) == 1
     assert len(result[0].text) > 0
+
+
+@pytest.mark.skipif(
+    os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
+    reason="LLAMA_CLOUD_API_KEY not set",
+)
+@pytest.mark.asyncio
+async def test_input_url() -> None:
+    parser = LlamaParse(result_type="markdown")
+
+    # links to a resume example
+    input_url = "https://cdn-blog.novoresume.com/articles/google-docs-resume-templates/basic-google-docs-resume.png"
+    result = await parser.aload_data(input_url)
+    assert len(result) == 1
+    assert "your name" in result[0].text.lower()
+
+
+@pytest.mark.skipif(
+    os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
+    reason="LLAMA_CLOUD_API_KEY not set",
+)
+@pytest.mark.asyncio
+async def test_input_url_with_website_input() -> None:
+    parser = LlamaParse(result_type="markdown")
+    input_url = "https://www.google.com"
+    result = await parser.aload_data(input_url)
+    assert len(result) == 1
+    assert "google" in result[0].text.lower()
+
+
+@pytest.mark.skipif(
+    os.environ.get("LLAMA_CLOUD_API_KEY", "") == "",
+    reason="LLAMA_CLOUD_API_KEY not set",
+)
+@pytest.mark.asyncio
+async def test_mixing_input_types() -> None:
+    parser = LlamaParse(result_type="markdown")
+    filepath = os.path.join(
+        os.path.dirname(__file__), "test_files/attention_is_all_you_need.pdf"
+    )
+    input_url = "https://www.google.com"
+    result = await parser.aload_data([filepath, input_url])
+
+    assert len(result) == 2
+    assert "table 2" in result[0].text.lower()
+    assert "google" in result[1].text.lower()

From 9161fb3e2460524483ebf5fc6b58579456cb8836 Mon Sep 17 00:00:00 2001
From: Logan Markewich <logan.markewich@live.com>
Date: Tue, 12 Nov 2024 12:52:07 -0600
Subject: [PATCH 4/4] vbump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 323af8f..012937e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "llama-parse"
-version = "0.5.13"
+version = "0.5.14"
 description = "Parse files into RAG-Optimized formats."
 authors = ["Logan Markewich <logan@llamaindex.ai>"]
 license = "MIT"