Skip to content

Commit

Permalink
laxydl: Support resuming retries on truncated downloads.
Browse files Browse the repository at this point in the history
  • Loading branch information
pansapiens committed Oct 24, 2024
1 parent 79e96c0 commit c207277
Show file tree
Hide file tree
Showing 2 changed files with 267 additions and 25 deletions.
82 changes: 57 additions & 25 deletions laxy_downloader/laxy_downloader/downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,7 @@ def download_url(
check_existing_size (bool): Whether to check the size of an existing file before downloading.
remove_existing (bool): Whether to remove an existing file if its size doesn't match the expected size.
chunk_size (int): The size of chunks to use when streaming the download.
max_retries (int): The maximum number of retry attempts for the download.
Returns:
str: The path to the downloaded file.
Expand All @@ -212,15 +213,12 @@ def download_url(
if username is not None and password is not None:
auth = HTTPBasicAuth(username, password)

if headers is None:
headers = {}

headers = headers or {}
filepath = Path(filepath)
directory = tmp_directory or str(filepath.parent)
filename = filepath.name
filepath = str(filepath)

scheme = urlparse(url).scheme
scheme = urlparse(url).scheme.lower()

lock_path = f"{filepath}.lock"
lock = FileLock(lock_path, timeout=60 * 60 * 12) # 12 hours timeout
Expand All @@ -247,46 +245,80 @@ def download_url(
logger.info(
f"Starting download (attempt {attempt + 1}/{max_retries}): {url} ({url_to_cache_key(url)})"
)

# Get size of partial download if it exists
partial_size = (
os.path.getsize(filepath) if os.path.exists(filepath) else None
)

with temporary_file(
mode="wb", dir=directory, prefix=f"{filename}.", suffix=".tmp"
) as tmpfile:
if partial_size is not None:
# Copy existing partial download
shutil.copy2(filepath, tmpfile.name)

if scheme == "ftp":
urllib.request.urlretrieve(url, filename=tmpfile.name)
else:
request_headers = headers.copy()
if partial_size is not None and partial_size > 0:
request_headers["Range"] = f"bytes={partial_size}-"

with closing(
request_with_retries(
"GET", url, stream=True, headers=headers, auth=auth
"GET",
url,
stream=True,
headers=request_headers,
auth=auth,
)
) as download:
download.raise_for_status()
for chunk in download.iter_content(chunk_size=chunk_size):
tmpfile.write(chunk)

tmpfile.flush()
os.fsync(tmpfile.fileno())

# If server ignored our range request and sent full file
if partial_size and download.status_code == 200:
mode = "wb" # Start fresh since we got the full file
else:
mode = "ab" if partial_size else "wb"

with open(tmpfile.name, mode) as f:
for chunk in download.iter_content(
chunk_size=chunk_size
):
f.write(chunk)
f.flush()
os.fsync(f.fileno())

# Check if download is complete
file_size = os.path.getsize(tmpfile.name)
if content_length is not None and file_size == int(content_length):
if content_length is None or file_size == int(content_length):
shutil.move(tmpfile.name, filepath)
return filepath
elif attempt < max_retries - 1:
logger.warning(
f"Downloaded file size ({file_size}) does not match Content-Length ({content_length}). Retrying..."
)
else:
raise Exception(
f"Downloaded file size ({file_size}) does not match Content-Length ({content_length}) after {max_retries} attempts."
)
# Save partial download for next attempt
shutil.move(tmpfile.name, filepath)
if attempt < max_retries - 1:
logger.warning(
f"Downloaded file size ({file_size}) does not match Content-Length ({content_length}). Retrying..."
)
else:
raise Exception(
f"Downloaded file size ({file_size}) does not match Content-Length ({content_length}) after {max_retries} attempts."
)

except Exception as e:
handle_download_exception(e, None, url, cleanup_on_exception, None)
handle_download_exception(
e, getattr(e, "status_code", None), url, cleanup_on_exception, None
)
raise # Re-raise the exception after cleanup
finally:
# Clean up the lock file if it exists
try:
os.remove(lock_path)
except OSError:
pass

return filepath
if os.path.exists(lock_path):
os.unlink(lock_path)
except OSError as e:
logger.warning(f"Failed to remove lock file {lock_path}: {e}")


def get_content_length(url, headers, auth, scheme):
Expand Down
210 changes: 210 additions & 0 deletions laxy_downloader/laxy_downloader/tests/test_downloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def create_mock_response(content, status_code=200, headers=None):
mock_response.iter_content.return_value = [
content[i : i + 1024] for i in range(0, len(content), 1024)
]
mock_response.raise_for_status = mock.Mock()
return mock_response


Expand Down Expand Up @@ -137,5 +138,214 @@ def test_download_url_existing_file(mock_request, temp_dir):
mock_request.assert_called_once_with("HEAD", url, headers={}, auth=None)


@mock.patch("laxy_downloader.downloader.request_with_retries")
def test_download_url_resume_partial_download(mock_request, temp_dir):
url = "http://example.com/partial_file.txt"
content = b"This is a complete file that will be downloaded in parts."
file_path = os.path.join(temp_dir, "partial_file.txt")

# Mock HEAD request
mock_head_response = mock.Mock()
mock_head_response.headers = {"content-length": str(len(content))}

# First attempt - partial download
mock_partial_response = create_mock_response(
content[:20],
headers={"content-length": str(len(content))}
)

# Second attempt - range request response
mock_range_response = create_mock_response(
content[20:],
status_code=206, # Partial Content status code
headers={
"content-length": str(len(content[20:])),
"content-range": f"bytes 20-{len(content)-1}/{len(content)}"
}
)

mock_request.side_effect = [
mock_head_response,
mock_partial_response,
mock_range_response
]

result = download_url(url, file_path)

assert result == file_path
assert os.path.exists(file_path)
with open(file_path, "rb") as f:
assert f.read() == content

# Check that the range request was made correctly
calls = mock_request.call_args_list
assert len(calls) == 3
assert calls[0][0] == ("HEAD", url) # First call is HEAD request
assert calls[1][0] == ("GET", url) # Second call is initial GET
assert calls[2][0] == ("GET", url) # Third call is range request
assert calls[2][1]["headers"]["Range"] == "bytes=20-" # Check range header


@mock.patch("laxy_downloader.downloader.request_with_retries")
def test_download_url_multiple_resume_attempts(mock_request, temp_dir):
url = "http://example.com/partial_file.txt"
content = b"This is a file that will need multiple resume attempts to complete."
file_path = os.path.join(temp_dir, "partial_file.txt")

# Mock HEAD request
mock_head_response = mock.Mock()
mock_head_response.headers = {"content-length": str(len(content))}

# Create responses for partial downloads
responses = [
# HEAD request
mock_head_response,
# First attempt - gets first 10 bytes
create_mock_response(
content[:10],
headers={"content-length": str(len(content))}
),
# Second attempt - gets next 15 bytes with range request
create_mock_response(
content[10:25],
status_code=206,
headers={
"content-length": str(len(content[10:25])),
"content-range": f"bytes 10-24/{len(content)}"
}
),
# Final attempt - gets remaining bytes with range request
create_mock_response(
content[25:],
status_code=206,
headers={
"content-length": str(len(content[25:])),
"content-range": f"bytes 25-{len(content)-1}/{len(content)}"
}
)
]

mock_request.side_effect = responses


@mock.patch("laxy_downloader.downloader.request_with_retries")
def test_download_url_server_ignores_range(mock_request, temp_dir):
"""Test behavior when server ignores range requests and sends full file"""
url = "http://example.com/partial_file.txt"
content = b"This is a file where the server ignores range requests."
file_path = os.path.join(temp_dir, "partial_file.txt")

# Mock HEAD request
mock_head_response = mock.Mock()
mock_head_response.headers = {"content-length": str(len(content))}

# First attempt - partial download
mock_partial_response = create_mock_response(
content[:20],
headers={"content-length": str(len(content))}
)

# Second attempt - server ignores range and sends full file
mock_full_response = create_mock_response(
content,
status_code=200, # Server ignores range and sends full file
headers={
"content-length": str(len(content)),
# Add Accept-Ranges header to indicate server supports ranges
"Accept-Ranges": "bytes"
}
)

# Add an extra mock response in case we need another retry
mock_request.side_effect = [
mock_head_response,
mock_partial_response,
mock_full_response,
mock_full_response # Extra response just in case
]

result = download_url(url, file_path)

assert result == file_path
assert os.path.exists(file_path)
with open(file_path, "rb") as f:
assert f.read() == content

# Verify the sequence of requests
calls = mock_request.call_args_list
assert 2 <= len(calls) <= 4 # Allow for 2-4 calls
assert calls[0][0] == ("HEAD", url)
assert calls[1][0] == ("GET", url)
if len(calls) > 2:
assert calls[2][0] == ("GET", url)
assert "Range" in calls[2][1]["headers"]
if len(calls) > 3:
assert calls[3][0] == ("GET", url)
assert "Range" in calls[3][1]["headers"]


@mock.patch("laxy_downloader.downloader.request_with_retries")
def test_download_url_no_content_length(mock_request, temp_dir):
url = "http://example.com/no_length.txt"
content = b"This file has no Content-Length header."
file_path = os.path.join(temp_dir, "no_length.txt")

# Mock HEAD request with no content-length
mock_head_response = mock.Mock()
mock_head_response.headers = {}
mock_head_response.raise_for_status = mock.Mock()

# Mock GET response
mock_response = create_mock_response(
content,
headers={}, # No content-length header
)

mock_request.side_effect = [mock_head_response, mock_response]

result = download_url(url, file_path)

assert result == file_path
assert os.path.exists(file_path)
with open(file_path, "rb") as f:
assert f.read() == content


@mock.patch("laxy_downloader.downloader.request_with_retries")
def test_download_url_lock_file_cleanup(mock_request, temp_dir):
"""Test that lock files are properly cleaned up after download completion or failure."""
url = "http://example.com/test_file.txt"
content = b"Test content for lock file cleanup"
file_path = os.path.join(temp_dir, "test_file.txt")
lock_path = f"{file_path}.lock"

# Test successful download
mock_response = create_mock_response(
content, headers={"content-length": str(len(content))}
)
mock_request.return_value = mock_response

result = download_url(url, file_path)
assert result == file_path
assert os.path.exists(file_path)
assert not os.path.exists(lock_path), "Lock file should be removed after successful download"

# Test failed download
mock_request.side_effect = requests.exceptions.RequestException("Download failed")

with pytest.raises(requests.exceptions.RequestException):
download_url(url, file_path)

assert not os.path.exists(lock_path), "Lock file should be removed after failed download"

# Test download interrupted by keyboard interrupt
mock_request.side_effect = KeyboardInterrupt()

with pytest.raises(KeyboardInterrupt):
download_url(url, file_path)

assert not os.path.exists(lock_path), "Lock file should be removed after keyboard interrupt"


if __name__ == "__main__":
pytest.main([__file__])

0 comments on commit c207277

Please sign in to comment.