Skip to content

Commit

Permalink
perf(page): add max limit streaming chunk size
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Dec 30, 2024
1 parent e5e0638 commit 264e18c
Show file tree
Hide file tree
Showing 9 changed files with 17 additions and 24 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ A web crawler and scraper, building blocks for data curation workloads.
- Subscriptions
- Smart Mode
- Anti-Bot mitigation
- Disk persistence
- Privacy and Efficiency through Ad, Analytics, and Custom Tiered Network Blocking
- Blacklisting, Whitelisting, and Budgeting Depth
- Dynamic AI Prompt Scripting Headless with Step Caching
Expand Down
2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.22.14"
version = "2.22.16"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
16 changes: 4 additions & 12 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,8 @@ lazy_static! {

/// The chunk size for the rewriter. Can be adjusted using the env var "SPIDER_STREAMING_CHUNK_SIZE".
pub(crate) static ref STREAMING_CHUNK_SIZE: usize = {
let default_streaming_chunk_size: usize = 8192 * num_cpus::get_physical();
let min_streaming_chunk_size: usize = default_streaming_chunk_size / 4;
let default_streaming_chunk_size: usize = 8192 * num_cpus::get_physical().min(64);
let min_streaming_chunk_size: usize = default_streaming_chunk_size * 2 / 3;

std::env::var("SPIDER_STREAMING_CHUNK_SIZE")
.ok()
Expand Down Expand Up @@ -405,6 +405,7 @@ pub fn validate_empty(content: &Option<Box<Bytes>>, is_success: bool) -> bool {
}

/// Extract a specific type of error from a chain of errors.
#[cfg(not(feature = "decentralized"))]
fn extract_specific_error<'a, T: std::error::Error + 'static>(
error: &'a (dyn std::error::Error + 'static),
) -> Option<&'a T> {
Expand All @@ -419,6 +420,7 @@ fn extract_specific_error<'a, T: std::error::Error + 'static>(
}

/// Determine if the response is goaway and should retry.
#[cfg(not(feature = "decentralized"))]
fn should_attempt_retry(error: &(dyn std::error::Error + 'static)) -> bool {
if let Some(e) = extract_specific_error::<h2::Error>(error) {
if e.is_go_away() && e.is_remote() && e.reason() == Some(h2::Reason::NO_ERROR) {
Expand Down Expand Up @@ -2116,16 +2118,6 @@ impl Page {
fn abs_path(&self, href: &str) -> Option<Url> {
self.base.as_ref().map(|b| convert_abs_path(b, href))
}

/// Convert a URL to its absolute path without any fragments or params. [unused in the worker atm by default all is returned]
#[inline(never)]
#[cfg(feature = "decentralized")]
fn abs_path(&self, href: &str) -> Option<Url> {
match Url::parse(&href) {
Ok(u) => Some(convert_abs_path(&u, href)),
_ => None,
}
}
}

/// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS.
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.22.14"
version = "2.22.16"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.22.14"
version = "2.22.16"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.22.14"
version = "2.22.16"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.22.14"
version = "2.22.16"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.22.14"
version = "2.22.16"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 264e18c

Please sign in to comment.