diff --git a/Cargo.lock b/Cargo.lock index 970e48d44..a0e0f9013 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5367,7 +5367,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.22.14" +version = "2.22.16" dependencies = [ "ahash", "aho-corasick", @@ -5428,7 +5428,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.22.14" +version = "2.22.16" dependencies = [ "adblock", "aho-corasick", @@ -5518,7 +5518,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.22.14" +version = "2.22.16" dependencies = [ "clap", "env_logger", @@ -5561,7 +5561,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.22.14" +version = "2.22.16" dependencies = [ "aho-corasick", "fast_html2md", @@ -5584,7 +5584,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.22.14" +version = "2.22.16" dependencies = [ "indexmap 1.9.3", "serde", @@ -5597,7 +5597,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.22.14" +version = "2.22.16" dependencies = [ "env_logger", "lazy_static", diff --git a/README.md b/README.md index c2667d852..7bf21f0f2 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ A web crawler and scraper, building blocks for data curation workloads. - Subscriptions - Smart Mode - Anti-Bot mitigation +- Disk persistence - Privacy and Efficiency through Ad, Analytics, and Custom Tiered Network Blocking - Blacklisting, Whitelisting, and Budgeting Depth - Dynamic AI Prompt Scripting Headless with Step Caching diff --git a/spider/Cargo.toml b/spider/Cargo.toml index d30c15f75..55f666557 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.22.14" +version = "2.22.16" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index 159e18716..7acc72ea2 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -94,8 +94,8 @@ lazy_static! { /// The chunk size for the rewriter. Can be adjusted using the env var "SPIDER_STREAMING_CHUNK_SIZE". pub(crate) static ref STREAMING_CHUNK_SIZE: usize = { - let default_streaming_chunk_size: usize = 8192 * num_cpus::get_physical(); - let min_streaming_chunk_size: usize = default_streaming_chunk_size / 4; + let default_streaming_chunk_size: usize = 8192 * num_cpus::get_physical().min(64); + let min_streaming_chunk_size: usize = default_streaming_chunk_size * 2 / 3; std::env::var("SPIDER_STREAMING_CHUNK_SIZE") .ok() @@ -405,6 +405,7 @@ pub fn validate_empty(content: &Option>, is_success: bool) -> bool { } /// Extract a specific type of error from a chain of errors. +#[cfg(not(feature = "decentralized"))] fn extract_specific_error<'a, T: std::error::Error + 'static>( error: &'a (dyn std::error::Error + 'static), ) -> Option<&'a T> { @@ -419,6 +420,7 @@ fn extract_specific_error<'a, T: std::error::Error + 'static>( } /// Determine if the response is goaway and should retry. +#[cfg(not(feature = "decentralized"))] fn should_attempt_retry(error: &(dyn std::error::Error + 'static)) -> bool { if let Some(e) = extract_specific_error::(error) { if e.is_go_away() && e.is_remote() && e.reason() == Some(h2::Reason::NO_ERROR) { @@ -2116,16 +2118,6 @@ impl Page { fn abs_path(&self, href: &str) -> Option { self.base.as_ref().map(|b| convert_abs_path(b, href)) } - - /// Convert a URL to its absolute path without any fragments or params. [unused in the worker atm by default all is returned] - #[inline(never)] - #[cfg(feature = "decentralized")] - fn abs_path(&self, href: &str) -> Option { - match Url::parse(&href) { - Ok(u) => Some(convert_abs_path(&u, href)), - _ => None, - } - } } /// Get the content with proper encoding. Pass in a proper encoding label like SHIFT_JIS. diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index d5bad3419..b177e2e27 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.22.14" +version = "2.22.16" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index d8d9577e8..da3b1f1de 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.22.14" +version = "2.22.16" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index f6df6f4a0..7c9135adc 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.22.14" +version = "2.22.16" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index fbf6e93ca..e6e1a78e7 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.22.14" +version = "2.22.16" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 59adb305d..4715d9e52 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.22.14" +version = "2.22.16" authors = [ "j-mendez " ]