From 2b393122d98f01ef06b90b515a8c149965ba2f55 Mon Sep 17 00:00:00 2001 From: j-mendez Date: Sat, 4 Jan 2025 20:39:28 -0500 Subject: [PATCH] feat(page): add blocked_crawl [#242] --- Cargo.lock | 12 ++++++------ spider/Cargo.toml | 2 +- spider/src/page.rs | 3 +++ spider/src/website.rs | 26 ++++++++++++++++++++++---- spider_chrome/Cargo.toml | 2 +- spider_cli/Cargo.toml | 2 +- spider_transformations/Cargo.toml | 2 +- spider_utils/Cargo.toml | 2 +- spider_worker/Cargo.toml | 2 +- 9 files changed, 37 insertions(+), 16 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 80e2218e4..27c99a5c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5400,7 +5400,7 @@ dependencies = [ [[package]] name = "spider" -version = "2.24.0" +version = "2.24.1" dependencies = [ "ahash", "aho-corasick", @@ -5464,7 +5464,7 @@ dependencies = [ [[package]] name = "spider_chrome" -version = "2.24.0" +version = "2.24.1" dependencies = [ "adblock", "aho-corasick", @@ -5554,7 +5554,7 @@ dependencies = [ [[package]] name = "spider_cli" -version = "2.24.0" +version = "2.24.1" dependencies = [ "clap", "env_logger", @@ -5597,7 +5597,7 @@ dependencies = [ [[package]] name = "spider_transformations" -version = "2.24.0" +version = "2.24.1" dependencies = [ "aho-corasick", "fast_html2md", @@ -5620,7 +5620,7 @@ dependencies = [ [[package]] name = "spider_utils" -version = "2.24.0" +version = "2.24.1" dependencies = [ "indexmap 1.9.3", "serde", @@ -5633,7 +5633,7 @@ dependencies = [ [[package]] name = "spider_worker" -version = "2.24.0" +version = "2.24.1" dependencies = [ "env_logger", "lazy_static", diff --git a/spider/Cargo.toml b/spider/Cargo.toml index 2129c25d5..38e0263bf 100644 --- a/spider/Cargo.toml +++ b/spider/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider" -version = "2.24.0" +version = "2.24.1" authors = [ "j-mendez " ] diff --git a/spider/src/page.rs b/spider/src/page.rs index cda93614f..896708910 100644 --- a/spider/src/page.rs +++ b/spider/src/page.rs @@ -177,6 +177,8 @@ pub struct Page { pub waf_check: bool, /// The total byte transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead. pub bytes_transferred: Option, + /// The page was blocked from crawling usual from using website::on_should_crawl_callback + pub blocked_crawl: bool, } /// Represent a page visited. @@ -493,6 +495,7 @@ pub fn build(url: &str, res: PageResponse) -> Page { should_retry, waf_check: res.waf_check, bytes_transferred: res.bytes_transferred, + blocked_crawl: false, } } diff --git a/spider/src/website.rs b/spider/src/website.rs index 4fc5362b0..26d86d251 100644 --- a/spider/src/website.rs +++ b/spider/src/website.rs @@ -1626,6 +1626,7 @@ impl Website { if let Some(cb) = self.on_should_crawl_callback { if !cb(&page) { + page.blocked_crawl = true; channel_send_page(&self.channel, page, &self.channel_guard); return Default::default(); } @@ -1735,6 +1736,7 @@ impl Website { if let Some(cb) = self.on_should_crawl_callback { if !cb(&page) { + page.blocked_crawl = true; channel_send_page(&self.channel, page, &self.channel_guard); return Default::default(); } @@ -1931,6 +1933,7 @@ impl Website { if let Some(cb) = self.on_should_crawl_callback { if !cb(&page) { + page.blocked_crawl = true; channel_send_page(&self.channel, page, &self.channel_guard); return Default::default(); } @@ -2582,6 +2585,7 @@ impl Website { if let Some(cb) = on_should_crawl_callback { if !cb(&page) { + page.blocked_crawl = true; channel_send_page(&shared.2, page, &shared.4); drop(permit); return Default::default() @@ -2889,14 +2893,15 @@ impl Website { page.base = prev_domain; - if let Some(cb) = on_should_crawl_callback { + if let Some(cb) = on_should_crawl_callback { if !cb(&page) { + page.blocked_crawl = true; channel_send_page(&shared.2, page, &shared.4); drop(permit); return Default::default() } } - + channel_send_page( &shared.2, page, &shared.4, ); @@ -3335,14 +3340,15 @@ impl Website { page.base = prev_domain; - if let Some(cb) = on_should_crawl_callback { + if let Some(cb) = on_should_crawl_callback { if !cb(&page) { + page.blocked_crawl = true; channel_send_page(&shared.2, page, &shared.3); drop(permit); return Default::default() } } - + channel_send_page(&shared.2, page, &shared.3); drop(permit); @@ -4226,6 +4232,18 @@ impl Website { self } + /// Use a callback to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled. + pub fn with_on_should_crawl_callback( + &mut self, + on_should_crawl_callback: Option bool>, + ) -> &mut Self { + match on_should_crawl_callback { + Some(callback) => self.on_should_crawl_callback = Some(callback), + _ => self.on_should_crawl_callback = None, + }; + self + } + /// Cookie string to use in request. This does nothing without the `cookies` flag enabled. pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self { self.configuration.with_cookies(cookie_str); diff --git a/spider_chrome/Cargo.toml b/spider_chrome/Cargo.toml index 18d9f219c..e08dd2972 100644 --- a/spider_chrome/Cargo.toml +++ b/spider_chrome/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_chrome" -version = "2.24.0" +version = "2.24.1" rust-version = "1.70" authors = [ "j-mendez " diff --git a/spider_cli/Cargo.toml b/spider_cli/Cargo.toml index c9b846fbf..41e2703ba 100644 --- a/spider_cli/Cargo.toml +++ b/spider_cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_cli" -version = "2.24.0" +version = "2.24.1" authors = [ "j-mendez " ] diff --git a/spider_transformations/Cargo.toml b/spider_transformations/Cargo.toml index b5fbdc679..8b02ba7f7 100644 --- a/spider_transformations/Cargo.toml +++ b/spider_transformations/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_transformations" -version = "2.24.0" +version = "2.24.1" authors = [ "j-mendez " ] diff --git a/spider_utils/Cargo.toml b/spider_utils/Cargo.toml index dbc55a4f1..66f30061e 100644 --- a/spider_utils/Cargo.toml +++ b/spider_utils/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_utils" -version = "2.24.0" +version = "2.24.1" authors = [ "j-mendez " ] diff --git a/spider_worker/Cargo.toml b/spider_worker/Cargo.toml index 1a6ec64b9..85b83134d 100644 --- a/spider_worker/Cargo.toml +++ b/spider_worker/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "spider_worker" -version = "2.24.0" +version = "2.24.1" authors = [ "j-mendez " ]