Skip to content

Commit

Permalink
feat(page): add blocked_crawl [#242]
Browse files Browse the repository at this point in the history
  • Loading branch information
j-mendez committed Jan 5, 2025
1 parent 31113dd commit 2b39312
Show file tree
Hide file tree
Showing 9 changed files with 37 additions and 16 deletions.
12 changes: 6 additions & 6 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion spider/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider"
version = "2.24.0"
version = "2.24.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
3 changes: 3 additions & 0 deletions spider/src/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,8 @@ pub struct Page {
pub waf_check: bool,
/// The total byte transferred for the page. Mainly used for chrome events. Inspect the content for bytes when using http instead.
pub bytes_transferred: Option<f64>,
/// The page was blocked from crawling usual from using website::on_should_crawl_callback
pub blocked_crawl: bool,
}

/// Represent a page visited.
Expand Down Expand Up @@ -493,6 +495,7 @@ pub fn build(url: &str, res: PageResponse) -> Page {
should_retry,
waf_check: res.waf_check,
bytes_transferred: res.bytes_transferred,
blocked_crawl: false,
}
}

Expand Down
26 changes: 22 additions & 4 deletions spider/src/website.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1626,6 +1626,7 @@ impl Website {

if let Some(cb) = self.on_should_crawl_callback {
if !cb(&page) {
page.blocked_crawl = true;
channel_send_page(&self.channel, page, &self.channel_guard);
return Default::default();
}
Expand Down Expand Up @@ -1735,6 +1736,7 @@ impl Website {

if let Some(cb) = self.on_should_crawl_callback {
if !cb(&page) {
page.blocked_crawl = true;
channel_send_page(&self.channel, page, &self.channel_guard);
return Default::default();
}
Expand Down Expand Up @@ -1931,6 +1933,7 @@ impl Website {

if let Some(cb) = self.on_should_crawl_callback {
if !cb(&page) {
page.blocked_crawl = true;
channel_send_page(&self.channel, page, &self.channel_guard);
return Default::default();
}
Expand Down Expand Up @@ -2582,6 +2585,7 @@ impl Website {

if let Some(cb) = on_should_crawl_callback {
if !cb(&page) {
page.blocked_crawl = true;
channel_send_page(&shared.2, page, &shared.4);
drop(permit);
return Default::default()
Expand Down Expand Up @@ -2889,14 +2893,15 @@ impl Website {

page.base = prev_domain;

if let Some(cb) = on_should_crawl_callback {
if let Some(cb) = on_should_crawl_callback {
if !cb(&page) {
page.blocked_crawl = true;
channel_send_page(&shared.2, page, &shared.4);
drop(permit);
return Default::default()
}
}

channel_send_page(
&shared.2, page, &shared.4,
);
Expand Down Expand Up @@ -3335,14 +3340,15 @@ impl Website {
page.base = prev_domain;


if let Some(cb) = on_should_crawl_callback {
if let Some(cb) = on_should_crawl_callback {
if !cb(&page) {
page.blocked_crawl = true;
channel_send_page(&shared.2, page, &shared.3);
drop(permit);
return Default::default()
}
}


channel_send_page(&shared.2, page, &shared.3);
drop(permit);
Expand Down Expand Up @@ -4226,6 +4232,18 @@ impl Website {
self
}

/// Use a callback to determine if a page should be ignored. Return false to ensure that the discovered links are not crawled.
pub fn with_on_should_crawl_callback(
&mut self,
on_should_crawl_callback: Option<fn(&Page) -> bool>,
) -> &mut Self {
match on_should_crawl_callback {
Some(callback) => self.on_should_crawl_callback = Some(callback),
_ => self.on_should_crawl_callback = None,
};
self
}

/// Cookie string to use in request. This does nothing without the `cookies` flag enabled.
pub fn with_cookies(&mut self, cookie_str: &str) -> &mut Self {
self.configuration.with_cookies(cookie_str);
Expand Down
2 changes: 1 addition & 1 deletion spider_chrome/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_chrome"
version = "2.24.0"
version = "2.24.1"
rust-version = "1.70"
authors = [
"j-mendez <[email protected]>"
Expand Down
2 changes: 1 addition & 1 deletion spider_cli/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_cli"
version = "2.24.0"
version = "2.24.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_transformations/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_transformations"
version = "2.24.0"
version = "2.24.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_utils/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_utils"
version = "2.24.0"
version = "2.24.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down
2 changes: 1 addition & 1 deletion spider_worker/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "spider_worker"
version = "2.24.0"
version = "2.24.1"
authors = [
"j-mendez <[email protected]>"
]
Expand Down

0 comments on commit 2b39312

Please sign in to comment.