Skip to content

Commit

Permalink
fix: deprecation messages in CheerioCrawler and ProxyConfiguration (
Browse files Browse the repository at this point in the history
#1129)

* progress commit

* fix: deprecation messages
  • Loading branch information
mnmkng authored Aug 27, 2021
1 parent 94a71bf commit 7b7a971
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 33 deletions.
68 changes: 39 additions & 29 deletions src/crawlers/cheerio_crawler.js
Original file line number Diff line number Diff line change
Expand Up @@ -473,7 +473,10 @@ class CheerioCrawler extends BasicCrawler {
* @type {Array<Hook>}
* @ignore
* */
this.postNavigationHooks = postNavigationHooks;
this.postNavigationHooks = [
({ request, response }) => this._abortDownloadOfBody(request, response),
...postNavigationHooks,
];
/**
* @type {RequestAsBrowserOptions}
* @ignore
Expand Down Expand Up @@ -639,9 +642,6 @@ class CheerioCrawler extends BasicCrawler {
* @internal
*/
async _requestFunction({ request, session, proxyUrl, requestAsBrowserOptions }) {
// Using the streaming API of Request to be able to
// handle the response based on headers received.

if (this.useSessionPool) {
const { headers } = request;
headers.Cookie = session.getCookieString(request.url);
Expand Down Expand Up @@ -712,46 +712,35 @@ class CheerioCrawler extends BasicCrawler {
* @internal
*/
_getRequestOptions(request, session, proxyUrl, requestAsBrowserOptions) {
const mandatoryRequestOptions = {
const requestOptions = {
url: request.url,
method: request.method,
headers: { ...request.headers, ...requestAsBrowserOptions.headers },
ignoreSslErrors: this.ignoreSslErrors,
proxyUrl,
timeoutSecs: this.requestTimeoutMillis / 1000,
timeout: { request: this.requestTimeoutMillis },
sessionToken: session,
...requestAsBrowserOptions,
stream: true,
abortFunction: (res) => {
const { statusCode } = res;
const { type } = parseContentTypeFromResponse(res);

if (statusCode === 406) {
request.noRetry = true;
throw new Error(`Resource ${request.url} is not available in HTML format. Skipping resource.`);
}

if (!this.supportedMimeTypes.has(type) && statusCode < 500) {
request.noRetry = true;
throw new Error(`Resource ${request.url} served Content-Type ${type}, `
+ `but only ${Array.from(this.supportedMimeTypes).join(', ')} are allowed. Skipping resource.`);
}

return false;
headers: { ...request.headers, ...requestAsBrowserOptions.headers },
https: {
...requestAsBrowserOptions.https,
rejectUnauthorized: !this.ignoreSslErrors,
},
isStream: true,
};

// TODO this is incorrect, the check for man in the middle needs to be done
// on individual proxy level, not on the `proxyConfiguration` level,
// because users can use normal + MITM proxies in a single configuration.
// Disable SSL verification for MITM proxies
if (this.proxyConfiguration && this.proxyConfiguration.isManInTheMiddle) {
mandatoryRequestOptions.https = {
...mandatoryRequestOptions.https,
requestOptions.https = {
...requestOptions.https,
rejectUnauthorized: false,
};
}

if (/PATCH|POST|PUT/.test(request.method)) mandatoryRequestOptions.payload = request.payload;
if (/PATCH|POST|PUT/.test(request.method)) requestOptions.body = request.payload;

return { ...this.requestOptions, ...mandatoryRequestOptions };
return requestOptions;
}

/**
Expand Down Expand Up @@ -860,6 +849,27 @@ class CheerioCrawler extends BasicCrawler {
if (session) session.markBad();
throw new Error(`request timed out after ${this.handlePageTimeoutMillis / 1000} seconds.`);
}

/**
* @param {Request} request
* @param {IncomingMessage|Readable} response
* @private
*/
_abortDownloadOfBody(request, response) {
const { statusCode } = response;
const { type } = parseContentTypeFromResponse(response);

if (statusCode === 406) {
request.noRetry = true;
throw new Error(`Resource ${request.url} is not available in the format requested by the Accept header. Skipping resource.`);
}

if (!this.supportedMimeTypes.has(type) && statusCode < 500) {
request.noRetry = true;
throw new Error(`Resource ${request.url} served Content-Type ${type}, `
+ `but only ${Array.from(this.supportedMimeTypes).join(', ')} are allowed. Skipping resource.`);
}
}
}

export default CheerioCrawler;
Expand Down
6 changes: 3 additions & 3 deletions src/proxy_configuration.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { Configuration } from './configuration';
const PROTOCOL = 'http';
// https://docs.apify.com/proxy/datacenter-proxy#username-parameters
const MAX_SESSION_ID_LENGTH = 50;
const CHECK_ACCESS_REQUEST_TIMEOUT_SECS = 4;
const CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS = 4_000;
const CHECK_ACCESS_MAX_ATTEMPTS = 2;

/**
Expand Down Expand Up @@ -371,8 +371,8 @@ export class ProxyConfiguration {
const requestOpts = {
url: `${this.config.get('proxyStatusUrl')}/?format=json`,
proxyUrl: this.newUrl(),
json: true,
timeoutSecs: CHECK_ACCESS_REQUEST_TIMEOUT_SECS,
timeout: { request: CHECK_ACCESS_REQUEST_TIMEOUT_MILLIS },
responseType: 'json',
};
for (let attempt = 1; attempt <= CHECK_ACCESS_MAX_ATTEMPTS; attempt++) {
try {
Expand Down
4 changes: 3 additions & 1 deletion test/crawlers/cheerio_crawler.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,9 @@ describe('CheerioCrawler', () => {

expect(handlePageInvocationCount).toBe(0);
expect(errorMessages).toHaveLength(4);
errorMessages.forEach((msg) => expect(msg).toMatch('is not available in HTML format. Skipping resource.'));
errorMessages.forEach((msg) => {
expect(msg).toMatch('is not available in the format requested by the Accept header. Skipping resource.');
});
});
});
});
Expand Down

0 comments on commit 7b7a971

Please sign in to comment.