From 063774beaea1197349cca3fd5490463dc679a97d Mon Sep 17 00:00:00 2001 From: Ondra Urban Date: Sat, 6 Jun 2020 15:44:28 +0200 Subject: [PATCH] Fix and improve proxyConf, update changelog --- CHANGELOG.md | 14 +++++++++++--- package.json | 2 +- src/crawlers/puppeteer_crawler.js | 3 +++ src/proxy_configuration.js | 16 ++++++++-------- src/puppeteer_pool.js | 20 ++++++++++++-------- src/request_list.js | 2 +- test/proxy_configuration.test.js | 4 ++-- 7 files changed, 38 insertions(+), 23 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2a2b97a0c43a..be012ebd137a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,10 +6,12 @@ for more information and examples. First large change is a redesigned proxy configuration. `Cheerio` and `Puppeteer` crawlers now accept a `proxyConfiguration` parameter, which is an instance of `ProxyConfiguration`. -This class now exclusively manages both Apify Proxy and custom proxies. +This class now exclusively manages both Apify Proxy and custom proxies. Visit the new +[proxy management guide](https://sdk.apify.com/docs/guides/proxy-management) We also removed `Apify.utils.getRandomUserAgent()` as it was no longer effective -in avoiding bot detection. +in avoiding bot detection and changed the default values for empty properties in +`Request` instances. - **BREAKING:** Removed `Apify.getApifyProxyUrl()`. To get an Apify Proxy url, use `proxyConfiguration.newUrl([sessionId])`. @@ -18,17 +20,23 @@ in avoiding bot detection. in `requestAsBrowser` and `Apify.launchPuppeteer`. - **BREAKING:** Removed `Apify.utils.getRandomUserAgent()` as it was no longer effective in avoiding bot detection. +- **BREAKING:** `Request` instances no longer initialize empty properties with `null`, + which means that: + - empty `errorMessages` are now represented by `[]`, and + - empty `loadedUrl`, `payload` and `handledAt` are `undefined`. - Add `Apify.createProxyConfiguration()` `async` function to create `ProxyConfiguration` instances. `ProxyConfiguration` itself is not exposed. - Add `proxyConfiguration` to `CheerioCrawlerOptions` and `PuppeteerCrawlerOptions`. - Add `proxyInfo` to `CheerioHandlePageInputs` and `PuppeteerHandlePageInputs`. You can use this object to retrieve information about the currently used proxy - in `Puppeteer` and `Cheerio` crawlers. + in `Puppeteer` and `Cheerio` crawlers. +- Add click buttons and scroll up options to `Apify.utils.puppeteer.infiniteScroll()`. - Fixed a bug where intercepted requests would never continue. - Fixed a bug where `Apify.utils.requestAsBrowser()` would get into redirect loops. - Fix `Apify.utils.getMemoryInfo()` crashing the process on AWS Lambda and on systems running in Docker without memory cgroups enabled. +- Update Puppeteer to 3.3.0. 0.20.4 / 2020-05-11 diff --git a/package.json b/package.json index 731cb8d2297f..e306eec39b7a 100644 --- a/package.json +++ b/package.json @@ -51,7 +51,7 @@ "lint:fix": "eslint ./src ./test --ext .js,.jsx --fix" }, "dependencies": { - "@apify/http-request": "^2.0.0", + "@apify/http-request": "^2.0.1", "@apify/ps-tree": "^1.1.3", "@types/cheerio": "^0.22.18", "@types/node": "^12", diff --git a/src/crawlers/puppeteer_crawler.js b/src/crawlers/puppeteer_crawler.js index e08bf309826d..ddefc86da00b 100644 --- a/src/crawlers/puppeteer_crawler.js +++ b/src/crawlers/puppeteer_crawler.js @@ -1,4 +1,5 @@ import { checkParamOrThrow } from 'apify-client/build/utils'; +import { checkParamPrototypeOrThrow } from 'apify-shared/utilities'; import * as _ from 'underscore'; import PuppeteerPool from '../puppeteer_pool'; // eslint-disable-line import/no-duplicates import { BASIC_CRAWLER_TIMEOUT_MULTIPLIER } from '../constants'; @@ -20,6 +21,7 @@ import AutoscaledPool, { AutoscaledPoolOptions } from '../autoscaling/autoscaled import { LaunchPuppeteerOptions } from '../puppeteer'; // eslint-disable-line no-unused-vars,import/named import { Session } from '../session_pool/session'; // eslint-disable-line no-unused-vars import { SessionPoolOptions } from '../session_pool/session_pool'; +import { ProxyConfiguration } from '../proxy_configuration'; // eslint-enable-line import/no-duplicates /** @@ -250,6 +252,7 @@ class PuppeteerCrawler { checkParamOrThrow(useSessionPool, 'options.useSessionPool', 'Boolean'); checkParamOrThrow(sessionPoolOptions, 'options.sessionPoolOptions', 'Object'); checkParamOrThrow(persistCookiesPerSession, 'options.persistCookiesPerSession', 'Boolean'); + checkParamPrototypeOrThrow(proxyConfiguration, 'options.proxyConfiguration', ProxyConfiguration, 'ProxyConfiguration', true); if (proxyConfiguration && (launchPuppeteerOptions && launchPuppeteerOptions.proxyUrl)) { throw new Error('It is not possible to combine "options.proxyConfiguration" together with ' diff --git a/src/proxy_configuration.js b/src/proxy_configuration.js index 68342b1e2c35..c53e95d61b02 100644 --- a/src/proxy_configuration.js +++ b/src/proxy_configuration.js @@ -352,14 +352,14 @@ export class ProxyConfiguration { * @ignore */ _callNewUrlFunction(sessionId) { - const urlToReturn = this.newUrlFunction(sessionId); + let proxyUrl; try { - // eslint-disable-next-line no-new - new URL(urlToReturn); + proxyUrl = this.newUrlFunction(sessionId); + new URL(proxyUrl); // eslint-disable-line no-new } catch (err) { - this._throwNewUrlFunctionInvalidReturn(urlToReturn); + this._throwNewUrlFunctionInvalid(err); } - return urlToReturn; + return proxyUrl; } /** @@ -410,11 +410,11 @@ export class ProxyConfiguration { /** * Throws invalid custom newUrlFunction return - * @param {string} url + * @param {Error} err * @ignore */ - _throwNewUrlFunctionInvalidReturn(url) { - throw new Error(`The return value "${url}" of provided "options.newUrlFunction" is not a valid URL.`); + _throwNewUrlFunctionInvalid(err) { + throw new Error(`The provided newUrlFunction did not return a valid URL.\nCause: ${err.message}`); } /** diff --git a/src/puppeteer_pool.js b/src/puppeteer_pool.js index 6e90ec2cc644..be4f085596aa 100644 --- a/src/puppeteer_pool.js +++ b/src/puppeteer_pool.js @@ -54,16 +54,20 @@ class PuppeteerInstance { // TODO: this is in progress and it will be refactored } async launch() { - this.browserPromise = new Promise(async (resolve) => { - if (this.sessionPool) { - this.session = await this.sessionPool.getSession(); - } + this.browserPromise = new Promise(async (resolve, reject) => { + try { + if (this.sessionPool) { + this.session = await this.sessionPool.getSession(); + } - if (this.proxyConfiguration) { - this.proxyInfo = this.proxyConfiguration.newProxyInfo(this.session ? this.session.id : undefined); + if (this.proxyConfiguration) { + this.proxyInfo = this.proxyConfiguration.newProxyInfo(this.session ? this.session.id : undefined); + } + const proxyUrl = this.proxyInfo ? this.proxyInfo.url : null; + resolve(this.launchPuppeteerFunction({ proxyUrl })); + } catch (err) { + reject(err); } - const proxyUrl = this.proxyInfo ? this.proxyInfo.url : null; - resolve(this.launchPuppeteerFunction({ proxyUrl })); }); } } diff --git a/src/request_list.js b/src/request_list.js index 004c87bd83bf..3b816a71eaf6 100644 --- a/src/request_list.js +++ b/src/request_list.js @@ -368,7 +368,7 @@ export class RequestList { this._addRequest(source); } } catch (err) { - throw new Error(`Loading requests with sourcesFunction failed. Cause:\n${err.stack}`); + throw new Error(`Loading requests with sourcesFunction failed.\nCause: ${err.message}`); } } } diff --git a/test/proxy_configuration.test.js b/test/proxy_configuration.test.js index a6ee0206c0fe..69c4448d9965 100644 --- a/test/proxy_configuration.test.js +++ b/test/proxy_configuration.test.js @@ -134,7 +134,7 @@ describe('ProxyConfiguration', () => { expect(() => proxyConfiguration.newUrl(123456)).not.toThrowError(); }); - test('should throw invalid newUrlFunction return value', async () => { + test('should throw on invalid newUrlFunction', async () => { const newUrlFunction = () => { return 'http://proxy.com:1111*invalid_url'; }; @@ -146,7 +146,7 @@ describe('ProxyConfiguration', () => { proxyConfiguration.newUrl(); throw new Error('wrong error'); } catch (err) { - expect(err.message).toMatch('The return value "http://proxy.com:1111*invalid_url"'); + expect(err.message).toMatch('The provided newUrlFunction did not return'); } });