Skip to content

Commit

Permalink
Fix and improve proxyConf, update changelog
Browse files Browse the repository at this point in the history
  • Loading branch information
mnmkng committed Jun 6, 2020
1 parent cb0bbf4 commit 063774b
Show file tree
Hide file tree
Showing 7 changed files with 38 additions and 23 deletions.
14 changes: 11 additions & 3 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,12 @@ for more information and examples.

First large change is a redesigned proxy configuration. `Cheerio` and `Puppeteer` crawlers
now accept a `proxyConfiguration` parameter, which is an instance of `ProxyConfiguration`.
This class now exclusively manages both Apify Proxy and custom proxies.
This class now exclusively manages both Apify Proxy and custom proxies. Visit the new
[proxy management guide](https://sdk.apify.com/docs/guides/proxy-management)

We also removed `Apify.utils.getRandomUserAgent()` as it was no longer effective
in avoiding bot detection.
in avoiding bot detection and changed the default values for empty properties in
`Request` instances.

- **BREAKING:** Removed `Apify.getApifyProxyUrl()`. To get an Apify Proxy url,
use `proxyConfiguration.newUrl([sessionId])`.
Expand All @@ -18,17 +20,23 @@ in avoiding bot detection.
in `requestAsBrowser` and `Apify.launchPuppeteer`.
- **BREAKING:** Removed `Apify.utils.getRandomUserAgent()` as it was no longer effective
in avoiding bot detection.
- **BREAKING:** `Request` instances no longer initialize empty properties with `null`,
which means that:
- empty `errorMessages` are now represented by `[]`, and
- empty `loadedUrl`, `payload` and `handledAt` are `undefined`.
- Add `Apify.createProxyConfiguration()` `async` function to create `ProxyConfiguration`
instances. `ProxyConfiguration` itself is not exposed.
- Add `proxyConfiguration` to `CheerioCrawlerOptions`
and `PuppeteerCrawlerOptions`.
- Add `proxyInfo` to `CheerioHandlePageInputs` and `PuppeteerHandlePageInputs`.
You can use this object to retrieve information about the currently used proxy
in `Puppeteer` and `Cheerio` crawlers.
in `Puppeteer` and `Cheerio` crawlers.
- Add click buttons and scroll up options to `Apify.utils.puppeteer.infiniteScroll()`.
- Fixed a bug where intercepted requests would never continue.
- Fixed a bug where `Apify.utils.requestAsBrowser()` would get into redirect loops.
- Fix `Apify.utils.getMemoryInfo()` crashing the process on AWS Lambda and on systems
running in Docker without memory cgroups enabled.
- Update Puppeteer to 3.3.0.


0.20.4 / 2020-05-11
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@
"lint:fix": "eslint ./src ./test --ext .js,.jsx --fix"
},
"dependencies": {
"@apify/http-request": "^2.0.0",
"@apify/http-request": "^2.0.1",
"@apify/ps-tree": "^1.1.3",
"@types/cheerio": "^0.22.18",
"@types/node": "^12",
Expand Down
3 changes: 3 additions & 0 deletions src/crawlers/puppeteer_crawler.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { checkParamOrThrow } from 'apify-client/build/utils';
import { checkParamPrototypeOrThrow } from 'apify-shared/utilities';
import * as _ from 'underscore';
import PuppeteerPool from '../puppeteer_pool'; // eslint-disable-line import/no-duplicates
import { BASIC_CRAWLER_TIMEOUT_MULTIPLIER } from '../constants';
Expand All @@ -20,6 +21,7 @@ import AutoscaledPool, { AutoscaledPoolOptions } from '../autoscaling/autoscaled
import { LaunchPuppeteerOptions } from '../puppeteer'; // eslint-disable-line no-unused-vars,import/named
import { Session } from '../session_pool/session'; // eslint-disable-line no-unused-vars
import { SessionPoolOptions } from '../session_pool/session_pool';
import { ProxyConfiguration } from '../proxy_configuration';
// eslint-enable-line import/no-duplicates

/**
Expand Down Expand Up @@ -250,6 +252,7 @@ class PuppeteerCrawler {
checkParamOrThrow(useSessionPool, 'options.useSessionPool', 'Boolean');
checkParamOrThrow(sessionPoolOptions, 'options.sessionPoolOptions', 'Object');
checkParamOrThrow(persistCookiesPerSession, 'options.persistCookiesPerSession', 'Boolean');
checkParamPrototypeOrThrow(proxyConfiguration, 'options.proxyConfiguration', ProxyConfiguration, 'ProxyConfiguration', true);

if (proxyConfiguration && (launchPuppeteerOptions && launchPuppeteerOptions.proxyUrl)) {
throw new Error('It is not possible to combine "options.proxyConfiguration" together with '
Expand Down
16 changes: 8 additions & 8 deletions src/proxy_configuration.js
Original file line number Diff line number Diff line change
Expand Up @@ -352,14 +352,14 @@ export class ProxyConfiguration {
* @ignore
*/
_callNewUrlFunction(sessionId) {
const urlToReturn = this.newUrlFunction(sessionId);
let proxyUrl;
try {
// eslint-disable-next-line no-new
new URL(urlToReturn);
proxyUrl = this.newUrlFunction(sessionId);
new URL(proxyUrl); // eslint-disable-line no-new
} catch (err) {
this._throwNewUrlFunctionInvalidReturn(urlToReturn);
this._throwNewUrlFunctionInvalid(err);
}
return urlToReturn;
return proxyUrl;
}

/**
Expand Down Expand Up @@ -410,11 +410,11 @@ export class ProxyConfiguration {

/**
* Throws invalid custom newUrlFunction return
* @param {string} url
* @param {Error} err
* @ignore
*/
_throwNewUrlFunctionInvalidReturn(url) {
throw new Error(`The return value "${url}" of provided "options.newUrlFunction" is not a valid URL.`);
_throwNewUrlFunctionInvalid(err) {
throw new Error(`The provided newUrlFunction did not return a valid URL.\nCause: ${err.message}`);
}

/**
Expand Down
20 changes: 12 additions & 8 deletions src/puppeteer_pool.js
Original file line number Diff line number Diff line change
Expand Up @@ -54,16 +54,20 @@ class PuppeteerInstance { // TODO: this is in progress and it will be refactored
}

async launch() {
this.browserPromise = new Promise(async (resolve) => {
if (this.sessionPool) {
this.session = await this.sessionPool.getSession();
}
this.browserPromise = new Promise(async (resolve, reject) => {
try {
if (this.sessionPool) {
this.session = await this.sessionPool.getSession();
}

if (this.proxyConfiguration) {
this.proxyInfo = this.proxyConfiguration.newProxyInfo(this.session ? this.session.id : undefined);
if (this.proxyConfiguration) {
this.proxyInfo = this.proxyConfiguration.newProxyInfo(this.session ? this.session.id : undefined);
}
const proxyUrl = this.proxyInfo ? this.proxyInfo.url : null;
resolve(this.launchPuppeteerFunction({ proxyUrl }));
} catch (err) {
reject(err);
}
const proxyUrl = this.proxyInfo ? this.proxyInfo.url : null;
resolve(this.launchPuppeteerFunction({ proxyUrl }));
});
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/request_list.js
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@ export class RequestList {
this._addRequest(source);
}
} catch (err) {
throw new Error(`Loading requests with sourcesFunction failed. Cause:\n${err.stack}`);
throw new Error(`Loading requests with sourcesFunction failed.\nCause: ${err.message}`);
}
}
}
Expand Down
4 changes: 2 additions & 2 deletions test/proxy_configuration.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ describe('ProxyConfiguration', () => {
expect(() => proxyConfiguration.newUrl(123456)).not.toThrowError();
});

test('should throw invalid newUrlFunction return value', async () => {
test('should throw on invalid newUrlFunction', async () => {
const newUrlFunction = () => {
return 'http://proxy.com:1111*invalid_url';
};
Expand All @@ -146,7 +146,7 @@ describe('ProxyConfiguration', () => {
proxyConfiguration.newUrl();
throw new Error('wrong error');
} catch (err) {
expect(err.message).toMatch('The return value "http://proxy.com:1111*invalid_url"');
expect(err.message).toMatch('The provided newUrlFunction did not return');
}
});

Expand Down

0 comments on commit 063774b

Please sign in to comment.