From e8c70cf4dbdf43054a79a09159a9dbfdd1a9f6b9 Mon Sep 17 00:00:00 2001 From: Ondra Urban Date: Mon, 25 Jan 2021 19:44:24 +0100 Subject: [PATCH] Build docs v1.0.0 --- .../versioned_docs/version-1.0.0/api/Apify.md | 767 ++++++++++++++++++ .../version-1.0.0/api/BasicCrawler.md | 128 +++ .../version-1.0.0/api/CheerioCrawler.md | 158 ++++ .../version-1.0.0/api/Dataset.md | 239 ++++++ .../version-1.0.0/api/PlaywrightCrawler.md | 144 ++++ .../version-1.0.0/api/ProxyConfiguration.md | 101 +++ .../version-1.0.0/api/PuppeteerCrawler.md | 142 ++++ .../version-1.0.0/api/Request.md | 165 ++++ .../version-1.0.0/api/RequestList.md | 217 +++++ .../version-1.0.0/api/RequestQueue.md | 235 ++++++ .../version-1.0.0/api/playwright.md | 48 ++ .../version-1.0.0/guides/apify_platform.md | 71 ++ .../version-1.0.0/guides/docker_images.md | 174 ++++ .../version-1.0.0/guides/motivation.md | 22 + .../version-1.0.0/guides/quick_start.md | 108 +++ .../version-1.0.0/guides/request_storage.md | 132 +++ .../version-1.0.0/guides/result_storage.md | 116 +++ .../typedefs/BasicCrawlerOptions.md | 149 ++++ .../typedefs/BrowserLaunchContext.md | 46 ++ .../typedefs/CheerioCrawlerOptions.md | 328 ++++++++ .../typedefs/CheerioHandlePageInputs.md | 77 ++ .../typedefs/HandleFailedRequestInput.md | 37 + .../typedefs/HandleRequestInputs.md | 32 + .../typedefs/PlaywrightCrawlerOptions.md | 243 ++++++ .../typedefs/PlaywrightLaunchContext.md | 65 ++ .../typedefs/PostResponseInputs.md | 47 ++ .../typedefs/PrepareRequestInputs.md | 39 + .../typedefs/PuppeteerCrawlerOptions.md | 244 ++++++ .../typedefs/PuppeteerLaunchContext.md | 93 +++ .../version-1.0.0-sidebars.json | 184 +++++ website/versions.json | 1 + 31 files changed, 4552 insertions(+) create mode 100644 website/versioned_docs/version-1.0.0/api/Apify.md create mode 100644 website/versioned_docs/version-1.0.0/api/BasicCrawler.md create mode 100644 website/versioned_docs/version-1.0.0/api/CheerioCrawler.md create mode 100644 website/versioned_docs/version-1.0.0/api/Dataset.md create mode 100644 website/versioned_docs/version-1.0.0/api/PlaywrightCrawler.md create mode 100644 website/versioned_docs/version-1.0.0/api/ProxyConfiguration.md create mode 100644 website/versioned_docs/version-1.0.0/api/PuppeteerCrawler.md create mode 100644 website/versioned_docs/version-1.0.0/api/Request.md create mode 100644 website/versioned_docs/version-1.0.0/api/RequestList.md create mode 100644 website/versioned_docs/version-1.0.0/api/RequestQueue.md create mode 100644 website/versioned_docs/version-1.0.0/api/playwright.md create mode 100644 website/versioned_docs/version-1.0.0/guides/apify_platform.md create mode 100644 website/versioned_docs/version-1.0.0/guides/docker_images.md create mode 100644 website/versioned_docs/version-1.0.0/guides/motivation.md create mode 100644 website/versioned_docs/version-1.0.0/guides/quick_start.md create mode 100644 website/versioned_docs/version-1.0.0/guides/request_storage.md create mode 100644 website/versioned_docs/version-1.0.0/guides/result_storage.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/BasicCrawlerOptions.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/BrowserLaunchContext.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/CheerioCrawlerOptions.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/CheerioHandlePageInputs.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/HandleFailedRequestInput.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/HandleRequestInputs.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/PlaywrightCrawlerOptions.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/PlaywrightLaunchContext.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/PostResponseInputs.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/PrepareRequestInputs.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/PuppeteerCrawlerOptions.md create mode 100644 website/versioned_docs/version-1.0.0/typedefs/PuppeteerLaunchContext.md create mode 100644 website/versioned_sidebars/version-1.0.0-sidebars.json diff --git a/website/versioned_docs/version-1.0.0/api/Apify.md b/website/versioned_docs/version-1.0.0/api/Apify.md new file mode 100644 index 000000000000..c71d2c3ad1e0 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/Apify.md @@ -0,0 +1,767 @@ +--- +id: version-1.0.0-apify +title: Apify +original_id: apify +--- + + + +The following section describes all functions and properties provided by the `apify` package, except individual classes and namespaces that have their +separate, detailed, documentation pages accessible from the left sidebar. To learn how Apify SDK works, we suggest following the +[Getting Started](../guides/getting-started) tutorial. + +**Important:** + +> The following functions: `addWebhook`, `call`, `callTask` and `newClient` invoke features of the [Apify platform](../guides/apify-platform) and +> require your scripts to be authenticated. See the [authentication guide](../guides/apify-platform#logging-into-apify-platform-from-apify-sdk) for +> instructions. + +--- + + + +## `Apify.addWebhook(options)` + +Creates an ad-hoc webhook for the current actor run, which lets you receive a notification when the actor run finished or failed. For more information +about Apify actor webhooks, please see the [documentation](https://docs.apify.com/webhooks). + +Note that webhooks are only supported for actors running on the Apify platform. In local environment, the function will print a warning and have no +effect. + +**Parameters**: + +- **`options`**: `Object` + - **`eventTypes`**: `Array` - Array of event types, which you can set for actor run, see the + [actor run events](https://docs.apify.com/webhooks/events#actor-run) in the Apify doc. + - **`requestUrl`**: `string` - URL which will be requested using HTTP POST request, when actor run will reach the set event type. + - **`[payloadTemplate]`**: `string` - Payload template is a JSON-like string that describes the structure of the webhook POST request payload. + It uses JSON syntax, extended with a double curly braces syntax for injecting variables `{{variable}}`. Those variables are resolved at the + time of the webhook's dispatch, and a list of available variables with their descriptions is available in the + [Apify webhook documentation](https://docs.apify.com/webhooks). If `payloadTemplate` is omitted, the default payload template is used + ([view docs](https://docs.apify.com/webhooks/actions#payload-template)). + - **`[idempotencyKey]`**: `string` - Idempotency key enables you to ensure that a webhook will not be added multiple times in case of an actor + restart or other situation that would cause the `addWebhook()` function to be called again. We suggest using the actor run ID as the + idempotency key. You can get the run ID by calling [`Apify.getEnv()`](../api/apify#getenv) function. + +**Returns**: + +`Promise` - The return value is the Webhook object. For more information, see the +[Get webhook](https://apify.com/docs/api/v2#/reference/webhooks/webhook-object/get-webhook) API endpoint. + +--- + + + +## `Apify.call(actId, [input], [options])` + +Runs an actor on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable), waits for the actor to +finish and fetches its output. + +By passing the `waitSecs` option you can reduce the maximum amount of time to wait for the run to finish. If the value is less than or equal to zero, +the function returns immediately after the run is started. + +The result of the function is an [`ActorRun`](../typedefs/actor-run) object that contains details about the actor run and its output (if any). If the +actor run fails, the function throws the [`ApifyCallError`](../api/apify-call-error) exception. + +If you want to run an actor task rather than an actor, please use the [`Apify.callTask()`](../api/apify#calltask) function instead. + +For more information about actors, read the [documentation](https://docs.apify.com/actor). + +**Example usage:** + +```javascript +const run = await Apify.call('apify/hello-world', { myInput: 123 }); +console.log(`Received message: ${run.output.body.message}`); +``` + +Internally, the `call()` function invokes the [Run actor](https://apify.com/docs/api/v2#/reference/actors/run-collection/run-actor) and several other +API endpoints to obtain the output. + +**Throws**: + +- [`ApifyCallError`](../api/apify-call-error) If the run did not succeed, e.g. if it failed or timed out. + +**Parameters**: + +- **`actId`**: `string` - Allowed formats are `username/actor-name`, `userId/actor-name` or actor ID. +- **`[input]`**: `object` - Input for the actor. If it is an object, it will be stringified to JSON and its content type set to + `application/json; charset=utf-8`. Otherwise the `options.contentType` parameter must be provided. +- **`[options]`**: `Object` = {} - Object with the settings below: + - **`[contentType]`**: `string` - Content type for the `input`. If not specified, `input` is expected to be an object that will be stringified + to JSON and content type set to `application/json; charset=utf-8`. If `options.contentType` is specified, then `input` must be a `String` or + `Buffer`. + - **`[token]`**: `string` - User API token that is used to run the actor. By default, it is taken from the `APIFY_TOKEN` environment variable. + - **`[memoryMbytes]`**: `number` - Memory in megabytes which will be allocated for the new actor run. If not provided, the run uses memory of + the default actor run configuration. + - **`[timeoutSecs]`**: `number` - Timeout for the actor run in seconds. Zero value means there is no timeout. If not provided, the run uses + timeout of the default actor run configuration. + - **`[build]`**: `string` - Tag or number of the actor build to run (e.g. `beta` or `1.2.345`). If not provided, the run uses build tag or + number from the default actor run configuration (typically `latest`). + - **`[waitSecs]`**: `number` - Maximum time to wait for the actor run to finish, in seconds. If the limit is reached, the returned promise is + resolved to a run object that will have status `READY` or `RUNNING` and it will not contain the actor run output. If `waitSecs` is null or + undefined, the function waits for the actor to finish (default behavior). + - **`[fetchOutput]`**: `boolean` = true - If `false` then the function does not fetch output of the actor. + - **`[disableBodyParser]`**: `boolean` = false - If `true` then the function will not attempt to parse the actor's output and will + return it in a raw `Buffer`. + - **`[webhooks]`**: `Array` - Specifies optional webhooks associated with the actor run, which can be used to receive a notification + e.g. when the actor finished or failed, see [ad hook webhooks documentation](https://docs.apify.com/webhooks/ad-hoc-webhooks) for detailed + description. + +**Returns**: + +[`Promise`](../typedefs/actor-run) + +--- + + + +## `Apify.callTask(taskId, [input], [options])` + +Runs an actor task on the Apify platform using the current user account (determined by the `APIFY_TOKEN` environment variable), waits for the task to +finish and fetches its output. + +By passing the `waitSecs` option you can reduce the maximum amount of time to wait for the run to finish. If the value is less than or equal to zero, +the function returns immediately after the run is started. + +The result of the function is an [`ActorRun`](../typedefs/actor-run) object that contains details about the actor run and its output (if any). If the +actor run failed, the function fails with [`ApifyCallError`](../api/apify-call-error) exception. + +Note that an actor task is a saved input configuration and options for an actor. If you want to run an actor directly rather than an actor task, +please use the [`Apify.call()`](../api/apify#call) function instead. + +For more information about actor tasks, read the [documentation](https://docs.apify.com/tasks). + +**Example usage:** + +```javascript +const run = await Apify.callTask('bob/some-task'); +console.log(`Received message: ${run.output.body.message}`); +``` + +Internally, the `callTask()` function calls the [Run task](https://apify.com/docs/api/v2#/reference/actor-tasks/run-collection/run-task) and several +other API endpoints to obtain the output. + +**Throws**: + +- [`ApifyCallError`](../api/apify-call-error) If the run did not succeed, e.g. if it failed or timed out. + +**Parameters**: + +- **`taskId`**: `string` - Allowed formats are `username/task-name`, `userId/task-name` or task ID. +- **`[input]`**: `object` - Input overrides for the actor task. If it is an object, it will be stringified to JSON and its content type set to + `application/json; charset=utf-8`. Provided input will be merged with actor task input. +- **`[options]`**: `Object` = {} - Object with the settings below: + - **`[token]`**: `string` - User API token that is used to run the actor. By default, it is taken from the `APIFY_TOKEN` environment variable. + - **`[memoryMbytes]`**: `number` - Memory in megabytes which will be allocated for the new actor task run. If not provided, the run uses memory + of the default actor run configuration. + - **`[timeoutSecs]`**: `number` - Timeout for the actor task run in seconds. Zero value means there is no timeout. If not provided, the run uses + timeout of the default actor run configuration. + - **`[build]`**: `string` - Tag or number of the actor build to run (e.g. `beta` or `1.2.345`). If not provided, the run uses build tag or + number from the default actor run configuration (typically `latest`). + - **`[waitSecs]`**: `number` - Maximum time to wait for the actor task run to finish, in seconds. If the limit is reached, the returned promise + is resolved to a run object that will have status `READY` or `RUNNING` and it will not contain the actor run output. If `waitSecs` is null or + undefined, the function waits for the actor task to finish (default behavior). + - **`[webhooks]`**: `Array` - Specifies optional webhooks associated with the actor run, which can be used to receive a notification + e.g. when the actor finished or failed, see [ad hook webhooks documentation](https://docs.apify.com/webhooks/ad-hoc-webhooks) for detailed + description. + +**Returns**: + +[`Promise`](../typedefs/actor-run) + +--- + + + +## `Apify.createProxyConfiguration([proxyConfigurationOptions])` + +Creates a proxy configuration and returns a promise resolving to an instance of the [`ProxyConfiguration`](../api/proxy-configuration) class that is +already initialized. + +Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking your crawlers based +on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies for +all connections. + +For more details and code examples, see the [`ProxyConfiguration`](../api/proxy-configuration) class. + +```javascript + +// Returns initialized proxy configuration class +const proxyConfiguration = await Apify.createProxyConfiguration({ + groups: ['GROUP1', 'GROUP2'] // List of Apify proxy groups + countryCode: 'US' +}); + +const crawler = new Apify.CheerioCrawler({ + // ... + proxyConfiguration, + handlePageFunction: ({ proxyInfo }) => { + const usedProxyUrl = proxyInfo.url; // Getting the proxy URL + } +}) + +``` + +For compatibility with existing Actor Input UI (Input Schema), this function returns `undefined` when the following object is passed as +`proxyConfigurationOptions`. + +``` +{ useApifyProxy: false } +``` + +**Parameters**: + +- **`[proxyConfigurationOptions]`**: [`ProxyConfigurationOptions`](../typedefs/proxy-configuration-options) + +**Returns**: + +[`Promise`](../api/proxy-configuration) + +--- + + + +## `Apify.events` + +Gets an instance of a Node.js' [EventEmitter](https://nodejs.org/api/events.html#events_class_eventemitter) class that emits various events from the +SDK or the Apify platform. The event emitter is initialized by calling the [`Apify.main()`](../api/apify#main) function. + +**Example usage:** + +```javascript +Apify.events.on('cpuInfo', data => { + if (data.isCpuOverloaded) console.log('Oh no, the CPU is overloaded!'); +}); +``` + +The following events are emitted: + +- `cpuInfo`: `{ "isCpuOverloaded": Boolean }` The event is emitted approximately every second and it indicates whether the actor is using the + maximum of available CPU resources. If that's the case, the actor should not add more workload. For example, this event is used by the + [`AutoscaledPool`](../api/autoscaled-pool) class. +- `migrating`: `void` Emitted when the actor running on the Apify platform is going to be migrated to another worker server soon. You can use it to + persist the state of the actor and abort the run, to speed up migration. For example, this is used by the [`RequestList`](../api/request-list) + class. +- `persistState`: `{ "isMigrating": Boolean }` Emitted in regular intervals (by default 60 seconds) to notify all components of Apify SDK that it is + time to persist their state, in order to avoid repeating all work when the actor restarts. This event is automatically emitted together with the + `migrating` event, in which case the `isMigrating` flag is set to `true`. Otherwise the flag is `false`. Note that the `persistState` event is + provided merely for user convenience, you can achieve the same effect using `setInterval()` and listening for the `migrating` event. + +--- + + + +## `Apify.getEnv()` + +Returns a new [`ApifyEnv`](../typedefs/apify-env) object which contains information parsed from all the `APIFY_XXX` environment variables. + +For the list of the `APIFY_XXX` environment variables, see [Actor documentation](https://docs.apify.com/actor/run#environment-variables). If some of +the variables are not defined or are invalid, the corresponding value in the resulting object will be null. + +**Returns**: + +[`ApifyEnv`](../typedefs/apify-env) + +--- + + + +## `Apify.getInput()` + +Gets the actor input value from the default [`KeyValueStore`](../api/key-value-store) associated with the current actor run. + +This is just a convenient shortcut for [`keyValueStore.getValue('INPUT')`](key-value-store#getvalue). For example, calling the following code: + +```javascript +const input = await Apify.getInput(); +``` + +is equivalent to: + +```javascript +const store = await Apify.openKeyValueStore(); +await store.getValue('INPUT'); +``` + +For more information, see [`Apify.openKeyValueStore()`](../api/apify#openkeyvaluestore) and +[`KeyValueStore.getValue()`](../api/key-value-store#getvalue). + +**Returns**: + +`Promise<(object|string|Buffer|null)>` - Returns a promise that resolves to an object, string or [`Buffer`](https://nodejs.org/api/buffer.html), +depending on the MIME content type of the record, or `null` if the record is missing. + +--- + + + +## `Apify.getMemoryInfo()` + +Returns memory statistics of the process and the system, see [`MemoryInfo`](../typedefs/memory-info). + +If the process runs inside of Docker, the `getMemoryInfo` gets container memory limits, otherwise it gets system memory limits. + +Beware that the function is quite inefficient because it spawns a new process. Therefore you shouldn't call it too often, like more than once per +second. + +**Returns**: + +[`Promise`](../typedefs/memory-info) + +--- + + + +## `Apify.getValue(key)` + +Gets a value from the default [`KeyValueStore`](../api/key-value-store) associated with the current actor run. + +This is just a convenient shortcut for [`KeyValueStore.getValue()`](../api/key-value-store#getvalue). For example, calling the following code: + +```javascript +const value = await Apify.getValue('my-key'); +``` + +is equivalent to: + +```javascript +const store = await Apify.openKeyValueStore(); +const value = await store.getValue('my-key'); +``` + +To store the value to the default key-value store, you can use the [`Apify.setValue()`](../api/apify#setvalue) function. + +For more information, see [`Apify.openKeyValueStore()`](../api/apify#openkeyvaluestore) and +[`KeyValueStore.getValue()`](../api/key-value-store#getvalue). + +**Parameters**: + +- **`key`**: `string` - Unique record key. + +**Returns**: + +`Promise<(object|string|Buffer|null)>` - Returns a promise that resolves to an object, string or [`Buffer`](https://nodejs.org/api/buffer.html), +depending on the MIME content type of the record, or `null` if the record is missing. + +--- + + + +## `Apify.isAtHome()` + +Returns `true` when code is running on Apify platform and `false` otherwise (for example locally). + +**Returns**: + +`boolean` + +--- + + + +## `Apify.launchPlaywright([options])` + +Launches headless browsers using Playwright pre-configured to work within the Apify platform. The function has the same return value as +`browserType.launch()`. See Playwright documentation for more +details. + +The `launchPlaywright()` function alters the following Playwright options: + +- Passes the setting from the `APIFY_HEADLESS` environment variable to the `headless` option, unless it was already defined by the caller or + `APIFY_XVFB` environment variable is set to `1`. Note that Apify Actor cloud platform automatically sets `APIFY_HEADLESS=1` to all running actors. +- Takes the `proxyUrl` option, validates it and adds it to `launchOptions` in a proper format. The proxy URL must define a port number and have one + of the following schemes: `http://`, `https://`, `socks4://` or `socks5://`. If the proxy is HTTP (i.e. has the `http://` scheme) and contains + username or password, the `launchPlaywright` functions sets up an anonymous proxy HTTP to make the proxy work with headless Chrome. For more + information, read the + blog post about proxy-chain library. + +To use this function, you need to have the [Playwright](https://www.npmjs.com/package/playwright) NPM package installed in your project. When running +on the Apify Platform, you can achieve that simply by using the `apify/actor-node-playwright-*` base Docker image for your actor - see +[Apify Actor documentation](https://docs.apify.com/actor/build#base-images) for details. + +**Parameters**: + +- **`[options]`**: [`PlaywrightLaunchContext`](../typedefs/playwright-launch-context) - Optional settings passed to `browserType.launch()`. In + addition to [Playwright's options](https://playwright.dev/docs/api/class-browsertype?_highlight=launch#browsertypelaunchoptions) the object may + contain our own [`PlaywrightLaunchContext`](../typedefs/playwright-launch-context) that enable additional features. + +**Returns**: + +`Promise` - Promise that resolves to Playwright's `Browser` instance. + +--- + + + +## `Apify.launchPuppeteer([launchContext])` + +Launches headless Chrome using Puppeteer pre-configured to work within the Apify platform. The function has the same argument and the return value as +`puppeteer.launch()`. See Puppeteer +documentation for more details. + +The `launchPuppeteer()` function alters the following Puppeteer options: + +- Passes the setting from the `APIFY_HEADLESS` environment variable to the `headless` option, unless it was already defined by the caller or + `APIFY_XVFB` environment variable is set to `1`. Note that Apify Actor cloud platform automatically sets `APIFY_HEADLESS=1` to all running actors. +- Takes the `proxyUrl` option, validates it and adds it to `args` as `--proxy-server=XXX`. The proxy URL must define a port number and have one of + the following schemes: `http://`, `https://`, `socks4://` or `socks5://`. If the proxy is HTTP (i.e. has the `http://` scheme) and contains + username or password, the `launchPuppeteer` functions sets up an anonymous proxy HTTP to make the proxy work with headless Chrome. For more + information, read the + blog post about proxy-chain library. + +To use this function, you need to have the [puppeteer](https://www.npmjs.com/package/puppeteer) NPM package installed in your project. When running on +the Apify cloud, you can achieve that simply by using the `apify/actor-node-chrome` base Docker image for your actor - see +[Apify Actor documentation](https://docs.apify.com/actor/build#base-images) for details. + +For an example of usage, see the [Synchronous run Example](../examples/synchronous-run) or the +[Puppeteer proxy Example](../examples/puppeteer-with-proxy) + +**Parameters**: + +- **`[launchContext]`**: [`PuppeteerLaunchContext`](../typedefs/puppeteer-launch-context) - All `PuppeteerLauncher` parameters are passed via an + launchContext object. If you want to pass custom `puppeteer.launch(options)` options you can use the `PuppeteerLaunchContext.launchOptions` + property. + +**Returns**: + +`Promise` - Promise that resolves to Puppeteer's `Browser` instance. + +--- + + + +## `Apify.main(userFunc)` + +Runs the main user function that performs the job of the actor and terminates the process when the user function finishes. + +**The `Apify.main()` function is optional** and is provided merely for your convenience. It is mainly useful when you're running your code as an actor +on the [Apify platform](https://apify.com/actors). However, if you want to use Apify SDK tools directly inside your existing projects, e.g. running in +an [Express](https://expressjs.com/) server, on [Google Cloud functions](https://cloud.google.com/functions) or +[AWS Lambda](https://aws.amazon.com/lambda/), it's better to avoid it since the function terminates the main process when it finishes! + +The `Apify.main()` function performs the following actions: + +- When running on the Apify platform (i.e. APIFY_IS_AT_HOME environment variable is set), it sets up a connection to listen for + platform events. For example, to get a notification about an imminent migration to another server. See [`Apify.events`](../api/apify#events) for + details. +- It checks that either APIFY_TOKEN or APIFY_LOCAL_STORAGE_DIR environment variable is defined. If not, the functions sets + APIFY_LOCAL_STORAGE_DIR to ./apify_storage inside the current working directory. This is to simplify running code + examples. +- It invokes the user function passed as the userFunc parameter. +- If the user function returned a promise, waits for it to resolve. +- If the user function throws an exception or some other error is encountered, prints error details to console so that they are stored to the log. +- Exits the Node.js process, with zero exit code on success and non-zero on errors. + +The user function can be synchronous: + +```javascript +Apify.main(() => { + // My synchronous function that returns immediately + console.log('Hello world from actor!'); +}); +``` + +If the user function returns a promise, it is considered asynchronous: + +```javascript +const { requestAsBrowser } = require('some-request-library'); + +Apify.main(() => { + // My asynchronous function that returns a promise + return request('http://www.example.com').then(html => { + console.log(html); + }); +}); +``` + +To simplify your code, you can take advantage of the `async`/`await` keywords: + +```javascript +const request = require('some-request-library'); + +Apify.main(async () => { + // My asynchronous function + const html = await request('http://www.example.com'); + console.log(html); +}); +``` + +**Parameters**: + +- **`userFunc`**: [`UserFunc`](../typedefs/user-func) - User function to be executed. If it returns a promise, the promise will be awaited. The user + function is called with no arguments. + +--- + + + +## `Apify.metamorph(targetActorId, [input], [options])` + +Transforms this actor run to an actor run of a given actor. The system stops the current container and starts the new container instead. All the +default storages are preserved and the new input is stored under the `INPUT-METAMORPH-1` key in the same default key-value store. + +**Parameters**: + +- **`targetActorId`**: `string` - Either `username/actor-name` or actor ID of an actor to which we want to metamorph. +- **`[input]`**: `object` - Input for the actor. If it is an object, it will be stringified to JSON and its content type set to + `application/json; charset=utf-8`. Otherwise the `options.contentType` parameter must be provided. +- **`[options]`**: `Object` = {} - Object with the settings below: + - **`[contentType]`**: `string` - Content type for the `input`. If not specified, `input` is expected to be an object that will be stringified + to JSON and content type set to `application/json; charset=utf-8`. If `options.contentType` is specified, then `input` must be a `String` or + `Buffer`. + - **`[build]`**: `string` - Tag or number of the target actor build to metamorph into (e.g. `beta` or `1.2.345`). If not provided, the run uses + build tag or number from the default actor run configuration (typically `latest`). + +**Returns**: + +`Promise` + +--- + + + +## `Apify.newClient([options])` + +Returns a new instance of the Apify API client. The `ApifyClient` class is provided by the +apify-client NPM package, and it is automatically configured using the +`APIFY_API_BASE_URL`, and `APIFY_TOKEN` environment variables. You can override the token via the available options. That's useful if you want to use +the client as a different Apify user than the SDK internals are using. + +**Parameters**: + +- **`[options]`**: `object` + - **`[token]`**: `string` + - **`[maxRetries]`**: `string` + - **`[minDelayBetweenRetriesMillis]`**: `string` + +**Returns**: + +[`ApifyClient`](../api/apify) + +--- + + + +## `Apify.openDataset([datasetIdOrName], [options])` + +Opens a dataset and returns a promise resolving to an instance of the [`Dataset`](../api/dataset) class. + +Datasets are used to store structured data where each object stored has the same attributes, such as online store products or real estate offers. The +actual data is stored either on the local filesystem or in the cloud. + +For more details and code examples, see the [`Dataset`](../api/dataset) class. + +**Parameters**: + +- **`[datasetIdOrName]`**: `string` - ID or name of the dataset to be opened. If `null` or `undefined`, the function returns the default dataset + associated with the actor run. +- **`[options]`**: `Object` + - **`[forceCloud]`**: `boolean` = false - If set to `true` then the function uses cloud storage usage even if the + `APIFY_LOCAL_STORAGE_DIR` environment variable is set. This way it is possible to combine local and cloud storage. + +**Returns**: + +[`Promise`](../api/dataset) + +--- + + + +## `Apify.openKeyValueStore([storeIdOrName], [options])` + +Opens a key-value store and returns a promise resolving to an instance of the [`KeyValueStore`](../api/key-value-store) class. + +Key-value stores are used to store records or files, along with their MIME content type. The records are stored and retrieved using a unique key. The +actual data is stored either on a local filesystem or in the Apify cloud. + +For more details and code examples, see the [`KeyValueStore`](../api/key-value-store) class. + +**Parameters**: + +- **`[storeIdOrName]`**: `string` - ID or name of the key-value store to be opened. If `null` or `undefined`, the function returns the default + key-value store associated with the actor run. +- **`[options]`**: `object` + - **`[forceCloud]`**: `boolean` = false - If set to `true` then the function uses cloud storage usage even if the + `APIFY_LOCAL_STORAGE_DIR` environment variable is set. This way it is possible to combine local and cloud storage. + +**Returns**: + +[`Promise`](../api/key-value-store) + +--- + + + +## `Apify.openRequestList(listName, sources, [options])` + +Opens a request list and returns a promise resolving to an instance of the [`RequestList`](../api/request-list) class that is already initialized. + +[`RequestList`](../api/request-list) represents a list of URLs to crawl, which is always stored in memory. To enable picking up where left off after a +process restart, the request list sources are persisted to the key-value store at initialization of the list. Then, while crawling, a small state +object is regularly persisted to keep track of the crawling status. + +For more details and code examples, see the [`RequestList`](../api/request-list) class. + +**Example usage:** + +```javascript +const sources = ['https://www.example.com', 'https://www.google.com', 'https://www.bing.com']; + +const requestList = await Apify.openRequestList('my-name', sources); +``` + +**Parameters**: + +- **`listName`**: `string` | `null` - Name of the request list to be opened. Setting a name enables the `RequestList`'s state to be persisted in the + key-value store. This is useful in case of a restart or migration. Since `RequestList` is only stored in memory, a restart or migration wipes it + clean. Setting a name will enable the `RequestList`'s state to survive those situations and continue where it left off. + + The name will be used as a prefix in key-value store, producing keys such as `NAME-REQUEST_LIST_STATE` and `NAME-REQUEST_LIST_SOURCES`. + + If `null`, the list will not be persisted and will only be stored in memory. Process restart will then cause the list to be crawled again from the + beginning. We suggest always using a name. + +- **`sources`**: [`Array<(RequestOptions|Request|string)>`](../typedefs/request-options) - An array of sources of URLs for the + [`RequestList`](../api/request-list). It can be either an array of strings, plain objects that define at least the `url` property, or an array of + [`Request`](../api/request) instances. + + **IMPORTANT:** The `sources` array will be consumed (left empty) after [`RequestList`](../api/request-list) initializes. This is a measure to + prevent memory leaks in situations when millions of sources are added. + +Additionally, the `requestsFromUrl` property may be used instead of `url`, which will instruct [`RequestList`](../api/request-list) to download the +source URLs from a given remote location. The URLs will be parsed from the received response. In this case you can limit the URLs using `regex` +parameter containing regular expression pattern for URLs to be included. + +For details, see the [`RequestListOptions.sources`](../typedefs/request-list-options#sources) + +- **`[options]`**: [`RequestListOptions`](../typedefs/request-list-options) - The [`RequestList`](../api/request-list) options. Note that the + `listName` parameter supersedes the [`RequestListOptions.persistStateKey`](../typedefs/request-list-options#persiststatekey) and + [`RequestListOptions.persistRequestsKey`](../typedefs/request-list-options#persistrequestskey) options and the `sources` parameter supersedes the + [`RequestListOptions.sources`](../typedefs/request-list-options#sources) option. + +**Returns**: + +[`Promise`](../api/request-list) + +--- + + + +## `Apify.openRequestQueue([queueIdOrName], [options])` + +Opens a request queue and returns a promise resolving to an instance of the [`RequestQueue`](../api/request-queue) class. + +[`RequestQueue`](../api/request-queue) represents a queue of URLs to crawl, which is stored either on local filesystem or in the cloud. The queue is +used for deep crawling of websites, where you start with several URLs and then recursively follow links to other pages. The data structure supports +both breadth-first and depth-first crawling orders. + +For more details and code examples, see the [`RequestQueue`](../api/request-queue) class. + +**Parameters**: + +- **`[queueIdOrName]`**: `string` - ID or name of the request queue to be opened. If `null` or `undefined`, the function returns the default request + queue associated with the actor run. +- **`[options]`**: `object` + - **`[forceCloud]`**: `boolean` = false - If set to `true` then the function uses cloud storage usage even if the + `APIFY_LOCAL_STORAGE_DIR` environment variable is set. This way it is possible to combine local and cloud storage. + +**Returns**: + +[`Promise`](../api/request-queue) + +--- + + + +## `Apify.openSessionPool(sessionPoolOptions)` + +Opens a SessionPool and returns a promise resolving to an instance of the [`SessionPool`](../api/session-pool) class that is already initialized. + +For more details and code examples, see the [`SessionPool`](../api/session-pool) class. + +**Parameters**: + +- **`sessionPoolOptions`**: [`SessionPoolOptions`](../typedefs/session-pool-options) + +**Returns**: + +[`Promise`](../api/session-pool) + +--- + + + +## `Apify.pushData(item)` + +Stores an object or an array of objects to the default [`Dataset`](../api/dataset) of the current actor run. + +This is just a convenient shortcut for [`Dataset.pushData()`](../api/dataset#pushdata). For example, calling the following code: + +```javascript +await Apify.pushData({ myValue: 123 }); +``` + +is equivalent to: + +```javascript +const dataset = await Apify.openDataset(); +await dataset.pushData({ myValue: 123 }); +``` + +For more information, see [`Apify.openDataset()`](../api/apify#opendataset) and [`Dataset.pushData()`](../api/dataset#pushdata) + +**IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`, otherwise the actor process might finish before the data are stored! + +**Parameters**: + +- **`item`**: `object` - Object or array of objects containing data to be stored in the default dataset. The objects must be serializable to JSON + and the JSON representation of each object must be smaller than 9MB. + +**Returns**: + +`Promise` + +--- + + + +## `Apify.setValue(key, value, [options])` + +Stores or deletes a value in the default [`KeyValueStore`](../api/key-value-store) associated with the current actor run. + +This is just a convenient shortcut for [`KeyValueStore.setValue()`](../api/key-value-store#setvalue). For example, calling the following code: + +```javascript +await Apify.setValue('OUTPUT', { foo: 'bar' }); +``` + +is equivalent to: + +```javascript +const store = await Apify.openKeyValueStore(); +await store.setValue('OUTPUT', { foo: 'bar' }); +``` + +To get a value from the default key-value store, you can use the [`Apify.getValue()`](../api/apify#getvalue) function. + +For more information, see [`Apify.openKeyValueStore()`](../api/apify#openkeyvaluestore) and +[`KeyValueStore.getValue()`](../api/key-value-store#getvalue). + +**Parameters**: + +- **`key`**: `string` - Unique record key. +- **`value`**: `object` - Record data, which can be one of the following values: + - If `null`, the record in the key-value store is deleted. + - If no `options.contentType` is specified, `value` can be any JavaScript object and it will be stringified to JSON. + - If `options.contentType` is set, `value` is taken as is and it must be a `String` or [`Buffer`](https://nodejs.org/api/buffer.html). For any + other value an error will be thrown. +- **`[options]`**: `Object` + - **`[contentType]`**: `string` - Specifies a custom MIME content type of the record. + +**Returns**: + +`Promise` + +--- diff --git a/website/versioned_docs/version-1.0.0/api/BasicCrawler.md b/website/versioned_docs/version-1.0.0/api/BasicCrawler.md new file mode 100644 index 000000000000..588e39f26279 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/BasicCrawler.md @@ -0,0 +1,128 @@ +--- +id: version-1.0.0-basic-crawler +title: BasicCrawler +original_id: basic-crawler +--- + + + +Provides a simple framework for parallel crawling of web pages. The URLs to crawl are fed either from a static list of URLs or from a dynamic queue of +URLs enabling recursive crawling of websites. + +`BasicCrawler` is a low-level tool that requires the user to implement the page download and data extraction functionality themselves. If you want a +crawler that already facilitates this functionality, please consider using [`CheerioCrawler`](../api/cheerio-crawler), +[`PuppeteerCrawler`](../api/puppeteer-crawler) or [`PlaywrightCrawler`](../api/playwright-crawler). + +`BasicCrawler` invokes the user-provided [`BasicCrawlerOptions.handleRequestFunction`](../typedefs/basic-crawler-options#handlerequestfunction) for +each [`Request`](../api/request) object, which represents a single URL to crawl. The [`Request`](../api/request) objects are fed from the +[`RequestList`](../api/request-list) or the [`RequestQueue`](../api/request-queue) instances provided by the +[`BasicCrawlerOptions.requestList`](../typedefs/basic-crawler-options#requestlist) or +[`BasicCrawlerOptions.requestQueue`](../typedefs/basic-crawler-options#requestqueue) constructor options, respectively. + +If both [`BasicCrawlerOptions.requestList`](../typedefs/basic-crawler-options#requestlist) and +[`BasicCrawlerOptions.requestQueue`](../typedefs/basic-crawler-options#requestqueue) options are used, the instance first processes URLs from the +[`RequestList`](../api/request-list) and automatically enqueues all of them to [`RequestQueue`](../api/request-queue) before it starts their +processing. This ensures that a single URL is not crawled multiple times. + +The crawler finishes if there are no more [`Request`](../api/request) objects to crawl. + +New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the +[`AutoscaledPool`](../api/autoscaled-pool) class. All [`AutoscaledPool`](../api/autoscaled-pool) configuration options can be passed to the +`autoscaledPoolOptions` parameter of the `BasicCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` +[`AutoscaledPool`](../api/autoscaled-pool) options are available directly in the `BasicCrawler` constructor. + +**Example usage:** + +```javascript +// Prepare a list of URLs to crawl +const requestList = new Apify.RequestList({ + sources: [{ url: 'http://www.example.com/page-1' }, { url: 'http://www.example.com/page-2' }], +}); +await requestList.initialize(); + +// Crawl the URLs +const crawler = new Apify.BasicCrawler({ + requestList, + handleRequestFunction: async ({ request }) => { + // 'request' contains an instance of the Request class + // Here we simply fetch the HTML of the page and store it to a dataset + const { body } = await Apify.utils.requestAsBrowser(request); + await Apify.pushData({ + url: request.url, + html: body, + }); + }, +}); + +await crawler.run(); +``` + +## Properties + +### `stats` + +**Type**: [`Statistics`](../api/statistics) + +Contains statistics about the current run. + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +A reference to the underlying [`RequestList`](../api/request-list) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +A reference to the underlying [`RequestQueue`](../api/request-queue) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `sessionPool` + +**Type**: [`SessionPool`](../api/session-pool) + +A reference to the underlying [`SessionPool`](../api/session-pool) class that manages the crawler's [`Session`](../api/session)s. Only available if +used by the crawler. + +--- + +### `autoscaledPool` + +**Type**: [`AutoscaledPool`](../api/autoscaled-pool) + +A reference to the underlying [`AutoscaledPool`](../api/autoscaled-pool) class that manages the concurrency of the crawler. Note that this property is +only initialized after calling the [`BasicCrawler.run()`](../api/basic-crawler#run) function. You can use it to change the concurrency settings on the +fly, to pause the crawler by calling [`AutoscaledPool.pause()`](../api/autoscaled-pool#pause) or to abort it by calling +[`AutoscaledPool.abort()`](../api/autoscaled-pool#abort). + +--- + + + +## `new BasicCrawler(options)` + +**Parameters**: + +- **`options`**: [`BasicCrawlerOptions`](../typedefs/basic-crawler-options) - All `BasicCrawler` parameters are passed via an options object. + +--- + + + +## `basicCrawler.run()` + +Runs the crawler. Returns a promise that gets resolved once all the requests are processed. + +**Returns**: + +`Promise` + +--- diff --git a/website/versioned_docs/version-1.0.0/api/CheerioCrawler.md b/website/versioned_docs/version-1.0.0/api/CheerioCrawler.md new file mode 100644 index 000000000000..ef3b5426283a --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/CheerioCrawler.md @@ -0,0 +1,158 @@ +--- +id: version-1.0.0-cheerio-crawler +title: CheerioCrawler +original_id: cheerio-crawler +--- + + + +Provides a framework for the parallel crawling of web pages using plain HTTP requests and [cheerio](https://www.npmjs.com/package/cheerio) HTML +parser. The URLs to crawl are fed either from a static list of URLs or from a dynamic queue of URLs enabling recursive crawling of websites. + +Since `CheerioCrawler` uses raw HTTP requests to download web pages, it is very fast and efficient on data bandwidth. However, if the target website +requires JavaScript to display the content, you might need to use [`PuppeteerCrawler`](../api/puppeteer-crawler) or +[`PlaywrightCrawler`](../api/playwright-crawler) instead, because it loads the pages using full-featured headless Chrome browser. + +`CheerioCrawler` downloads each URL using a plain HTTP request, parses the HTML content using [Cheerio](https://www.npmjs.com/package/cheerio) and +then invokes the user-provided [`CheerioCrawlerOptions.handlePageFunction`](../typedefs/cheerio-crawler-options#handlepagefunction) to extract page +data using a [jQuery](https://jquery.com/)-like interface to the parsed HTML DOM. + +The source URLs are represented using [`Request`](../api/request) objects that are fed from [`RequestList`](../api/request-list) or +[`RequestQueue`](../api/request-queue) instances provided by the +[`CheerioCrawlerOptions.requestList`](../typedefs/cheerio-crawler-options#requestlist) or +[`CheerioCrawlerOptions.requestQueue`](../typedefs/cheerio-crawler-options#requestqueue) constructor options, respectively. + +If both [`CheerioCrawlerOptions.requestList`](../typedefs/cheerio-crawler-options#requestlist) and +[`CheerioCrawlerOptions.requestQueue`](../typedefs/cheerio-crawler-options#requestqueue) are used, the instance first processes URLs from the +[`RequestList`](../api/request-list) and automatically enqueues all of them to [`RequestQueue`](../api/request-queue) before it starts their +processing. This ensures that a single URL is not crawled multiple times. + +The crawler finishes when there are no more [`Request`](../api/request) objects to crawl. + +`CheerioCrawler` downloads the web pages using the [`utils.requestAsBrowser()`](../api/utils#requestasbrowser) utility function. + +By default, `CheerioCrawler` only processes web pages with the `text/html` and `application/xhtml+xml` MIME content types (as reported by the +`Content-Type` HTTP header), and skips pages with other content types. If you want the crawler to process other content types, use the +[`CheerioCrawlerOptions.additionalMimeTypes`](../typedefs/cheerio-crawler-options#additionalmimetypes) constructor option. Beware that the parsing +behavior differs for HTML, XML, JSON and other types of content. For details, see +[`CheerioCrawlerOptions.handlePageFunction`](../typedefs/cheerio-crawler-options#handlepagefunction). + +New requests are only dispatched when there is enough free CPU and memory available, using the functionality provided by the +[`AutoscaledPool`](../api/autoscaled-pool) class. All [`AutoscaledPool`](../api/autoscaled-pool) configuration options can be passed to the +`autoscaledPoolOptions` parameter of the `CheerioCrawler` constructor. For user convenience, the `minConcurrency` and `maxConcurrency` +[`AutoscaledPool`](../api/autoscaled-pool) options are available directly in the `CheerioCrawler` constructor. + +**Example usage:** + +```javascript +// Prepare a list of URLs to crawl +const requestList = new Apify.RequestList({ + sources: [{ url: 'http://www.example.com/page-1' }, { url: 'http://www.example.com/page-2' }], +}); +await requestList.initialize(); + +// Crawl the URLs +const crawler = new Apify.CheerioCrawler({ + requestList, + handlePageFunction: async ({ request, response, body, contentType, $ }) => { + const data = []; + + // Do some data extraction from the page with Cheerio. + $('.some-collection').each((index, el) => { + data.push({ + title: $(el) + .find('.some-title') + .text(), + }); + }); + + // Save the data to dataset. + await Apify.pushData({ + url: request.url, + html: body, + data, + }); + }, +}); + +await crawler.run(); +``` + +## Properties + +### `stats` + +**Type**: [`Statistics`](../api/statistics) + +Contains statistics about the current run. + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +A reference to the underlying [`RequestList`](../api/request-list) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +A reference to the underlying [`RequestQueue`](../api/request-queue) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `sessionPool` + +**Type**: [`SessionPool`](../api/session-pool) + +A reference to the underlying [`SessionPool`](../api/session-pool) class that manages the crawler's [`Session`](../api/session)s. Only available if +used by the crawler. + +--- + +### `proxyConfiguration` + +**Type**: [`ProxyConfiguration`](../api/proxy-configuration) + +A reference to the underlying [`ProxyConfiguration`](../api/proxy-configuration) class that manages the crawler's proxies. Only available if used by +the crawler. + +--- + +### `autoscaledPool` + +**Type**: [`AutoscaledPool`](../api/autoscaled-pool) + +A reference to the underlying [`AutoscaledPool`](../api/autoscaled-pool) class that manages the concurrency of the crawler. Note that this property is +only initialized after calling the [`CheerioCrawler.run()`](../api/cheerio-crawler#run) function. You can use it to change the concurrency settings on +the fly, to pause the crawler by calling [`AutoscaledPool.pause()`](../api/autoscaled-pool#pause) or to abort it by calling +[`AutoscaledPool.abort()`](../api/autoscaled-pool#abort). + +--- + + + +## `new CheerioCrawler(options)` + +**Parameters**: + +- **`options`**: [`CheerioCrawlerOptions`](../typedefs/cheerio-crawler-options) - All `CheerioCrawler` parameters are passed via an options object. + +--- + + + +## `cheerioCrawler.use(extension)` + +**EXPERIMENTAL** Function for attaching CrawlerExtensions such as the Unblockers. + +**Parameters**: + +- **`extension`**: `CrawlerExtension` - Crawler extension that overrides the crawler configuration. + +--- diff --git a/website/versioned_docs/version-1.0.0/api/Dataset.md b/website/versioned_docs/version-1.0.0/api/Dataset.md new file mode 100644 index 000000000000..ddf1aaab5047 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/Dataset.md @@ -0,0 +1,239 @@ +--- +id: version-1.0.0-dataset +title: Dataset +original_id: dataset +--- + + + +The `Dataset` class represents a store for structured data where each object stored has the same attributes, such as online store products or real +estate offers. You can imagine it as a table, where each object is a row and its attributes are columns. Dataset is an append-only storage - you can +only add new records to it but you cannot modify or remove existing records. Typically it is used to store crawling results. + +Do not instantiate this class directly, use the [`Apify.openDataset()`](../api/apify#opendataset) function instead. + +`Dataset` stores its data either on local disk or in the Apify cloud, depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment +variables are set. + +If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the data is stored in the local directory in the following files: + +``` +{APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json +``` + +Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID: `default`, unless you override it by setting the +`APIFY_DEFAULT_DATASET_ID` environment variable. Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the +item in the dataset. + +If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` not, the data is stored in the +[Apify Dataset](https://docs.apify.com/storage/dataset) cloud storage. Note that you can force usage of the cloud storage also by passing the +`forceCloud` option to [`Apify.openDataset()`](../api/apify#opendataset) function, even if the `APIFY_LOCAL_STORAGE_DIR` variable is set. + +**Example usage:** + +```javascript +// Write a single row to the default dataset +await Apify.pushData({ col1: 123, col2: 'val2' }); + +// Open a named dataset +const dataset = await Apify.openDataset('some-name'); + +// Write a single row +await dataset.pushData({ foo: 'bar' }); + +// Write multiple rows +await dataset.pushData([{ foo: 'bar2', col2: 'val2' }, { col3: 123 }]); +``` + +--- + + + +## `dataset.pushData(data)` + +Stores an object or an array of objects to the dataset. The function returns a promise that resolves when the operation finishes. It has no result, +but throws on invalid args or other errors. + +**IMPORTANT**: Make sure to use the `await` keyword when calling `pushData()`, otherwise the actor process might finish before the data is stored! + +The size of the data is limited by the receiving API and therefore `pushData()` will only allow objects whose JSON representation is smaller than 9MB. +When an array is passed, none of the included objects may be larger than 9MB, but the array itself may be of any size. + +The function internally chunks the array into separate items and pushes them sequentially. The chunking process is stable (keeps order of data), but +it does not provide a transaction safety mechanism. Therefore, in the event of an uploading error (after several automatic retries), the function's +Promise will reject and the dataset will be left in a state where some of the items have already been saved to the dataset while other items from the +source array were not. To overcome this limitation, the developer may, for example, read the last item saved in the dataset and re-attempt the save of +the data from this item onwards to prevent duplicates. + +**Parameters**: + +- **`data`**: `object` | `Array` - Object or array of objects containing data to be stored in the default dataset. The objects must be + serializable to JSON and the JSON representation of each object must be smaller than 9MB. + +**Returns**: + +`Promise` + +--- + + + +## `dataset.getData([options])` + +Returns [`DatasetContent`](../typedefs/dataset-content) object holding the items in the dataset based on the provided parameters. + +If you need to get data in an unparsed format, use the [`Apify.newClient()`](../api/apify#newclient) function to get a new `apify-client` instance and +call [`datasetClient.downloadItems()`](https://github.com/apify/apify-client-js#DatasetClient+downloadItems) + +**Parameters**: + +- **`[options]`**: `Object` - All `getData()` parameters are passed via an options object with the following keys: + - **`[offset]`**: `number` = 0 - Number of array elements that should be skipped at the start. + - **`[limit]`**: `number` = 250000 - Maximum number of array elements to return. + - **`[desc]`**: `boolean` = false - If `true` then the objects are sorted by `createdAt` in descending order. Otherwise they are + sorted in ascending order. + - **`[fields]`**: `Array` - An array of field names that will be included in the result. If omitted, all fields are included in the + results. + - **`[unwind]`**: `string` - Specifies a name of the field in the result objects that will be used to unwind the resulting objects. By default, + the results are returned as they are. + - **`[clean]`**: `boolean` = false - If `true` then the function returns only non-empty items and skips hidden fields (i.e. fields + starting with `#` character). Note that the `clean` parameter is a shortcut for `skipHidden: true` and `skipEmpty: true` options. + - **`[skipHidden]`**: `boolean` = false - If `true` then the function doesn't return hidden fields (fields starting with "#" + character). + - **`[skipEmpty]`**: `boolean` = false - If `true` then the function doesn't return empty items. Note that in this case the + returned number of items might be lower than limit parameter and pagination must be done using the `limit` value. + +**Returns**: + +[`Promise`](../typedefs/dataset-content) + +--- + + + +## `dataset.getInfo()` + +Returns an object containing general information about the dataset. + +The function returns the same object as the Apify API Client's +[getDataset](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-datasets-getDataset) function, which in turn calls the +[Get dataset](https://apify.com/docs/api/v2#/reference/datasets/dataset/get-dataset) API endpoint. + +**Example:** + +``` +{ + id: "WkzbQMuFYuamGv3YF", + name: "my-dataset", + userId: "wRsJZtadYvn4mBZmm", + createdAt: new Date("2015-12-12T07:34:14.202Z"), + modifiedAt: new Date("2015-12-13T08:36:13.202Z"), + accessedAt: new Date("2015-12-14T08:36:13.202Z"), + itemCount: 14, + cleanItemCount: 10 +} +``` + +**Returns**: + +`Promise` + +--- + + + +## `dataset.forEach(iteratee, [options], [index])` + +Iterates over dataset items, yielding each in turn to an `iteratee` function. Each invocation of `iteratee` is called with two arguments: +`(item, index)`. + +If the `iteratee` function returns a Promise then it is awaited before the next call. If it throws an error, the iteration is aborted and the +`forEach` function throws the error. + +**Example usage** + +```javascript +const dataset = await Apify.openDataset('my-results'); +await dataset.forEach(async (item, index) => { + console.log(`Item at ${index}: ${JSON.stringify(item)}`); +}); +``` + +**Parameters**: + +- **`iteratee`**: [`DatasetConsumer`](../typedefs/dataset-consumer) - A function that is called for every item in the dataset. +- **`[options]`**: `Object` - All `forEach()` parameters are passed via an options object with the following keys: + - **`[desc]`**: `boolean` = false - If `true` then the objects are sorted by `createdAt` in descending order. + - **`[fields]`**: `Array` - If provided then returned objects will only contain specified keys. + - **`[unwind]`**: `string` - If provided then objects will be unwound based on provided field. +- **`[index]`**: `number` = 0 - Specifies the initial index number passed to the `iteratee` function. + +**Returns**: + +`Promise` + +--- + + + +## `dataset.map(iteratee, [options])` + +Produces a new array of values by mapping each value in list through a transformation function `iteratee()`. Each invocation of `iteratee()` is called +with two arguments: `(element, index)`. + +If `iteratee` returns a `Promise` then it's awaited before a next call. + +**Parameters**: + +- **`iteratee`**: [`DatasetMapper`](../typedefs/dataset-mapper) +- **`[options]`**: `Object` - All `map()` parameters are passed via an options object with the following keys: + - **`[desc]`**: `boolean` = false - If `true` then the objects are sorted by createdAt in descending order. + - **`[fields]`**: `Array` - If provided then returned objects will only contain specified keys + - **`[unwind]`**: `string` - If provided then objects will be unwound based on provided field. + +**Returns**: + +`Promise>` + +--- + + + +## `dataset.reduce(iteratee, memo, [options])` + +Reduces a list of values down to a single value. + +Memo is the initial state of the reduction, and each successive step of it should be returned by `iteratee()`. The `iteratee()` is passed three +arguments: the `memo`, then the `value` and `index` of the iteration. + +If no `memo` is passed to the initial invocation of reduce, the `iteratee()` is not invoked on the first element of the list. The first element is +instead passed as the memo in the invocation of the `iteratee()` on the next element in the list. + +If `iteratee()` returns a `Promise` then it's awaited before a next call. + +**Parameters**: + +- **`iteratee`**: [`DatasetReducer`](../typedefs/dataset-reducer) +- **`memo`**: `object` - Initial state of the reduction. +- **`[options]`**: `Object` - All `reduce()` parameters are passed via an options object with the following keys: + - **`[desc]`**: `boolean` = false - If `true` then the objects are sorted by createdAt in descending order. + - **`[fields]`**: `Array` - If provided then returned objects will only contain specified keys + - **`[unwind]`**: `string` - If provided then objects will be unwound based on provided field. + +**Returns**: + +`Promise` + +--- + + + +## `dataset.drop()` + +Removes the dataset either from the Apify cloud storage or from the local directory, depending on the mode of operation. + +**Returns**: + +`Promise` + +--- diff --git a/website/versioned_docs/version-1.0.0/api/PlaywrightCrawler.md b/website/versioned_docs/version-1.0.0/api/PlaywrightCrawler.md new file mode 100644 index 000000000000..509a11e3af62 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/PlaywrightCrawler.md @@ -0,0 +1,144 @@ +--- +id: version-1.0.0-playwright-crawler +title: PlaywrightCrawler +original_id: playwright-crawler +--- + + + +Provides a simple framework for parallel crawling of web pages using headless Chromium, Firefox and Webkit browsers with +[Playwright](https://github.com/microsoft/playwright). The URLs to crawl are fed either from a static list of URLs or from a dynamic queue of URLs +enabling recursive crawling of websites. + +Since `Playwright` uses headless browser to download web pages and extract data, it is useful for crawling of websites that require to execute +JavaScript. If the target website doesn't need JavaScript, consider using [`CheerioCrawler`](../api/cheerio-crawler), which downloads the pages using +raw HTTP requests and is about 10x faster. + +The source URLs are represented using [`Request`](../api/request) objects that are fed from [`RequestList`](../api/request-list) or +[`RequestQueue`](../api/request-queue) instances provided by the +[`PlaywrightCrawlerOptions.requestList`](../typedefs/playwright-crawler-options#requestlist) or +[`PlaywrightCrawlerOptions.requestQueue`](../typedefs/playwright-crawler-options#requestqueue) constructor options, respectively. + +If both [`PlaywrightCrawlerOptions.requestList`](../typedefs/playwright-crawler-options#requestlist) and +[`PlaywrightCrawlerOptions.requestQueue`](../typedefs/playwright-crawler-options#requestqueue) are used, the instance first processes URLs from the +[`RequestList`](../api/request-list) and automatically enqueues all of them to [`RequestQueue`](../api/request-queue) before it starts their +processing. This ensures that a single URL is not crawled multiple times. + +The crawler finishes when there are no more [`Request`](../api/request) objects to crawl. + +`PlaywrightCrawler` opens a new Chrome page (i.e. tab) for each [`Request`](../api/request) object to crawl and then calls the function provided by +user as the [`PlaywrightCrawlerOptions.handlePageFunction`](../typedefs/playwright-crawler-options#handlepagefunction) option. + +New pages are only opened when there is enough free CPU and memory available, using the functionality provided by the +[`AutoscaledPool`](../api/autoscaled-pool) class. All [`AutoscaledPool`](../api/autoscaled-pool) configuration options can be passed to the +[`PlaywrightCrawlerOptions.autoscaledPoolOptions`](../typedefs/playwright-crawler-options#autoscaledpooloptions) parameter of the `PlaywrightCrawler` +constructor. For user convenience, the `minConcurrency` and `maxConcurrency` [`AutoscaledPoolOptions`](../typedefs/autoscaled-pool-options) are +available directly in the `PlaywrightCrawler` constructor. + +Note that the pool of Playwright instances is internally managed by the `BrowserPool` class. Many constructor options such as +[`PlaywrightCrawlerOptions.maxOpenPagesPerInstance`](../typedefs/playwright-crawler-options#maxopenpagesperinstance) or + +**Example usage:** + +```javascript +const crawler = new Apify.PlaywrightCrawler({ + requestList, + handlePageFunction: async ({ page, request }) => { + // This function is called to extract data from a single web page + // 'page' is an instance of Playwright.Page with page.goto(request.url) already called + // 'request' is an instance of Request class with information about the page to load + await Apify.pushData({ + title: await page.title(), + url: request.url, + succeeded: true, + }); + }, + handleFailedRequestFunction: async ({ request }) => { + // This function is called when the crawling of a request failed too many times + await Apify.pushData({ + url: request.url, + succeeded: false, + errors: request.errorMessages, + }); + }, +}); + +await crawler.run(); +``` + +## Properties + +### `stats` + +**Type**: [`Statistics`](../api/statistics) + +Contains statistics about the current run. + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +A reference to the underlying [`RequestList`](../api/request-list) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +A reference to the underlying [`RequestQueue`](../api/request-queue) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `sessionPool` + +**Type**: [`SessionPool`](../api/session-pool) + +A reference to the underlying [`SessionPool`](../api/session-pool) class that manages the crawler's [`Session`](../api/session)s. Only available if +used by the crawler. + +--- + +### `proxyConfiguration` + +**Type**: [`ProxyConfiguration`](../api/proxy-configuration) + +A reference to the underlying [`ProxyConfiguration`](../api/proxy-configuration) class that manages the crawler's proxies. Only available if used by +the crawler. + +--- + +### `browserPool` + +**Type**: `BrowserPool` + +A reference to the underlying `BrowserPool` class that manages the crawler's browsers. For more information about it, see the +[`browser-pool` module](https://github.com/apify/browser-pool). + +--- + +### `autoscaledPool` + +**Type**: [`AutoscaledPool`](../api/autoscaled-pool) + +A reference to the underlying [`AutoscaledPool`](../api/autoscaled-pool) class that manages the concurrency of the crawler. Note that this property is +only initialized after calling the [`CheerioCrawler.run()`](../api/cheerio-crawler#run) function. You can use it to change the concurrency settings on +the fly, to pause the crawler by calling [`AutoscaledPool.pause()`](../api/autoscaled-pool#pause) or to abort it by calling +[`AutoscaledPool.abort()`](../api/autoscaled-pool#abort). + +--- + + + +## `new PlaywrightCrawler(options)` + +**Parameters**: + +- **`options`**: [`PlaywrightCrawlerOptions`](../typedefs/playwright-crawler-options) - All `PlaywrightCrawler` parameters are passed via an options + object. + +--- diff --git a/website/versioned_docs/version-1.0.0/api/ProxyConfiguration.md b/website/versioned_docs/version-1.0.0/api/ProxyConfiguration.md new file mode 100644 index 000000000000..151cce0184eb --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/ProxyConfiguration.md @@ -0,0 +1,101 @@ +--- +id: version-1.0.0-proxy-configuration +title: ProxyConfiguration +original_id: proxy-configuration +--- + + + +Configures connection to a proxy server with the provided options. Proxy servers are used to prevent target websites from blocking your crawlers based +on IP address rate limits or blacklists. Setting proxy configuration in your crawlers automatically configures them to use the selected proxies for +all connections. You can get information about the currently used proxy by inspecting the [`ProxyInfo`](../typedefs/proxy-info) property in your +crawler's page function. There, you can inspect the proxy's URL and other attributes. + +The proxy servers are managed by [Apify Proxy](https://docs.apify.com/proxy). To be able to use Apify Proxy, you need an Apify account and access to +the selected proxies. If you provide no configuration option, the proxies will be managed automatically using a smart algorithm. + +If you want to use your own proxies, use the [`ProxyConfigurationOptions.proxyUrls`](../typedefs/proxy-configuration-options#proxyurls) option. Your +list of proxy URLs will be rotated by the configuration if this option is provided. + +**Example usage:** + +```javascript + +const proxyConfiguration = await Apify.createProxyConfiguration({ + groups: ['GROUP1', 'GROUP2'] // List of Apify Proxy groups + countryCode: 'US', +}); + +const crawler = new Apify.CheerioCrawler({ + // ... + proxyConfiguration, + handlePageFunction: ({ proxyInfo }) => { + const usedProxyUrl = proxyInfo.url; // Getting the proxy URL + } +}) + +``` + +--- + + + +## `proxyConfiguration.initialize()` + +Loads proxy password if token is provided and checks access to Apify Proxy and provided proxy groups if Apify Proxy configuration is used. Also checks +if country has access to Apify Proxy groups if the country code is provided. + +You should use the [`Apify.createProxyConfiguration`](../api/apify#createproxyconfiguration) function to create a pre-initialized `ProxyConfiguration` +instance instead of calling this manually. + +**Returns**: + +`Promise` + +--- + + + +## `proxyConfiguration.newProxyInfo([sessionId])` + +This function creates a new [`ProxyInfo`](../typedefs/proxy-info) info object. It is used by CheerioCrawler and PuppeteerCrawler to generate proxy +URLs and also to allow the user to inspect the currently used proxy via the handlePageFunction parameter: proxyInfo. Use it if you want to work with a +rich representation of a proxy URL. If you need the URL string only, use [`ProxyConfiguration.newUrl`](../api/proxy-configuration#newurl). + +**Parameters**: + +- **`[sessionId]`**: `string` | `number` - Represents the identifier of user [`Session`](../api/session) that can be managed by the + [`SessionPool`](../api/session-pool) or you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier. When the provided + sessionId is a number, it's converted to a string. Property sessionId of [`ProxyInfo`](../typedefs/proxy-info) is always returned as a type + string. + +All the HTTP requests going through the proxy with the same session identifier will use the same target proxy server (i.e. the same IP address). The +identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. + +**Returns**: + +[`ProxyInfo`](../typedefs/proxy-info) - represents information about used proxy and its configuration. + +--- + + + +## `proxyConfiguration.newUrl([sessionId])` + +Returns a new proxy URL based on provided configuration options and the `sessionId` parameter. + +**Parameters**: + +- **`[sessionId]`**: `string` | `number` - Represents the identifier of user [`Session`](../api/session) that can be managed by the + [`SessionPool`](../api/session-pool) or you can use the Apify Proxy [Session](https://docs.apify.com/proxy#sessions) identifier. When the provided + sessionId is a number, it's converted to a string. + +All the HTTP requests going through the proxy with the same session identifier will use the same target proxy server (i.e. the same IP address). The +identifier must not be longer than 50 characters and include only the following: `0-9`, `a-z`, `A-Z`, `"."`, `"_"` and `"~"`. + +**Returns**: + +`string` - A string with a proxy URL, including authentication credentials and port number. For example, +`http://bob:password123@proxy.example.com:8000` + +--- diff --git a/website/versioned_docs/version-1.0.0/api/PuppeteerCrawler.md b/website/versioned_docs/version-1.0.0/api/PuppeteerCrawler.md new file mode 100644 index 000000000000..9ac7fcd5c4b4 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/PuppeteerCrawler.md @@ -0,0 +1,142 @@ +--- +id: version-1.0.0-puppeteer-crawler +title: PuppeteerCrawler +original_id: puppeteer-crawler +--- + + + +Provides a simple framework for parallel crawling of web pages using headless Chrome with [Puppeteer](https://github.com/puppeteer/puppeteer). The +URLs to crawl are fed either from a static list of URLs or from a dynamic queue of URLs enabling recursive crawling of websites. + +Since `PuppeteerCrawler` uses headless Chrome to download web pages and extract data, it is useful for crawling of websites that require to execute +JavaScript. If the target website doesn't need JavaScript, consider using [`CheerioCrawler`](../api/cheerio-crawler), which downloads the pages using +raw HTTP requests and is about 10x faster. + +The source URLs are represented using [`Request`](../api/request) objects that are fed from [`RequestList`](../api/request-list) or +[`RequestQueue`](../api/request-queue) instances provided by the +[`PuppeteerCrawlerOptions.requestList`](../typedefs/puppeteer-crawler-options#requestlist) or +[`PuppeteerCrawlerOptions.requestQueue`](../typedefs/puppeteer-crawler-options#requestqueue) constructor options, respectively. + +If both [`PuppeteerCrawlerOptions.requestList`](../typedefs/puppeteer-crawler-options#requestlist) and +[`PuppeteerCrawlerOptions.requestQueue`](../typedefs/puppeteer-crawler-options#requestqueue) are used, the instance first processes URLs from the +[`RequestList`](../api/request-list) and automatically enqueues all of them to [`RequestQueue`](../api/request-queue) before it starts their +processing. This ensures that a single URL is not crawled multiple times. + +The crawler finishes when there are no more [`Request`](../api/request) objects to crawl. + +`PuppeteerCrawler` opens a new Chrome page (i.e. tab) for each [`Request`](../api/request) object to crawl and then calls the function provided by +user as the [`PuppeteerCrawlerOptions.handlePageFunction`](../typedefs/puppeteer-crawler-options#handlepagefunction) option. + +New pages are only opened when there is enough free CPU and memory available, using the functionality provided by the +[`AutoscaledPool`](../api/autoscaled-pool) class. All [`AutoscaledPool`](../api/autoscaled-pool) configuration options can be passed to the +[`PuppeteerCrawlerOptions.autoscaledPoolOptions`](../typedefs/puppeteer-crawler-options#autoscaledpooloptions) parameter of the `PuppeteerCrawler` +constructor. For user convenience, the `minConcurrency` and `maxConcurrency` [`AutoscaledPoolOptions`](../typedefs/autoscaled-pool-options) are +available directly in the `PuppeteerCrawler` constructor. + +Note that the pool of Puppeteer instances is internally managed by the `BrowserPool` class. + +**Example usage:** + +```javascript +const crawler = new Apify.PuppeteerCrawler({ + requestList, + handlePageFunction: async ({ page, request }) => { + // This function is called to extract data from a single web page + // 'page' is an instance of Puppeteer.Page with page.goto(request.url) already called + // 'request' is an instance of Request class with information about the page to load + await Apify.pushData({ + title: await page.title(), + url: request.url, + succeeded: true, + }); + }, + handleFailedRequestFunction: async ({ request }) => { + // This function is called when the crawling of a request failed too many times + await Apify.pushData({ + url: request.url, + succeeded: false, + errors: request.errorMessages, + }); + }, +}); + +await crawler.run(); +``` + +## Properties + +### `stats` + +**Type**: [`Statistics`](../api/statistics) + +Contains statistics about the current run. + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +A reference to the underlying [`RequestList`](../api/request-list) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +A reference to the underlying [`RequestQueue`](../api/request-queue) class that manages the crawler's [`Request`](../api/request)s. Only available if +used by the crawler. + +--- + +### `sessionPool` + +**Type**: [`SessionPool`](../api/session-pool) + +A reference to the underlying [`SessionPool`](../api/session-pool) class that manages the crawler's [`Session`](../api/session)s. Only available if +used by the crawler. + +--- + +### `proxyConfiguration` + +**Type**: [`ProxyConfiguration`](../api/proxy-configuration) + +A reference to the underlying [`ProxyConfiguration`](../api/proxy-configuration) class that manages the crawler's proxies. Only available if used by +the crawler. + +--- + +### `browserPool` + +**Type**: `BrowserPool` + +A reference to the underlying `BrowserPool` class that manages the crawler's browsers. For more information about it, see the +[`browser-pool` module](https://github.com/apify/browser-pool). + +--- + +### `autoscaledPool` + +**Type**: [`AutoscaledPool`](../api/autoscaled-pool) + +A reference to the underlying [`AutoscaledPool`](../api/autoscaled-pool) class that manages the concurrency of the crawler. Note that this property is +only initialized after calling the [`CheerioCrawler.run()`](../api/cheerio-crawler#run) function. You can use it to change the concurrency settings on +the fly, to pause the crawler by calling [`AutoscaledPool.pause()`](../api/autoscaled-pool#pause) or to abort it by calling +[`AutoscaledPool.abort()`](../api/autoscaled-pool#abort). + +--- + + + +## `new PuppeteerCrawler(options)` + +**Parameters**: + +- **`options`**: [`PuppeteerCrawlerOptions`](../typedefs/puppeteer-crawler-options) - All `PuppeteerCrawler` parameters are passed via an options + object. + +--- diff --git a/website/versioned_docs/version-1.0.0/api/Request.md b/website/versioned_docs/version-1.0.0/api/Request.md new file mode 100644 index 000000000000..cc6b8a1c03ef --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/Request.md @@ -0,0 +1,165 @@ +--- +id: version-1.0.0-request +title: Request +original_id: request +--- + + + +Represents a URL to be crawled, optionally including HTTP method, headers, payload and other metadata. The `Request` object also stores information +about errors that occurred during processing of the request. + +Each `Request` instance has the `uniqueKey` property, which can be either specified manually in the constructor or generated automatically from the +URL. Two requests with the same `uniqueKey` are considered as pointing to the same web resource. This behavior applies to all Apify SDK classes, such +as [`RequestList`](../api/request-list), [`RequestQueue`](../api/request-queue), [`PuppeteerCrawler`](../api/puppeteer-crawler) or +[`PlaywrightCrawler`](../api/playwright-crawler). + +Example use: + +```javascript +const request = new Apify.Request({ + url: 'http://www.example.com', + headers: { Accept: 'application/json' }, +}); + +... + +request.userData.foo = 'bar'; +request.pushErrorMessage(new Error('Request failed!')); + +... + +const foo = request.userData.foo; +``` + +## Properties + +### `id` + +**Type**: `string` + +Request ID + +--- + +### `url` + +**Type**: `string` + +URL of the web page to crawl. + +--- + +### `loadedUrl` + +**Type**: `string` + +An actually loaded URL after redirects, if present. HTTP redirects are guaranteed to be included. + +When using [`PuppeteerCrawler`](../api/puppeteer-crawler) or [`PlaywrightCrawler`](../api/playwright-crawler), meta tag and JavaScript redirects may, +or may not be included, depending on their nature. This generally means that redirects, which happen immediately will most likely be included, but +delayed redirects will not. + +--- + +### `uniqueKey` + +**Type**: `string` + +A unique key identifying the request. Two requests with the same `uniqueKey` are considered as pointing to the same URL. + +--- + +### `method` + +**Type**: `string` + +HTTP method, e.g. `GET` or `POST`. + +--- + +### `payload` + +**Type**: `string` | `Buffer` + +HTTP request payload, e.g. for POST requests. + +--- + +### `noRetry` + +**Type**: `boolean` + +The `true` value indicates that the request will not be automatically retried on error. + +--- + +### `retryCount` + +**Type**: `number` + +Indicates the number of times the crawling of the request has been retried on error. + +--- + +### `errorMessages` + +**Type**: `Array` + +An array of error messages from request processing. + +--- + +### `headers` + +**Type**: `object` + +Object with HTTP headers. Key is header name, value is the value. + +--- + +### `userData` + +**Type**: `object` + +Custom user data assigned to the request. + +--- + +### `handledAt` + +**Type**: `Date` + +Indicates the time when the request has been processed. Is `null` if the request has not been crawled yet. + +--- + + + +## `new Request(options)` + +**Parameters**: + +- **`options`**: [`RequestOptions`](../typedefs/request-options) - `Request` parameters including the URL, HTTP method and headers, and others. + +--- + + + +## `request.pushErrorMessage(errorOrMessage, [options])` + +Stores information about an error that occurred during processing of this request. + +You should always use Error instances when throwing errors in JavaScript. + +Nevertheless, to improve the debugging experience when using third party libraries that may not always throw an Error instance, the function performs +a type inspection of the passed argument and attempts to extract as much information as possible, since just throwing a bad type error makes any +debugging rather difficult. + +**Parameters**: + +- **`errorOrMessage`**: `Error` | `string` - Error object or error message to be stored in the request. +- **`[options]`**: `Object` + - **`[omitStack]`**: `boolean` = false - Only push the error message without stack trace when true. + +--- diff --git a/website/versioned_docs/version-1.0.0/api/RequestList.md b/website/versioned_docs/version-1.0.0/api/RequestList.md new file mode 100644 index 000000000000..32e3fada13c2 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/RequestList.md @@ -0,0 +1,217 @@ +--- +id: version-1.0.0-request-list +title: RequestList +original_id: request-list +--- + + + +Represents a static list of URLs to crawl. The URLs can be provided either in code or parsed from a text file hosted on the web. `RequestList` is used +by [`BasicCrawler`](../api/basic-crawler), [`CheerioCrawler`](../api/cheerio-crawler), [`PuppeteerCrawler`](../api/puppeteer-crawler) and +[`PlaywrightCrawler`](../api/playwright-crawler) as a source of URLs to crawl. + +Each URL is represented using an instance of the [`Request`](../api/request) class. The list can only contain unique URLs. More precisely, it can only +contain `Request` instances with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can also be overridden. To +add a single URL to the list multiple times, corresponding [`Request`](../api/request) objects will need to have different `uniqueKey` properties. You +can use the `keepDuplicateUrls` option to do this for you when initializing the `RequestList` from sources. + +Once you create an instance of `RequestList`, you need to call the [`RequestList.initialize()`](../api/request-list#initialize) function before the +instance can be used. After that, no more URLs can be added to the list. Unlike [`RequestQueue`](../api/request-queue), `RequestList` is static but it +can contain even millions of URLs. + +> Note that `RequestList` can be used together with `RequestQueue` by the same crawler. In such cases, each request from `RequestList` is enqueued +> into `RequestQueue` first and then consumed from the latter. This is necessary to avoid the same URL being processed more than once (from the list +> first and then possibly from the queue). In practical terms, such a combination can be useful when there is a large number of initial URLs, but more +> URLs would be added dynamically by the crawler. + +`RequestList` has an internal state where it stores information about which requests were already handled, which are in progress and which were +reclaimed. The state may be automatically persisted to the default [`KeyValueStore`](../api/key-value-store) by setting the `persistStateKey` option +so that if the Node.js process is restarted, the crawling can continue where it left off. The automated persisting is launched upon receiving the +`persistState` event that is periodically emitted by [`Apify.events`](../api/apify#events). + +The internal state is closely tied to the provided sources (URLs). If the sources change on actor restart, the state will become corrupted and +`RequestList` will raise an exception. This typically happens when the sources is a list of URLs downloaded from the web. In such case, use the +`persistRequestsKey` option in conjunction with `persistStateKey`, to make the `RequestList` store the initial sources to the default key-value store +and load them after restart, which will prevent any issues that a live list of URLs might cause. + +**Basic usage:** + +```javascript +// Use a helper function to simplify request list initialization. +// State and sources are automatically persisted. This is a preferred usage. +const requestList = await Apify.openRequestList('my-request-list', [ + 'http://www.example.com/page-1', + { url: 'http://www.example.com/page-2', method: 'POST', userData: { foo: 'bar' } }, + { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } }, +]); +``` + +**Advanced usage:** + +```javascript +// Use the constructor to get more control over the initialization. +const requestList = new Apify.RequestList({ + sources: [ + // Separate requests + { url: 'http://www.example.com/page-1', method: 'GET', headers: { ... } }, + { url: 'http://www.example.com/page-2', userData: { foo: 'bar' }}, + + // Bulk load of URLs from file `http://www.example.com/my-url-list.txt` + // Note that all URLs must start with http:// or https:// + { requestsFromUrl: 'http://www.example.com/my-url-list.txt', userData: { isFromUrl: true } }, + ], + + // Persist the state to avoid re-crawling which can lead to data duplications. + // Keep in mind that the sources have to be immutable or this will throw an error. + persistStateKey: 'my-state', +}); + +await requestList.initialize(); +``` + +--- + + + +## `new RequestList(options)` + +**Parameters**: + +- **`options`**: [`RequestListOptions`](../typedefs/request-list-options) - All `RequestList` configuration options + +--- + + + +## `requestList.initialize()` + +Loads all remote sources of URLs and potentially starts periodic state persistence. This function must be called before you can start using the +instance in a meaningful way. + +**Returns**: + +`Promise` + +--- + + + +## `requestList.persistState()` + +Persists the current state of the `RequestList` into the default [`KeyValueStore`](../api/key-value-store). The state is persisted automatically in +regular intervals, but calling this method manually is useful in cases where you want to have the most current state available after you pause or stop +fetching its requests. For example after you pause or abort a crawl. Or just before a server migration. + +**Returns**: + +`Promise` + +--- + + + +## `requestList.getState()` + +Returns an object representing the internal state of the `RequestList` instance. Note that the object's fields can change in future releases. + +**Returns**: + +[`RequestListState`](../typedefs/request-list-state) + +--- + + + +## `requestList.isEmpty()` + +Resolves to `true` if the next call to [`RequestList.fetchNextRequest()`](../api/request-list#fetchnextrequest) function would return `null`, +otherwise it resolves to `false`. Note that even if the list is empty, there might be some pending requests currently being processed. + +**Returns**: + +`Promise` + +--- + + + +## `requestList.isFinished()` + +Returns `true` if all requests were already handled and there are no more left. + +**Returns**: + +`Promise` + +--- + + + +## `requestList.fetchNextRequest()` + +Gets the next [`Request`](../api/request) to process. First, the function gets a request previously reclaimed using the +[`RequestList.reclaimRequest()`](../api/request-list#reclaimrequest) function, if there is any. Otherwise it gets the next request from sources. + +The function's `Promise` resolves to `null` if there are no more requests to process. + +**Returns**: + +[`Promise<(Request|null)>`](../api/request) + +--- + + + +## `requestList.markRequestHandled(request)` + +Marks request as handled after successful processing. + +**Parameters**: + +- **`request`**: [`Request`](../api/request) + +**Returns**: + +`Promise` + +--- + + + +## `requestList.reclaimRequest(request)` + +Reclaims request to the list if its processing failed. The request will become available in the next `this.fetchNextRequest()`. + +**Parameters**: + +- **`request`**: [`Request`](../api/request) + +**Returns**: + +`Promise` + +--- + + + +## `requestList.length()` + +Returns the total number of unique requests present in the `RequestList`. + +**Returns**: + +`number` + +--- + + + +## `requestList.handledCount()` + +Returns number of handled requests. + +**Returns**: + +`number` + +--- diff --git a/website/versioned_docs/version-1.0.0/api/RequestQueue.md b/website/versioned_docs/version-1.0.0/api/RequestQueue.md new file mode 100644 index 000000000000..36ef63ea8258 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/RequestQueue.md @@ -0,0 +1,235 @@ +--- +id: version-1.0.0-request-queue +title: RequestQueue +original_id: request-queue +--- + + + +Represents a queue of URLs to crawl, which is used for deep crawling of websites where you start with several URLs and then recursively follow links +to other pages. The data structure supports both breadth-first and depth-first crawling orders. + +Each URL is represented using an instance of the [`Request`](../api/request) class. The queue can only contain unique URLs. More precisely, it can +only contain [`Request`](../api/request) instances with distinct `uniqueKey` properties. By default, `uniqueKey` is generated from the URL, but it can +also be overridden. To add a single URL multiple times to the queue, corresponding [`Request`](../api/request) objects will need to have different +`uniqueKey` properties. + +Do not instantiate this class directly, use the [`Apify.openRequestQueue()`](../api/apify#openrequestqueue) function instead. + +`RequestQueue` is used by [`BasicCrawler`](../api/basic-crawler), [`CheerioCrawler`](../api/cheerio-crawler), +[`PuppeteerCrawler`](../api/puppeteer-crawler) and [`PlaywrightCrawler`](../api/playwright-crawler) as a source of URLs to crawl. Unlike +[`RequestList`](../api/request-list), `RequestQueue` supports dynamic adding and removing of requests. On the other hand, the queue is not optimized +for operations that add or remove a large number of URLs in a batch. + +`RequestQueue` stores its data either on local disk or in the Apify Cloud, depending on whether the `APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` +environment variable is set. + +If the `APIFY_LOCAL_STORAGE_DIR` environment variable is set, the queue data is stored in that directory in an SQLite database file. + +If the `APIFY_TOKEN` environment variable is set but `APIFY_LOCAL_STORAGE_DIR` is not, the data is stored in the +[Apify Request Queue](https://docs.apify.com/storage/request-queue) cloud storage. Note that you can force usage of the cloud storage also by passing +the `forceCloud` option to [`Apify.openRequestQueue()`](../api/apify#openrequestqueue) function, even if the `APIFY_LOCAL_STORAGE_DIR` variable is +set. + +**Example usage:** + +```javascript +// Open the default request queue associated with the actor run +const queue = await Apify.openRequestQueue(); + +// Open a named request queue +const queueWithName = await Apify.openRequestQueue('some-name'); + +// Enqueue few requests +await queue.addRequest({ url: 'http://example.com/aaa' }); +await queue.addRequest({ url: 'http://example.com/bbb' }); +await queue.addRequest({ url: 'http://example.com/foo/bar' }, { forefront: true }); +``` + +--- + + + +## `requestQueue.addRequest(requestLike, [options])` + +Adds a request to the queue. + +If a request with the same `uniqueKey` property is already present in the queue, it will not be updated. You can find out whether this happened from +the resulting [`QueueOperationInfo`](../typedefs/queue-operation-info) object. + +To add multiple requests to the queue by extracting links from a webpage, see the [`utils.enqueueLinks()`](../api/utils#enqueuelinks) helper function. + +**Parameters**: + +- **`requestLike`**: [`Request`](../api/request) | [`RequestOptions`](../typedefs/request-options) - [`Request`](../api/request) object or vanilla + object with request data. Note that the function sets the `uniqueKey` and `id` fields to the passed Request. +- **`[options]`**: `Object` + - **`[forefront]`**: `boolean` = false - If `true`, the request will be added to the foremost position in the queue. + +**Returns**: + +[`Promise`](../typedefs/queue-operation-info) + +--- + + + +## `requestQueue.getRequest(id)` + +Gets the request from the queue specified by ID. + +**Parameters**: + +- **`id`**: `string` - ID of the request. + +**Returns**: + +[`Promise<(Request|null)>`](../api/request) - Returns the request object, or `null` if it was not found. + +--- + + + +## `requestQueue.fetchNextRequest()` + +Returns a next request in the queue to be processed, or `null` if there are no more pending requests. + +Once you successfully finish processing of the request, you need to call +[`RequestQueue.markRequestHandled()`](../api/request-queue#markrequesthandled) to mark the request as handled in the queue. If there was some error in +processing the request, call [`RequestQueue.reclaimRequest()`](../api/request-queue#reclaimrequest) instead, so that the queue will give the request +to some other consumer in another call to the `fetchNextRequest` function. + +Note that the `null` return value doesn't mean the queue processing finished, it means there are currently no pending requests. To check whether all +requests in queue were finished, use [`RequestQueue.isFinished()`](../api/request-queue#isfinished) instead. + +**Returns**: + +[`Promise<(Request|null)>`](../api/request) - Returns the request object or `null` if there are no more pending requests. + +--- + + + +## `requestQueue.markRequestHandled(request)` + +Marks a request that was previously returned by the [`RequestQueue.fetchNextRequest()`](../api/request-queue#fetchnextrequest) function as handled +after successful processing. Handled requests will never again be returned by the `fetchNextRequest` function. + +**Parameters**: + +- **`request`**: [`Request`](../api/request) + +**Returns**: + +[`Promise`](../typedefs/queue-operation-info) + +--- + + + +## `requestQueue.reclaimRequest(request, [options])` + +Reclaims a failed request back to the queue, so that it can be returned for processed later again by another call to +[`RequestQueue.fetchNextRequest()`](../api/request-queue#fetchnextrequest). The request record in the queue is updated using the provided `request` +parameter. For example, this lets you store the number of retries or error messages for the request. + +**Parameters**: + +- **`request`**: [`Request`](../api/request) +- **`[options]`**: `Object` - **`[forefront]`**: `boolean` = false - If `true` then the request it placed to the beginning of the + queue, so that it's returned in the next call to [`RequestQueue.fetchNextRequest()`](../api/request-queue#fetchnextrequest). By default, it's put + to the end of the queue. + +**Returns**: + +[`Promise`](../typedefs/queue-operation-info) + +--- + + + +## `requestQueue.isEmpty()` + +Resolves to `true` if the next call to [`RequestQueue.fetchNextRequest()`](../api/request-queue#fetchnextrequest) would return `null`, otherwise it +resolves to `false`. Note that even if the queue is empty, there might be some pending requests currently being processed. If you need to ensure that +there is no activity in the queue, use [`RequestQueue.isFinished()`](../api/request-queue#isfinished). + +**Returns**: + +`Promise` + +--- + + + +## `requestQueue.isFinished()` + +Resolves to `true` if all requests were already handled and there are no more left. Due to the nature of distributed storage used by the queue, the +function might occasionally return a false negative, but it will never return a false positive. + +**Returns**: + +`Promise` + +--- + + + +## `requestQueue.drop()` + +Removes the queue either from the Apify Cloud storage or from the local database, depending on the mode of operation. + +**Returns**: + +`Promise` + +--- + + + +## `requestQueue.handledCount()` + +Returns the number of handled requests. + +This function is just a convenient shortcut for: + +```javascript +const { handledRequestCount } = await queue.getInfo(); +``` + +**Returns**: + +`Promise` + +--- + + + +## `requestQueue.getInfo()` + +Returns an object containing general information about the request queue. + +The function returns the same object as the Apify API Client's [getQueue](https://docs.apify.com/api/apify-client-js/latest#ApifyClient-requestQueues) +function, which in turn calls the [Get request queue](https://apify.com/docs/api/v2#/reference/request-queues/queue/get-request-queue) API endpoint. + +**Example:** + +``` +{ + id: "WkzbQMuFYuamGv3YF", + name: "my-queue", + userId: "wRsJZtadYvn4mBZmm", + createdAt: new Date("2015-12-12T07:34:14.202Z"), + modifiedAt: new Date("2015-12-13T08:36:13.202Z"), + accessedAt: new Date("2015-12-14T08:36:13.202Z"), + totalRequestCount: 25, + handledRequestCount: 5, + pendingRequestCount: 20, +} +``` + +**Returns**: + +[`Promise`](../typedefs/request-queue-info) + +--- diff --git a/website/versioned_docs/version-1.0.0/api/playwright.md b/website/versioned_docs/version-1.0.0/api/playwright.md new file mode 100644 index 000000000000..a001bbb54142 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/api/playwright.md @@ -0,0 +1,48 @@ +--- +id: version-1.0.0-playwright +title: utils.playwright +original_id: playwright +--- + + + +A namespace that contains various utilities for [Playwright](https://github.com/microsoft/playwright) - the headless Chrome Node API. + +**Example usage:** + +```javascript +const Apify = require('apify'); +const { playwright } = Apify.utils; + +// Navigate to https://www.example.com in Playwright with a POST request +const browser = await Apify.launchPlaywright(); +const page = await browser.newPage(); +await playwright.gotoExtended(page, { + url: 'https://example.com, + method: 'POST', +}); +``` + +--- + + + +## `playwright.gotoExtended` + +Extended version of Playwright's `page.goto()` allowing to perform requests with HTTP method other than GET, with custom headers and POST payload. +URL, method, headers and payload are taken from request parameter that must be an instance of Apify.Request class. + +_NOTE:_ In recent versions of Playwright using requests other than GET, overriding headers and adding payloads disables browser cache which degrades +performance. + +**Parameters**: + +- **`page`**: `Page` - Puppeteer [`Page`](https://playwright.dev/docs/api/class-page) object. +- **`request`**: [`Request`](../api/request) +- **`[gotoOptions]`**: `DirectNavigationOptions` - Custom options for `page.goto()`. + +**Returns**: + +`Promise<(Response|null)>` + +--- diff --git a/website/versioned_docs/version-1.0.0/guides/apify_platform.md b/website/versioned_docs/version-1.0.0/guides/apify_platform.md new file mode 100644 index 000000000000..c77f5b321a06 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/guides/apify_platform.md @@ -0,0 +1,71 @@ +--- +id: version-1.0.0-apify-platform +title: Apify Platform +original_id: apify-platform +--- + +Apify is a [platform](https://apify.com) built to serve large scale and high performance web scraping +and automation needs. It provides easy access to [compute instances (Actors)](#what-is-an-actor), +convenient [request](../guides/request-storage) and [result](../guides/result-storage) storages, [proxies](../guides/proxy-management), +[scheduling](https://docs.apify.com/scheduler), [webhooks](https://docs.apify.com/webhooks) +and [more](https://docs.apify.com/), accessible through a [web interface](https://my.apify.com) +or an [API](https://docs.apify.com/api). + +While we think that the Apify platform is super cool, and you should definitely try the +[free account](https://my.apify.com/sign-up), **Apify SDK is and will always be open source**, +runnable locally or on any cloud infrastructure. + +> Note that we do not test Apify SDK in other cloud environments such as Lambda or on specific +> architectures such as Raspberry PI. We strive to make it work, but there's no guarantee. + +## Logging into Apify platform from Apify SDK +To access your [Apify account](https://my.apify.com/sign-up) from the SDK, you must provide +credentials - [your API token](https://my.apify.com/account#/integrations). You can do that +either by utilizing [Apify CLI](https://github.com/apify/apify-cli) or by environment +variables. + +Once you provide credentials to your scraper, you will be able to use all the Apify platform +features of the SDK, such as calling Actors, saving to cloud storages, using Apify proxies, +setting up webhooks and so on. + +### Log in with CLI +Apify CLI allows you to log in to your Apify account on your computer. If you then run your +scraper using the CLI, your credentials will automatically be added. + +``` +npm install -g apify-cli +``` +``` +apify login -t YOUR_API_TOKEN +``` +In your project folder: +``` +apify run -p +``` + +### Log in with environment variables +If you prefer not to use Apify CLI, you can always provide credentials to your scraper +by setting the [`APIFY_TOKEN`](../guides/environment-variables#apify_token) environment +variable to your API token. + +> There's also the [`APIFY_PROXY_PASSWORD`](../guides/environment-variables#apify_proxy_password) +> environment variable. It is automatically inferred from your token by the SDK, but it can be useful +> when you need to access proxies from a different account than your token represents. + +## What is an Actor +When you deploy your script to the Apify platform, it becomes an [actor](https://apify.com/actors). +Actor is a serverless microservice that accepts an input and produces an output. It can run for +a few seconds, hours or even infinitely. An actor can perform anything from a simple action such +as filling out a web form or sending an email, to complex operations such as crawling an entire website +and removing duplicates from a large dataset. + +Actors can be shared in the [Apify Store](https://apify.com/store) so that other people can use them. +But don't worry, if you share your actor in the store and somebody uses it, it runs under their account, +not yours. + +**Related links** + +- [Store of existing actors](https://apify.com/store) +- [Documentation](https://docs.apify.com/actor) +- [View actors in Apify app](https://my.apify.com/actors) +- [API reference](https://apify.com/docs/api/v2#/reference/actors) diff --git a/website/versioned_docs/version-1.0.0/guides/docker_images.md b/website/versioned_docs/version-1.0.0/guides/docker_images.md new file mode 100644 index 000000000000..c23b19c76b7f --- /dev/null +++ b/website/versioned_docs/version-1.0.0/guides/docker_images.md @@ -0,0 +1,174 @@ +--- +id: version-1.0.0-docker-images +title: Running in Docker +original_id: docker-images +--- + +Running headless browsers in Docker requires a lot of setup to do it right. But you don't need to +worry about that, because we already did it for you and created base images that you can freely use. +We use them every day on the [Apify Platform](../guides/apify_platform.md). + +All images can be found in their [GitHub repo](https://github.com/apify/apify-actor-docker) +and in our [DockerHub](https://hub.docker.com/orgs/apify). + +## Overview +Browsers are pretty big, so we try to provide a wide variety of images to suit your needs. Here's a full list +of our Docker images. + +- [`apify/actor-node`](#actor-node) +- [`apify/actor-node-puppeteer-chrome`](#actor-node-puppeteer-chrome) +- [`apify/actor-node-playwright`](#actor-node-playwright) +- [`apify/actor-node-playwright-chrome`](#actor-node-playwright-chrome) +- [`apify/actor-node-playwright-firefox`](#actor-node-playwright-firefox) +- [`apify/actor-node-playwright-webkit`](#actor-node-playwright-webkit) + +## Example Dockerfile +To use our images, you need a [`Dockerfile`](https://docs.docker.com/engine/reference/builder/). +You can either use this example, or bootstrap your projects with the [Apify CLI](../guides/getting_started.md#creating-a-new-project) +which automatically copies the correct Dockerfile into your project folder. + +```dockerfile +# First, specify the base Docker image. You can read more about +# the available images at https://sdk.apify.com/docs/guides/docker-images +# You can also use any other image from Docker Hub. +FROM apify/actor-node + +# Second, copy just package.json and package-lock.json since it should be +# the only file that affects "npm install" in the next step, to speed up the build +COPY package*.json ./ + +# Install NPM packages, skip optional and development dependencies to +# keep the image small. Avoid logging too much and print the dependency +# tree for debugging +RUN npm --quiet set progress=false \ + && npm install --only=prod --no-optional \ + && echo "Installed NPM packages:" \ + && (npm list || true) \ + && echo "Node.js version:" \ + && node --version \ + && echo "NPM version:" \ + && npm --version + +# Next, copy the remaining files and directories with the source code. +# Since we do this after NPM install, quick build will be really fast +# for most source file changes. +COPY . ./ + +# Optionally, specify how to launch the source code of your actor. +# By default, Apify's base Docker images define the CMD instruction +# that runs the Node.js source code using the command specified +# in the "scripts.start" section of the package.json file. +# In short, the instruction looks something like this: +# +# CMD npm start +``` + +## Versioning +The images are tagged with the version of the library that's preinstalled in the image. This ensures +compatibility. For example, `apify/actor-node-puppeteer-chrome:5.5.0` comes with Puppeteer v5.5.0, +which bundles Chromium 88, and we add Chrome 88. If you try to install a different version of Puppeteer +into this image, you may run into compatibility issues. + +Similarly `apify/actor-node-playwright-firefox:1.7.1` is pre-installed with the Firefox version that comes +with v1.7.1. + +Installing `apify/actor-node-puppeteer-chrome` (without a tag) will install the latest available version. + +We recommend reflecting this in your `package.json` files. Either by providing the same version of +the library as the selected image: + +```dockerfile +FROM apify/actor-node-playwright-chrome:1.7.1 +``` + +```json +{ + "dependencies": { + "playwright": "1.7.1" + } +} +``` + +Or by using an asterisk as your version. This will make sure the library version pre-installed in the +docker image is left untouched. + +```json +{ + "dependencies": { + "playwright": "*" + } +} +``` + +## Warning about image size +Browsers are huge. If you don't need them all in your image, it's better to use a smaller image with +only the one browser you need. + +Be careful when installing new dependencies. Nothing prevents you from installing Playwright into the +`actor-node-puppeteer-chrome` image, but the resulting image will be about 3 times larger and extremely +slow to download and build. + +Use only what you need and you'll be rewarded with reasonable build and start times. + +## actor-node +This is the smallest image we have based on Alpine Linux. It does not include any browsers, and it's therefore +best used with [`CheerioCrawler`](../api/cheerio-crawler). It benefits from lightning fast builds and container startups. + +[`PuppeteerCrawler`](../api/puppeteer-crawler), [`PlaywrightCrawler`](../api/playwright-crawler) +and other browser based features will **NOT** work with this image. + +```dockerfile +FROM apify/actor-node +``` + +## actor-node-puppeteer-chrome +This image includes Puppeteer (Chromium) and the Chrome browser. It can be used with +[`CheerioCrawler`](../api/cheerio-crawler) and [`PuppeteerCrawler`](../api/puppeteer-crawler), but **NOT** with +[`PlaywrightCrawler`](../api/playwright-crawler). + +The image supports XVFB by default, so you can run both `headless` and `headful` browsers with it. + +```dockerfile +FROM apify/actor-node-puppeteer-chrome +``` + +## actor-node-playwright +A very large and slow image that can run all Playwright browsers: Chromium, Chrome, Firefox, +WebKit. Everything is installed. If you need to develop or test with multiple browsers, this is the image to choose, +but in most cases, we suggest using the specialized images below. + +```dockerfile +FROM apify/actor-node-playwright +``` + +## actor-node-playwright-chrome +Similar to [`actor-node-puppeteer-chrome`](#actor-node-puppeteer-chrome), but for Playwright. You can run +[`CheerioCrawler`](../api/cheerio-crawler) and [`PlaywrightCrawler`](../api/playwright-crawler), +but **NOT** [`PuppeteerCrawler`](../api/puppeteer-crawler). + +It uses the [`PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD`](https://playwright.dev/docs/api/environment-variables/) +environment variable to block installation of more browsers into your images (to keep them small). +If you want more browsers, either choose the [`actor-node-playwright`](#actor-node-playwright) image +or override this env var. + +The image supports XVFB by default, so you can run both `headless` and `headful` browsers with it. + +```dockerfile +FROM apify/actor-node-playwright-chrome +``` + +## actor-node-playwright-firefox +Same idea as [`actor-node-playwright-chrome`](#actor-node-playwright-chrome), but with Firefox +pre-installed. + +```dockerfile +FROM apify/actor-node-playwright-firefox +``` + +## actor-node-playwright-webkit +Same idea as [`actor-node-playwright-chrome`](#actor-node-playwright-chrome), but with WebKit +pre-installed. + +```dockerfile +FROM apify/actor-node-playwright-webkit +``` diff --git a/website/versioned_docs/version-1.0.0/guides/motivation.md b/website/versioned_docs/version-1.0.0/guides/motivation.md new file mode 100644 index 000000000000..82dbc9d7e81c --- /dev/null +++ b/website/versioned_docs/version-1.0.0/guides/motivation.md @@ -0,0 +1,22 @@ +--- +id: version-1.0.0-motivation +title: Motivation +original_id: motivation +--- + +Thanks to tools like [Playwright](https://github.com/microsoft/playwright), [Puppeteer](https://github.com/puppeteer/puppeteer) or +[Cheerio](https://www.npmjs.com/package/cheerio), it is easy to write Node.js code to extract data from web pages. But +eventually things will get complicated. For example, when you try to: + +- Perform a deep crawl of an entire website using a persistent queue of URLs. +- Run your scraping code on a list of 100k URLs in a CSV file, without losing any data when your code crashes. +- Rotate proxies to hide your browser origin and keep user-like sessions. +- Disable browser fingerprinting protections used by websites. + +Python has [Scrapy](https://scrapy.org/) for these tasks, but there was no such library for **JavaScript, the language of +the web**. The use of JavaScript is natural, since the same language is used to write the scripts as well as the data extraction code running in a +browser. + +The goal of the Apify SDK is to fill this gap and provide a toolbox for generic web scraping, crawling and automation tasks in JavaScript. So don't +reinvent the wheel every time you need data from the web, and focus on writing code specific to the target website, rather than developing +commonalities. diff --git a/website/versioned_docs/version-1.0.0/guides/quick_start.md b/website/versioned_docs/version-1.0.0/guides/quick_start.md new file mode 100644 index 000000000000..dd928afe342d --- /dev/null +++ b/website/versioned_docs/version-1.0.0/guides/quick_start.md @@ -0,0 +1,108 @@ +--- +id: version-1.0.0-quick-start +title: Quick Start +original_id: quick-start +--- + +This short tutorial will set you up to start using Apify SDK in a minute or two. +If you want to learn more, proceed to the [Getting Started](../guides/getting-started) +tutorial that will take you step by step through creating your first scraper. + +## Local stand-alone usage +Apify SDK requires [Node.js](https://nodejs.org/en/) 10.17 or later, with the exception of Node.js 11. +Add Apify SDK to any Node.js project by running: + +```bash +npm install apify playwright +``` + +> Neither `playwright` nor `puppeteer` are bundled with the SDK to reduce install size and allow greater +> flexibility. That's why we install it with NPM. You can choose one, both, or neither. + +Run the following example to perform a recursive crawl of a website using Playwright. For more examples showcasing various features of the Apify SDK, +[see the Examples section of the documentation](../examples/crawl-multiple-urls). + +```javascript +const Apify = require('apify'); + +// Apify.main is a helper function, you don't need to use it. +Apify.main(async () => { + const requestQueue = await Apify.openRequestQueue(); + // Choose the first URL to open. + await requestQueue.addRequest({ url: 'https://www.iana.org/' }); + + const crawler = new Apify.PlaywrightCrawler({ + requestQueue, + handlePageFunction: async ({ request, page }) => { + // Extract HTML title of the page. + const title = await page.title(); + console.log(`Title of ${request.url}: ${title}`); + + // Add URLs that match the provided pattern. + await Apify.utils.enqueueLinks({ + page, + requestQueue, + pseudoUrls: ['https://www.iana.org/[.*]'], + }); + }, + }); + + await crawler.run(); +}); +``` + +When you run the example, you should see Apify SDK automating a Chrome browser. + +![Chrome Scrape](/img/chrome_scrape.gif) + +By default, Apify SDK stores data to `./apify_storage` in the current working directory. You can override this behavior by setting either the +`APIFY_LOCAL_STORAGE_DIR` or `APIFY_TOKEN` environment variable. For details, see [Environment variables](../guides/environment-variables), [Request storage](../guides/request-storage) and [Result storage](../guides/result-storage). + +## Local usage with Apify command-line interface (CLI) + +To avoid the need to set the environment variables manually, to create a boilerplate of your project, and to enable pushing and running your code on +the [Apify platform](../guides/apify-platform), you can use the [Apify command-line interface (CLI)](https://github.com/apify/apify-cli) tool. + +Install the CLI by running: + +```bash +npm -g install apify-cli +``` + +Now create a boilerplate of your new web crawling project by running: + +```bash +apify create my-hello-world +``` + +The CLI will prompt you to select a project boilerplate template - just pick "Hello world". The tool will create a directory called `my-hello-world` +with a Node.js project files. You can run the project as follows: + +```bash +cd my-hello-world +apify run +``` + +By default, the crawling data will be stored in a local directory at `./apify_storage`. For example, the input JSON file for the actor is expected to +be in the default key-value store in `./apify_storage/key_value_stores/default/INPUT.json`. + +Now you can easily deploy your code to the Apify platform by running: + +```bash +apify login +``` + +```bash +apify push +``` + +Your script will be uploaded to the Apify platform and built there so that it can be run. For more information, view the +[Apify Actor](https://docs.apify.com/cli) documentation. + +## Usage on the Apify platform + +You can also develop your web scraping project in an online code editor directly on the [Apify platform](../guides/apify-platform). +You'll need to have an Apify Account. Go to [Actors](https://my.apify.com/actors), page in the app, click Create new +and then go to the Source tab and start writing your code or paste one of the examples from the Examples section. + +For more information, view the [Apify actors quick start guide](https://docs.apify.com/actor/quick-start). diff --git a/website/versioned_docs/version-1.0.0/guides/request_storage.md b/website/versioned_docs/version-1.0.0/guides/request_storage.md new file mode 100644 index 000000000000..dd7cc9a68a8e --- /dev/null +++ b/website/versioned_docs/version-1.0.0/guides/request_storage.md @@ -0,0 +1,132 @@ +--- +id: version-1.0.0-request-storage +title: Request Storage +original_id: request-storage +--- + +The Apify SDK has several request storage types that are useful for specific tasks. The requests are stored either on local disk to a directory defined by the +`APIFY_LOCAL_STORAGE_DIR` environment variable, or on the [Apify platform](/docs/guides/apify-platform) under the user account identified by the API token defined by the `APIFY_TOKEN` environment variable. If neither of these variables is defined, by default Apify SDK sets `APIFY_LOCAL_STORAGE_DIR` to `./apify_storage` in the current working directory and prints a warning. + +Typically, you will be developing the code on your local computer and thus set the `APIFY_LOCAL_STORAGE_DIR` environment variable. Once the code is ready, you will deploy it to the Apify platform, where it will automatically set the `APIFY_TOKEN` environment variable and thus use cloud storage. No code changes are needed. + +**Related links** + +- [Apify platform storage documentation](https://docs.apify.com/storage) +- [View storage in Apify app](https://my.apify.com/storage) +- [Request queues API reference](https://docs.apify.com/api/v2#/reference/request-queues) + +## Request queue + +The request queue is a storage of URLs to crawl. The queue is used for the deep crawling of websites, where you start with several URLs and then recursively follow links to other pages. The data structure supports both breadth-first and depth-first crawling orders. + +Each actor run is associated with a **default request queue**, which is created exclusively for the actor run. Typically, it is used to store URLs to crawl in the specific actor run. Its usage is optional. + +In Apify SDK, the request queue is represented by the [`RequestQueue`](/docs/api/request-queue) class. + +In local configuration, the request queue is emulated by [@apify/storage-local](https://github.com/apify/apify-storage-local-js) NPM package and its data is stored in SQLite database in the directory specified by the `APIFY_LOCAL_STORAGE_DIR` environment variable as follows: + +``` +{APIFY_LOCAL_STORAGE_DIR}/request_queues/{QUEUE_ID}/db.sqlite +``` + +Note that `{QUEUE_ID}` is the name or ID of the request queue. The default queue has ID `default`, unless you override it by setting the `APIFY_DEFAULT_REQUEST_QUEUE_ID` environment variable. + +The following code demonstrates basic operations of the request queue: + +```javascript +// Open the default request queue associated with the actor run +const requestQueue = await Apify.openRequestQueue(); +// Enqueue the initial request +await requestQueue.addRequest({ url: 'https://example.com' }); + +// The crawler will automatically process requests from the queue +const crawler = new Apify.CheerioCrawler({ + requestQueue, + handlePageFunction: async ({ $, request }) => { + // Add new request to the queue + await requestQueue.addRequest({ url: 'https://example.com/new-page' }); + // Add links found on page to the queue + await Apify.utils.enqueueLinks({ $, requestQueue }); + }, +}); +``` + +To see more detailed example of how to use the request queue with a crawler, see the [Puppeteer Crawler](/docs/examples/puppeteer-crawler) example. + +## Request list + +The request list is not a storage per se - it represents the list of URLs to crawl that is stored in a run memory (or optionally in default [Key-Value Store](../guides/results-storage#key-value-store) associated with the run, if specified). The list is used for the crawling of a large number of URLs, when you know all the URLs which should be visited by the crawler and no URLs would be added during the run. The URLs can be provided either in code or parsed from a text file hosted on the web. + +Request list is created exclusively for the actor run and only if its usage is explicitly specified in the code. Its usage is optional. + +In Apify SDK, the request list is represented by the [`RequestList`](/docs/api/request-list) class. + +The following code demonstrates basic operations of the request list: + +```javascript +// Prepare the sources array with URLs to visit +const sources = [ + { url: 'http://www.example.com/page-1' }, + { url: 'http://www.example.com/page-2' }, + { url: 'http://www.example.com/page-3' }, +]; +// Open the request list. +// List name is used to persist the sources and the list state in the key-value store +const requestList = await Apify.openRequestList('my-list', sources); + +// The crawler will automatically process requests from the list +const crawler = new Apify.PuppeteerCrawler({ + requestList, + handlePageFunction: async ({ page, request }) => { + // Process the page (extract data, take page screenshot, etc). + // No more requests could be added to the request list here + }, +}); +``` + +To see more detailed example of how to use the request list with a crawler, see the [Puppeteer with proxy](/docs/examples/puppeteer-with-proxy) example. + +## Which one to choose? + +When using Request queue - you would normally have several start URLs (e.g. category pages on e-commerce website) and then recursively add more (e.g. individual item pages) programmatically to the queue, it supports dynamic adding and removing of requests. No more URLs can be added to Request list after its initialization as it is immutable, URLs cannot be removed from the list either. + +On the other hand, the Request queue is not optimized for adding or removing numerous URLs in a batch. This is technically possible, but requests are added one by one to the queue, and thus it would take significant time with a larger number of requests. Request list however can contain even millions of URLs, and it would take significantly less time to add them to the list, compared to the queue. + +Note that Request queue and Request list can be used together by the same crawler. +In such cases, each request from the Request list is enqueued into the Request queue first (to the foremost position in the queue, even if Request queue is not empty) and then consumed from the latter. +This is necessary to avoid the same URL being processed more than once (from the list first and then possibly from the queue). +In practical terms, such a combination can be useful when there are numerous initial URLs, but more URLs would be added dynamically by the crawler. + +The following code demonstrates how to use Request queue and Request list in the same crawler: +```javascript +// Prepare the sources array with URLs to visit (it can contain millions of URLs) +const sources = [ + { url: 'http://www.example.com/page-1' }, + { url: 'http://www.example.com/page-2' }, + { url: 'http://www.example.com/page-3' }, +]; +// Open the request list +const requestList = await Apify.openRequestList('my-list', sources); + +// Open the default request queue. It's not necessary to add any requests to the queue +const requestQueue = await Apify.openRequestQueue(); + +// The crawler will automatically process requests from the list and the queue +const crawler = new Apify.PuppeteerCrawler({ + requestList, + requestQueue, + // Each request from the request list is enqueued to the request queue one by one. + // At this point request with the same URL would exist in the list and the queue + handlePageFunction: async ({ request, page }) => { + // Add new request to the queue + await requestQueue.addRequest({ url: 'http://www.example.com/new-page' }); + + // Add links found on page to the queue + await Apify.utils.enqueueLinks({ page, requestQueue }); + + // The requests above would be added to the queue (but not to the list) + // and would be processed after the request list is empty. + // No more requests could be added to the list here + }, +}); +``` diff --git a/website/versioned_docs/version-1.0.0/guides/result_storage.md b/website/versioned_docs/version-1.0.0/guides/result_storage.md new file mode 100644 index 000000000000..b3f74ec049a3 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/guides/result_storage.md @@ -0,0 +1,116 @@ +--- +id: version-1.0.0-result-storage +title: Result Storage +original_id: result-storage +--- + +The Apify SDK has several result storage types that are useful for specific tasks. The data is stored either on local disk to a directory defined by the +`APIFY_LOCAL_STORAGE_DIR` environment variable, or on the [Apify platform](../guides/apify-platform) under the user account +identified by the API token defined by the `APIFY_TOKEN` environment variable. If neither of these variables is defined, by default Apify SDK sets +`APIFY_LOCAL_STORAGE_DIR` to `./apify_storage` in the current working directory and prints a warning. + +Typically, you will be developing the code on your local computer and thus set the `APIFY_LOCAL_STORAGE_DIR` environment variable. Once the code is +ready, you will deploy it to the Apify platform, where it will automatically set the `APIFY_TOKEN` environment variable and thus use cloud storage. No +code changes are needed. + +**Related links** + +- [Apify platform storage documentation](https://docs.apify.com/storage) +- [View storage in Apify app](https://my.apify.com/storage) +- [Key-value stores API reference](https://apify.com/docs/api/v2#/reference/key-value-stores) +- [Datasets API reference](https://docs.apify.com/api/v2#/reference/datasets) + +## Key-value store + +The key-value store is used for saving and reading data records or files. Each data record is represented by a unique key and associated with a MIME +content type. Key-value stores are ideal for saving screenshots of web pages, PDFs or to persist the state of crawlers. + +Each actor run is associated with a **default key-value store**, which is created exclusively for the actor run. By convention, the actor run input +and output is stored in the default key-value store under the `INPUT` and `OUTPUT` key, respectively. Typically the input and output is a JSON file, +although it can be any other format. + +In the Apify SDK, the key-value store is represented by the [`KeyValueStore`](../api/key-value-store) class. In order to simplify access to the default +key-value store, the SDK also provides [`Apify.getValue()`](../api/apify#getvalue) and +[`Apify.setValue()`](../api/apify#setvalue) functions. + +In local configuration, the data is stored in the directory specified by the `APIFY_LOCAL_STORAGE_DIR` environment variable as follows: + +``` +{APIFY_LOCAL_STORAGE_DIR}/key_value_stores/{STORE_ID}/{KEY}.{EXT} +``` + +Note that `{STORE_ID}` is the name or ID of the key-value store. The default key value store has ID `default`, unless you override it by setting the +`APIFY_DEFAULT_KEY_VALUE_STORE_ID` environment variable. The `{KEY}` is the key of the record and `{EXT}` corresponds to the MIME content type of the +data value. + +The following code demonstrates basic operations of key-value stores: + +```javascript +// Get actor input from the default key-value store +const input = await Apify.getInput(); + +// Write actor output to the default key-value store. +await Apify.setValue('OUTPUT', { myResult: 123 }); + +// Open a named key-value store +const store = await Apify.openKeyValueStore('some-name'); + +// Write record. JavaScript object is automatically converted to JSON, +// strings and binary buffers are stored as they are +await store.setValue('some-key', { foo: 'bar' }); + +// Read record. Note that JSON is automatically parsed to a JavaScript object, +// text data returned as a string and other data is returned as binary buffer +const value = await store.getValue('some-key'); + +// Delete record +await store.setValue('some-key', null); +``` + +To see a real-world example of how to get the input from the key-value store, see the [Screenshots](../examples/screenshots) example. + +## Dataset + +Datasets are used to store structured data where each object stored has the same attributes, such as online store products or real estate offers. You +can imagine a dataset as a table, where each object is a row and its attributes are columns. Dataset is an append-only storage - you can only add new +records to it but you cannot modify or remove existing records. + +When the dataset is stored on the [Apify platform](../guides/apify-platform), you can export its data to the following formats: HTML, +JSON, CSV, Excel, XML and RSS. The datasets are displayed on the actor run details page and in the +[Storage](https://my.apify.com/storage) section in the Apify app. The actual data is exported using the +[Get dataset items](https://apify.com/docs/api/v2#/reference/datasets/item-collection/get-items) Apify API endpoint. This +way you can easily share crawling results. + +Each actor run is associated with a **default dataset**, which is created exclusively for the actor run. Typically, it is used to store crawling +results specific for the actor run. Its usage is optional. + +In the Apify SDK, the dataset is represented by the [`Dataset`](../api/dataset) class. In order to simplify writes to the default dataset, the SDK +also provides the [`Apify.pushData()`](../api/apify#pushdata) function. + +In local configuration, the data is stored in the directory specified by the `APIFY_LOCAL_STORAGE_DIR` environment variable as follows: + +``` +{APIFY_LOCAL_STORAGE_DIR}/datasets/{DATASET_ID}/{INDEX}.json +``` + +Note that `{DATASET_ID}` is the name or ID of the dataset. The default dataset has ID `default`, unless you override it by setting the +`APIFY_DEFAULT_DATASET_ID` environment variable. Each dataset item is stored as a separate JSON file, where `{INDEX}` is a zero-based index of the +item in the dataset. + +The following code demonstrates basic operations of the dataset: + +```javascript +// Write a single row to the default dataset +await Apify.pushData({ col1: 123, col2: 'val2' }); + +// Open a named dataset +const dataset = await Apify.openDataset('some-name'); + +// Write a single row +await dataset.pushData({ foo: 'bar' }); + +// Write multiple rows +await dataset.pushData([{ foo: 'bar2', col2: 'val2' }, { col3: 123 }]); +``` + +To see how to use the dataset to store crawler results, see the [Cheerio Crawler](../examples/cheerio-crawler) example. diff --git a/website/versioned_docs/version-1.0.0/typedefs/BasicCrawlerOptions.md b/website/versioned_docs/version-1.0.0/typedefs/BasicCrawlerOptions.md new file mode 100644 index 000000000000..99314598878d --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/BasicCrawlerOptions.md @@ -0,0 +1,149 @@ +--- +id: version-1.0.0-basic-crawler-options +title: BasicCrawlerOptions +original_id: basic-crawler-options +--- + + + +## Properties + +### `handleRequestFunction` + +**Type**: [`HandleRequest`](../typedefs/handle-request) + +User-provided function that performs the logic of the crawler. It is called for each URL to crawl. + +The function receives the following object as an argument: + +``` +{ + request: Request, + session: Session, + crawler: BasicCrawler, +} +``` + +where the [`Request`](../api/request) instance represents the URL to crawl. + +The function must return a promise, which is then awaited by the crawler. + +If the function throws an exception, the crawler will try to re-crawl the request later, up to `option.maxRequestRetries` times. If all the retries +fail, the crawler calls the function provided to the `handleFailedRequestFunction` parameter. To make this work, you should **always** let your +function throw exceptions rather than catch them. The exceptions are logged to the request using the +[`Request.pushErrorMessage()`](../api/request#pusherrormessage) function. + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +Static list of URLs to be processed. Either `requestList` or `requestQueue` option must be provided (or both). + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites. Either `requestList` or `requestQueue` option must be +provided (or both). + +--- + +### `handleRequestTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds. + +--- + +### `handleFailedRequestFunction` + +**Type**: [`HandleFailedRequest`](../typedefs/handle-failed-request) + +A function to handle requests that failed more than `option.maxRequestRetries` times. + +The function receives the following object as an argument: + +``` +{ + request: Request, + error: Error, + session: Session, + crawler: BasicCrawler, +} +``` + +where the [`Request`](../api/request) instance corresponds to the failed request, and the `Error` instance represents the last error thrown during +processing of the request. + +See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/basic_crawler.js#L11) for the default implementation of this function. + +--- + +### `maxRequestRetries` + +**Type**: `number` = 3 + +Indicates how many times the request is retried if +[`BasicCrawlerOptions.handleRequestFunction`](../typedefs/basic-crawler-options#handlerequestfunction) fails. + +--- + +### `maxRequestsPerCrawl` + +**Type**: `number` + +Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached. Always set this value in order to prevent infinite +loops in misconfigured crawlers. Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value. + +--- + +### `autoscaledPoolOptions` + +**Type**: [`AutoscaledPoolOptions`](../typedefs/autoscaled-pool-options) + +Custom options passed to the underlying [`AutoscaledPool`](../api/autoscaled-pool) constructor. Note that the `runTaskFunction` and +`isTaskReadyFunction` options are provided by `BasicCrawler` and cannot be overridden. However, you can provide a custom implementation of +`isFinishedFunction`. + +--- + +### `minConcurrency` + +**Type**: `number` = 1 + +Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +_WARNING:_ If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash. If +you're not sure, just keep the default value and the concurrency will scale up automatically. + +--- + +### `maxConcurrency` + +**Type**: `number` = 1000 + +Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +--- + +### `useSessionPool` + +**Type**: `boolean` = true + +Basic crawler will initialize the [`SessionPool`](../api/session-pool) with the corresponding `sessionPoolOptions`. The session instance will be than +available in the `handleRequestFunction`. + +--- + +### `sessionPoolOptions` + +**Type**: [`SessionPoolOptions`](../typedefs/session-pool-options) + +The configuration options for [`SessionPool`](../api/session-pool) to use. + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/BrowserLaunchContext.md b/website/versioned_docs/version-1.0.0/typedefs/BrowserLaunchContext.md new file mode 100644 index 000000000000..8ad324e021d4 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/BrowserLaunchContext.md @@ -0,0 +1,46 @@ +--- +id: version-1.0.0-browser-launch-context +title: BrowserLaunchContext +original_id: browser-launch-context +--- + + + +## Properties + +### `launchOptions` + +**Type**: `object` + +`Options passed to the browser launcher function. Options are based on underlying library. + +--- + +### `proxyUrl` + +**Type**: `string` + +URL to a HTTP proxy server. It must define the port number, and it may also contain proxy username and password. + +Example: `http://bob:pass123@proxy.example.com:1234`. + +--- + +### `useChrome` + +**Type**: `boolean` = false + +If `true` and `executablePath` is not set, Playwright will launch full Google Chrome browser available on the machine rather than the bundled +Chromium. The path to Chrome executable is taken from the `APIFY_CHROME_EXECUTABLE_PATH` environment variable if provided, or defaults to the typical +Google Chrome executable location specific for the operating system. By default, this option is `false`. + +--- + +### `launcher` + +**Type**: `Object` + +By default this function uses +require("playwright").chromium`. If you want to use a different browser you can pass it by this property as`require("playwright").firefox + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/CheerioCrawlerOptions.md b/website/versioned_docs/version-1.0.0/typedefs/CheerioCrawlerOptions.md new file mode 100644 index 000000000000..5cb4634cdcaa --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/CheerioCrawlerOptions.md @@ -0,0 +1,328 @@ +--- +id: version-1.0.0-cheerio-crawler-options +title: CheerioCrawlerOptions +original_id: cheerio-crawler-options +--- + + + +## Properties + +### `handlePageFunction` + +**Type**: [`CheerioHandlePage`](../typedefs/cheerio-handle-page) + +User-provided function that performs the logic of the crawler. It is called for each page loaded and parsed by the crawler. + +The function receives the following object as an argument: + +``` +{ + // The Cheerio object's function with the parsed HTML. + $: Cheerio, + + // The request body of the web page, whose type depends on the content type. + body: String|Buffer, + + // The parsed object from JSON for responses with the "application/json" content types. + // For other content types it's null. + json: Object, + + // Apify.Request object with details of the requested web page + request: Request, + + // Parsed Content-Type HTTP header: { type, encoding } + contentType: Object, + + // An instance of Node's http.IncomingMessage object, + response: Object, + + // Session object, useful to work around anti-scraping protections + session: Session + + // ProxyInfo object with information about currently used proxy + proxyInfo: ProxyInfo + + // The running cheerio crawler instance. + crawler: CheerioCrawler +} +``` + +Type of `body` depends on the `Content-Type` header of the web page: + +- String for `text/html`, `application/xhtml+xml`, `application/xml` MIME content types +- Buffer for others MIME content types + +Parsed `Content-Type` header using [content-type package](https://www.npmjs.com/package/content-type) is stored in `contentType`. + +Cheerio is available only for HTML and XML content types. + +With the [`Request`](../api/request) object representing the URL to crawl. + +If the function returns, the returned promise is awaited by the crawler. + +If the function throws an exception, the crawler will try to re-crawl the request later, up to `option.maxRequestRetries` times. If all the retries +fail, the crawler calls the function provided to the `handleFailedRequestFunction` parameter. To make this work, you should **always** let your +function throw exceptions rather than catch them. The exceptions are logged to the request using the +[`Request.pushErrorMessage()`](../api/request#pusherrormessage) function. + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +Static list of URLs to be processed. Either `requestList` or `requestQueue` option must be provided (or both). + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites. Either `requestList` or `requestQueue` option must be +provided (or both). + +--- + +### `prepareRequestFunction` + +**Type**: [`PrepareRequest`](../typedefs/prepare-request) + +A function that executes before the HTTP request is made to the target resource. This function is suitable for setting dynamic properties such as +cookies to the [`Request`](../api/request). + +The function receives the following object as an argument: + +``` +{ + request: Request, + session: Session, + proxyInfo: ProxyInfo, + crawler: CheerioCrawler, +} +``` + +where the [`Request`](../api/request) instance corresponds to the initialized request and the [`Session`](../api/session) instance corresponds to used +session. + +The function should modify the properties of the passed [`Request`](../api/request) instance in place because there are already earlier references to +it. Making a copy and returning it from this function is therefore not supported, because it would create inconsistencies where different parts of SDK +would have access to a different [`Request`](../api/request) instance. + +--- + +### `postResponseFunction` + +**Type**: [`PostResponse`](../typedefs/post-response) + +A function that executes right after the HTTP request is made to the target resource and response is returned. This function is suitable for +overriding custom properties of response e.g. setting headers because of response parsing. + +**Example usage:** + +```javascript +const cheerioCrawlerOptions = { + // ... + postResponseFunction: ({ request, response }) => { + if (request.userData.parseAsJSON) { + response.headers['content-type'] = 'application/json; charset=utf-8'; + } + }, +}; +``` + +The function receives the following object as an argument: + +``` +{ + response: Object, + request: Request, + session: Session, + proxyInfo: ProxyInfo, + crawler: CheerioCrawler, +} +``` + +The response is an instance of Node's http.IncomingMessage object. + +--- + +### `handlePageTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which the function passed as `handlePageFunction` needs to finish, given in seconds. + +--- + +### `requestTimeoutSecs` + +**Type**: `number` = 30 + +Timeout in which the HTTP request to the resource needs to finish, given in seconds. + +--- + +### `ignoreSslErrors` + +**Type**: `boolean` = true + +If set to true, SSL certificate errors will be ignored. + +--- + +### `proxyConfiguration` + +**Type**: [`ProxyConfiguration`](../api/proxy-configuration) + +If set, `CheerioCrawler` will be configured for all connections to use [Apify Proxy](https://my.apify.com/proxy) or your own Proxy URLs provided and +rotated according to the configuration. For more information, see the [documentation](https://docs.apify.com/proxy). + +--- + +### `handleFailedRequestFunction` + +**Type**: [`HandleFailedRequest`](../typedefs/handle-failed-request) + +A function to handle requests that failed more than `option.maxRequestRetries` times. + +The function receives the following object as an argument: + +``` +{ + error: Error, + request: Request, + session: Session, + $: Cheerio, + body: String|Buffer, + json: Object, + contentType: Object, + response: Object, + proxyInfo: ProxyInfo, + crawler: CheerioCrawler, +} +``` + +where the [`Request`](../api/request) instance corresponds to the failed request, and the `Error` instance represents the last error thrown during +processing of the request. + +See [source code](https://github.com/apify/apify-js/blob/master/src/crawlers/cheerio_crawler.js#L13) for the default implementation of this function. + +--- + +### `additionalMimeTypes` + +**Type**: `Array` + +An array of MIME types you want the crawler to load and process. By default, only `text/html` and `application/xhtml+xml` MIME types are +supported. + +--- + +### `suggestResponseEncoding` + +**Type**: `string` + +By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Sadly, there are some websites which use invalid headers. +Those are encoded using the UTF-8 encoding. If those sites actually use a different encoding, the response will be corrupted. You can use +`suggestResponseEncoding` to fall back to a certain encoding, if you know that your target website uses it. To force a certain encoding, disregarding +the response headers, use [`CheerioCrawlerOptions.forceResponseEncoding`](../typedefs/cheerio-crawler-options#forceresponseencoding) + +``` +// Will fall back to windows-1250 encoding if none found +suggestResponseEncoding: 'windows-1250' +``` + +--- + +### `forceResponseEncoding` + +**Type**: `string` + +By default `CheerioCrawler` will extract correct encoding from the HTTP response headers. Use `forceResponseEncoding` to force a certain encoding, +disregarding the response headers. To only provide a default for missing encodings, use +[`CheerioCrawlerOptions.suggestResponseEncoding`](../typedefs/cheerio-crawler-options#suggestresponseencoding) + +``` +// Will force windows-1250 encoding even if headers say otherwise +forceResponseEncoding: 'windows-1250' +``` + +--- + +### `maxRequestRetries` + +**Type**: `number` = 3 + +Indicates how many times the request is retried if either `requestFunction` or `handlePageFunction` fails. + +--- + +### `maxRequestsPerCrawl` + +**Type**: `number` + +Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached. Always set this value in order to prevent infinite +loops in misconfigured crawlers. Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value. + +--- + +### `autoscaledPoolOptions` + +**Type**: [`AutoscaledPoolOptions`](../typedefs/autoscaled-pool-options) + +Custom options passed to the underlying [`AutoscaledPool`](../api/autoscaled-pool) constructor. Note that the `runTaskFunction`, `isTaskReadyFunction` +and `isFinishedFunction` options are provided by `CheerioCrawler` and cannot be overridden. Reasonable [`Snapshotter`](../api/snapshotter) and +[`SystemStatus`](../api/system-status) defaults are provided to account for the fact that `cheerio` parses HTML synchronously and therefore blocks the +event loop. + +--- + +### `minConcurrency` + +**Type**: `number` = 1 + +Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +_WARNING:_ If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash. If +you're not sure, just keep the default value and the concurrency will scale up automatically. + +--- + +### `maxConcurrency` + +**Type**: `number` = 1000 + +Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +--- + +### `useSessionPool` + +**Type**: `boolean` = false + +If set to true Crawler will automatically use Session Pool. It will automatically retire sessions on 403, 401 and 429 status codes. It also marks +Session as bad after a request timeout. + +--- + +### `sessionPoolOptions` + +**Type**: [`SessionPoolOptions`](../typedefs/session-pool-options) + +Custom options passed to the underlying [`SessionPool`](../api/session-pool) constructor. + +--- + +### `persistCookiesPerSession` + +**Type**: `boolean` + +Automatically saves cookies to Session. Works only if Session Pool is used. + +It parses cookie from response "set-cookie" header saves or updates cookies for session and once the session is used for next request. It passes the +"Cookie" header to the request with the session cookies. + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/CheerioHandlePageInputs.md b/website/versioned_docs/version-1.0.0/typedefs/CheerioHandlePageInputs.md new file mode 100644 index 000000000000..8c2b06d28dc3 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/CheerioHandlePageInputs.md @@ -0,0 +1,77 @@ +--- +id: version-1.0.0-cheerio-handle-page-inputs +title: CheerioHandlePageInputs +original_id: cheerio-handle-page-inputs +--- + + + +## Properties + +### `$` + +**Type**: `cheerio.Selector` + +The [Cheerio](https://cheerio.js.org/) object with parsed HTML. + +--- + +### `body` + +**Type**: `string` | `Buffer` + +The request body of the web page. + +--- + +### `json` + +**Type**: `*` + +The parsed object from JSON string if the response contains the content type application/json. + +--- + +### `request` + +**Type**: [`Request`](../api/request) + +The original [`Request`](../api/request) object. + +--- + +### `contentType` + +**Type**: `Object` + +Parsed `Content-Type header: { type, encoding }`. + +--- + +### `response` + +**Type**: `IncomingMessage` + +An instance of Node's http.IncomingMessage object, + +--- + +### `session` + +**Type**: [`Session`](../api/session) + +--- + +### `proxyInfo` + +**Type**: [`ProxyInfo`](../typedefs/proxy-info) + +An object with information about currently used proxy by the crawler and configured by the [`ProxyConfiguration`](../api/proxy-configuration) class. + +--- + +### `crawler` + +**Type**: [`CheerioCrawler`](../api/cheerio-crawler) + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/HandleFailedRequestInput.md b/website/versioned_docs/version-1.0.0/typedefs/HandleFailedRequestInput.md new file mode 100644 index 000000000000..cbefb9fbdc96 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/HandleFailedRequestInput.md @@ -0,0 +1,37 @@ +--- +id: version-1.0.0-handle-failed-request-input +title: HandleFailedRequestInput +original_id: handle-failed-request-input +--- + + + +## Properties + +### `error` + +**Type**: `Error` + +The Error thrown by `handleRequestFunction`. + +--- + +### `request` + +**Type**: [`Request`](../api/request) + +The original {Request} object. + +--- + +### `session` + +**Type**: [`Session`](../api/session) + +--- + +### `proxyInfo` + +**Type**: [`ProxyInfo`](../typedefs/proxy-info) + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/HandleRequestInputs.md b/website/versioned_docs/version-1.0.0/typedefs/HandleRequestInputs.md new file mode 100644 index 000000000000..16e0baee2a65 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/HandleRequestInputs.md @@ -0,0 +1,32 @@ +--- +id: version-1.0.0-handle-request-inputs +title: HandleRequestInputs +original_id: handle-request-inputs +--- + + + +## Properties + +### `request` + +**Type**: [`Request`](../api/request) + +The original {Request} object. A reference to the underlying [`AutoscaledPool`](../api/autoscaled-pool) class that manages the concurrency of the +crawler. Note that this property is only initialized after calling the [`BasicCrawler.run()`](../api/basic-crawler#run) function. You can use it to +change the concurrency settings on the fly, to pause the crawler by calling [`AutoscaledPool.pause()`](../api/autoscaled-pool#pause) or to abort it by +calling [`AutoscaledPool.abort()`](../api/autoscaled-pool#abort). + +--- + +### `session` + +**Type**: [`Session`](../api/session) + +--- + +### `crawler` + +**Type**: [`BasicCrawler`](../api/basic-crawler) + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/PlaywrightCrawlerOptions.md b/website/versioned_docs/version-1.0.0/typedefs/PlaywrightCrawlerOptions.md new file mode 100644 index 000000000000..7eba8592c5a6 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/PlaywrightCrawlerOptions.md @@ -0,0 +1,243 @@ +--- +id: version-1.0.0-playwright-crawler-options +title: PlaywrightCrawlerOptions +original_id: playwright-crawler-options +--- + + + +## Properties + +### `handlePageFunction` + +**Type**: `function` + +Function that is called to process each request. It is passed an object with the following fields: + +``` +{ + request: Request, + response: Response, + page: Page, + session: Session, + browserController: BrowserController, + proxyInfo: ProxyInfo, + crawler: PlaywrightCrawler, +} +``` + +`request` is an instance of the [`Request`](../api/request) object with details about the URL to open, HTTP method etc. `page` is an instance of the +`Playwright` [`Page`](https://playwright.dev/docs/api/class-page) `browserController` is an instance of the +[`BrowserController`](https://github.com/apify/browser-pool#browsercontroller), `response` is an instance of the `Playwright` +[`Response`](https://playwright.dev/docs/api/class-response), which is the main resource response as returned by `page.goto(request.url)`. The +function must return a promise, which is then awaited by the crawler. + +If the function throws an exception, the crawler will try to re-crawl the request later, up to `option.maxRequestRetries` times. If all the retries +fail, the crawler calls the function provided to the `handleFailedRequestFunction` parameter. To make this work, you should **always** let your +function throw exceptions rather than catch them. The exceptions are logged to the request using the +[`Request.pushErrorMessage()`](../api/request#pusherrormessage) function. + +--- + +### `navigationTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which page navigation needs to finish, in seconds. When `gotoFunction()` is used and thus the default function is overridden, this timeout +will not be used and needs to be configured in the new `gotoFunction()`. + +--- + +### `handleFailedRequestFunction` + +**Type**: [`HandleFailedRequest`](../typedefs/handle-failed-request) + +A function to handle requests that failed more than `option.maxRequestRetries` times. + +The function receives the following object as an argument: + +``` +{ + request: Request, + response: Response, + page: Page, + session: Session, + browserController: BrowserController, + proxyInfo: ProxyInfo, + crawler: PlaywrightCrawler, +} +``` + +Where the [`Request`](../api/request) instance corresponds to the failed request, and the `Error` instance represents the last error thrown during +processing of the request. + +--- + +### `preNavigationHooks` + +**Type**: `Array` + +Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies or browser properties before navigation. +The function accepts two parameters, `crawlingContext` and `gotoOptions`, which are passed to the `gotoFunction` the crawler calls to navigate. +Example: + +``` +preNavigationHooks: [ + async (crawlingContext, gotoOptions) => { + await page.evaluate((attr) => { window.foo = attr; }, 'bar'); + } +] +``` + +--- + +### `postNavigationHooks` + +**Type**: `Array` + +Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. The function accepts +`crawlingContext` as an only parameter. Example: + +``` +postNavigationHooks: [ + async (crawlingContext) => { + const { page } = crawlingContext; + if (hasCaptcha(page)) { + await solveCaptcha (page); + } + }; +] +``` + +--- + +### `launchContext` + +**Type**: [`PlaywrightLaunchContext`](../typedefs/playwright-launch-context) + +The same options as used by [`Apify.launchPlaywright()`](../api/apify#launchplaywright). + +--- + +### `handlePageTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which the function passed as `handlePageFunction` needs to finish, in seconds. + +--- + +### `browserPoolOptions` + +**Type**: `BrowserPoolOptions` + +Custom options passed to the underlying [`BrowserPool`](https://github.com/apify/browser-pool#BrowserPool) constructor. You can tweak those to +fine-tune browser management. + +--- + +### `persistCookiesPerSession` + +**Type**: `boolean` = true + +Automatically saves cookies to Session. Works only if Session Pool is used. + +--- + +### `proxyConfiguration` + +**Type**: [`ProxyConfiguration`](../api/proxy-configuration) + +If set, `PlaywrightCrawler` will be configured for all connections to use [Apify Proxy](https://my.apify.com/proxy) or your own Proxy URLs provided +and rotated according to the configuration. For more information, see the [documentation](https://docs.apify.com/proxy). + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +Static list of URLs to be processed. Either `requestList` or `requestQueue` option must be provided (or both). + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites. Either `requestList` or `requestQueue` option must be +provided (or both). + +--- + +### `handleRequestTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds. + +--- + +### `maxRequestRetries` + +**Type**: `number` = 3 + +Indicates how many times the request is retried if +[`PlaywrightCrawlerOptions.handlePageFunction`](../typedefs/playwright-crawler-options#handlepagefunction) fails. + +--- + +### `maxRequestsPerCrawl` + +**Type**: `number` + +Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached. Always set this value in order to prevent infinite +loops in misconfigured crawlers. Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value. + +--- + +### `autoscaledPoolOptions` + +**Type**: [`AutoscaledPoolOptions`](../typedefs/autoscaled-pool-options) + +Custom options passed to the underlying [`AutoscaledPool`](../api/autoscaled-pool) constructor. Note that the `runTaskFunction` and +`isTaskReadyFunction` options are provided by `BasicCrawler` and cannot be overridden. However, you can provide a custom implementation of +`isFinishedFunction`. + +--- + +### `minConcurrency` + +**Type**: `number` = 1 + +Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +_WARNING:_ If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash. If +you're not sure, just keep the default value and the concurrency will scale up automatically. + +--- + +### `maxConcurrency` + +**Type**: `number` = 1000 + +Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +--- + +### `useSessionPool` + +**Type**: `boolean` = true + +Playwright crawler will initialize the [`SessionPool`](../api/session-pool) with the corresponding `sessionPoolOptions`. The session instance will be +than available in the `handleRequestFunction`. + +--- + +### `sessionPoolOptions` + +**Type**: [`SessionPoolOptions`](../typedefs/session-pool-options) + +The configuration options for [`SessionPool`](../api/session-pool) to use. + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/PlaywrightLaunchContext.md b/website/versioned_docs/version-1.0.0/typedefs/PlaywrightLaunchContext.md new file mode 100644 index 000000000000..d0048e401963 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/PlaywrightLaunchContext.md @@ -0,0 +1,65 @@ +--- +id: version-1.0.0-playwright-launch-context +title: PlaywrightLaunchContext +original_id: playwright-launch-context +--- + + + +Apify extends the launch options of Playwright. You can use any of the Playwright compatible +[`LaunchOptions`](https://playwright.dev/docs/api/class-browsertype#browsertypelaunchoptions) options by providing the `launchOptions` property. + +**Example:** + +```js +// launch a headless Chrome (not Chromium) +const launchContext = { + // Apify helpers + useChrome: true, + proxyUrl: 'http://user:password@some.proxy.com' + // Native Playwright options + launchOptions: { + headless: true, + args: ['--some-flag'], + } +} +``` + +## Properties + +### `launchOptions` + +**Type**: `object` + +`browserType.launch` [options](https://playwright.dev/docs/api/class-browsertype?_highlight=launch#browsertypelaunchoptions) + +--- + +### `proxyUrl` + +**Type**: `string` + +URL to a HTTP proxy server. It must define the port number, and it may also contain proxy username and password. + +Example: `http://bob:pass123@proxy.example.com:1234`. + +--- + +### `useChrome` + +**Type**: `boolean` = false + +If `true` and `executablePath` is not set, Playwright will launch full Google Chrome browser available on the machine rather than the bundled +Chromium. The path to Chrome executable is taken from the `APIFY_CHROME_EXECUTABLE_PATH` environment variable if provided, or defaults to the typical +Google Chrome executable location specific for the operating system. By default, this option is `false`. + +--- + +### `launcher` + +**Type**: `Object` + +By default this function uses `require("playwright").chromium`. If you want to use a different browser you can pass it by this property as e.g. +`require("playwright").firefox` + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/PostResponseInputs.md b/website/versioned_docs/version-1.0.0/typedefs/PostResponseInputs.md new file mode 100644 index 000000000000..81f94652be32 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/PostResponseInputs.md @@ -0,0 +1,47 @@ +--- +id: version-1.0.0-post-response-inputs +title: PostResponseInputs +original_id: post-response-inputs +--- + + + +## Properties + +### `response` + +**Type**: `IncomingMessage` | `Readable` + +stream + +--- + +### `request` + +**Type**: [`Request`](../api/request) + +Original instance fo the {Request} object. Must be modified in-place. + +--- + +### `session` + +**Type**: [`Session`](../api/session) + +The current session + +--- + +### `proxyInfo` + +**Type**: [`ProxyInfo`](../typedefs/proxy-info) + +An object with information about currently used proxy by the crawler and configured by the [`ProxyConfiguration`](../api/proxy-configuration) class. + +--- + +### `crawler` + +**Type**: [`CheerioCrawler`](../api/cheerio-crawler) + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/PrepareRequestInputs.md b/website/versioned_docs/version-1.0.0/typedefs/PrepareRequestInputs.md new file mode 100644 index 000000000000..50132409dff6 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/PrepareRequestInputs.md @@ -0,0 +1,39 @@ +--- +id: version-1.0.0-prepare-request-inputs +title: PrepareRequestInputs +original_id: prepare-request-inputs +--- + + + +## Properties + +### `request` + +**Type**: [`Request`](../api/request) + +Original instance fo the {Request} object. Must be modified in-place. + +--- + +### `session` + +**Type**: [`Session`](../api/session) + +The current session + +--- + +### `proxyInfo` + +**Type**: [`ProxyInfo`](../typedefs/proxy-info) + +An object with information about currently used proxy by the crawler and configured by the [`ProxyConfiguration`](../api/proxy-configuration) class. + +--- + +### `crawler` + +**Type**: [`CheerioCrawler`](../api/cheerio-crawler) + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/PuppeteerCrawlerOptions.md b/website/versioned_docs/version-1.0.0/typedefs/PuppeteerCrawlerOptions.md new file mode 100644 index 000000000000..7f1ab65cbc2c --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/PuppeteerCrawlerOptions.md @@ -0,0 +1,244 @@ +--- +id: version-1.0.0-puppeteer-crawler-options +title: PuppeteerCrawlerOptions +original_id: puppeteer-crawler-options +--- + + + +## Properties + +### `handlePageFunction` + +**Type**: `PuppeteerHandlePage` + +Function that is called to process each request. It is passed an object with the following fields: + +``` +{ + request: Request, + response: Response, + page: Page, + session: Session, + browserController: BrowserController, + proxyInfo: ProxyInfo, + crawler: PuppeteerCrawler, +} +``` + +`request` is an instance of the [`Request`](../api/request) object with details about the URL to open, HTTP method etc. `page` is an instance of the +`Puppeteer` [`Page`](https://pptr.dev/#?product=Puppeteer&show=api-class-page) `browserPool` is an instance of the +[`BrowserPool`](https://github.com/apify/browser-pool#BrowserPool), `browserController` is an instance of the +[`BrowserController`](https://github.com/apify/browser-pool#browsercontroller), `response` is an instance of the `Puppeteer` +[`Response`](https://pptr.dev/#?product=Puppeteer&show=api-class-response), which is the main resource response as returned by +`page.goto(request.url)`. The function must return a promise, which is then awaited by the crawler. + +If the function throws an exception, the crawler will try to re-crawl the request later, up to `option.maxRequestRetries` times. If all the retries +fail, the crawler calls the function provided to the `handleFailedRequestFunction` parameter. To make this work, you should **always** let your +function throw exceptions rather than catch them. The exceptions are logged to the request using the +[`Request.pushErrorMessage()`](../api/request#pusherrormessage) function. + +--- + +### `navigationTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which page navigation needs to finish, in seconds. When `gotoFunction()` is used and thus the default function is overridden, this timeout +will not be used and needs to be configured in the new `gotoFunction()`. + +--- + +### `handleFailedRequestFunction` + +**Type**: [`HandleFailedRequest`](../typedefs/handle-failed-request) + +A function to handle requests that failed more than `option.maxRequestRetries` times. + +The function receives the following object as an argument: + +``` +{ + request: Request, + response: Response, + page: Page, + session: Session, + browserController: BrowserController, + proxyInfo: ProxyInfo, + crawler: PuppeteerCrawler, +} +``` + +Where the [`Request`](../api/request) instance corresponds to the failed request, and the `Error` instance represents the last error thrown during +processing of the request. + +--- + +### `launchContext` + +**Type**: `object` + +Options used by [`Apify.launchPuppeteer()`](../api/apify#launchpuppeteer) to start new Puppeteer instances. + +--- + +### `handlePageTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which the function passed as `handlePageFunction` needs to finish, in seconds. + +--- + +### `browserPoolOptions` + +**Type**: `BrowserPoolOptions` + +Custom options passed to the underlying [`BrowserPool`](https://github.com/apify/browser-pool#BrowserPool) constructor. You can tweak those to +fine-tune browser management. + +--- + +### `persistCookiesPerSession` + +**Type**: `boolean` = true + +Automatically saves cookies to Session. Works only if Session Pool is used. + +--- + +### `proxyConfiguration` + +**Type**: [`ProxyConfiguration`](../api/proxy-configuration) + +If set, `PuppeteerCrawler` will be configured for all connections to use [Apify Proxy](https://my.apify.com/proxy) or your own Proxy URLs provided and +rotated according to the configuration. For more information, see the [documentation](https://docs.apify.com/proxy). + +--- + +### `preNavigationHooks` + +**Type**: `Array` + +Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies or browser properties before navigation. +The function accepts two parameters, `crawlingContext` and `gotoOptions`, which are passed to the `gotoFunction` the crawler calls to navigate. +Example: + +``` +preNavigationHooks: [ + async (crawlingContext, gotoOptions) => { + await page.evaluate((attr) => { window.foo = attr; }, 'bar'); + } +] +``` + +--- + +### `postNavigationHooks` + +**Type**: `Array` + +Async functions that are sequentially evaluated after the navigation. Good for checking if the navigation was successful. The function accepts +`crawlingContext` as an only parameter. Example: + +``` +postNavigationHooks: [ + async (crawlingContext) => { + const { page } = crawlingContext; + if (hasCaptcha(page)) { + await solveCaptcha (page); + } + }; +] +``` + +--- + +### `requestList` + +**Type**: [`RequestList`](../api/request-list) + +Static list of URLs to be processed. Either `requestList` or `requestQueue` option must be provided (or both). + +--- + +### `requestQueue` + +**Type**: [`RequestQueue`](../api/request-queue) + +Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites. Either `requestList` or `requestQueue` option must be +provided (or both). + +--- + +### `handleRequestTimeoutSecs` + +**Type**: `number` = 60 + +Timeout in which the function passed as `handleRequestFunction` needs to finish, in seconds. + +--- + +### `maxRequestRetries` + +**Type**: `number` = 3 + +Indicates how many times the request is retried if +[`PuppeteerCrawlerOptions.handlePageFunction`](../typedefs/puppeteer-crawler-options#handlepagefunction) fails. + +--- + +### `maxRequestsPerCrawl` + +**Type**: `number` + +Maximum number of pages that the crawler will open. The crawl will stop when this limit is reached. Always set this value in order to prevent infinite +loops in misconfigured crawlers. Note that in cases of parallel crawling, the actual number of pages visited might be slightly higher than this value. + +--- + +### `autoscaledPoolOptions` + +**Type**: [`AutoscaledPoolOptions`](../typedefs/autoscaled-pool-options) + +Custom options passed to the underlying [`AutoscaledPool`](../api/autoscaled-pool) constructor. Note that the `runTaskFunction` and +`isTaskReadyFunction` options are provided by `BasicCrawler` and cannot be overridden. However, you can provide a custom implementation of +`isFinishedFunction`. + +--- + +### `minConcurrency` + +**Type**: `number` = 1 + +Sets the minimum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +_WARNING:_ If you set this value too high with respect to the available system memory and CPU, your crawler will run extremely slow or crash. If +you're not sure, just keep the default value and the concurrency will scale up automatically. + +--- + +### `maxConcurrency` + +**Type**: `number` = 1000 + +Sets the maximum concurrency (parallelism) for the crawl. Shortcut to the corresponding [`AutoscaledPool`](../api/autoscaled-pool) option. + +--- + +### `useSessionPool` + +**Type**: `boolean` = true + +Puppeteer crawler will initialize the [`SessionPool`](../api/session-pool) with the corresponding `sessionPoolOptions`. The session instance will be +than available in the `handleRequestFunction`. + +--- + +### `sessionPoolOptions` + +**Type**: [`SessionPoolOptions`](../typedefs/session-pool-options) + +The configuration options for [`SessionPool`](../api/session-pool) to use. + +--- diff --git a/website/versioned_docs/version-1.0.0/typedefs/PuppeteerLaunchContext.md b/website/versioned_docs/version-1.0.0/typedefs/PuppeteerLaunchContext.md new file mode 100644 index 000000000000..9af7be4061e2 --- /dev/null +++ b/website/versioned_docs/version-1.0.0/typedefs/PuppeteerLaunchContext.md @@ -0,0 +1,93 @@ +--- +id: version-1.0.0-puppeteer-launch-context +title: PuppeteerLaunchContext +original_id: puppeteer-launch-context +--- + + + +Apify extends the launch options of Puppeteer. You can use any of the Puppeteer compatible +[`LaunchOptions`](https://pptr.dev/#?product=Puppeteer&show=api-puppeteerlaunchoptions) options by providing the `launchOptions` property. + +**Example:** + +```js +// launch a headless Chrome (not Chromium) +const launchContext = { + // Apify helpers + useChrome: true, + proxyUrl: 'http://user:password@some.proxy.com' + // Native Puppeteer options + launchOptions: { + headless: true, + args: ['--some-flag'], + } +} +``` + +## Properties + +### `launchOptions` + +**Type**: `object` + +`puppeteer.launch` [options](https://pptr.dev/#?product=Puppeteer&version=v5.5.0&show=api-puppeteerlaunchoptions) + +--- + +### `proxyUrl` + +**Type**: `string` + +URL to a HTTP proxy server. It must define the port number, and it may also contain proxy username and password. + +Example: `http://bob:pass123@proxy.example.com:1234`. + +--- + +### `userAgent` + +**Type**: `string` + +The `User-Agent` HTTP header used by the browser. If not provided, the function sets `User-Agent` to a reasonable default to reduce the chance of +detection of the crawler. + +--- + +### `useChrome` + +**Type**: `boolean` = false + +If `true` and `executablePath` is not set, Puppeteer will launch full Google Chrome browser available on the machine rather than the bundled Chromium. +The path to Chrome executable is taken from the `APIFY_CHROME_EXECUTABLE_PATH` environment variable if provided, or defaults to the typical Google +Chrome executable location specific for the operating system. By default, this option is `false`. + +--- + +### `launcher` + +**Type**: `Object` + +Already required module (`Object`). This enables usage of various Puppeteer wrappers such as `puppeteer-extra`. + +Take caution, because it can cause all kinds of unexpected errors and weird behavior. Apify SDK is not tested with any other library besides +`puppeteer` itself. + +--- + +### `stealth` + +**Type**: `boolean` + +This setting hides most of the known properties that identify headless Chrome and makes it nearly undetectable. It is recommended to use it together +with the `useChrome` set to `true`. + +--- + +### `stealthOptions` + +**Type**: [`StealthOptions`](../typedefs/stealth-options) + +Using this configuration, you can disable some of the hiding tricks. For these settings to take effect `stealth` must be set to true + +--- diff --git a/website/versioned_sidebars/version-1.0.0-sidebars.json b/website/versioned_sidebars/version-1.0.0-sidebars.json new file mode 100644 index 000000000000..d1c5e94f033d --- /dev/null +++ b/website/versioned_sidebars/version-1.0.0-sidebars.json @@ -0,0 +1,184 @@ +{ + "version-1.0.0-docs": { + "Guide": [ + "version-1.0.0-guides/motivation", + "version-1.0.0-guides/quick-start", + "version-1.0.0-guides/apify-platform", + "version-1.0.0-guides/getting-started", + "version-1.0.0-guides/request-storage", + "version-1.0.0-guides/result-storage", + "version-1.0.0-guides/environment-variables", + "version-1.0.0-guides/proxy-management", + "version-1.0.0-guides/session-management", + "version-1.0.0-guides/type-script-actor", + "version-1.0.0-guides/docker-images" + ], + "Examples": [ + "version-1.0.0-examples/accept-user-input", + "version-1.0.0-examples/add-data-to-dataset", + "version-1.0.0-examples/basic-crawler", + "version-1.0.0-examples/call-actor", + "version-1.0.0-examples/capture-screenshot", + "version-1.0.0-examples/cheerio-crawler", + "version-1.0.0-examples/crawl-all-links", + "version-1.0.0-examples/crawl-multiple-urls", + "version-1.0.0-examples/crawl-relative-links", + "version-1.0.0-examples/crawl-single-url", + "version-1.0.0-examples/crawl-sitemap", + "version-1.0.0-examples/crawl-some-links", + "version-1.0.0-examples/forms", + "version-1.0.0-examples/handle-broken-links", + "version-1.0.0-examples/map-and-reduce", + "version-1.0.0-examples/puppeteer-crawler", + "version-1.0.0-examples/puppeteer-recursive-crawl", + "version-1.0.0-examples/puppeteer-sitemap", + "version-1.0.0-examples/puppeteer-with-proxy", + "version-1.0.0-examples/screenshots", + "version-1.0.0-examples/synchronous-run" + ], + "API Reference": [ + "version-1.0.0-api/apify", + { + "type": "subcategory", + "label": "Crawlers", + "ids": [ + "version-1.0.0-api/basic-crawler", + "version-1.0.0-api/cheerio-crawler", + "version-1.0.0-api/playwright-crawler", + "version-1.0.0-api/puppeteer-crawler" + ] + }, + { + "type": "subcategory", + "label": "Result Stores", + "ids": [ + "version-1.0.0-api/dataset", + "version-1.0.0-api/key-value-store" + ] + }, + { + "type": "subcategory", + "label": "Scaling", + "ids": [ + "version-1.0.0-api/autoscaled-pool", + "version-1.0.0-api/session", + "version-1.0.0-api/session-pool", + "version-1.0.0-api/proxy-configuration", + "version-1.0.0-api/snapshotter", + "version-1.0.0-api/system-status" + ] + }, + { + "type": "subcategory", + "label": "Sources", + "ids": [ + "version-1.0.0-api/request", + "version-1.0.0-api/request-list", + "version-1.0.0-api/request-queue", + "version-1.0.0-api/pseudo-url" + ] + }, + { + "type": "subcategory", + "label": "Utilities", + "ids": [ + "version-1.0.0-api/utils", + "version-1.0.0-api/log", + "version-1.0.0-api/playwright", + "version-1.0.0-api/puppeteer", + "version-1.0.0-api/social" + ] + } + ], + "Type Definitions": [ + { + "type": "subcategory", + "label": "Constructor Options", + "ids": [ + "version-1.0.0-typedefs/autoscaled-pool-options", + "version-1.0.0-typedefs/basic-crawler-options", + "version-1.0.0-typedefs/cheerio-crawler-options", + "version-1.0.0-typedefs/playwright-crawler-options", + "version-1.0.0-typedefs/playwright-launch-context", + "version-1.0.0-typedefs/puppeteer-crawler-options", + "version-1.0.0-typedefs/puppeteer-launch-context", + "version-1.0.0-typedefs/proxy-configuration-options", + "version-1.0.0-typedefs/proxy-info", + "version-1.0.0-typedefs/logger-options", + "version-1.0.0-typedefs/request-options", + "version-1.0.0-typedefs/request-list-options", + "version-1.0.0-typedefs/request-as-browser-options", + "version-1.0.0-typedefs/session-options", + "version-1.0.0-typedefs/session-pool-options", + "version-1.0.0-typedefs/snapshotter-options", + "version-1.0.0-typedefs/stealth-options", + "version-1.0.0-typedefs/system-status-options" + ] + }, + { + "type": "subcategory", + "label": "Functions - Crawlers", + "ids": [ + "version-1.0.0-typedefs/cheerio-handle-page", + "version-1.0.0-typedefs/cheerio-handle-page-inputs", + "version-1.0.0-typedefs/handle-failed-request", + "version-1.0.0-typedefs/handle-failed-request-input", + "version-1.0.0-typedefs/handle-request", + "version-1.0.0-typedefs/handle-request-inputs", + "version-1.0.0-typedefs/prepare-request", + "version-1.0.0-typedefs/prepare-request-inputs", + "version-1.0.0-typedefs/playwright-handle-page", + "version-1.0.0-typedefs/playwright-handle-page-inputs", + "version-1.0.0-typedefs/puppeteer-handle-page", + "version-1.0.0-typedefs/puppeteer-handle-page-inputs" + ] + }, + { + "type": "subcategory", + "label": "Functions - Dataset", + "ids": [ + "version-1.0.0-typedefs/dataset-consumer", + "version-1.0.0-typedefs/dataset-mapper", + "version-1.0.0-typedefs/dataset-reducer" + ] + }, + { + "type": "subcategory", + "label": "Functions - KeyValueStore", + "ids": [ + "version-1.0.0-typedefs/key-consumer" + ] + }, + { + "type": "subcategory", + "label": "Functions - Sessions", + "ids": [ + "version-1.0.0-typedefs/create-session" + ] + }, + { + "type": "subcategory", + "label": "Functions - Utilities", + "ids": [ + "version-1.0.0-typedefs/request-transform" + ] + }, + { + "type": "subcategory", + "label": "Return Types", + "ids": [ + "version-1.0.0-typedefs/actor-run", + "version-1.0.0-api/apify-call-error", + "version-1.0.0-typedefs/apify-env", + "version-1.0.0-typedefs/dataset-content", + "version-1.0.0-typedefs/memory-info", + "version-1.0.0-typedefs/queue-operation-info", + "version-1.0.0-typedefs/request-list-state", + "version-1.0.0-typedefs/session-state", + "version-1.0.0-typedefs/social-handles", + "version-1.0.0-typedefs/system-info" + ] + } + ] + } +} diff --git a/website/versions.json b/website/versions.json index 47f36f88e75c..4fcd1189d98c 100644 --- a/website/versions.json +++ b/website/versions.json @@ -1,3 +1,4 @@ [ + "1.0.0", "0.22.4" ]