Version: 1.3

Puppeteer sitemap

This example demonstrates how to use PuppeteerCrawler to crawl a list of web pages specified in a sitemap. The crawler extracts the page title and URL from each page and stores them as a record in the default dataset. In local configuration, the results are stored as JSON files in ./apify_storage/datasets/default.

To run this example on the Apify Platform, select the Node.js 12 + Chrome on Debian (apify/actor-node-chrome) base image on the Source tab when configuring the actor.

const Apify = require('apify');

Apify.main(async () => {
    const requestList = new Apify.RequestList({
        sources: [{ requestsFromUrl: 'https://apify.com/sitemaps.xml' }],
    });
    await requestList.initialize();

    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        maxRequestsPerCrawl: 10,
        handlePageFunction: async ({ page, request }) => {
            console.log(`Processing ${request.url}...`);
            await Apify.pushData({
                url: request.url,
                title: await page.title(),
                html: await page.content(),
            });
        },
    });

    await crawler.run();
    console.log('Done.');
});