Actor

mnmkng/create-har-from-url-list

  • Builds
  • latest 0.0.15 / 2018-07-14
  • Created 2018-07-13
  • Last modified 2018-07-14
  • grade 2

Description

This Actor takes a country code and a list of urls as an input (see RequestList docs or example Body) and returns a Dataset with a record per url processed. Dataset contains basic request metadata and a link to KeyValueStore where a HAR archive of the url crawl is saved. The requests are proxied via proxies located in the given country.


API

To run the actor, send a HTTP POST request to:

https://api.apify.com/v2/acts/mnmkng~create-har-from-url-list/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the actor. For more information, read the docs.


Example input

Content type: application/json; charset=utf-8

{
  "country": "US",
  "sources": [
    { "url": "http://makdevelopers.com/dr/" },
    { "url": "http://partiwa-adiputra.com/invoice/nD/" },
    { "url": "https://goldstoneco.net/images/doc/auth/" },
    { "requestsFromUrl": "https://apifier-user-uploaded-files-prod.s3.amazonaws.com/46FWvpepkbTtGvDxL-url_list.csv" }
  ]
}

Source code

Based on the apify/actor-node-chrome Docker image (see docs).

const Apify = require('apify');
const PuppeteerHar = require('puppeteer-har');
const crypto = require('crypto');

const PROXY_MAP = {
    UK: "BUYPROXIES63811",
    US: "BUYPROXIES68277",
    CZ: "di5BsrawgufWetctm"
};

Apify.main(async () => {

    const { country, sources } = await Apify.getValue('INPUT');
    if (!sources || !Array.isArray(sources)) throw new Error("Input is missing sources declaration.");
    if (!country) throw new Error("Input is missing proxy country declaration.");
    const proxyGroup = PROXY_MAP[country];
    if (!proxyGroup) throw new Error(`Proxies for the given country: ${country} are not available.`);

    const requestList = new Apify.RequestList({sources});
    await requestList.initialize(); // Load requests.

    // Dictionary to map individual HARs to requests
    const requestHarMap = new WeakMap();
    const responseMap = new Map();

    const crawler = new Apify.PuppeteerCrawler({
        requestList,
        launchPuppeteerOptions: {
            useApifyProxy: true,
            apifyProxyGroups: [proxyGroup],
        },
        gotoFunction: async ({request, page}) => {
            const har = new PuppeteerHar(page);
            requestHarMap.set(request, har);
            console.log(`Initiating HAR for ${request.url}`);
            await har.start();
            page.on("response", async (res) => {
                let responses;
                if (responses = responseMap.get(request.uniqueKey)) {
                    responses.push(res)
                } else {
                    responseMap.set(request.uniqueKey, [res]);
                }
            });
            await page.goto("about:blank");
            return page.goto(request.url, {waitUntil: ["load", "networkidle2"]})
        },
        handlePageFunction: async ({page, request}) => {
            const har = requestHarMap.get(request);
            console.log(`Finalizing HAR for ${request.url}`);
            const result = await har.stop();
            await assignContentToRequests(request, responseMap, result);
            const id = crypto.createHash("sha256").update(request.uniqueKey).digest("base64").replace(/[+/]/g,"x").substr(0,17);
            await Apify.setValue(id, result);
            await Apify.pushData({
                request,
                har: getKeyValueStoreUrl(id)
            });
            console.log(`HAR of ${request.url} saved successfully.`)
            responseMap.delete(request.uniqueKey);
        },
        handleFailedRequestFunction: async ({request}) => {
            const har = requestHarMap.get(request);
            console.log(`Closing HAR after request to: ${request.url} failed.`);
            const result = await har.stop();
            await assignContentToRequests(request, responseMap, result);
            const id = crypto.createHash("sha256").update(request.uniqueKey).digest("base64").replace(/[+/]/g,"x").substr(0,17);
            await Apify.setValue(id, result);
            await Apify.pushData({
                request,
                har: getKeyValueStoreUrl(id)
            });
            console.log(`Saving error data for ${request.url}`);
            responseMap.delete(request.uniqueKey);
        },
    });

    await crawler.run();
});

function getKeyValueStoreUrl(recordKey) {
    return `https://api.apify.com/v2/key-value-stores/${process.env.APIFY_DEFAULT_KEY_VALUE_STORE_ID}/records/${recordKey}`
}

async function assignContentToRequests(request, responseMap, result) {
    const responses = responseMap.get(request.uniqueKey);
    for (const res of responses) {
        const requestUrl = res.request().url();
        const harEntry = result.log.entries.find(entry => entry.request.url.match(requestUrl.substr(0, 100))); // limit length to prevent errors
        try {
            const body = await res.buffer();
            const headers = res.headers();
            const cType = headers["content-type"];
            const encoding = (cType && cType.match(/(text|javascript)/)) ? "utf8" : "base64";
            harEntry.response.content.text = body.toString(encoding);
            if (encoding === "base64") harEntry.response.content.encoding = "base64";
        } catch (e) {
            // do nothing
        }
    }
}