Actor

jancurn/analyse-pages

  • Builds
  • latest 0.0.12 / 2018-04-17
  • Created 2017-08-31
  • Last modified 2018-09-18
  • grade 7

Description

See more at https://github.com/jancurn/act-analyse-pages


API

To run the actor, send a HTTP POST request to:

https://api.apify.com/v2/acts/jancurn~analyse-pages/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the actor. For more information, read the docs.


Example input

Content type: application/json

{
  "urls": [
    "https://www.example.com",
    "https://www.apifier.com",
    "https://www.apify.com",
    "https://www.cnn.com",
    "https://www.idnes.cz",
    "https://www.seznam.cz",
    "https://www.yahoo.com",
    "https://www.microsoft.com"
  ],
  "concurrency": 2,
  "storePagesInterval": 20,
  "screenshotWidth": 1200,
  "screenshotHeight": 900
}

Source code

Based on the apify/actor-node-chrome Docker image (see docs).

// NOTE: This code is quite outdated, using the Apify SDK you can achieve
// the same results in about 10x shorter code. I hope to fix this example soon.

const URL = require('url');
const util = require('util');
const puppeteer = require('puppeteer');
const _ = require('underscore');
const Apify = require('apify');
const leftPad = require('left-pad');
const typeCheck = require('type-check').typeCheck;
const request = require('request');
const async = require('async');

console.log(`Starting (${JSON.stringify(process.versions)})`);

// Definition of the input
const INPUT_TYPE = `{
    urls: [String],
    urlToTextFileWithUrls: Maybe String,
    concurrency: Maybe Number,
    storePagesInterval: Maybe Number,
    screenshotWidth: Maybe Number,
    screenshotHeight: Maybe Number
}`;

const DEFAULT_STATE = {
    storeCount: 0,
    pageCount: 0,
};
const DEFAULT_STORE_PAGES_INTERVAL = 20;
const DEFAULT_CONCURRENCY = 1;


// Input object
let input;

// Objects holding the state of the crawler, which is stored under the 'STATE' key in the KV store
let state;

// Array of Result records that were finished but not yet stored to KV store
const results = [];

// Date when state and data was last stored
let lastStoredAt = new Date();

// Indicates that results and state are currently being stored
let isStoring = false;

// Properties of window object on about:blank page.
// It is an object where key is property name, value is true, to allow quick lookup.
let nativeWindowsProperties = null;


const requestPromised = async (opts) => {
    return new Promise((resolve, reject) => {
        request(opts, (error, response, body) => {
            if (error) return reject(error);
            resolve({ body: body, response: response });
        });
    });
};

// Downloads list of URLs from an external text file and adds valid URLs to input.urls
const addUrlsFromTextFile = async (input) => {
    console.log(`Fetching text file from ${input.urlToTextFileWithUrls}`);
    const request = await requestPromised({ url: input.urlToTextFileWithUrls });
    const textFile = request.body;

    console.log(`Processing URLs from text file (length: ${textFile.length})`);
    let count = 0;
    textFile.split('\n').forEach((url) => {
        url = url.trim();
        const parsed = URL.parse(url);
        if (parsed.host) {
            count++;
            input.urls.push(url);
        }
    });

    console.log(`Added ${count} URLs from the text file`);
};


// If there's a long enough time since the last storing,
// the function stores finished pages and the current state to the key-value store.
const maybeStoreResults = async (force) => {
    // Is there anything to store?
    if (results.length === 0) return;

    // Is it long enough time since the last storing?
    if (!force && results.length < input.storePagesInterval) return;

    // Isn't some other worker storing data?
    if (isStoring) return;
    isStoring = true;

    try {
        // Store buffered pages to store under key RESULTS-XXX
        // Careful here, results array might be added more records while awaiting setValue()
        const recordsToStore = _.clone(results);
        const key = `RESULTS-${leftPad(state.storeCount+1, 9, '0')}`;

        console.log(`Storing ${recordsToStore.length} page records to ${key} (total pages crawled: ${state.pageCount + recordsToStore.length})`);
        await Apify.setValue(key, recordsToStore);

        results.splice(0, recordsToStore.length);

        // Update and save state (but only after saving pages!)
        state.pageCount += recordsToStore.length;
        state.storeCount++;
        await Apify.setValue('STATE', state);

        lastStoredAt = new Date();
    } catch(e) {
        // This is a fatal error, immediately stop the act
        if (e.message && e.message.indexOf('The POST payload is too large') >= 0) {
            console.log('FATAL ERROR');
            console.log(e.stack || e);
            process.exit(1);
        }
        if (force) throw e;
        console.log(`ERROR: Cannot store data (will be ignored): ${e.stack || e}`);
    } finally {
        isStoring = false;
    }
};


// Worker function, it loads one URL from the list and analyses the page content
const analysePage = async ({ browser, url }) => {
    const result = {
        url,
        // Fix order of fields
        errorInfo: null,
        loadedUrl: null,
        requestedAt: null,
        loadedAt: null,
        analysedAt: null,
        responseStatus: null,
        responseHeaders: null,
        responseTotalBytes: 0,
        iframeCount: null,
        scriptCount: null,
        windowProperties: null,
        requests: [],
        html: null,
        text: null,
        screenshotPngBase64: null,
    };

    let page = null;

    try {
        page = await browser.newPage();

        page.on('error', (err) => {
            console.log(`Web page crashed (${url}): ${err}`);
            page.close().catch((err2) => console.log(`Error closing page 1 (${url}): ${err2}`));
        });

        // On first run, get list of native window properties from the browser
        if (!nativeWindowsProperties) {
            const keys = await page.evaluate(() => {
                return Object.keys(window);
            });
            // Other concurrent worker might have done the same in the meantime
            if (!nativeWindowsProperties) {
                console.log(`Found ${keys.length} native 'window' object properties`);
                nativeWindowsProperties = {};
                _.each(keys, (key) => {
                    nativeWindowsProperties[key] = true;
                });
            }
        }

        if (input.screenshotWidth > 0 && input.screenshotHeight > 0) {
            page.setViewport({
                width: input.screenshotWidth,
                height: input.screenshotHeight,
            });
        }

        // Key is requestId, value is record in result.requests
        const requestIdToRecord = {};

        // ID of the main page request
        let initialRequestId = null;


        const getOrCreateRequestRecord = (requestId) => {
            let rec = requestIdToRecord[requestId];
            if (!rec) {
                rec = requestIdToRecord[requestId] = {
                    url: null,
                    method: null,
                    responseStatus: null,
                    responseHeaders: null,
                    responseBytes: 0 ,
                };
                result.requests.push(rec);
            }
            return rec;
        };

        page.on('request', (request) => {
            if (!initialRequestId) initialRequestId = request._requestId;
            const rec = getOrCreateRequestRecord(request._requestId);
            rec.url = request.url;
            rec.method = request.method;
        });

        // WORKAROUND: Puppeteer's Network.loadingFinished handler doesn't store encodedDataLength field
        page._networkManager._client.on('Network.dataReceived', (params) => {
            const rec = getOrCreateRequestRecord(params.requestId);
            if (rec) rec.responseBytes += params.encodedDataLength || 0;
            result.responseTotalBytes += params.encodedDataLength || 0;
        });

        page.on('response', (response) => {
            const request = response.request();
            const rec = getOrCreateRequestRecord(request._requestId);
            if (rec) {
                rec.responseStatus = response.status;
                rec.responseHeaders = response.headers;
            }
        });

        console.log(`Loading page: ${url}`);
        result.requestedAt = new Date();
        await page.goto(url);

        console.log(`Page loaded: ${url}`);

        const rec = requestIdToRecord[initialRequestId];
        if (rec) {
            result.responseStatus = rec.responseStatus;
            result.responseHeaders = rec.responseHeaders;
        }

        result.loadedAt = new Date();
        result.loadedUrl = await page.url();

        const evalData = await page.evaluate(() => {
            return {
                html: document.documentElement.innerHTML,
                text: document.documentElement.innerText,
                iframeCount: document.querySelectorAll('iframe').length,
                scriptCount: document.querySelectorAll('script').length,
                allWindowProperties: Object.keys(window),
            }
        });
        Object.assign(result, _.pick(evalData, 'html', 'text', 'iframeCount', 'scriptCount'));

        // Extract list of non-native window properties
        result.windowProperties = [];
        _.each(evalData.allWindowProperties, (propName) => {
            if (!nativeWindowsProperties[propName]) result.windowProperties.push(propName);
        });

        if (input.screenshotWidth > 0 && input.screenshotHeight > 0) {
            console.log(`Taking screenshot: ${result.url}`);
            const buffer = await page.screenshot({ type: 'png', fullPage: true });
            result.screenshotPngBase64 = buffer.toString('base64');
        }

        result.analysedAt = new Date();
    } catch (e) {
        console.log(`Loading of web page failed (${url}): ${e}`);
        result.errorInfo = e.stack || e.message || String(e);
    } finally {
        if (page) {
            page.close().catch((e) => console.log(`Error closing page 2 (${url}): ${e}`));
        }
    }

    console.log(`Page finished: ${result.url}`);

    results.push(result);
    await maybeStoreResults();
};



Apify.main(async () => {
    // Fetch and check the input
    input = await Apify.getValue('INPUT');
    if (!typeCheck(INPUT_TYPE, input)) {
        console.error('Expected input:');
        console.error(INPUT_TYPE);
        console.error('Received input:');
        console.error(util.inspect(input));
        throw new Error('Received invalid input');
    }

    if (!(input.storePagesInterval > 0)) input.storePagesInterval = DEFAULT_STORE_PAGES_INTERVAL;
    if ((!input.concurrency > 0)) input.concurrency = DEFAULT_CONCURRENCY;

    // Prepare list of URLs
    input.urls = input.urls || [];
    if (input.urlToTextFileWithUrls) addUrlsFromTextFile(input);

    // Get the state of crawling (the act might have been restarted)
    state = await Apify.getValue('STATE');
    if (state) console.log(`Reusing persisted state: ${JSON.stringify(state)}`)
    else state = DEFAULT_STATE;

    const browser = await puppeteer.launch({
        args: ['--no-sandbox'],
        headless: !!process.env.APIFY_HEADLESS,
        //dumpio: true,
        // slowMo: 1000
    });

    // Load pages in asynchronous queue with a specified concurrency
    const queue = async.queue(analysePage, input.concurrency);

    // Push all not-yet-crawled URLs to to the queue
    if (state.pageCount > 0) {
        console.log(`Skipping first ${state.pageCount} pages that were already crawled`);
        input.urls.splice(0, state.pageCount);
    }
    input.urls.forEach((url) => {
        queue.push({ browser, url }, (err) => {
            if (err) console.log(`WARNING: Unhandled exception from worker function: ${err.stack || err}`);
        });
    });

    // Wait for the queue to finish all tasks
    if (input.urls.length > 0) {
        await new Promise((resolve) => {
            queue.drain = resolve;
        });
    }

    // Force store results
    await maybeStoreResults(true);

    console.log('Done');
});