Act

petr_cermak/crawler-results-dedup-xls

  • Builds
  • latest 0.0.27 / 2017-10-09
  • Created 2017-09-03
  • Last modified 2017-10-06
  • grade 2

Description

This act downloads crawler execution results, deduplicates them and sends them to a specified e-mail. It can be used from a crawler finish webhook, the data attribute must be a stringified JSON. Example input: { "_id": "EXECUTION_ID" "data": "{ \"email\": \"YOUR_E_MAIL_ADDRESS\", \"title\": \"E_MAIL_TITLE\" }" }


API

To run the act, send a HTTP POST request to:

https://api.apify.com/v2/acts/petr_cermak~crawler-results-dedup-xls/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the act. For more information, read the docs.


Example input

Content type: application/json

{ "hello": 123 }

Source code

Based on the apify/actor-node-basic Docker image (see docs).

const XLSX = require('xlsx');
const Apify = require('apify');
const _ = require('underscore');
const Promise = require('bluebird');

Apify.main(async () => {
    Apify.setPromisesDependency(Promise);
    const rowSplit = process.env.MULTIROW ? parseInt(process.env.MULTIROW) : 10;
    
    // get Act input and validate it
    const input = await Apify.getValue('INPUT');
    const data = input.data ? (typeof input.data === 'string' ? JSON.parse(input.data) : input.data) : {};
    if(!input._id){
        return console.log('missing "_id" attribute in INPUT');
    }
    if(!input.data){
        console.log('WARNING: missing "data" attribute in INPUT, row JSON will be used as a deduplication key');
    }
    
    function getKey(value){
        return (input.data && input.data.compareKey && value[input.data.compareKey] !== undefined) ? 
               value[input.data.compareKey] : JSON.stringify(value);
    }
    
    const hashMap = {};
    function processResult(result){
        const key = getKey(result);
        if(!hashMap[key]){hashMap[key] = result;}
    }
    function processResults(results){
        _.each(results.items, function(item, index){
            const pfr = item.pageFunctionResult;
            if(pfr){
                if(Array.isArray(pfr) && pfr.length > 0){
                    _.each(pfr, processResult);
                }
                else{processResult(pfr);}
            }
        });
    }
    
    // set global executionId
    Apify.client.setOptions({executionId: input._id});
    
    // loop through pages of results and deduplicate them
    const limit = 200;
    let total = null, offset = 0;
    while(total === null || offset + limit < total){
        const results = await Apify.client.crawlers.getExecutionResults({limit: limit, offset: offset});
        processResults(results);
        total = results.total;
        offset += limit;
    }
    //await Apify.setValue('OUTPUT', Object.values(hashMap));
    //const storeId = Apify.getEnv('defaultKeyValueStoreId');
    
    function toXlsxBuffer(array){
        const ws = XLSX.utils.json_to_sheet(array);
        const wb = {SheetNames:['results'], Sheets:{'results': ws}};
        const wopts = {bookType:'xlsx', bookSST:false, type:'binary'};
        const wbout = XLSX.write(wb, wopts);
         
        function s2ab(s){
            const buf = new ArrayBuffer(s.length);
            const view = new Uint8Array(buf);
            for(let i=0; i!=s.length; ++i){
                view[i] = s.charCodeAt(i) & 0xFF;
            }
            return Buffer.from(buf);
        }
        
        //return new Blob([s2ab(wbout)], {type: type});
        return s2ab(wbout);
    }
    
    /* the saveAs call downloads a file on the local machine */
    //saveAs(new Blob([s2ab(wbout)],{type:"application/octet-stream"}), "test.xlsx");
    
    const output = Object.values(hashMap);
    const buffer = toXlsxBuffer(output);
    const type = "application/octet-stream";
    //console.log('Is buffer: ' + Buffer.isBuffer(buffer) + ', ' + buffer);
    //await Apify.setValue('OUTPUT', buffer, {contentType: type});
    await Apify.setValue('results.xlsx', buffer, {contentType: type});
    
    const storeId = (await Apify.getEnv()).defaultKeyValueStoreId;
    const url = "https://api.apifier.com/v2/key-value-stores/" + storeId + "/records/results.xlsx?rawBody=1&disableRedirect=1";
    console.log('XLSX URL: ' + url);
    await Apify.call('apify/send-mail', {
        to: data.email,
        subject: data.title,
        text: url
    });

});