Actor

juansgaitan/pdf-scraping

  • Builds
  • latest 0.0.70 / 2017-11-02
  • Created 2017-08-16
  • Last modified 2018-09-18
  • grade 3

Description

Scrape PDF text using pdfExtract.


API

To run the actor, send a HTTP POST request to:

https://api.apify.com/v2/acts/juansgaitan~pdf-scraping/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the actor. For more information, read the docs.


Example input

Content type: application/json

{ "url": "http://www.ripuc.org/utilityinfo/electric/NPP_List.pdf"}

Source code

Based on the apify/actor-node-puppeteer Docker image (see docs).

const fs = require('fs');
const path = require('path');
const Apify = require('apify');
const Promise = require("bluebird");
const { typeCheck } = require('type-check');
const pdfExtract = require('pdf-text-extract');
const requestPromise = require('request-promise');

// Helper functions
const { log, dir, error } = console;

// Definition of the input
const INPUT_TYPE = `{
  url: String,
}`;

// This function will vary on the formatting of each PDF.
function crawlResult(arr) {
  const allPages = arr[0].split(/\n/g)
    .map(x => x.split(/\s{4,}/g)
      .map(y => y.replace(/\s+/g, ' '))
      .filter(Boolean)
    ).filter(e => e.length);

  const info = {
    'Title': allPages[0][0],
    'Number of Registered Companies': allPages.length,
    'Companies': [],
  };
  const th = allPages[1].map(x => x.trim());

  let company, temp, i, j;
  for (i = 2; i < allPages.length; i++) {
    company = allPages[i];
    temp = {};
    for (j in th) temp[th[j]] = company[j];
    info.Companies.push(temp);
  }
  // const json = JSON.stringify(info); // or return a JSON Object;
  return info;
}

Apify.main(async () => {
  // Fetch and check the input
  const input = await Apify.getValue('INPUT');
  if (!typeCheck(INPUT_TYPE, input)) {
      error('Expected input:');
      error(INPUT_TYPE);
      error('Received input:');
      throw new Error('Received invalid input');
  }
  const options = {
    url: input.url,
    encoding: null // set to `null`, if you expect binary data.
  };

  log('Requesting URL: ', input.url);
  const response = await requestPromise(options);
  const buffer = Buffer.from(response);

  const tmpTarget = 'temp.pdf';
  log('Saving file to: ' + tmpTarget);
  fs.writeFileSync(tmpTarget, buffer)
  log('File saved.');

  const pathToPdf = path.join(__dirname, tmpTarget);
  const extract = Promise.promisify(pdfExtract);

  log('Extracting PDF...');
  const arrayOfPages = await extract(pathToPdf);
  log('Crawling result...');
  const json = crawlResult(arrayOfPages);

  const output = {
    actAt: new Date(),
    actResult: json,
  };
  dir(JSON.stringify(output, null, 2));

  log('Setting OUTPUT...')
  await Apify.setValue('OUTPUT', output);
  log('Finished');
});