Actor

juansgaitan/link-extractor

  • Builds
  • latest 0.0.18 / 2018-01-18
  • Created 2017-10-23
  • Last modified 2018-09-18
  • grade 10

Description

Extract links from an Array of different paths/users parsed with a baseUrl and a pageFunction.


API

To run the actor, send a HTTP POST request to:

https://api.apify.com/v2/acts/juansgaitan~link-extractor/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the actor. For more information, read the docs.


Example input

Content type: application/json; charset=utf-8

{
  "baseUrl": "https://www.instagram.com/",
  "pageFunction": "const anchors = Array.from(document.querySelectorAll('._mck9w._gvoze._f2mse')); return anchors.map(anchor => anchor.firstElementChild.getAttribute('href'));",
  "waitForCssSelector": "._mck9w._gvoze._f2mse",
  "usernames": [
    "mcuban",
    "lewishowes",
    "garyvee",
    "ba&sh"
  ]
}

Source code

Based on the apify/actor-node-puppeteer Docker image (see docs).

const { URL } = require('url');
const Apify = require('apify');
const { typeCheck } = require('type-check');

const { log, dir } = console;

const INPUT_TYPE = `{
  baseUrl: String,
  pageFunction: String,
  waitForCssSelector: String,
  usernames: [String],
}`;

const parseUrlFor = baseUrl => input => new URL(input, baseUrl);
let parseUrl = null;

async function extractUrls(browser, username, url, pageFunc, cssSelector) {
  let page = null;
  const result = {
    username,
    postsLinks: [],
  };
  try {
    page = await browser.newPage();
    log(`New browser page for: ${url}`);

    const response = await page.goto(url, { waitUntil: 'networkidle2' });
    if (!/^2\d{2}$/.test(response.status)) {
      log('Response:', response.status);
      return Object.assign({}, result, {
        errorMessage: `${url} responded ${response.status}. Verify the username.`,
      });
    }
    await page.waitForSelector(cssSelector);

    const postsUrls = await page.evaluate((fn) => {
      const func = new Function(fn);
      return func();
    }, pageFunc);

    const parsedPostsUrls = postsUrls.map(parseUrl);
    result.postsLinks.push(...parsedPostsUrls);
  } catch (error) {
    throw new Error(`The page ${url}, could not be loaded: ${error}`);
  } finally {
    if (page) {
      await page.close().catch(error => log(`Error closing page: (${url}): ${error}.`));
    }
  }
  return result;
}

Apify.main(async () => {
  let input = await Apify.getValue('INPUT');
  if (typeof input === 'string') {
    input = JSON.parse(input);
  }
  log(input);
  if (!typeCheck(INPUT_TYPE, input)) {
    log('Expected input:');
    log(INPUT_TYPE);
    log('Received input:');
    dir(input);
    throw new Error('Received invalid input');
  }
  const {
    baseUrl,
    usernames,
    pageFunction,
    waitForCssSelector,
  } = input;
  log(baseUrl, usernames);

  log('Openning browser...');
  const browser = await Apify.launchPuppeteer();
  log('New browser window.');

  parseUrl = parseUrlFor(baseUrl);
  const allExtractedUrls = usernames.map((username) => {
    const { href } = parseUrl(username);
    return extractUrls(browser, username, href, pageFunction, waitForCssSelector);
  });
  const urls = await Promise.all(allExtractedUrls);
  await Apify.setValue('OUTPUT', urls);
  log(JSON.stringify(urls, null, 2));

  log('Closing browser.');
  await browser.close();
});