Act

jaroslavhejlek/bf-mysql-deduplicate

  • Builds
  • latest 0.0.30 / 2017-11-28
  • Created 2017-11-26
  • Last modified 2017-11-28
  • grade 1

Description

Loads data from mysql table and creates a new table without duplicates


API

To run the act, send a HTTP POST request to:

https://api.apify.com/v2/acts/jaroslavhejlek~bf-mysql-deduplicate/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the act. For more information, read the docs.


Example input

Content type: application/json; charset=utf-8

{ "hello": 123 }

Source code

Based on the apify/actor-node-puppeteer Docker image (see docs).

const Apify = require('apify');
const request = require('request-promise');
const _ = require('underscore');
const mysql = require('mysql');
const Promise = require('bluebird');

// Credentials to mysql stored securely in Environment variables
const MYSQL_CREDENTIALS = {
    host: process.env.MYSQL_HOST,
    user: process.env.MYSQL_USER,
    database: process.env.MYSQL_DATABASE,
    password: process.env.MYSQL_PASSWORD,
};

let connection;
let queryPromised;

/**
 * Creates connection to mysql database
 * @return {Promise} Promise of successful connection
 */
const connect = () => {
    Promise
        .resolve()
        .then(() => {
            connection = mysql.createConnection(MYSQL_CREDENTIALS);
            queryPromised = Promise.promisify(connection.query, { context: connection });

            return Promise.promisify(connection.connect, { context: connection })();
        });
}

/**
 * Creates deduplication query for day and eshop
 * @param  {String} day   Day for which we should deduplicate data
 * @param  {String} eshop Eshop for which we should deduplicate data
 * @return {String}       Query
 */
const getQuery = (day, eshop) => {
    // look for results for provided date and eshop
    const condition = `date > '${day} 00:00:00' AND date < '${day} 23:59:59' and eshop = "${eshop}"`;
    
    const groupByItemName = `GROUP BY eshop, itemName, DATE_FORMAT(date, '%Y-%m-%d')`
    const groupByItemUrl = `GROUP BY eshop, itemUrl, DATE_FORMAT(date, '%Y-%m-%d')`
    const groupBy = eshop === 'mall' ? groupByItemName : groupByItemUrl;

    return `INSERT INTO all_data_dedu
SELECT a.* FROM (
    SELECT * FROM all_data_last
    WHERE ${condition}
) a
JOIN (
    select eshop, itemName, DATE_FORMAT(date, '%Y-%m-%d') as day, MAX(id) as id from all_data_last a
    WHERE ${condition}
    ${groupBy}
) aa
ON aa.id = a.id`;
}

Apify.main(async () => {
    await connect();
    
    // setup of variables used for deduplication
    const days = [
        '2017-10-20', '2017-10-21', '2017-10-22', '2017-10-23', '2017-10-24', '2017-10-25', '2017-10-26',
        '2017-10-27', '2017-10-28', '2017-10-29', '2017-10-30', '2017-10-31', '2017-11-01', '2017-11-02', 
        '2017-11-03', '2017-11-04', '2017-11-05', '2017-11-06', '2017-11-07', '2017-11-08', '2017-11-09', 
        '2017-11-10', '2017-11-11', '2017-11-12', '2017-11-13', '2017-11-14', '2017-11-15', '2017-11-16',
        '2017-11-17', '2017-11-18', '2017-11-19', '2017-11-20', '2017-11-21', '2017-11-22', '2017-11-23', 
        '2017-11-24', '2017-11-25', '2017-11-26'
    ];
    const eshops = ['czc', 'mall', 'alza'];
    const queries = [];
    
    const totalQueries = eshops.length * days.length;

    // Generates queries for each option
    days.forEach(day => {
        eshops.forEach(eshop => {
            const query = getQuery(
                day, 
                eshop,
            );
            queries.push(query);
        })
    })

    // Process queries in order to ensure correct order of items after deduplication
    for (let i = 0; i < queries.length; i++) {
        const query = queries[i];
        console.log(`Running query ${i}/${totalQueries}`);
        console.log(query);
        // Process query with larger timeout to make sure that everything finishes in time
        await queryPromised({ sql: query , timeout: 10 * 60000 }); // timeout each query after ten minutes
        console.log('Query finished');
    }
});