Actor

petr_cermak/booking-best-hotels

  • Builds
  • latest 0.0.147 / 2018-10-18
  • Created 2018-10-11
  • Last modified 2018-10-18
  • grade 5

Description

Crawler for best hotels on booking in any city. This crawler does not work without US buyproxies!!


API

To run the actor, send a HTTP POST request to:

https://api.apify.com/v2/acts/petr_cermak~booking-best-hotels/runs?token=<YOUR_API_TOKEN>

The POST payload will be passed as input for the actor. For more information, read the docs.


Example input

Content type: application/json; charset=utf-8

{
    "search": "paris", 
    "minScore": 8.4,
    "maxPages": 20,
    "concurrency": 20,
    "checkIn": "10-18-2018", 
    "checkOut": "10-19-2018", 
    "currency": "eur",
    "language": "en-gb",
    "proxyGroup": "RESIDENTIAL"
}

Source code

Based on the apify/actor-node-chrome Docker image (see docs).

const Apify = require('apify');
const request = require('request-promise');

async function saveScreenshot(name, page){
    try{
        const screenshotBuffer = await page.screenshot();
        await Apify.setValue(name + '.png', screenshotBuffer, { contentType: 'image/png' });
        const html = await page.evaluate(() => document.body.innerHTML);
        await Apify.setValue(name + '.html', html, { contentType: 'text/html' });
    }
    catch(e){console.log('unable to save screenshot: ' + name);}
}

async function getAttribute(element, attr){
    try{
        const prop = await element.getProperty(attr);
        return (await prop.jsonValue()).trim();
    }
    catch(e){return null;}
}

async function enqueueLinks(page, requestQueue, selector, condition, label, urlMod, keyMod){
    const links = await page.$$(selector);
    for(const link of links){
        const href = await getAttribute(link, 'href');
        if(href && (!condition || await condition(link))){
            await requestQueue.addRequest(new Apify.Request({
            	userData: {label: label},
            	url: urlMod ? urlMod(href) : href,
            	uniqueKey: keyMod ? (await keyMod(link)) : href
            }));
        }
    }
}

Apify.main(async () => {
    
    const input = await Apify.getValue('INPUT');
    
    if(!input.search){
        throw new Error('Missing "search" attribute in INPUT!');
    }
    
    const requestQueue = await Apify.openRequestQueue();
    
    const query = encodeURIComponent(input.search);
    let startUrl = `https://www.booking.com/searchresults.html?dest_type=city;ss=${query}&order=bayesian_review_score`;
    if(input.checkIn && input.checkOut){
        const ci = input.checkIn.split(/-|\//);
        const co = input.checkOut.split(/-|\//);
        //startUrl += `&checkin_month=${ci[0]}&checkin_monthday=${ci[1]}&checkin_year=${ci[2]}`;
        //startUrl += `&checkout_month=${co[0]}&checkout_monthday=${co[1]}&checkout_year=${co[2]}`;
        startUrl += `&checkin_year_month_monthday=${ci[2]}-${ci[0]}-${ci[1]}`;
        startUrl += `&checkout_year_month_monthday=${co[2]}-${co[0]}-${co[1]}`;
    }
    if(input.currency){
        startUrl += `&selected_currency=${input.currency.toUpperCase()}&changed_currency=1&top_currency=1`;
    }
    if(input.language){
        const lng = input.language.replace('_','-');
        startUrl += `&lang=${lng}`;
        //startUrl += `&lang_click=top&cdl=${input.language}&lang_changed=1`;
    }
    console.log('startUrl: ' + startUrl);
    await requestQueue.addRequest(new Apify.Request({url: startUrl, userData: {label: 'start'}}));
    
    const { 
    	APIFY_PROXY_PASSWORD, 
    	APIFY_PROXY_HOSTNAME,
    	APIFY_PROXY_PORT,
    } = process.env;
    
    const session = 'my_session_1';
    const country = input.country || 'gb';
    const proxyGroup = input.proxyGroup || 'RESIDENTIAL';
    const username = `groups-${proxyGroup},session-${session},country-${country}`;
    const proxyUrl = `http://${username}:${APIFY_PROXY_PASSWORD}@${APIFY_PROXY_HOSTNAME}:${APIFY_PROXY_PORT}`;
    
    const launchPuppeteerOptions = input.proxyGroup ? {proxyUrl} : {};
    
    const crawler = new Apify.PuppeteerCrawler({
        requestQueue,
        
        maxConcurrency: input.concurrency || 10,
        
        launchPuppeteerOptions,
        
        /*launchPuppeteerOptions: {
            useApifyProxy: true,
            apifyProxyGroups: input.proxyGroups || ['RESIDENTIAL'],
            apifyProxySession: 'my_session_2',
            //args: [`--window-size=1600,800`],
            //liveView: true
        },*/
        
        handlePageFunction: async ({ page, request }) => {
            
            const listPageFunction = (minScore) => new Promise((resolve, reject) => {
   
                const $ = jQuery;
   
                function waitFor(condition, callback, i){
                    var val = condition();
                    if(val){callback(val);}
                    else if(i > 10){callback(null);}
                    else{setTimeout(function(){waitFor(condition, callback, i ? i+1 : 1);}, 500);}
                }
                
                var getHeaderNumber = function(){
                    var av = $('.availability_nr').text().trim().replace(/(\s|\.|,)+/g, '').match(/\d+/);
                    var h1 = $('.sr_header h1').text().replace(/(\s|\.|,)+/g, '').match(/\d+/);
                    var h2 = $('.sr_header h2').text().replace(/(\s|\.|,)+/g, '').match(/\d+/);
                    var h4 = $('#results_prev_next h4').text().replace(/(\s|\.|,)+/g, '').match(/\d+/);
                    var fd = $('#sr-filter-descr').text().replace(/(\s|\.|,)+/g, '').match(/(\d+)de/);
                    return av ? av[0] : (h1 ? h1[0] : (h2 ? h2[0] : (h4 ? h4[0] : (fd ? fd[1] : null))));
                }
                
                var result = [];
                var num = getHeaderNumber();
                var items = $('.sr_item');//$('.sr_item').eq(0).nextUntil('.sr_separator').addBack();
                console.log('items: ' + items.length);
                var started = 0;
                var finished = 0;
                items.each(function(index, sr){
                    var n1 = $(this).find('.score_from_number_of_reviews').text().replace(/(\s|\.|,)+/g, '').match(/\d+/);
                    var n2 = $(this).find('.review-score-widget__subtext').text().replace(/(\s|\.|,)+/g, '').match(/\d+/);
                    if(true){
                        ++started;
                        sr.scrollIntoView();
                        var getPrice = function(){
                            return $(sr).find(':not(strong).site_price, .totalPrice, strong.price');
                        }
                        waitFor(function(){return getPrice().length > 0;}, function(){
                            var pr = getPrice().eq(0).text().trim().replace(/(\d)\s(\d)/g, '$1$2').split(/\s+/);
                            var rat = $(sr).attr('data-score');
                            var found = num ? parseInt(num) : null;
                            var item = {
                                'url': window.location.origin + $('.hotel_name_link').attr('href'),
                                'total': found,
                                'name': $(sr).find('.sr-hotel__name').text().trim(),
                                'score': rat ? parseFloat(rat.replace(',', '.')) : null,
                                'reviews': n1 ? parseInt(n1[0]) : (n2 ? parseInt(n2[0]) : null),
                                'price': pr.length > 0 ? parseFloat(pr[pr.length - 1].replace('.', '')) : null
                            };
                            if(item.score && item.score >= minScore){result.push(item);}
                            if(++finished >= started){
                                resolve(result.sort((a, b) => a - b));
                            }
                        });
                    }
                    else{resolve([]);}
                });
            });
            
            const fixUrl = s => href => {
                href = href.replace(/#([a-zA-Z_]+)/g, '');
                if(input.language && href.indexOf('lang') < 0){
                    const lng = input.language.replace('_','-');
                    if(href.indexOf(s)){
                        href.replace(s, `${s}lang=${lng}${s}`);
                    }
                    else{href += `${s}lang=${lng}`;}//`${s}lang_click=top${s}cdl=${lng}${s}lang_changed=1`;
                }
                if(input.currency && href.indexOf('currency') < 0){
                    href += `${s}selected_currency=${input.currency.toUpperCase()}${s}changed_currency=1${s}top_currency=1`;
                }
                //console.log('enqueuing: ' + href);
                return href;
            };
            
            console.log('open url: ' + await page.url());
            
            const extractRooms = async () => {
                let roomType, bedText, features;
                const rooms = [];
                
                const rows = await page.$$('.hprt-table > tbody > tr:not(.hprt-cheapest-block-row)');
                for(const row of rows){
                    const roomRow = await row.$('.hprt-table-cell-roomtype');
                    if(roomRow){
                        roomType = await row.$('.hprt-roomtype-icon-link');
                        const bedType = await row.$('.hprt-roomtype-bed');
                        bedText = bedType ? await getAttribute(bedType, 'textContent') : null;
                        
                        const facilities = roomRow ? await roomRow.$$('.hprt-facilities-facility') : null;
                        features = [];
                        if(facilities){
                            for(const f of facilities){
                                const fText = (await getAttribute(f, 'textContent')).replace('•', '').trim();
                                if(fText.indexOf('ft²') > -1){
                                    const num = parseInt(fText.split(' ')[0]);
                                    const nText = parseInt(num*0.092903) + ' m²';
                                    features.push(nText);
                                }
                                else{features.push(fText);}
                            }
                        }
                    }
                    
                    const occupancy = await row.$eval('.hprt-occupancy-occupancy-info', hprt => {
                        if(!hprt){return null;}
                        const occ1 = document.querySelector('.hprt-occupancy-occupancy-info .invisible_spoken');
                        const occ2 = document.querySelector('.hprt-occupancy-occupancy-info').getAttribute('data-title');
                        const occ3 = document.querySelector('.hprt-occupancy-occupancy-info').textContent;
                        return occ1 ? occ1.textContent : (occ2 || occ3);
                    });
                    const persons = occupancy ? occupancy.match(/\d+/) : null;
                    const priceE = await row.$('.hprt-price-price');
                    const prict = priceE ? await getAttribute(priceE, 'textContent') : null;
                    const priceT = priceE ? (await getAttribute(priceE, 'textContent')).replace(/\s|,/g, '').match(/(\d|\.)+/) : null;
                    const priceC = priceE ? (await getAttribute(priceE, 'textContent')).replace(/\s|,/g, '').match(/[^\d\.]+/) : null;
                    
                    const room = {available: true};
                    if(roomType){room.roomType = await getAttribute(roomType, 'textContent');}
                    if(bedText){room.bedType = bedText.replace(/\n+/g, ' ');}
                    if(persons){room.persons = parseInt(persons[0]);}
                    if(priceT && priceC){
                        room.price = parseFloat(priceT[0]);
                        room.currency = priceC[0];
                        room.features = features;
                    }
                    else{room.available = false;}
                    await rooms.push(room);
                }
                return rooms;
            };
            
            if(request.userData.label === 'detail'){
                try{await page.waitForSelector('.hprt-occupancy-occupancy-info');}
                catch(e){}
                
                await saveScreenshot('page_' + Math.random(), page);
                
                const ldElem = await page.$('script[type="application/ld+json"]');
                const ld = JSON.parse(await getAttribute(ldElem, 'textContent'));
                
                if(ld.aggregateRating.ratingValue <= (input.minScore || 8.4)){return;}
                
                const name = await page.$('#hp_hotel_name');
                const starIcon = await page.$('i.bk-icon-stars');
                const starTitle = await getAttribute(starIcon, 'title');
                const stars = starTitle ? starTitle.match(/\d/) : null;
                
                const rooms = await extractRooms();
                await Apify.pushData({
                    url: await page.url(),
                    name: await getAttribute(name, 'textContent'),
                    stars: stars ? stars[0] : null,
                    rating: ld.aggregateRating.ratingValue,
                    reviews: ld.aggregateRating.reviewCount,
                    rooms: rooms
                });
            }
            else{
                /*const url = await page.url();
                if(url.indexOf('bayesian') < 0){
                    console.log('re-opening sorted url...');
                    await page.goto(url + '&order=bayesian_review_score', {timeout: 100000});
                    await page.waitForNavigation({timeout: 100000});
                    await page.waitFor(5000);
                }*/
                
                if(input.simple){
                    console.log('extracting data...');
                    await Apify.utils.puppeteer.injectJQuery(page);
                    const result = await page.evaluate(listPageFunction, minScore || 8.4);
                    if(result.length > 0){await Apify.pushData(result);}
                }
                else{
                    console.log('enqueuing detail pages...');
                    await enqueueLinks(page, requestQueue, '.hotel_name_link', null, 'detail', fixUrl('&'));
                }
            }
            
            //if(result.filter(i => i.score < (input.minScore || 8.4)).length === 0){
                await enqueueLinks(page, requestQueue, '.sr_pagination_item a', async elem => {
                    const text = await getAttribute(elem, 'textContent');
                    return text.match(/\d+/) && parseInt(text) <= (input.maxPages || 20);
                }, 'page', fixUrl('&'), async l => await getAttribute(l, 'textContent'));
            //}
        },
        handleFailedRequestFunction: async ({ request }) => {
            await Apify.pushData({
                url: request.url,
                succeeded: false,
                errors: request.errorMessages,
            })
        },
        gotoFunction: async ({ page, request }) => {
        	await page.setRequestInterception(true);
        	page.on('request', intercepted => {
        	    const type = intercepted.resourceType();
        		if(type === 'image' || type === 'stylesheet'){intercepted.abort();}
        		else{intercepted.continue();}
        	});
        	await Apify.utils.puppeteer.hideWebDriver(page);
        	return await page.goto(request.url, {timeout: 200000});
        }
    });
    
    await crawler.run();
});