On-demand scraping with PhantomJS and Node

Problem: I needed up to date size availability for clothing from multiple UK fashion stores.
Problem 2: None of the retailers wanted to provide realtime size availability.
Solution: Make my own API that takes a product URL, checks if it can be scraped, fires up the page in PhantomJS, grabs the sizes and then returns them.

First lets install the dependencies.

npm install express lodash phantom --save  

Now lets set up a simple express server to handle the requests.

var express = require('express');  
var phantom = require('phantom');  
var _ = require('lodash');  
var app     = express();

var scraper = require('./scraper');  
var asos = require('./scrapers/asos'); // our scraper for ASOS  
var site;

app.use(function(req, res, next) {  
  res.setHeader('Access-Control-Allow-Origin', '*');
  res.setHeader('Access-Control-Allow-Methods', 'GET');
  res.setHeader('Access-Control-Allow-Headers', 'X-Requested-With,content-type, Authorization,If-Modified-Since');
  next();
});

// this intercepts the favicon request to stop it raising an error.
app.get('/favicon.ico', function(req, res){  
  console.log("No favicon to find here!");
})

app.get('/', function(req, res){  
  res.send("Place a product url after the url to scrape for sizes!");
})

// Catch all URL to grab the url for the product to be scraped
app.get('*', function(req, res){  
  var url = req.originalUrl.substring(1);
  console.log("Scraping: " + url);

  // check the url matches a site that can be scraped

  if (url.indexOf("www.asos.com") !== -1) {
    // set the correct scraper for the site
    site = asos;
  }

  // If we are given a url we can scrape, pass the response, the url 
  // and the site module to use to the scraper function
  if (site) {
    scraper(res, url, site);
  } else {
    res.send("cant scrape this currently");
  }

});

app.listen(process.env.PORT || 5000);

console.log('Magic happens on port 5000');

exports = module.exports = app;  

In our scraper.js file we need to fire up PhantomJS, wait for any async JS on the page to complete and then grab sizes with JS and then parse them.

var phantom = require('phantom');

module.exports = function(res, url, site){  
  phantom.create(function(ph) {
    return ph.createPage(function(page) {
      return page.open(url, function(status) {
        console.log("opened site? ", status);         

          page.injectJs('http://ajax.googleapis.com/ajax/libs/jquery/1.7.2/jquery.min.js', function() {
            //jQuery Loaded.
            //Wait for a bit for AJAX content to load on the page. Here, we are waiting 0.1 seconds.
            setTimeout(function() {
              return page.evaluate(function(selector) {

                // Get what you want from the page using jQuery. 
                // A good way is to populate an object with all the 
                // jQuery commands that you need and then return the object.
                var sArr = [];
                if ($(selector[0]).length) {
                  $(selector[0]).each(function() {
                    sArr.push($(this).html());
                  });
                // if you don't get any thing try the next scraper string
                // this helps when shoes might display sizes differently
                // to jeans etc
                } else if (selector[1]) {
                  $(selector[1]).each(function() {
                    sArr.push($(this).html());
                  });
                }

                return {
                  sizes: sArr
                };
              }, function(result) {
                // Call the sites filter function to filter out unavailable sizes
                result = site.filterFunction(result);
                // send the result back to the user.
                res.send(result);
                // Close PhantomJS
                ph.exit();
              }, site.queryStrings);
            }, 100);

          });
        });
      });
  });
}

Now for an example of the scraper for ASOS (in /scrapers/asos.js).

var _ = require('lodash');

module.exports = {  
  // List of strings to query for sizes with jQuery
  queryStrings: ['#ctl00_ContentMainPage_ctlSeparateProduct_drpdwnSize option'],
  // function to filter the results will vary between sites
  filterFunction: function(result){
    // drop the first option since its the placeholder
    result.sizes = _.drop(result.sizes, 1);
    // reject sizes that are not available
    result.sizes = _.reject(result.sizes, function(s){
      return s.indexOf("- Not available") !== -1
    });
    // map the sizes to somthing a bit more readable
    result.sizes = _.map(result.sizes, function(n) {
      return {name: n}
    });
    // return the tidy sizes.
    return result;
  }
}

Putting it all together, now you should be able to call http://localhost:5000/{ASOS PRODUCT URL GOES HERE "http/s included"}

Sample code on Github with more scrapers here.