Creating a recursive web scraper with Node Crawler

So you want to scrape lots of data from the internet using Node Crawler? No problem. You’ll need to know JavaScript, jQuery, and a bit of Node.

First things first:

1) goto Terminal and create a new javascript folder called node crawler and save a file called “craigslist.js”
2) in your folder, run npm install crawler
3) Make sure you know how to select elements from the DOM using jQuery
4) Pick a site you want to scrape from. In our example, I’m using craigslist.

Lets jump into it and I’ll show you the code in full and then break it down piece by piece:

var Crawler = require("crawler");
var url = require('url');
var fs = require('fs');

var jsonArray = [];

var craigslist = new Crawler({
    maxConnections : 10,
    callback : function (error, result, $) {

$('.row').each(function(index, a) {
          var jsonOutput = {};

var dataSplit = $(a).text().split(' ');
          jsonOutput.price = dataSplit[1];
          jsonOutput.date = dataSplit[5] + dataSplit[6];
          jsonOutput.siteLink = "http://sfbay.craigslist.org" + $(a).children().attr('href');

console.log("price:" + jsonOutput.price + "  date:" + jsonOutput.date 
                      + "   date:" + jsonOutput.siteLink)

jsonArray.push(jsonOutput)

});
        
        var rangeNumber = $($('.range')[0]).text().split(' ')[2]
        
        console.log("We are on item range: " + rangeNumber)
        
        var toQueueUrl = 'http://sfbay.craigslist.org/search/bia?s=' + rangeNumber

if (parseInt(rangeNumber) < 1000) {
          craigslist.queue(toQueueUrl);  
        } else {
          fs.appendFile('craigsListData.txt', JSON.stringify(jsonArray), function (err) {
            if (err) throw err;
            console.log('The "data to append" was appended to file!');
          });
        }
    }
});

craigslist.queue('http://sfbay.craigslist.org/search/bia')

var Crawler = require("crawler");

var url = require('url');

var fs = require('fs');

var jsonArray = [];

var craigslist = new Crawler({

maxConnections : 10,

callback : function (error, result, $) {

$('.row').each(function(index, a) {

var jsonOutput = {};

var dataSplit = $(a).text().split(' ');

jsonOutput.price = dataSplit[1];

jsonOutput.date = dataSplit[5] + dataSplit[6];

jsonOutput.siteLink = "http://sfbay.craigslist.org" + $(a).children().attr('href');

console.log("price:" + jsonOutput.price + " date:" + jsonOutput.date

+ " date:" + jsonOutput.siteLink)

jsonArray.push(jsonOutput)

});

var rangeNumber = $($('.range')[0]).text().split(' ')[2]

console.log("We are on item range: " + rangeNumber)

var toQueueUrl = 'http://sfbay.craigslist.org/search/bia?s=' + rangeNumber

if (parseInt(rangeNumber) < 1000) {

craigslist.queue(toQueueUrl);

} else {

fs.appendFile('craigsListData.txt', JSON.stringify(jsonArray), function (err) {

if (err) throw err;

console.log('The "data to append" was appended to file!');

});

}

});

craigslist.queue('http://sfbay.craigslist.org/search/bia')

when you run node craigslist.js This will give you a result such as this:

Beautiful bike data

Awesome, lets break down how it works:

Lines 1 -3: All of our require statements. Only module you need to require is crawler.

var Crawler = require("crawler");

var url = require('url');

var fs = require('fs');

Lines 7-9: instantiate a new Crawler and pass in standard options. on line 9, create your callback function.

var craigslist = new Crawler({

maxConnections : 10,

callback : function (error, result, $) {

line 11: This will loop over every instance of .row class on the page. As you can see, the row class contains all the pertinent info we need.
$('.row').each(function(index, a) {

blogpost

lines 14-18: using jQuery, select data you want to save and put that data into the json object we’ve created

var dataSplit = $(a).text().split(' ');

jsonOutput.price = dataSplit[1];

jsonOutput.date = dataSplit[5] + dataSplit[6];

jsonOutput.siteLink = "http://sfbay.craigslist.org" + $(a).children().attr('href')

We want to append that data to the JSON array, and when all is complete — we want to find the next page.
Go recursion go!

Line 26: Here we look at the an item on the on the HTML document that we can pass into our crawler and find the the next page. Thankfully, craigslist gives us a range element that we can pass into the URL to find the next 100 elements. We extract that of course.
var rangeNumber = $($('.range')[0]).text().split(' ')[2]

Line 30: Create a ‘link’ to pass into the crawler based on the range we just scraped
var toQueueUrl = 'http://sfbay.craigslist.org/search/bia?s=' + rangeNumber

Lines 32-38: We want our base case — which is “If the range is less than 1000, run the next link into our crawler. Otherwise, save all the data to our system.”

if (parseInt(rangeNumber) < 1000) {

craigslist.queue(toQueueUrl);

} else {

fs.appendFile('craigsListData.txt', JSON.stringify(jsonArray), function (err) {

if (err) throw err;

console.log('The "data to append" was appended to file!');

});

Line 43: Start the recursion and pass in the first page
craigslist.queue('http://sfbay.craigslist.org/search/bia')

3 thoughts on “Creating a recursive web scraper with Node Crawler”

Deb says:

October 28, 2017 at 3:42 am

Using your code to fetch the details from the same website. But it’s giving the below error. I tried with all the CSS selectors but didn’t work. Please just check it once. I am new to this.

G:\node crawler\craigslist.js:13
$(“.result-row”).each(function(index, a) {
^

TypeError: Cannot read property ‘each’ of undefined
at Object.callback (G:\node crawler\craigslist.js:13:25)
at Crawler._injected (G:\node crawler\node_modules\crawler\lib\crawler.js:403:13)
at Crawler._inject (G:\node crawler\node_modules\crawler\lib\crawler.js:144:9)
at Crawler._onContent (G:\node crawler\node_modules\crawler\lib\crawler.js:396:10)
at Request._callback (G:\node crawler\node_modules\crawler\lib\crawler.js:341:18)
at Request.self.callback (G:\node crawler\node_modules\request\request.js:186:22)
at emitTwo (events.js:106:13)
at Request.emit (events.js:191:7)
at Request. (G:\node crawler\node_modules\request\request.js:1163:10)
at emitOne (events.js:96:13)

1. admin says:
  
  October 28, 2017 at 11:57 pm
  
  Hey Deb,
  
  It looks like the new crawler callback signature has changed.
  
  callback : function (error, result, $) {
  
  Has changed to
  
  callback : function (error, res, done) {
  
  and jQuery is included within the response.
  
  —————–
  
  It looks like you’ll need to do this:
  
  var craigslist = new Crawler({ maxConnections : 10, callback : function (error, res, done) { var $ = res.$; $('.row').each(function(index, a) { var jsonOutput = {};
  
  ————————
  
  Look at https://github.com/bda-research/node-crawler for further info.
  
  Best,
  Marc
  
Sahith says:

August 3, 2018 at 12:52 am

Awesome! It was helpfull.

Marc Balaban

Version 1.02

Creating a recursive web scraper with Node Crawler

3 thoughts on “Creating a recursive web scraper with Node Crawler”

Leave a Reply Cancel reply