Author: @benjamincoe
Over my past couple years in the industry, there have been several times where I need to scrape structured information from (relatively) unstructured XHTML websites.
My approach to doing this has gradually evolved to include the following technologies:
I was starting to notice a lot of code duplication in my scraping scripts, enter jDistiller:
npm install jdistiller
var jDistiller = require('jdistiller').jDistiller;
new jDistiller()
new jDistiller()
.set('headline', '#article h1.articleHeadline')
.set('firstParagraph', '#article .articleBody p:eq(0)');
Simple Example (New York Times)
var jDistiller = require('jdistiller').jDistiller;
new jDistiller()
.set('headline', '#article h1.articleHeadline')
.set('firstParagraph', '#article .articleBody p:eq(0)')
.distill('http://www.nytimes.com/2012/09/09/us/politics/obama-and-romney-battle-for-votes-in-2-swing-states.html?_r=1&hp', function(err, distilledPage) {
console.log(JSON.stringify(distilledPage))
});
Output
{"headline":"Obama Tries to Turn Focus to Medicare From Jobs Figures","firstParagraph":"SEMINOLE, Fla. — President Obama on Saturday began hammering away at the Republican ticket’s plans for Medicare, using a campaign swing through Florida, with its large number of retired and elderly voters, to try to turn the page from anemic employment growth, his biggest weakness, to entitlements, a Democratic strength."}
A closure can optionally be provided as the third parameter for the set() method.
If a closure is given, the return value of the closure will be set as a key's value, rather than the text value of the selector.
DSL Using an Optional Data Processing Closure
var jDistiller = require('jdistiller').jDistiller;
new jDistiller()
.set('headline', '#article h1.articleHeadline')
.set('firstParagraph', '#article .articleBody p:eq(0)')
.set('image', '#article .articleBody .articleSpanImage img', function(element, prev) {
return element.attr('src')
})
.distill('http://www.nytimes.com/2012/09/09/us/politics/obama-and-romney-battle-for-votes-in-2-swing-states.html?_r=1&hp', function(err, distilledPage) {
console.log(JSON.stringify(distilledPage))
});
Output
{"headline":"Obama Tries to Turn Focus to Medicare From Jobs Figures","firstParagraph":"SEMINOLE, Fla. — President Obama on Saturday began hammering away at the Republican ticket’s plans for Medicare, using a campaign swing through Florida, with its large number of retired and elderly voters, to try to turn the page from anemic employment growth, his biggest weakness, to entitlements, a Democratic strength.","image":"http://graphics8.nytimes.com/images/2012/09/09/us/JP-CANDIDATE-1/JP-CANDIDATE-1-articleLarge.jpg"}
The closure will be passed the following values:
Array Merging Example
var jDistiller = require('jdistiller').jDistiller;
new jDistiller()
.set('paragraphs', '#article .articleBody p', function(element) {
return [element.text()]
})
.distill('http://www.nytimes.com/2012/09/09/us/politics/obama-and-romney-battle-for-votes-in-2-swing-states.html?_r=1&hp', function(err, distilledPage) {
console.log(JSON.stringify(distilledPage))
});
output
{"paragraphs": ["SEMINOLE, Fla. — President Obama on Saturday began hammering away at the Republican ticket’s...", "Kicking off a two-day bus tour through...", ...]}
Object Merging Example
var jDistiller = require('jdistiller').jDistiller;
new jDistiller()
.set('headlines', '.mw-headline', function(element) {
this.count = this.count || 0;
this.count ++;
if (this.count === 2) {
return {
'second_heading': element.text().trim()
}
}
if (this.count === 3) {
return {
'third_heading': element.text().trim()
}
}
})
.distill('http://en.wikipedia.org/wiki/Dog', function(err, distilledPage) {
console.log(JSON.stringify(distilledPage));
});
Output
{"headlines":{"second_heading":"Taxonomy","third_heading":"History and evolution"}}
Key/Object-Pair Example
var jDistiller = require('jdistiller').jDistiller;
new jDistiller()
.set('links', '#bodyContent p a', function(element, prev) {
var key = element.attr('href');
return [key, {
title: element.attr('title'),
href: key,
occurrences: prev[key] ? prev[key].occurrences + 1 : 1
}]
})
.distill('http://en.wikipedia.org/wiki/Dog', function(err, distilledPage) {
console.log(JSON.stringify(distilledPage));
});
Output
{"links":{"#cite_note-MSW3_Lupus-1":{"title":"","href":"#cite_note-MSW3_Lupus-1","occurrences":1},"#cite_note-ADW-2":{"title":"","href":"#cite_note-ADW-2","occurrences":1},"/wiki/Gray_wolf_subspecies":{"title":"Gray wolf subspecies","href":"/wiki/Gray_wolf_subspecies","occurrences":1},"/wiki/Gray_wolf":{"title":"Gray wolf","href":"/wiki/Gray_wolf","occurrences":1},"/wiki/Canidae":{"title":"Canidae","href":"/wiki/Canidae","occurrences":1}}}
I'm excited about jDistiller, I think it solves the scraping problem in an elegant way.
Don't be shy with your feedback, and please contribute.
-- Ben @benjamincoe