From 2ab2c350f364310844fc4f7805cb61240bec3140 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20De=20Bollivier?= Date: Sat, 28 Apr 2012 13:12:48 +0400 Subject: [PATCH 1/2] Issue #2 : Special Chars getting butchered MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before : Readability: ---DOM created Braquage cette nuit � Saint-Pierre

Deux hommes, arriv�s � bord d’un scooter, ont fait irruption cette nuit vers 3h30 chez un marchand de fruits et l�gumes ouvert 24h/24 � Saint-Pierre. Ils auraient alors menac� d’une arme � feu le g�rant en r�clamant la caisse. Mais ne seraient repartis qu’avec la balance, croyant sans doute qu’elle pouvait contenir de l’argent. Pour le magasin, le pr�judice �conomique est donc plut�t l�ger. Mais si personne n’a �t� bless�, le braqu� est �videmment choqu�.

After: Readability: ---DOM created Braquage cette nuit à Saint-Pierre

Deux hommes, arrivés à bord d’un scooter, ont fait irruption cette nuit vers 3h30 chez un marchand de fruits et légumes ouvert 24h/24 à Saint-Pierre. Ils auraient alors menacé d’une arme à feu le gérant en réclamant la caisse. Mais ne seraient repartis qu’avec la balance, croyant sans doute qu’elle pouvait contenir de l’argent. Pour le magasin, le préjudice économique est donc plutôt léger. Mais si personne n’a été blessé, le braqué est évidemment choqué.

Plus d’informations demain dans votre Journal de l’île.

if you use request (from mikeal for example), you will just have to do this : var readability = require('../lib/readability'); var url = "http://www.clicanoo.re/322520-braquage-cette-nuit-a-saint-pierre.html"; var request = require('request'); request({url:url, 'encoding':'binary'}, function (error, response, html) { var content_type = response['headers']['content-type'].split('='); var encoding = content_type[1].toUpperCase(); if (!error && response.statusCode == 200) { readability.parse(html, url, {encoding:encoding}, function(result) { console.log(result.title, result.content); }); } }); --- lib/readability.js | 11 ++++++++--- package.json | 5 ++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lib/readability.js b/lib/readability.js index c8b5a52..cd8e0c3 100644 --- a/lib/readability.js +++ b/lib/readability.js @@ -1,6 +1,7 @@ /*jslint undef: true, nomen: true, eqeqeq: true, plusplus: true, newcap: true, immed: true, browser: true, devel: true, passfail: false */ /*global window: false, readConvertLinksToFootnotes: false, readStyle: false, readSize: false, readMargin: false, Typekit: false, ActiveXObject: false */ - +var Buffer = require('buffer').Buffer; +var Iconv = require('iconv').Iconv; var dbg = (typeof console !== 'undefined') ? function(s) { if (readability.debugging) { console.log("Readability: " + s); @@ -2225,7 +2226,11 @@ exports.parse = function parse(theHtml, url, options, callback) { removeClassNames: true }; options = Utils.extend({}, defaultOptions, options); - + if(options.encoding && options.encoding != 'utf8') { + body = new Buffer(theHtml, 'binary'); + iconv = new Iconv(options.encoding, 'utf8'); + theHtml = iconv.convert(body).toString('utf8'); + } var startTime = new Date().getTime(); //dbg(html); var html = theHtml.replace(/]*>([\s\S]*?)<\/script>/gi, ''); @@ -2239,7 +2244,7 @@ exports.parse = function parse(theHtml, url, options, callback) { features : { FetchExternalResources : [], ProcessExternalResources : false - } + }, }; function createDocWithHTMLParser() { diff --git a/package.json b/package.json index 417beaa..cad8180 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,10 @@ "dependencies": { "mjsunit.runner": ">=0.1.0", "jsdom": ">=0.1.21", - "htmlparser": ">=1.7.3" + "htmlparser": ">=1.7.3", + "html5":">0.1", + "main": "iconv", + "iconv":">=1.1.3" }, "engines" : { "node" : ">=0.2.5" }, "directories": { From 0141305595102e98f0c94da3bdadb08e4339117e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20De=20Bollivier?= Date: Sun, 29 Apr 2012 10:48:37 +0400 Subject: [PATCH 2/2] Issue #6 : Should be able to fetch the next page #gklst You can know fetch next page from an article. I modified the xhr function to use mikeal request instead. Also, the callback is raised only if all pages are fetched. This pull contained a package.json updated, a simple "test" for the multi-page and the patch for encoding trouble. --- lib/readability.js | 71 ++++++++++++++++++++-------------------------- package.json | 3 +- test/multi-page.js | 17 +++++++++++ 3 files changed, 49 insertions(+), 42 deletions(-) create mode 100644 test/multi-page.js diff --git a/lib/readability.js b/lib/readability.js index cd8e0c3..63351e4 100644 --- a/lib/readability.js +++ b/lib/readability.js @@ -42,7 +42,9 @@ var readability = { maxPages: 30, /* The maximum number of pages to loop through before we call it quits and just show a link. */ parsedPages: {}, /* The list of pages we've parsed in this call of readability, for autopaging. As a key store for easier searching. */ pageETags: {}, /* A list of the ETag headers of pages we've parsed, in case they happen to match, we'll know it's a duplicate. */ - + success: function (html) { + + }, /** * All of the regular expressions in use within readability. * Defined up here so we don't instantiate them repeatedly in loops. @@ -92,7 +94,6 @@ var readability = { /* Pull out any possible next page link first */ var nextPageLink = readability.findNextPageLink(document.body); - readability.prepDocument(); /* Build readability's DOM tree */ @@ -189,6 +190,8 @@ var readability = { window.setTimeout(function() { readability.appendNextPage(nextPageLink); }, 500); + } else { + readability.success(document.body.innerHTML); } /** Smooth scrolling **/ @@ -1409,49 +1412,26 @@ timed(function(){ * TODO: This could likely be simplified beyond what we have here right now. There's still a bit of excess junk. **/ xhr: function () { - if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { + /*if (typeof XMLHttpRequest !== 'undefined' && (window.location.protocol !== 'file:' || !window.ActiveXObject)) { return new XMLHttpRequest(); } else { try { return new ActiveXObject('Msxml2.XMLHTTP.6.0'); } catch(sixerr) { } try { return new ActiveXObject('Msxml2.XMLHTTP.3.0'); } catch(threrr) { } try { return new ActiveXObject('Msxml2.XMLHTTP'); } catch(err) { } - } + }*/ + var request = require('request'); - return false; + return request; }, successfulRequest: function (request) { return (request.status >= 200 && request.status < 300) || request.status === 304 || (request.status === 0 && request.responseText); }, - ajax: function (url, options) { + ajax: function (url, callback) { var request = readability.xhr(); - - function respondToReadyState(readyState) { - if (request.readyState === 4) { - if (readability.successfulRequest(request)) { - if (options.success) { options.success(request); } - } - else { - if (options.error) { options.error(request); } - } - } - } - - if (typeof options === 'undefined') { options = {}; } - - request.onreadystatechange = respondToReadyState; - - request.open('get', url, true); - request.setRequestHeader('Accept', 'text/html'); - - try { - request.send(options.postBody); - } - catch (e) { - if (options.error) { options.error(); } - } + request({url:url, 'encoding':'binary'}, callback); return request; }, @@ -1483,11 +1463,20 @@ timed(function(){ * asynchronously and load the cleaned content into the div we created for it. **/ (function(pageUrl, thisPage) { - readability.ajax(pageUrl, { - success: function(r) { - + readability.ajax(pageUrl, function(error, r, html) { + var encoding = undefined; + if(r['headers']['content-type']) { + var content_type = r['headers']['content-type'].split('='); + if(content_type.length == 2) encoding = content_type[1].toUpperCase(); + } + if(encoding) { + body = new Buffer(html, 'binary'); + iconv = new Iconv(encoding, 'utf8'); + html = iconv.convert(body).toString('utf8'); + } + r.responseText = html; /* First, check to see if we have a matching ETag in headers - if we do, this is a duplicate page. */ - var eTag = r.getResponseHeader('ETag'); + var eTag = r['headers']['ETag']; if(eTag) { if(eTag in readability.pageETags) { dbg("Exact duplicate page found via ETag. Aborting."); @@ -1563,8 +1552,9 @@ timed(function(){ if(nextPageLink) { readability.appendNextPage(nextPageLink); + } else { + readability.success(document.body.innerHTML); } - } }); }(nextPageLink, articlePage)); }, @@ -1962,7 +1952,7 @@ var jsdom = require('jsdom'), var R = readability; var patch = { reComma: /[\uff0c,]/, // chinese comma, too - findNextPageLink: function() {return null;}, + /*findNextPageLink: function() {return null;},*/ getArticleTools: function() {return document.createElement('div');}, getArticleTitle: (function() { var old = R.getArticleTitle; @@ -2196,6 +2186,7 @@ function start(w, options, cb) { if (options.profile) { MyProfiler.reset(); } + readability.success = cb; readability.init(); @@ -2203,9 +2194,9 @@ function start(w, options, cb) { if (options.removeReadabilityArtifacts) removeReadabilityArtifacts(); if (options.removeClassNames) removeClassNames(); - + document.body.innerHTML = '
' + document.body.innerHTML + '
'; //dbg('[Readability] done'); - cb(document.body.innerHTML); + //cb(document.body.innerHTML); } var HTML5; @@ -2284,7 +2275,7 @@ exports.parse = function parse(theHtml, url, options, callback) { return callback({title: '', content: '', error: true}); } - dbg('---DOM created'); + //dbg('---DOM created'); var win = doc.parentWindow; win = win || doc.createWindow(); //for backward compatibility with jsdom <= 0.1.20 diff --git a/package.json b/package.json index cad8180..5de26fb 100644 --- a/package.json +++ b/package.json @@ -44,8 +44,7 @@ "jsdom": ">=0.1.21", "htmlparser": ">=1.7.3", "html5":">0.1", - "main": "iconv", - "iconv":">=1.1.3" + "iconv":">=1.1.3" }, "engines" : { "node" : ">=0.2.5" }, "directories": { diff --git a/test/multi-page.js b/test/multi-page.js new file mode 100644 index 0000000..67cbace --- /dev/null +++ b/test/multi-page.js @@ -0,0 +1,17 @@ +var readability = require('../lib/readability'), + request = require('request'), + encoding = 'utf8'; +var url = "http://www.washingtonpost.com/world/national-security/manhunt-details-us-mission-to-find-osama-bin-laden/2012/04/27/gIQAz5pLoT_story.html"; + + +request({url:url, 'encoding':'binary'}, function (error, response, html) { + if(response['headers']['content-type']) { + var content_type = response['headers']['content-type'].split('='); + if(content_type.length == 2) encoding = content_type[1].toUpperCase(); + } + if(!error && response.statusCode == 200) { + readability.parse(html, url, {encoding:encoding}, function(result) { + console.log(result.title, result.content); + }); + } +}); \ No newline at end of file