Chris Winberry needed an HTML parser for a project he was working on and started to use John's parser but found it to be a touch too strict for some of the HTML he was using (sloppy HTML? never). It was also too heavy to run on a server that would see considerable traffic, and so, being lazy, he wrote a new one from the ground up that is both light weight (extremely simple DOM) and very forgiving.
Which brings us to node-htmlparser which works in both Node:
JAVASCRIPT:
-
-
var htmlparser = require("node-htmlparser");
-
var rawHtml = "Xyz <script language= javascript>var foo = '<<bar>>';</ script><!--<!-- Waah! -- -->";
-
var handler = new htmlparser.DefaultHandler(function (error) {
-
if (error)
-
[...do something for errors...]
-
else
-
[...parsing done, do something...]
-
});
-
var parser = new htmlparser.Parser(handler);
-
parser.ParseComplete(rawHtml);
-
sys.puts(sys.inspect(handler.dom, false, null));
-
and on a modern browser:
JAVASCRIPT:
-
-
var handler = new Tautologistics.NodeHtmlParser.DefaultHandler(function (error) {
-
if (error)
-
[...do something for errors...]
-
else
-
[...parsing done, do something...]
-
});
-
var parser = new Tautologistics.NodeHtmlParser.Parser(handler);
-
parser.ParseComplete(document.body.innerHTML);
-
alert(JSON.stringify(handler.dom, null, 2));
-