| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822 |
- /***********************************************
- Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
- Permission is hereby granted, free of charge, to any person obtaining a copy
- of this software and associated documentation files (the "Software"), to
- deal in the Software without restriction, including without limitation the
- rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- sell copies of the Software, and to permit persons to whom the Software is
- furnished to do so, subject to the following conditions:
- The above copyright notice and this permission notice shall be included in
- all copies or substantial portions of the Software.
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- IN THE SOFTWARE.
- ***********************************************/
- /* v1.7.2 */
- (function () {
- function runningInNode () {
- return(
- (typeof require) == "function"
- &&
- (typeof exports) == "object"
- &&
- (typeof module) == "object"
- &&
- (typeof __filename) == "string"
- &&
- (typeof __dirname) == "string"
- );
- }
- if (!runningInNode()) {
- if (!this.Tautologistics)
- this.Tautologistics = {};
- else if (this.Tautologistics.NodeHtmlParser)
- return; //NodeHtmlParser already defined!
- this.Tautologistics.NodeHtmlParser = {};
- exports = this.Tautologistics.NodeHtmlParser;
- }
- //Types of elements found in the DOM
- var ElementType = {
- Text: "text" //Plain text
- , Directive: "directive" //Special tag <!...>
- , Comment: "comment" //Special tag <!--...-->
- , Script: "script" //Special tag <script>...</script>
- , Style: "style" //Special tag <style>...</style>
- , Tag: "tag" //Any tag that isn't special
- }
- function Parser (handler, options) {
- this._options = options ? options : { };
- if (this._options.includeLocation == undefined) {
- this._options.includeLocation = false; //Do not track element position in document by default
- }
- this.validateHandler(handler);
- this._handler = handler;
- this.reset();
- }
- //**"Static"**//
- //Regular expressions used for cleaning up and parsing (stateless)
- Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
- Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
- Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
- Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
- //Regular expressions used for parsing (stateful)
- Parser._reAttrib = //Find attributes in a tag
- /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
- Parser._reTags = /[\<\>]/g; //Find tag markers
- //**Public**//
- //Methods//
- //Parses a complete HTML and pushes it to the handler
- Parser.prototype.parseComplete = function Parser$parseComplete (data) {
- this.reset();
- this.parseChunk(data);
- this.done();
- }
- //Parses a piece of an HTML document
- Parser.prototype.parseChunk = function Parser$parseChunk (data) {
- if (this._done)
- this.handleError(new Error("Attempted to parse chunk after parsing already done"));
- this._buffer += data; //FIXME: this can be a bottleneck
- this.parseTags();
- }
- //Tells the parser that the HTML being parsed is complete
- Parser.prototype.done = function Parser$done () {
- if (this._done)
- return;
- this._done = true;
-
- //Push any unparsed text into a final element in the element list
- if (this._buffer.length) {
- var rawData = this._buffer;
- this._buffer = "";
- var element = {
- raw: rawData
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
- , type: this._parseState
- };
- if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
- element.name = this.parseTagName(element.data);
- this.parseAttribs(element);
- this._elements.push(element);
- }
-
- this.writeHandler();
- this._handler.done();
- }
- //Resets the parser to a blank state, ready to parse a new HTML document
- Parser.prototype.reset = function Parser$reset () {
- this._buffer = "";
- this._done = false;
- this._elements = [];
- this._elementsCurrent = 0;
- this._current = 0;
- this._next = 0;
- this._location = {
- row: 0
- , col: 0
- , charOffset: 0
- , inBuffer: 0
- };
- this._parseState = ElementType.Text;
- this._prevTagSep = '';
- this._tagStack = [];
- this._handler.reset();
- }
-
- //**Private**//
- //Properties//
- Parser.prototype._options = null; //Parser options for how to behave
- Parser.prototype._handler = null; //Handler for parsed elements
- Parser.prototype._buffer = null; //Buffer of unparsed data
- Parser.prototype._done = false; //Flag indicating whether parsing is done
- Parser.prototype._elements = null; //Array of parsed elements
- Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
- Parser.prototype._current = 0; //Position in data that has already been parsed
- Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
- Parser.prototype._location = null; //Position tracking for elements in a stream
- Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
- Parser.prototype._prevTagSep = ''; //Previous tag marker found
- //Stack of element types previously encountered; keeps track of when
- //parsing occurs inside a script/comment/style tag
- Parser.prototype._tagStack = null;
- //Methods//
- //Takes an array of elements and parses any found attributes
- Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
- var idxEnd = elements.length;
- var idx = 0;
-
- while (idx < idxEnd) {
- var element = elements[idx++];
- if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
- this.parseAttribs(element);
- }
-
- return(elements);
- }
- //Takes an element and adds an "attribs" property for any element attributes found
- Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
- //Only parse attributes for tags
- if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
- return;
-
- var tagName = element.data.split(Parser._reWhitespace, 1)[0];
- var attribRaw = element.data.substring(tagName.length);
- if (attribRaw.length < 1)
- return;
-
- var match;
- Parser._reAttrib.lastIndex = 0;
- while (match = Parser._reAttrib.exec(attribRaw)) {
- if (element.attribs == undefined)
- element.attribs = {};
-
- if (typeof match[1] == "string" && match[1].length) {
- element.attribs[match[1]] = match[2];
- } else if (typeof match[3] == "string" && match[3].length) {
- element.attribs[match[3].toString()] = match[4].toString();
- } else if (typeof match[5] == "string" && match[5].length) {
- element.attribs[match[5]] = match[6];
- } else if (typeof match[7] == "string" && match[7].length) {
- element.attribs[match[7]] = match[7];
- }
- }
- }
- //Extracts the base tag name from the data value of an element
- Parser.prototype.parseTagName = function Parser$parseTagName (data) {
- if (data == null || data == "")
- return("");
- var match = Parser._reTagName.exec(data);
- if (!match)
- return("");
- return((match[1] ? "/" : "") + match[2]);
- }
- //Parses through HTML text and returns an array of found elements
- //I admit, this function is rather large but splitting up had an noticeable impact on speed
- Parser.prototype.parseTags = function Parser$parseTags () {
- var bufferEnd = this._buffer.length - 1;
- while (Parser._reTags.test(this._buffer)) {
- this._next = Parser._reTags.lastIndex - 1;
- var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
- var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
-
- //A new element to eventually be appended to the element list
- var element = {
- raw: rawData
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
- , type: this._parseState
- };
-
- var elementName = this.parseTagName(element.data);
-
- //This section inspects the current tag stack and modifies the current
- //element if we're actually parsing a special area (script/comment/style tag)
- if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
- if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
- if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
- this._tagStack.pop();
- else { //Not a closing script tag
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
- //All data from here to script close is now a text element
- element.type = ElementType.Text;
- //If the previous element is text, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- }
- }
- }
- }
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
- if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
- this._tagStack.pop();
- else {
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
- //All data from here to style close is now a text element
- element.type = ElementType.Text;
- //If the previous element is text, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
- var prevElement = this._elements[this._elements.length - 1];
- if (element.raw != "") {
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- } else { //Element is empty, so just append the last tag marker found
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
- }
- } else { //The previous element was not text
- if (element.raw != "") {
- element.raw = element.data = element.raw;
- }
- }
- }
- }
- }
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
- var rawLen = element.raw.length;
- if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
- //Actually, we're no longer in a style tag, so pop it off the stack
- this._tagStack.pop();
- //If the previous element is a comment, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- element.type = ElementType.Text;
- }
- else //Previous element not a comment
- element.type = ElementType.Comment; //Change the current element's type to a comment
- }
- else { //Still in a comment tag
- element.type = ElementType.Comment;
- //If the previous element is a comment, append the current text to it
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
- var prevElement = this._elements[this._elements.length - 1];
- prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
- element.type = ElementType.Text;
- }
- else
- element.raw = element.data = element.raw + tagSep;
- }
- }
- }
-
- //Processing of non-special tags
- if (element.type == ElementType.Tag) {
- element.name = elementName;
-
- if (element.raw.indexOf("!--") == 0) { //This tag is really comment
- element.type = ElementType.Comment;
- delete element["name"];
- var rawLen = element.raw.length;
- //Check if the comment is terminated in the current element
- if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
- element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
- else { //It's not so push the comment onto the tag stack
- element.raw += tagSep;
- this._tagStack.push(ElementType.Comment);
- }
- }
- else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
- element.type = ElementType.Directive;
- //TODO: what about CDATA?
- }
- else if (element.name == "script") {
- element.type = ElementType.Script;
- //Special tag, push onto the tag stack if not terminated
- if (element.data.charAt(element.data.length - 1) != "/")
- this._tagStack.push(ElementType.Script);
- }
- else if (element.name == "/script")
- element.type = ElementType.Script;
- else if (element.name == "style") {
- element.type = ElementType.Style;
- //Special tag, push onto the tag stack if not terminated
- if (element.data.charAt(element.data.length - 1) != "/")
- this._tagStack.push(ElementType.Style);
- }
- else if (element.name == "/style")
- element.type = ElementType.Style;
- if (element.name && element.name.charAt(0) == "/")
- element.data = element.name;
- }
-
- //Add all tags and non-empty text elements to the element list
- if (element.raw != "" || element.type != ElementType.Text) {
- if (this._options.includeLocation && !element.location) {
- element.location = this.getLocation(element.type == ElementType.Tag);
- }
- this.parseAttribs(element);
- this._elements.push(element);
- //If tag self-terminates, add an explicit, separate closing tag
- if (
- element.type != ElementType.Text
- &&
- element.type != ElementType.Comment
- &&
- element.type != ElementType.Directive
- &&
- element.data.charAt(element.data.length - 1) == "/"
- )
- this._elements.push({
- raw: "/" + element.name
- , data: "/" + element.name
- , name: "/" + element.name
- , type: element.type
- });
- }
- this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
- this._current = this._next + 1;
- this._prevTagSep = tagSep;
- }
- if (this._options.includeLocation) {
- this.getLocation();
- this._location.row += this._location.inBuffer;
- this._location.inBuffer = 0;
- this._location.charOffset = 0;
- }
- this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
- this._current = 0;
-
- this.writeHandler();
- }
- Parser.prototype.getLocation = function Parser$getLocation (startTag) {
- var c,
- l = this._location,
- end = this._current - (startTag ? 1 : 0),
- chunk = startTag && l.charOffset == 0 && this._current == 0;
-
- for (; l.charOffset < end; l.charOffset++) {
- c = this._buffer.charAt(l.charOffset);
- if (c == '\n') {
- l.inBuffer++;
- l.col = 0;
- } else if (c != '\r') {
- l.col++;
- }
- }
- return {
- line: l.row + l.inBuffer + 1
- , col: l.col + (chunk ? 0: 1)
- };
- }
- //Checks the handler to make it is an object with the right "interface"
- Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
- if ((typeof handler) != "object")
- throw new Error("Handler is not an object");
- if ((typeof handler.reset) != "function")
- throw new Error("Handler method 'reset' is invalid");
- if ((typeof handler.done) != "function")
- throw new Error("Handler method 'done' is invalid");
- if ((typeof handler.writeTag) != "function")
- throw new Error("Handler method 'writeTag' is invalid");
- if ((typeof handler.writeText) != "function")
- throw new Error("Handler method 'writeText' is invalid");
- if ((typeof handler.writeComment) != "function")
- throw new Error("Handler method 'writeComment' is invalid");
- if ((typeof handler.writeDirective) != "function")
- throw new Error("Handler method 'writeDirective' is invalid");
- }
- //Writes parsed elements out to the handler
- Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
- forceFlush = !!forceFlush;
- if (this._tagStack.length && !forceFlush)
- return;
- while (this._elements.length) {
- var element = this._elements.shift();
- switch (element.type) {
- case ElementType.Comment:
- this._handler.writeComment(element);
- break;
- case ElementType.Directive:
- this._handler.writeDirective(element);
- break;
- case ElementType.Text:
- this._handler.writeText(element);
- break;
- default:
- this._handler.writeTag(element);
- break;
- }
- }
- }
- Parser.prototype.handleError = function Parser$handleError (error) {
- if ((typeof this._handler.error) == "function")
- this._handler.error(error);
- else
- throw error;
- }
- //TODO: make this a trully streamable handler
- function RssHandler (callback) {
- RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
- }
- inherits(RssHandler, DefaultHandler);
- RssHandler.prototype.done = function RssHandler$done () {
- var feed = { };
- var feedRoot;
- var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
- if (found.length) {
- feedRoot = found[0];
- }
- if (feedRoot) {
- if (feedRoot.name == "rss") {
- feed.type = "rss";
- feedRoot = feedRoot.children[0]; //<channel/>
- feed.id = "";
- try {
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
- } catch (ex) { }
- try {
- feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- feed.items = [];
- DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
- var entry = {};
- try {
- entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
- } catch (ex) { }
- feed.items.push(entry);
- });
- } else {
- feed.type = "atom";
- try {
- feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
- } catch (ex) { }
- try {
- feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
- } catch (ex) { }
- try {
- feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
- } catch (ex) { }
- feed.items = [];
- DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
- var entry = {};
- try {
- entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
- } catch (ex) { }
- try {
- entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
- } catch (ex) { }
- try {
- entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
- } catch (ex) { }
- feed.items.push(entry);
- });
- }
- this.dom = feed;
- }
- RssHandler.super_.prototype.done.call(this);
- }
- ///////////////////////////////////////////////////
- function DefaultHandler (callback, options) {
- this.reset();
- this._options = options ? options : { };
- if (this._options.ignoreWhitespace == undefined)
- this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
- if (this._options.verbose == undefined)
- this._options.verbose = true; //Keep data property for tags and raw property for all
- if (this._options.enforceEmptyTags == undefined)
- this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
- if ((typeof callback) == "function")
- this._callback = callback;
- }
- //**"Static"**//
- //HTML Tags that shouldn't contain child nodes
- DefaultHandler._emptyTags = {
- area: 1
- , base: 1
- , basefont: 1
- , br: 1
- , col: 1
- , frame: 1
- , hr: 1
- , img: 1
- , input: 1
- , isindex: 1
- , link: 1
- , meta: 1
- , param: 1
- , embed: 1
- }
- //Regex to detect whitespace only text nodes
- DefaultHandler.reWhitespace = /^\s*$/;
- //**Public**//
- //Properties//
- DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
- //Methods//
- //Resets the handler back to starting state
- DefaultHandler.prototype.reset = function DefaultHandler$reset() {
- this.dom = [];
- this._done = false;
- this._tagStack = [];
- this._tagStack.last = function DefaultHandler$_tagStack$last () {
- return(this.length ? this[this.length - 1] : null);
- }
- }
- //Signals the handler that parsing is done
- DefaultHandler.prototype.done = function DefaultHandler$done () {
- this._done = true;
- this.handleCallback(null);
- }
- DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
- if (this._options.ignoreWhitespace)
- if (DefaultHandler.reWhitespace.test(element.data))
- return;
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
- this.handleElement(element);
- }
- DefaultHandler.prototype.error = function DefaultHandler$error (error) {
- this.handleCallback(error);
- }
- //**Private**//
- //Properties//
- DefaultHandler.prototype._options = null; //Handler options for how to behave
- DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
- DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
- DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
- //Methods//
- DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
- if ((typeof this._callback) != "function")
- if (error)
- throw error;
- else
- return;
- this._callback(error, this.dom);
- }
-
- DefaultHandler.prototype.isEmptyTag = function(element) {
- var name = element.name.toLowerCase();
- if (name.charAt(0) == '/') {
- name = name.substring(1);
- }
- return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name];
- };
-
- DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
- if (this._done)
- this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
- if (!this._options.verbose) {
- // element.raw = null; //FIXME: Not clean
- //FIXME: Serious performance problem using delete
- delete element.raw;
- if (element.type == "tag" || element.type == "script" || element.type == "style")
- delete element.data;
- }
- if (!this._tagStack.last()) { //There are no parent elements
- //If the element can be a container, add it to the tag stack and the top level list
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
- if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
- this.dom.push(element);
- if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children
- this._tagStack.push(element);
- }
- }
- }
- else //Otherwise just add to the top level list
- this.dom.push(element);
- }
- else { //There are parent elements
- //If the element can be a container, add it as a child of the element
- //on top of the tag stack and then add it to the tag stack
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
- if (element.name.charAt(0) == "/") {
- //This is a closing tag, scan the tagStack to find the matching opening tag
- //and pop the stack up to the opening tag's parent
- var baseName = element.name.substring(1);
- if (!this.isEmptyTag(element)) {
- var pos = this._tagStack.length - 1;
- while (pos > -1 && this._tagStack[pos--].name != baseName) { }
- if (pos > -1 || this._tagStack[0].name == baseName)
- while (pos < this._tagStack.length - 1)
- this._tagStack.pop();
- }
- }
- else { //This is not a closing tag
- if (!this._tagStack.last().children)
- this._tagStack.last().children = [];
- this._tagStack.last().children.push(element);
- if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children
- this._tagStack.push(element);
- }
- }
- else { //This is not a container element
- if (!this._tagStack.last().children)
- this._tagStack.last().children = [];
- this._tagStack.last().children.push(element);
- }
- }
- }
- var DomUtils = {
- testElement: function DomUtils$testElement (options, element) {
- if (!element) {
- return false;
- }
-
- for (var key in options) {
- if (key == "tag_name") {
- if (element.type != "tag" && element.type != "script" && element.type != "style") {
- return false;
- }
- if (!options["tag_name"](element.name)) {
- return false;
- }
- } else if (key == "tag_type") {
- if (!options["tag_type"](element.type)) {
- return false;
- }
- } else if (key == "tag_contains") {
- if (element.type != "text" && element.type != "comment" && element.type != "directive") {
- return false;
- }
- if (!options["tag_contains"](element.data)) {
- return false;
- }
- } else {
- if (!element.attribs || !options[key](element.attribs[key])) {
- return false;
- }
- }
- }
-
- return true;
- }
-
- , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
- recurse = (recurse === undefined || recurse === null) || !!recurse;
- limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
- if (!currentElement) {
- return([]);
- }
-
- var found = [];
- var elementList;
- function getTest (checkVal) {
- return(function (value) { return(value == checkVal); });
- }
- for (var key in options) {
- if ((typeof options[key]) != "function") {
- options[key] = getTest(options[key]);
- }
- }
-
- if (DomUtils.testElement(options, currentElement)) {
- found.push(currentElement);
- }
- if (limit >= 0 && found.length >= limit) {
- return(found);
- }
- if (recurse && currentElement.children) {
- elementList = currentElement.children;
- } else if (currentElement instanceof Array) {
- elementList = currentElement;
- } else {
- return(found);
- }
-
- for (var i = 0; i < elementList.length; i++) {
- found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
- if (limit >= 0 && found.length >= limit) {
- break;
- }
- }
-
- return(found);
- }
-
- , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
- var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
- return(result.length ? result[0] : null);
- }
-
- , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
- return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
- }
-
- , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
- return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
- }
- }
- function inherits (ctor, superCtor) {
- var tempCtor = function(){};
- tempCtor.prototype = superCtor.prototype;
- ctor.super_ = superCtor;
- ctor.prototype = new tempCtor();
- ctor.prototype.constructor = ctor;
- }
- exports.Parser = Parser;
- exports.DefaultHandler = DefaultHandler;
- exports.RssHandler = RssHandler;
- exports.ElementType = ElementType;
- exports.DomUtils = DomUtils;
- })();
|