|
|
@@ -1,822 +0,0 @@
|
|
|
-/***********************************************
|
|
|
-Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
|
|
|
-Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
|
-of this software and associated documentation files (the "Software"), to
|
|
|
-deal in the Software without restriction, including without limitation the
|
|
|
-rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
|
|
|
-sell copies of the Software, and to permit persons to whom the Software is
|
|
|
-furnished to do so, subject to the following conditions:
|
|
|
-
|
|
|
-The above copyright notice and this permission notice shall be included in
|
|
|
-all copies or substantial portions of the Software.
|
|
|
-
|
|
|
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
|
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
-FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
-IN THE SOFTWARE.
|
|
|
-***********************************************/
|
|
|
-/* v1.7.2 */
|
|
|
-
|
|
|
-(function () {
|
|
|
-
|
|
|
-function runningInNode () {
|
|
|
- return(
|
|
|
- (typeof require) == "function"
|
|
|
- &&
|
|
|
- (typeof exports) == "object"
|
|
|
- &&
|
|
|
- (typeof module) == "object"
|
|
|
- &&
|
|
|
- (typeof __filename) == "string"
|
|
|
- &&
|
|
|
- (typeof __dirname) == "string"
|
|
|
- );
|
|
|
-}
|
|
|
-
|
|
|
-if (!runningInNode()) {
|
|
|
- if (!this.Tautologistics)
|
|
|
- this.Tautologistics = {};
|
|
|
- else if (this.Tautologistics.NodeHtmlParser)
|
|
|
- return; //NodeHtmlParser already defined!
|
|
|
- this.Tautologistics.NodeHtmlParser = {};
|
|
|
- exports = this.Tautologistics.NodeHtmlParser;
|
|
|
-}
|
|
|
-
|
|
|
-//Types of elements found in the DOM
|
|
|
-var ElementType = {
|
|
|
- Text: "text" //Plain text
|
|
|
- , Directive: "directive" //Special tag <!...>
|
|
|
- , Comment: "comment" //Special tag <!--...-->
|
|
|
- , Script: "script" //Special tag <script>...</script>
|
|
|
- , Style: "style" //Special tag <style>...</style>
|
|
|
- , Tag: "tag" //Any tag that isn't special
|
|
|
-}
|
|
|
-
|
|
|
-function Parser (handler, options) {
|
|
|
- this._options = options ? options : { };
|
|
|
- if (this._options.includeLocation == undefined) {
|
|
|
- this._options.includeLocation = false; //Do not track element position in document by default
|
|
|
- }
|
|
|
-
|
|
|
- this.validateHandler(handler);
|
|
|
- this._handler = handler;
|
|
|
- this.reset();
|
|
|
-}
|
|
|
-
|
|
|
- //**"Static"**//
|
|
|
- //Regular expressions used for cleaning up and parsing (stateless)
|
|
|
- Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
|
|
|
- Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
|
|
|
- Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
|
|
|
- Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
|
|
|
-
|
|
|
- //Regular expressions used for parsing (stateful)
|
|
|
- Parser._reAttrib = //Find attributes in a tag
|
|
|
- /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
|
|
|
- Parser._reTags = /[\<\>]/g; //Find tag markers
|
|
|
-
|
|
|
- //**Public**//
|
|
|
- //Methods//
|
|
|
- //Parses a complete HTML and pushes it to the handler
|
|
|
- Parser.prototype.parseComplete = function Parser$parseComplete (data) {
|
|
|
- this.reset();
|
|
|
- this.parseChunk(data);
|
|
|
- this.done();
|
|
|
- }
|
|
|
-
|
|
|
- //Parses a piece of an HTML document
|
|
|
- Parser.prototype.parseChunk = function Parser$parseChunk (data) {
|
|
|
- if (this._done)
|
|
|
- this.handleError(new Error("Attempted to parse chunk after parsing already done"));
|
|
|
- this._buffer += data; //FIXME: this can be a bottleneck
|
|
|
- this.parseTags();
|
|
|
- }
|
|
|
-
|
|
|
- //Tells the parser that the HTML being parsed is complete
|
|
|
- Parser.prototype.done = function Parser$done () {
|
|
|
- if (this._done)
|
|
|
- return;
|
|
|
- this._done = true;
|
|
|
-
|
|
|
- //Push any unparsed text into a final element in the element list
|
|
|
- if (this._buffer.length) {
|
|
|
- var rawData = this._buffer;
|
|
|
- this._buffer = "";
|
|
|
- var element = {
|
|
|
- raw: rawData
|
|
|
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
|
|
|
- , type: this._parseState
|
|
|
- };
|
|
|
- if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
|
|
|
- element.name = this.parseTagName(element.data);
|
|
|
- this.parseAttribs(element);
|
|
|
- this._elements.push(element);
|
|
|
- }
|
|
|
-
|
|
|
- this.writeHandler();
|
|
|
- this._handler.done();
|
|
|
- }
|
|
|
-
|
|
|
- //Resets the parser to a blank state, ready to parse a new HTML document
|
|
|
- Parser.prototype.reset = function Parser$reset () {
|
|
|
- this._buffer = "";
|
|
|
- this._done = false;
|
|
|
- this._elements = [];
|
|
|
- this._elementsCurrent = 0;
|
|
|
- this._current = 0;
|
|
|
- this._next = 0;
|
|
|
- this._location = {
|
|
|
- row: 0
|
|
|
- , col: 0
|
|
|
- , charOffset: 0
|
|
|
- , inBuffer: 0
|
|
|
- };
|
|
|
- this._parseState = ElementType.Text;
|
|
|
- this._prevTagSep = '';
|
|
|
- this._tagStack = [];
|
|
|
- this._handler.reset();
|
|
|
- }
|
|
|
-
|
|
|
- //**Private**//
|
|
|
- //Properties//
|
|
|
- Parser.prototype._options = null; //Parser options for how to behave
|
|
|
- Parser.prototype._handler = null; //Handler for parsed elements
|
|
|
- Parser.prototype._buffer = null; //Buffer of unparsed data
|
|
|
- Parser.prototype._done = false; //Flag indicating whether parsing is done
|
|
|
- Parser.prototype._elements = null; //Array of parsed elements
|
|
|
- Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
|
|
|
- Parser.prototype._current = 0; //Position in data that has already been parsed
|
|
|
- Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
|
|
|
- Parser.prototype._location = null; //Position tracking for elements in a stream
|
|
|
- Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
|
|
|
- Parser.prototype._prevTagSep = ''; //Previous tag marker found
|
|
|
- //Stack of element types previously encountered; keeps track of when
|
|
|
- //parsing occurs inside a script/comment/style tag
|
|
|
- Parser.prototype._tagStack = null;
|
|
|
-
|
|
|
- //Methods//
|
|
|
- //Takes an array of elements and parses any found attributes
|
|
|
- Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
|
|
|
- var idxEnd = elements.length;
|
|
|
- var idx = 0;
|
|
|
-
|
|
|
- while (idx < idxEnd) {
|
|
|
- var element = elements[idx++];
|
|
|
- if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
|
|
|
- this.parseAttribs(element);
|
|
|
- }
|
|
|
-
|
|
|
- return(elements);
|
|
|
- }
|
|
|
-
|
|
|
- //Takes an element and adds an "attribs" property for any element attributes found
|
|
|
- Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
|
|
|
- //Only parse attributes for tags
|
|
|
- if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
|
|
|
- return;
|
|
|
-
|
|
|
- var tagName = element.data.split(Parser._reWhitespace, 1)[0];
|
|
|
- var attribRaw = element.data.substring(tagName.length);
|
|
|
- if (attribRaw.length < 1)
|
|
|
- return;
|
|
|
-
|
|
|
- var match;
|
|
|
- Parser._reAttrib.lastIndex = 0;
|
|
|
- while (match = Parser._reAttrib.exec(attribRaw)) {
|
|
|
- if (element.attribs == undefined)
|
|
|
- element.attribs = {};
|
|
|
-
|
|
|
- if (typeof match[1] == "string" && match[1].length) {
|
|
|
- element.attribs[match[1]] = match[2];
|
|
|
- } else if (typeof match[3] == "string" && match[3].length) {
|
|
|
- element.attribs[match[3].toString()] = match[4].toString();
|
|
|
- } else if (typeof match[5] == "string" && match[5].length) {
|
|
|
- element.attribs[match[5]] = match[6];
|
|
|
- } else if (typeof match[7] == "string" && match[7].length) {
|
|
|
- element.attribs[match[7]] = match[7];
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- //Extracts the base tag name from the data value of an element
|
|
|
- Parser.prototype.parseTagName = function Parser$parseTagName (data) {
|
|
|
- if (data == null || data == "")
|
|
|
- return("");
|
|
|
- var match = Parser._reTagName.exec(data);
|
|
|
- if (!match)
|
|
|
- return("");
|
|
|
- return((match[1] ? "/" : "") + match[2]);
|
|
|
- }
|
|
|
-
|
|
|
- //Parses through HTML text and returns an array of found elements
|
|
|
- //I admit, this function is rather large but splitting up had an noticeable impact on speed
|
|
|
- Parser.prototype.parseTags = function Parser$parseTags () {
|
|
|
- var bufferEnd = this._buffer.length - 1;
|
|
|
- while (Parser._reTags.test(this._buffer)) {
|
|
|
- this._next = Parser._reTags.lastIndex - 1;
|
|
|
- var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
|
|
|
- var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
|
|
|
-
|
|
|
- //A new element to eventually be appended to the element list
|
|
|
- var element = {
|
|
|
- raw: rawData
|
|
|
- , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
|
|
|
- , type: this._parseState
|
|
|
- };
|
|
|
-
|
|
|
- var elementName = this.parseTagName(element.data);
|
|
|
-
|
|
|
- //This section inspects the current tag stack and modifies the current
|
|
|
- //element if we're actually parsing a special area (script/comment/style tag)
|
|
|
- if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
|
|
|
- if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
|
|
|
- if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
|
|
|
- this._tagStack.pop();
|
|
|
- else { //Not a closing script tag
|
|
|
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
|
|
|
- //All data from here to script close is now a text element
|
|
|
- element.type = ElementType.Text;
|
|
|
- //If the previous element is text, append the current text to it
|
|
|
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
|
|
|
- var prevElement = this._elements[this._elements.length - 1];
|
|
|
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
|
|
|
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
|
|
|
- if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
|
|
|
- this._tagStack.pop();
|
|
|
- else {
|
|
|
- if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
|
|
|
- //All data from here to style close is now a text element
|
|
|
- element.type = ElementType.Text;
|
|
|
- //If the previous element is text, append the current text to it
|
|
|
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
|
|
|
- var prevElement = this._elements[this._elements.length - 1];
|
|
|
- if (element.raw != "") {
|
|
|
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
|
|
|
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
|
|
|
- } else { //Element is empty, so just append the last tag marker found
|
|
|
- prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
|
|
|
- }
|
|
|
- } else { //The previous element was not text
|
|
|
- if (element.raw != "") {
|
|
|
- element.raw = element.data = element.raw;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
|
|
|
- var rawLen = element.raw.length;
|
|
|
- if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
|
|
|
- //Actually, we're no longer in a style tag, so pop it off the stack
|
|
|
- this._tagStack.pop();
|
|
|
- //If the previous element is a comment, append the current text to it
|
|
|
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
|
|
|
- var prevElement = this._elements[this._elements.length - 1];
|
|
|
- prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
|
|
|
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
|
|
|
- element.type = ElementType.Text;
|
|
|
- }
|
|
|
- else //Previous element not a comment
|
|
|
- element.type = ElementType.Comment; //Change the current element's type to a comment
|
|
|
- }
|
|
|
- else { //Still in a comment tag
|
|
|
- element.type = ElementType.Comment;
|
|
|
- //If the previous element is a comment, append the current text to it
|
|
|
- if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
|
|
|
- var prevElement = this._elements[this._elements.length - 1];
|
|
|
- prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
|
|
|
- element.raw = element.data = ""; //This causes the current element to not be added to the element list
|
|
|
- element.type = ElementType.Text;
|
|
|
- }
|
|
|
- else
|
|
|
- element.raw = element.data = element.raw + tagSep;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- //Processing of non-special tags
|
|
|
- if (element.type == ElementType.Tag) {
|
|
|
- element.name = elementName;
|
|
|
-
|
|
|
- if (element.raw.indexOf("!--") == 0) { //This tag is really comment
|
|
|
- element.type = ElementType.Comment;
|
|
|
- delete element["name"];
|
|
|
- var rawLen = element.raw.length;
|
|
|
- //Check if the comment is terminated in the current element
|
|
|
- if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
|
|
|
- element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
|
|
|
- else { //It's not so push the comment onto the tag stack
|
|
|
- element.raw += tagSep;
|
|
|
- this._tagStack.push(ElementType.Comment);
|
|
|
- }
|
|
|
- }
|
|
|
- else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
|
|
|
- element.type = ElementType.Directive;
|
|
|
- //TODO: what about CDATA?
|
|
|
- }
|
|
|
- else if (element.name == "script") {
|
|
|
- element.type = ElementType.Script;
|
|
|
- //Special tag, push onto the tag stack if not terminated
|
|
|
- if (element.data.charAt(element.data.length - 1) != "/")
|
|
|
- this._tagStack.push(ElementType.Script);
|
|
|
- }
|
|
|
- else if (element.name == "/script")
|
|
|
- element.type = ElementType.Script;
|
|
|
- else if (element.name == "style") {
|
|
|
- element.type = ElementType.Style;
|
|
|
- //Special tag, push onto the tag stack if not terminated
|
|
|
- if (element.data.charAt(element.data.length - 1) != "/")
|
|
|
- this._tagStack.push(ElementType.Style);
|
|
|
- }
|
|
|
- else if (element.name == "/style")
|
|
|
- element.type = ElementType.Style;
|
|
|
- if (element.name && element.name.charAt(0) == "/")
|
|
|
- element.data = element.name;
|
|
|
- }
|
|
|
-
|
|
|
- //Add all tags and non-empty text elements to the element list
|
|
|
- if (element.raw != "" || element.type != ElementType.Text) {
|
|
|
- if (this._options.includeLocation && !element.location) {
|
|
|
- element.location = this.getLocation(element.type == ElementType.Tag);
|
|
|
- }
|
|
|
- this.parseAttribs(element);
|
|
|
- this._elements.push(element);
|
|
|
- //If tag self-terminates, add an explicit, separate closing tag
|
|
|
- if (
|
|
|
- element.type != ElementType.Text
|
|
|
- &&
|
|
|
- element.type != ElementType.Comment
|
|
|
- &&
|
|
|
- element.type != ElementType.Directive
|
|
|
- &&
|
|
|
- element.data.charAt(element.data.length - 1) == "/"
|
|
|
- )
|
|
|
- this._elements.push({
|
|
|
- raw: "/" + element.name
|
|
|
- , data: "/" + element.name
|
|
|
- , name: "/" + element.name
|
|
|
- , type: element.type
|
|
|
- });
|
|
|
- }
|
|
|
- this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
|
|
|
- this._current = this._next + 1;
|
|
|
- this._prevTagSep = tagSep;
|
|
|
- }
|
|
|
-
|
|
|
- if (this._options.includeLocation) {
|
|
|
- this.getLocation();
|
|
|
- this._location.row += this._location.inBuffer;
|
|
|
- this._location.inBuffer = 0;
|
|
|
- this._location.charOffset = 0;
|
|
|
- }
|
|
|
- this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
|
|
|
- this._current = 0;
|
|
|
-
|
|
|
- this.writeHandler();
|
|
|
- }
|
|
|
-
|
|
|
- Parser.prototype.getLocation = function Parser$getLocation (startTag) {
|
|
|
- var c,
|
|
|
- l = this._location,
|
|
|
- end = this._current - (startTag ? 1 : 0),
|
|
|
- chunk = startTag && l.charOffset == 0 && this._current == 0;
|
|
|
-
|
|
|
- for (; l.charOffset < end; l.charOffset++) {
|
|
|
- c = this._buffer.charAt(l.charOffset);
|
|
|
- if (c == '\n') {
|
|
|
- l.inBuffer++;
|
|
|
- l.col = 0;
|
|
|
- } else if (c != '\r') {
|
|
|
- l.col++;
|
|
|
- }
|
|
|
- }
|
|
|
- return {
|
|
|
- line: l.row + l.inBuffer + 1
|
|
|
- , col: l.col + (chunk ? 0: 1)
|
|
|
- };
|
|
|
- }
|
|
|
-
|
|
|
- //Checks the handler to make it is an object with the right "interface"
|
|
|
- Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
|
|
|
- if ((typeof handler) != "object")
|
|
|
- throw new Error("Handler is not an object");
|
|
|
- if ((typeof handler.reset) != "function")
|
|
|
- throw new Error("Handler method 'reset' is invalid");
|
|
|
- if ((typeof handler.done) != "function")
|
|
|
- throw new Error("Handler method 'done' is invalid");
|
|
|
- if ((typeof handler.writeTag) != "function")
|
|
|
- throw new Error("Handler method 'writeTag' is invalid");
|
|
|
- if ((typeof handler.writeText) != "function")
|
|
|
- throw new Error("Handler method 'writeText' is invalid");
|
|
|
- if ((typeof handler.writeComment) != "function")
|
|
|
- throw new Error("Handler method 'writeComment' is invalid");
|
|
|
- if ((typeof handler.writeDirective) != "function")
|
|
|
- throw new Error("Handler method 'writeDirective' is invalid");
|
|
|
- }
|
|
|
-
|
|
|
- //Writes parsed elements out to the handler
|
|
|
- Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
|
|
|
- forceFlush = !!forceFlush;
|
|
|
- if (this._tagStack.length && !forceFlush)
|
|
|
- return;
|
|
|
- while (this._elements.length) {
|
|
|
- var element = this._elements.shift();
|
|
|
- switch (element.type) {
|
|
|
- case ElementType.Comment:
|
|
|
- this._handler.writeComment(element);
|
|
|
- break;
|
|
|
- case ElementType.Directive:
|
|
|
- this._handler.writeDirective(element);
|
|
|
- break;
|
|
|
- case ElementType.Text:
|
|
|
- this._handler.writeText(element);
|
|
|
- break;
|
|
|
- default:
|
|
|
- this._handler.writeTag(element);
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- Parser.prototype.handleError = function Parser$handleError (error) {
|
|
|
- if ((typeof this._handler.error) == "function")
|
|
|
- this._handler.error(error);
|
|
|
- else
|
|
|
- throw error;
|
|
|
- }
|
|
|
-
|
|
|
-//TODO: make this a trully streamable handler
|
|
|
-function RssHandler (callback) {
|
|
|
- RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
|
|
|
-}
|
|
|
-inherits(RssHandler, DefaultHandler);
|
|
|
-
|
|
|
- RssHandler.prototype.done = function RssHandler$done () {
|
|
|
- var feed = { };
|
|
|
- var feedRoot;
|
|
|
-
|
|
|
- var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
|
|
|
- if (found.length) {
|
|
|
- feedRoot = found[0];
|
|
|
- }
|
|
|
- if (feedRoot) {
|
|
|
- if (feedRoot.name == "rss") {
|
|
|
- feed.type = "rss";
|
|
|
- feedRoot = feedRoot.children[0]; //<channel/>
|
|
|
- feed.id = "";
|
|
|
- try {
|
|
|
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- feed.items = [];
|
|
|
- DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
|
|
|
- var entry = {};
|
|
|
- try {
|
|
|
- entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
|
|
|
- } catch (ex) { }
|
|
|
- feed.items.push(entry);
|
|
|
- });
|
|
|
- } else {
|
|
|
- feed.type = "atom";
|
|
|
- try {
|
|
|
- feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- feed.items = [];
|
|
|
- DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
|
|
|
- var entry = {};
|
|
|
- try {
|
|
|
- entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
|
|
|
- } catch (ex) { }
|
|
|
- try {
|
|
|
- entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
|
|
|
- } catch (ex) { }
|
|
|
- feed.items.push(entry);
|
|
|
- });
|
|
|
- }
|
|
|
-
|
|
|
- this.dom = feed;
|
|
|
- }
|
|
|
- RssHandler.super_.prototype.done.call(this);
|
|
|
- }
|
|
|
-
|
|
|
-///////////////////////////////////////////////////
|
|
|
-
|
|
|
-function DefaultHandler (callback, options) {
|
|
|
- this.reset();
|
|
|
- this._options = options ? options : { };
|
|
|
- if (this._options.ignoreWhitespace == undefined)
|
|
|
- this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
|
|
|
- if (this._options.verbose == undefined)
|
|
|
- this._options.verbose = true; //Keep data property for tags and raw property for all
|
|
|
- if (this._options.enforceEmptyTags == undefined)
|
|
|
- this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
|
|
|
- if ((typeof callback) == "function")
|
|
|
- this._callback = callback;
|
|
|
-}
|
|
|
-
|
|
|
- //**"Static"**//
|
|
|
- //HTML Tags that shouldn't contain child nodes
|
|
|
- DefaultHandler._emptyTags = {
|
|
|
- area: 1
|
|
|
- , base: 1
|
|
|
- , basefont: 1
|
|
|
- , br: 1
|
|
|
- , col: 1
|
|
|
- , frame: 1
|
|
|
- , hr: 1
|
|
|
- , img: 1
|
|
|
- , input: 1
|
|
|
- , isindex: 1
|
|
|
- , link: 1
|
|
|
- , meta: 1
|
|
|
- , param: 1
|
|
|
- , embed: 1
|
|
|
- }
|
|
|
- //Regex to detect whitespace only text nodes
|
|
|
- DefaultHandler.reWhitespace = /^\s*$/;
|
|
|
-
|
|
|
- //**Public**//
|
|
|
- //Properties//
|
|
|
- DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
|
|
|
- //Methods//
|
|
|
- //Resets the handler back to starting state
|
|
|
- DefaultHandler.prototype.reset = function DefaultHandler$reset() {
|
|
|
- this.dom = [];
|
|
|
- this._done = false;
|
|
|
- this._tagStack = [];
|
|
|
- this._tagStack.last = function DefaultHandler$_tagStack$last () {
|
|
|
- return(this.length ? this[this.length - 1] : null);
|
|
|
- }
|
|
|
- }
|
|
|
- //Signals the handler that parsing is done
|
|
|
- DefaultHandler.prototype.done = function DefaultHandler$done () {
|
|
|
- this._done = true;
|
|
|
- this.handleCallback(null);
|
|
|
- }
|
|
|
- DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
|
|
|
- this.handleElement(element);
|
|
|
- }
|
|
|
- DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
|
|
|
- if (this._options.ignoreWhitespace)
|
|
|
- if (DefaultHandler.reWhitespace.test(element.data))
|
|
|
- return;
|
|
|
- this.handleElement(element);
|
|
|
- }
|
|
|
- DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
|
|
|
- this.handleElement(element);
|
|
|
- }
|
|
|
- DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
|
|
|
- this.handleElement(element);
|
|
|
- }
|
|
|
- DefaultHandler.prototype.error = function DefaultHandler$error (error) {
|
|
|
- this.handleCallback(error);
|
|
|
- }
|
|
|
-
|
|
|
- //**Private**//
|
|
|
- //Properties//
|
|
|
- DefaultHandler.prototype._options = null; //Handler options for how to behave
|
|
|
- DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
|
|
|
- DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
|
|
|
- DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
|
|
|
- //Methods//
|
|
|
- DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
|
|
|
- if ((typeof this._callback) != "function")
|
|
|
- if (error)
|
|
|
- throw error;
|
|
|
- else
|
|
|
- return;
|
|
|
- this._callback(error, this.dom);
|
|
|
- }
|
|
|
-
|
|
|
- DefaultHandler.prototype.isEmptyTag = function(element) {
|
|
|
- var name = element.name.toLowerCase();
|
|
|
- if (name.charAt(0) == '/') {
|
|
|
- name = name.substring(1);
|
|
|
- }
|
|
|
- return this._options.enforceEmptyTags && !!DefaultHandler._emptyTags[name];
|
|
|
- };
|
|
|
-
|
|
|
- DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
|
|
|
- if (this._done)
|
|
|
- this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
|
|
|
- if (!this._options.verbose) {
|
|
|
-// element.raw = null; //FIXME: Not clean
|
|
|
- //FIXME: Serious performance problem using delete
|
|
|
- delete element.raw;
|
|
|
- if (element.type == "tag" || element.type == "script" || element.type == "style")
|
|
|
- delete element.data;
|
|
|
- }
|
|
|
- if (!this._tagStack.last()) { //There are no parent elements
|
|
|
- //If the element can be a container, add it to the tag stack and the top level list
|
|
|
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
|
|
|
- if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
|
|
|
- this.dom.push(element);
|
|
|
- if (!this.isEmptyTag(element)) { //Don't add tags to the tag stack that can't have children
|
|
|
- this._tagStack.push(element);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- else //Otherwise just add to the top level list
|
|
|
- this.dom.push(element);
|
|
|
- }
|
|
|
- else { //There are parent elements
|
|
|
- //If the element can be a container, add it as a child of the element
|
|
|
- //on top of the tag stack and then add it to the tag stack
|
|
|
- if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
|
|
|
- if (element.name.charAt(0) == "/") {
|
|
|
- //This is a closing tag, scan the tagStack to find the matching opening tag
|
|
|
- //and pop the stack up to the opening tag's parent
|
|
|
- var baseName = element.name.substring(1);
|
|
|
- if (!this.isEmptyTag(element)) {
|
|
|
- var pos = this._tagStack.length - 1;
|
|
|
- while (pos > -1 && this._tagStack[pos--].name != baseName) { }
|
|
|
- if (pos > -1 || this._tagStack[0].name == baseName)
|
|
|
- while (pos < this._tagStack.length - 1)
|
|
|
- this._tagStack.pop();
|
|
|
- }
|
|
|
- }
|
|
|
- else { //This is not a closing tag
|
|
|
- if (!this._tagStack.last().children)
|
|
|
- this._tagStack.last().children = [];
|
|
|
- this._tagStack.last().children.push(element);
|
|
|
- if (!this.isEmptyTag(element)) //Don't add tags to the tag stack that can't have children
|
|
|
- this._tagStack.push(element);
|
|
|
- }
|
|
|
- }
|
|
|
- else { //This is not a container element
|
|
|
- if (!this._tagStack.last().children)
|
|
|
- this._tagStack.last().children = [];
|
|
|
- this._tagStack.last().children.push(element);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- var DomUtils = {
|
|
|
- testElement: function DomUtils$testElement (options, element) {
|
|
|
- if (!element) {
|
|
|
- return false;
|
|
|
- }
|
|
|
-
|
|
|
- for (var key in options) {
|
|
|
- if (key == "tag_name") {
|
|
|
- if (element.type != "tag" && element.type != "script" && element.type != "style") {
|
|
|
- return false;
|
|
|
- }
|
|
|
- if (!options["tag_name"](element.name)) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- } else if (key == "tag_type") {
|
|
|
- if (!options["tag_type"](element.type)) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- } else if (key == "tag_contains") {
|
|
|
- if (element.type != "text" && element.type != "comment" && element.type != "directive") {
|
|
|
- return false;
|
|
|
- }
|
|
|
- if (!options["tag_contains"](element.data)) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- } else {
|
|
|
- if (!element.attribs || !options[key](element.attribs[key])) {
|
|
|
- return false;
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return true;
|
|
|
- }
|
|
|
-
|
|
|
- , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
|
|
|
- recurse = (recurse === undefined || recurse === null) || !!recurse;
|
|
|
- limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
|
|
|
-
|
|
|
- if (!currentElement) {
|
|
|
- return([]);
|
|
|
- }
|
|
|
-
|
|
|
- var found = [];
|
|
|
- var elementList;
|
|
|
-
|
|
|
- function getTest (checkVal) {
|
|
|
- return(function (value) { return(value == checkVal); });
|
|
|
- }
|
|
|
- for (var key in options) {
|
|
|
- if ((typeof options[key]) != "function") {
|
|
|
- options[key] = getTest(options[key]);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if (DomUtils.testElement(options, currentElement)) {
|
|
|
- found.push(currentElement);
|
|
|
- }
|
|
|
-
|
|
|
- if (limit >= 0 && found.length >= limit) {
|
|
|
- return(found);
|
|
|
- }
|
|
|
-
|
|
|
- if (recurse && currentElement.children) {
|
|
|
- elementList = currentElement.children;
|
|
|
- } else if (currentElement instanceof Array) {
|
|
|
- elementList = currentElement;
|
|
|
- } else {
|
|
|
- return(found);
|
|
|
- }
|
|
|
-
|
|
|
- for (var i = 0; i < elementList.length; i++) {
|
|
|
- found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
|
|
|
- if (limit >= 0 && found.length >= limit) {
|
|
|
- break;
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- return(found);
|
|
|
- }
|
|
|
-
|
|
|
- , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
|
|
|
- var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
|
|
|
- return(result.length ? result[0] : null);
|
|
|
- }
|
|
|
-
|
|
|
- , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
|
|
|
- return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
|
|
|
- }
|
|
|
-
|
|
|
- , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
|
|
|
- return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- function inherits (ctor, superCtor) {
|
|
|
- var tempCtor = function(){};
|
|
|
- tempCtor.prototype = superCtor.prototype;
|
|
|
- ctor.super_ = superCtor;
|
|
|
- ctor.prototype = new tempCtor();
|
|
|
- ctor.prototype.constructor = ctor;
|
|
|
- }
|
|
|
-
|
|
|
-exports.Parser = Parser;
|
|
|
-
|
|
|
-exports.DefaultHandler = DefaultHandler;
|
|
|
-
|
|
|
-exports.RssHandler = RssHandler;
|
|
|
-
|
|
|
-exports.ElementType = ElementType;
|
|
|
-
|
|
|
-exports.DomUtils = DomUtils;
|
|
|
-
|
|
|
-})();
|