node-htmlparser.old.js 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754
  1. /***********************************************
  2. Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
  3. Permission is hereby granted, free of charge, to any person obtaining a copy
  4. of this software and associated documentation files (the "Software"), to
  5. deal in the Software without restriction, including without limitation the
  6. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7. sell copies of the Software, and to permit persons to whom the Software is
  8. furnished to do so, subject to the following conditions:
  9. The above copyright notice and this permission notice shall be included in
  10. all copies or substantial portions of the Software.
  11. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  16. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  17. IN THE SOFTWARE.
  18. ***********************************************/
  19. /* v1.5.0 */
  20. (function () {
  21. function runningInNode () {
  22. return(
  23. (typeof require) == "function"
  24. &&
  25. (typeof exports) == "object"
  26. &&
  27. (typeof module) == "object"
  28. &&
  29. (typeof __filename) == "string"
  30. &&
  31. (typeof __dirname) == "string"
  32. );
  33. }
  34. if (!runningInNode()) {
  35. if (!this.Tautologistics)
  36. this.Tautologistics = {};
  37. else if (this.Tautologistics.NodeHtmlParser)
  38. return; //NodeHtmlParser already defined!
  39. this.Tautologistics.NodeHtmlParser = {};
  40. exports = this.Tautologistics.NodeHtmlParser;
  41. }
  42. //Types of elements found in the DOM
  43. var ElementType = {
  44. Text: "text" //Plain text
  45. , Directive: "directive" //Special tag <!...>
  46. , Comment: "comment" //Special tag <!--...-->
  47. , Script: "script" //Special tag <script>...</script>
  48. , Style: "style" //Special tag <style>...</style>
  49. , Tag: "tag" //Any tag that isn't special
  50. }
  51. function Parser (handler) {
  52. this.validateHandler(handler);
  53. this._handler = handler;
  54. this.reset();
  55. }
  56. //**"Static"**//
  57. //Regular expressions used for cleaning up and parsing (stateless)
  58. Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
  59. Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
  60. Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
  61. Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
  62. //Regular expressions used for parsing (stateful)
  63. Parser._reAttrib = //Find attributes in a tag
  64. /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
  65. Parser._reTags = /[\<\>]/g; //Find tag markers
  66. //**Public**//
  67. //Methods//
  68. //Parses a complete HTML and pushes it to the handler
  69. Parser.prototype.parseComplete = function Parser$parseComplete (data) {
  70. this.reset();
  71. this.parseChunk(data);
  72. this.done();
  73. }
  74. //Parses a piece of an HTML document
  75. Parser.prototype.parseChunk = function Parser$parseChunk (data) {
  76. if (this._done)
  77. this.handleError(new Error("Attempted to parse chunk after parsing already done"));
  78. this._buffer += data; //FIXME: this can be a bottleneck
  79. this.parseTags();
  80. }
  81. //Tells the parser that the HTML being parsed is complete
  82. Parser.prototype.done = function Parser$done () {
  83. if (this._done)
  84. return;
  85. this._done = true;
  86. //Push any unparsed text into a final element in the element list
  87. if (this._buffer.length) {
  88. var rawData = this._buffer;
  89. this._buffer = "";
  90. var element = {
  91. raw: rawData
  92. , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
  93. , type: this._parseState
  94. };
  95. if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
  96. element.name = this.parseTagName(element.data);
  97. this.parseAttribs(element);
  98. this._elements.push(element);
  99. }
  100. this.writeHandler();
  101. this._handler.done();
  102. }
  103. //Resets the parser to a blank state, ready to parse a new HTML document
  104. Parser.prototype.reset = function Parser$reset () {
  105. this._buffer = "";
  106. this._done = false;
  107. this._elements = [];
  108. this._elementsCurrent = 0;
  109. this._current = 0;
  110. this._next = 0;
  111. this._parseState = ElementType.Text;
  112. this._prevTagSep = '';
  113. this._tagStack = [];
  114. this._handler.reset();
  115. }
  116. //**Private**//
  117. //Properties//
  118. Parser.prototype._handler = null; //Handler for parsed elements
  119. Parser.prototype._buffer = null; //Buffer of unparsed data
  120. Parser.prototype._done = false; //Flag indicating whether parsing is done
  121. Parser.prototype._elements = null; //Array of parsed elements
  122. Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
  123. Parser.prototype._current = 0; //Position in data that has already been parsed
  124. Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
  125. Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
  126. Parser.prototype._prevTagSep = ''; //Previous tag marker found
  127. //Stack of element types previously encountered; keeps track of when
  128. //parsing occurs inside a script/comment/style tag
  129. Parser.prototype._tagStack = null;
  130. //Methods//
  131. //Takes an array of elements and parses any found attributes
  132. Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
  133. var idxEnd = elements.length;
  134. var idx = 0;
  135. while (idx < idxEnd) {
  136. var element = elements[idx++];
  137. if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
  138. this.parseAttribs(element);
  139. }
  140. return(elements);
  141. }
  142. //Takes an element and adds an "attribs" property for any element attributes found
  143. Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
  144. //Only parse attributes for tags
  145. if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
  146. return;
  147. var tagName = element.data.split(Parser._reWhitespace, 1)[0];
  148. var attribRaw = element.data.substring(tagName.length);
  149. if (attribRaw.length < 1)
  150. return;
  151. var match;
  152. Parser._reAttrib.lastIndex = 0;
  153. while (match = Parser._reAttrib.exec(attribRaw)) {
  154. if (element.attribs == undefined)
  155. element.attribs = {};
  156. if (typeof match[1] == "string" && match[1].length) {
  157. element.attribs[match[1]] = match[2];
  158. } else if (typeof match[3] == "string" && match[3].length) {
  159. element.attribs[match[3].toString()] = match[4].toString();
  160. } else if (typeof match[5] == "string" && match[5].length) {
  161. element.attribs[match[5]] = match[6];
  162. } else if (typeof match[7] == "string" && match[7].length) {
  163. element.attribs[match[7]] = match[7];
  164. }
  165. }
  166. }
  167. //Extracts the base tag name from the data value of an element
  168. Parser.prototype.parseTagName = function Parser$parseTagName (data) {
  169. if (data == null || data == "")
  170. return("");
  171. var match = Parser._reTagName.exec(data);
  172. if (!match)
  173. return("");
  174. return((match[1] ? "/" : "") + match[2]);
  175. }
  176. //Parses through HTML text and returns an array of found elements
  177. //I admit, this function is rather large but splitting up had an noticeable impact on speed
  178. Parser.prototype.parseTags = function Parser$parseTags () {
  179. var bufferEnd = this._buffer.length - 1;
  180. while (Parser._reTags.test(this._buffer)) {
  181. this._next = Parser._reTags.lastIndex - 1;
  182. var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
  183. var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
  184. //A new element to eventually be appended to the element list
  185. var element = {
  186. raw: rawData
  187. , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
  188. , type: this._parseState
  189. };
  190. var elementName = this.parseTagName(element.data);
  191. //This section inspects the current tag stack and modifies the current
  192. //element if we're actually parsing a special area (script/comment/style tag)
  193. if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
  194. if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
  195. if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
  196. this._tagStack.pop();
  197. else { //Not a closing script tag
  198. if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
  199. //All data from here to script close is now a text element
  200. element.type = ElementType.Text;
  201. //If the previous element is text, append the current text to it
  202. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
  203. var prevElement = this._elements[this._elements.length - 1];
  204. prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
  205. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  206. }
  207. }
  208. }
  209. }
  210. else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
  211. if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
  212. this._tagStack.pop();
  213. else {
  214. if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
  215. //All data from here to style close is now a text element
  216. element.type = ElementType.Text;
  217. //If the previous element is text, append the current text to it
  218. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
  219. if (element.raw != "") {
  220. var prevElement = this._elements[this._elements.length - 1];
  221. prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
  222. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  223. }
  224. else //Element is empty, so just append the last tag marker found
  225. prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
  226. }
  227. else //The previous element was not text
  228. if (element.raw != "")
  229. element.raw = element.data = element.raw;
  230. }
  231. }
  232. }
  233. else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
  234. var rawLen = element.raw.length;
  235. if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
  236. //Actually, we're no longer in a style tag, so pop it off the stack
  237. this._tagStack.pop();
  238. //If the previous element is a comment, append the current text to it
  239. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
  240. var prevElement = this._elements[this._elements.length - 1];
  241. prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
  242. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  243. element.type = ElementType.Text;
  244. }
  245. else //Previous element not a comment
  246. element.type = ElementType.Comment; //Change the current element's type to a comment
  247. }
  248. else { //Still in a comment tag
  249. element.type = ElementType.Comment;
  250. //If the previous element is a comment, append the current text to it
  251. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
  252. var prevElement = this._elements[this._elements.length - 1];
  253. prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
  254. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  255. element.type = ElementType.Text;
  256. }
  257. else
  258. element.raw = element.data = element.raw + tagSep;
  259. }
  260. }
  261. }
  262. //Processing of non-special tags
  263. if (element.type == ElementType.Tag) {
  264. element.name = elementName;
  265. if (element.raw.indexOf("!--") == 0) { //This tag is really comment
  266. element.type = ElementType.Comment;
  267. delete element["name"];
  268. var rawLen = element.raw.length;
  269. //Check if the comment is terminated in the current element
  270. if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
  271. element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
  272. else { //It's not so push the comment onto the tag stack
  273. element.raw += tagSep;
  274. this._tagStack.push(ElementType.Comment);
  275. }
  276. }
  277. else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
  278. element.type = ElementType.Directive;
  279. //TODO: what about CDATA?
  280. }
  281. else if (element.name == "script") {
  282. element.type = ElementType.Script;
  283. //Special tag, push onto the tag stack if not terminated
  284. if (element.data.charAt(element.data.length - 1) != "/")
  285. this._tagStack.push(ElementType.Script);
  286. }
  287. else if (element.name == "/script")
  288. element.type = ElementType.Script;
  289. else if (element.name == "style") {
  290. element.type = ElementType.Style;
  291. //Special tag, push onto the tag stack if not terminated
  292. if (element.data.charAt(element.data.length - 1) != "/")
  293. this._tagStack.push(ElementType.Style);
  294. }
  295. else if (element.name == "/style")
  296. element.type = ElementType.Style;
  297. if (element.name && element.name.charAt(0) == "/")
  298. element.data = element.name;
  299. }
  300. //Add all tags and non-empty text elements to the element list
  301. if (element.raw != "" || element.type != ElementType.Text) {
  302. this.parseAttribs(element);
  303. this._elements.push(element);
  304. //If tag self-terminates, add an explicit, separate closing tag
  305. if (
  306. element.type != ElementType.Text
  307. &&
  308. element.type != ElementType.Comment
  309. &&
  310. element.type != ElementType.Directive
  311. &&
  312. element.data.charAt(element.data.length - 1) == "/"
  313. )
  314. this._elements.push({
  315. raw: "/" + element.name
  316. , data: "/" + element.name
  317. , name: "/" + element.name
  318. , type: element.type
  319. });
  320. }
  321. this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
  322. this._current = this._next + 1;
  323. this._prevTagSep = tagSep;
  324. }
  325. this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
  326. this._current = 0;
  327. this.writeHandler();
  328. }
  329. //Checks the handler to make it is an object with the right "interface"
  330. Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
  331. if ((typeof handler) != "object")
  332. throw new Error("Handler is not an object");
  333. if ((typeof handler.reset) != "function")
  334. throw new Error("Handler method 'reset' is invalid");
  335. if ((typeof handler.done) != "function")
  336. throw new Error("Handler method 'done' is invalid");
  337. if ((typeof handler.writeTag) != "function")
  338. throw new Error("Handler method 'writeTag' is invalid");
  339. if ((typeof handler.writeText) != "function")
  340. throw new Error("Handler method 'writeText' is invalid");
  341. if ((typeof handler.writeComment) != "function")
  342. throw new Error("Handler method 'writeComment' is invalid");
  343. if ((typeof handler.writeDirective) != "function")
  344. throw new Error("Handler method 'writeDirective' is invalid");
  345. }
  346. //Writes parsed elements out to the handler
  347. Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
  348. forceFlush = !!forceFlush;
  349. if (this._tagStack.length && !forceFlush)
  350. return;
  351. while (this._elements.length) {
  352. var element = this._elements.shift();
  353. switch (element.type) {
  354. case ElementType.Comment:
  355. this._handler.writeComment(element);
  356. break;
  357. case ElementType.Directive:
  358. this._handler.writeDirective(element);
  359. break;
  360. case ElementType.Text:
  361. this._handler.writeText(element);
  362. break;
  363. default:
  364. this._handler.writeTag(element);
  365. break;
  366. }
  367. }
  368. }
  369. Parser.prototype.handleError = function Parser$handleError (error) {
  370. if ((typeof this._handler.error) == "function")
  371. this._handler.error(error);
  372. else
  373. throw error;
  374. }
  375. //TODO: make this a trully streamable handler
  376. function RssHandler (callback) {
  377. RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
  378. }
  379. inherits(RssHandler, DefaultHandler);
  380. RssHandler.prototype.done = function RssHandler$done () {
  381. var feed = { };
  382. var feedRoot;
  383. var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
  384. if (found.length) {
  385. feedRoot = found[0];
  386. }
  387. if (feedRoot) {
  388. if (feedRoot.name == "rss") {
  389. feed.type = "rss";
  390. feedRoot = feedRoot.children[0]; //<channel/>
  391. feed.id = "";
  392. // require("sys").debug(require("sys").inspect(feedRoot, false, null));
  393. // require("sys").debug(require("sys").inspect(DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data, false, null));
  394. try {
  395. feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
  396. } catch (ex) { }
  397. try {
  398. feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
  399. } catch (ex) { }
  400. try {
  401. feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
  402. } catch (ex) { }
  403. try {
  404. feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
  405. } catch (ex) { }
  406. try {
  407. feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
  408. } catch (ex) { }
  409. feed.items = [];
  410. DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
  411. var entry = {};
  412. try {
  413. entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
  414. } catch (ex) { }
  415. try {
  416. entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
  417. } catch (ex) { }
  418. try {
  419. entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
  420. } catch (ex) { }
  421. try {
  422. entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
  423. } catch (ex) { }
  424. try {
  425. entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
  426. } catch (ex) { }
  427. feed.items.push(entry);
  428. });
  429. } else {
  430. feed.type = "atom";
  431. try {
  432. feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
  433. } catch (ex) { }
  434. try {
  435. feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
  436. } catch (ex) { }
  437. try {
  438. feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
  439. } catch (ex) { }
  440. try {
  441. feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
  442. } catch (ex) { }
  443. try {
  444. feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
  445. } catch (ex) { }
  446. try {
  447. feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
  448. } catch (ex) { }
  449. feed.items = [];
  450. DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
  451. var entry = {};
  452. try {
  453. entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
  454. } catch (ex) { }
  455. try {
  456. entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
  457. } catch (ex) { }
  458. try {
  459. entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
  460. } catch (ex) { }
  461. try {
  462. entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
  463. } catch (ex) { }
  464. try {
  465. entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
  466. } catch (ex) { }
  467. feed.items.push(entry);
  468. });
  469. }
  470. this.dom = feed;
  471. }
  472. RssHandler.super_.prototype.done.call(this);
  473. }
  474. ///////////////////////////////////////////////////
  475. function DefaultHandler (callback, options) {
  476. this.reset();
  477. this._options = options ? options : { };
  478. if (this._options.ignoreWhitespace == undefined)
  479. this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
  480. if (this._options.verbose == undefined)
  481. this._options.verbose = true; //Keep data property for tags and raw property for all
  482. if (this._options.enforceEmptyTags == undefined)
  483. this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
  484. if ((typeof callback) == "function")
  485. this._callback = callback;
  486. }
  487. //**"Static"**//
  488. //HTML Tags that shouldn't contain child nodes
  489. DefaultHandler._emptyTags = {
  490. area: 1
  491. , base: 1
  492. , basefont: 1
  493. , br: 1
  494. , col: 1
  495. , frame: 1
  496. , hr: 1
  497. , img: 1
  498. , input: 1
  499. , isindex: 1
  500. , link: 1
  501. , meta: 1
  502. , param: 1
  503. , embed: 1
  504. }
  505. //Regex to detect whitespace only text nodes
  506. DefaultHandler.reWhitespace = /^\s*$/;
  507. //**Public**//
  508. //Properties//
  509. DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
  510. //Methods//
  511. //Resets the handler back to starting state
  512. DefaultHandler.prototype.reset = function DefaultHandler$reset() {
  513. this.dom = [];
  514. this._done = false;
  515. this._tagStack = [];
  516. this._tagStack.last = function DefaultHandler$_tagStack$last () {
  517. return(this.length ? this[this.length - 1] : null);
  518. }
  519. }
  520. //Signals the handler that parsing is done
  521. DefaultHandler.prototype.done = function DefaultHandler$done () {
  522. this._done = true;
  523. this.handleCallback(null);
  524. }
  525. DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
  526. this.handleElement(element);
  527. }
  528. DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
  529. if (this._options.ignoreWhitespace)
  530. if (DefaultHandler.reWhitespace.test(element.data))
  531. return;
  532. this.handleElement(element);
  533. }
  534. DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
  535. this.handleElement(element);
  536. }
  537. DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
  538. this.handleElement(element);
  539. }
  540. DefaultHandler.prototype.error = function DefaultHandler$error (error) {
  541. this.handleCallback(error);
  542. }
  543. //**Private**//
  544. //Properties//
  545. DefaultHandler.prototype._options = null; //Handler options for how to behave
  546. DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
  547. DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
  548. DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
  549. //Methods//
  550. DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
  551. if ((typeof this._callback) != "function")
  552. if (error)
  553. throw error;
  554. else
  555. return;
  556. this._callback(error, this.dom);
  557. }
  558. DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
  559. if (this._done)
  560. this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
  561. if (!this._options.verbose) {
  562. // element.raw = null; //FIXME: Not clean
  563. //FIXME: Serious performance problem using delete
  564. delete element.raw;
  565. if (element.type == "tag" || element.type == "script" || element.type == "style")
  566. delete element.data;
  567. }
  568. if (!this._tagStack.last()) { //There are no parent elements
  569. //If the element can be a container, add it to the tag stack and the top level list
  570. if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
  571. if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
  572. this.dom.push(element);
  573. if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) { //Don't add tags to the tag stack that can't have children
  574. this._tagStack.push(element);
  575. }
  576. }
  577. }
  578. else //Otherwise just add to the top level list
  579. this.dom.push(element);
  580. }
  581. else { //There are parent elements
  582. //If the element can be a container, add it as a child of the element
  583. //on top of the tag stack and then add it to the tag stack
  584. if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
  585. if (element.name.charAt(0) == "/") {
  586. //This is a closing tag, scan the tagStack to find the matching opening tag
  587. //and pop the stack up to the opening tag's parent
  588. var baseName = element.name.substring(1);
  589. if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[baseName]) {
  590. var pos = this._tagStack.length - 1;
  591. while (pos > -1 && this._tagStack[pos--].name != baseName) { }
  592. if (pos > -1 || this._tagStack[0].name == baseName)
  593. while (pos < this._tagStack.length - 1)
  594. this._tagStack.pop();
  595. }
  596. }
  597. else { //This is not a closing tag
  598. if (!this._tagStack.last().children)
  599. this._tagStack.last().children = [];
  600. this._tagStack.last().children.push(element);
  601. if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) //Don't add tags to the tag stack that can't have children
  602. this._tagStack.push(element);
  603. }
  604. }
  605. else { //This is not a container element
  606. if (!this._tagStack.last().children)
  607. this._tagStack.last().children = [];
  608. this._tagStack.last().children.push(element);
  609. }
  610. }
  611. }
  612. var DomUtils = {
  613. testElement: function DomUtils$testElement (options, element) {
  614. if (!element) {
  615. return(false);
  616. }
  617. for (var key in options) {
  618. if (key == "tag_name") {
  619. if (element.type != "tag" && element.type != "script" && element.type != "style") {
  620. return(false);
  621. }
  622. return(options["tag_name"](element.name));
  623. } else if (key == "tag_type") {
  624. return(options["tag_type"](element.type));
  625. } else if (key == "tag_contains") {
  626. if (element.type != "text" && element.type != "comment" && element.type != "directive") {
  627. return(false);
  628. }
  629. return(options["tag_contains"](element.data));
  630. } else {
  631. return(element.attribs && options[key](element.attribs[key]));
  632. }
  633. }
  634. return(true);
  635. }
  636. , getElements: function DomUtils$getElements (options, currentElement, recurse) {
  637. recurse = (recurse === undefined || recurse === null) || !!recurse;
  638. if (!currentElement) {
  639. return([]);
  640. }
  641. var found = [];
  642. var elementList;
  643. function getTest (checkVal) {
  644. return(((typeof options[key]) == "function") ? checkVal : function (value) { return(value == checkVal); });
  645. }
  646. for (var key in options) {
  647. options[key] = getTest(options[key]);
  648. }
  649. if (DomUtils.testElement(options, currentElement)) {
  650. found.push(currentElement);
  651. }
  652. if (recurse && currentElement.children)
  653. elementList = currentElement.children;
  654. else if (currentElement instanceof Array)
  655. elementList = currentElement;
  656. else
  657. return(found);
  658. for (var i = 0; i < elementList.length; i++)
  659. found = found.concat(DomUtils.getElements(options, elementList[i], recurse));
  660. return(found);
  661. }
  662. , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
  663. recurse = (recurse === undefined || recurse === null) || !!recurse;
  664. var result = DomUtils.getElements({ id: id }, currentElement, recurse);
  665. return(result.length ? result[0] : null);
  666. }
  667. , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse) {
  668. recurse = (recurse === undefined || recurse === null) || !!recurse;
  669. return(DomUtils.getElements({ tag_name: name }, currentElement, recurse));
  670. }
  671. , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse) {
  672. recurse = (recurse === undefined || recurse === null) || !!recurse;
  673. return(DomUtils.getElements({ tag_type: type }, currentElement, recurse));
  674. }
  675. }
  676. function inherits (ctor, superCtor) {
  677. var tempCtor = function(){};
  678. tempCtor.prototype = superCtor.prototype;
  679. ctor.super_ = superCtor;
  680. ctor.prototype = new tempCtor();
  681. ctor.prototype.constructor = ctor;
  682. }
  683. exports.Parser = Parser;
  684. exports.DefaultHandler = DefaultHandler;
  685. exports.RssHandler = RssHandler;
  686. exports.ElementType = ElementType;
  687. exports.DomUtils = DomUtils;
  688. })();