node-htmlparser.js 28 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772
  1. /***********************************************
  2. Copyright 2010, Chris Winberry <chris@winberry.net>. All rights reserved.
  3. Permission is hereby granted, free of charge, to any person obtaining a copy
  4. of this software and associated documentation files (the "Software"), to
  5. deal in the Software without restriction, including without limitation the
  6. rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  7. sell copies of the Software, and to permit persons to whom the Software is
  8. furnished to do so, subject to the following conditions:
  9. The above copyright notice and this permission notice shall be included in
  10. all copies or substantial portions of the Software.
  11. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  12. IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  13. FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  14. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  15. LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  16. FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  17. IN THE SOFTWARE.
  18. ***********************************************/
  19. /* v1.6.3 */
  20. (function () {
  21. function runningInNode () {
  22. return(
  23. (typeof require) == "function"
  24. &&
  25. (typeof exports) == "object"
  26. &&
  27. (typeof module) == "object"
  28. &&
  29. (typeof __filename) == "string"
  30. &&
  31. (typeof __dirname) == "string"
  32. );
  33. }
  34. if (!runningInNode()) {
  35. if (!this.Tautologistics)
  36. this.Tautologistics = {};
  37. else if (this.Tautologistics.NodeHtmlParser)
  38. return; //NodeHtmlParser already defined!
  39. this.Tautologistics.NodeHtmlParser = {};
  40. exports = this.Tautologistics.NodeHtmlParser;
  41. }
  42. //Types of elements found in the DOM
  43. var ElementType = {
  44. Text: "text" //Plain text
  45. , Directive: "directive" //Special tag <!...>
  46. , Comment: "comment" //Special tag <!--...-->
  47. , Script: "script" //Special tag <script>...</script>
  48. , Style: "style" //Special tag <style>...</style>
  49. , Tag: "tag" //Any tag that isn't special
  50. }
  51. function Parser (handler) {
  52. this.validateHandler(handler);
  53. this._handler = handler;
  54. this.reset();
  55. }
  56. //**"Static"**//
  57. //Regular expressions used for cleaning up and parsing (stateless)
  58. Parser._reTrim = /(^\s+|\s+$)/g; //Trim leading/trailing whitespace
  59. Parser._reTrimComment = /(^\!--|--$)/g; //Remove comment tag markup from comment contents
  60. Parser._reWhitespace = /\s/g; //Used to find any whitespace to split on
  61. Parser._reTagName = /^\s*(\/?)\s*([^\s\/]+)/; //Used to find the tag name for an element
  62. //Regular expressions used for parsing (stateful)
  63. Parser._reAttrib = //Find attributes in a tag
  64. /([^=<>\"\'\s]+)\s*=\s*"([^"]*)"|([^=<>\"\'\s]+)\s*=\s*'([^']*)'|([^=<>\"\'\s]+)\s*=\s*([^'"\s]+)|([^=<>\"\'\s\/]+)/g;
  65. Parser._reTags = /[\<\>]/g; //Find tag markers
  66. //**Public**//
  67. //Methods//
  68. //Parses a complete HTML and pushes it to the handler
  69. Parser.prototype.parseComplete = function Parser$parseComplete (data) {
  70. this.reset();
  71. this.parseChunk(data);
  72. this.done();
  73. }
  74. //Parses a piece of an HTML document
  75. Parser.prototype.parseChunk = function Parser$parseChunk (data) {
  76. if (this._done)
  77. this.handleError(new Error("Attempted to parse chunk after parsing already done"));
  78. this._buffer += data; //FIXME: this can be a bottleneck
  79. this.parseTags();
  80. }
  81. //Tells the parser that the HTML being parsed is complete
  82. Parser.prototype.done = function Parser$done () {
  83. if (this._done)
  84. return;
  85. this._done = true;
  86. //Push any unparsed text into a final element in the element list
  87. if (this._buffer.length) {
  88. var rawData = this._buffer;
  89. this._buffer = "";
  90. var element = {
  91. raw: rawData
  92. , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
  93. , type: this._parseState
  94. };
  95. if (this._parseState == ElementType.Tag || this._parseState == ElementType.Script || this._parseState == ElementType.Style)
  96. element.name = this.parseTagName(element.data);
  97. this.parseAttribs(element);
  98. this._elements.push(element);
  99. }
  100. this.writeHandler();
  101. this._handler.done();
  102. }
  103. //Resets the parser to a blank state, ready to parse a new HTML document
  104. Parser.prototype.reset = function Parser$reset () {
  105. this._buffer = "";
  106. this._done = false;
  107. this._elements = [];
  108. this._elementsCurrent = 0;
  109. this._current = 0;
  110. this._next = 0;
  111. this._parseState = ElementType.Text;
  112. this._prevTagSep = '';
  113. this._tagStack = [];
  114. this._handler.reset();
  115. }
  116. //**Private**//
  117. //Properties//
  118. Parser.prototype._handler = null; //Handler for parsed elements
  119. Parser.prototype._buffer = null; //Buffer of unparsed data
  120. Parser.prototype._done = false; //Flag indicating whether parsing is done
  121. Parser.prototype._elements = null; //Array of parsed elements
  122. Parser.prototype._elementsCurrent = 0; //Pointer to last element in _elements that has been processed
  123. Parser.prototype._current = 0; //Position in data that has already been parsed
  124. Parser.prototype._next = 0; //Position in data of the next tag marker (<>)
  125. Parser.prototype._parseState = ElementType.Text; //Current type of element being parsed
  126. Parser.prototype._prevTagSep = ''; //Previous tag marker found
  127. //Stack of element types previously encountered; keeps track of when
  128. //parsing occurs inside a script/comment/style tag
  129. Parser.prototype._tagStack = null;
  130. //Methods//
  131. //Takes an array of elements and parses any found attributes
  132. Parser.prototype.parseTagAttribs = function Parser$parseTagAttribs (elements) {
  133. var idxEnd = elements.length;
  134. var idx = 0;
  135. while (idx < idxEnd) {
  136. var element = elements[idx++];
  137. if (element.type == ElementType.Tag || element.type == ElementType.Script || element.type == ElementType.style)
  138. this.parseAttribs(element);
  139. }
  140. return(elements);
  141. }
  142. //Takes an element and adds an "attribs" property for any element attributes found
  143. Parser.prototype.parseAttribs = function Parser$parseAttribs (element) {
  144. //Only parse attributes for tags
  145. if (element.type != ElementType.Script && element.type != ElementType.Style && element.type != ElementType.Tag)
  146. return;
  147. var tagName = element.data.split(Parser._reWhitespace, 1)[0];
  148. var attribRaw = element.data.substring(tagName.length);
  149. if (attribRaw.length < 1)
  150. return;
  151. var match;
  152. Parser._reAttrib.lastIndex = 0;
  153. while (match = Parser._reAttrib.exec(attribRaw)) {
  154. if (element.attribs == undefined)
  155. element.attribs = {};
  156. if (typeof match[1] == "string" && match[1].length) {
  157. element.attribs[match[1]] = match[2];
  158. } else if (typeof match[3] == "string" && match[3].length) {
  159. element.attribs[match[3].toString()] = match[4].toString();
  160. } else if (typeof match[5] == "string" && match[5].length) {
  161. element.attribs[match[5]] = match[6];
  162. } else if (typeof match[7] == "string" && match[7].length) {
  163. element.attribs[match[7]] = match[7];
  164. }
  165. }
  166. }
  167. //Extracts the base tag name from the data value of an element
  168. Parser.prototype.parseTagName = function Parser$parseTagName (data) {
  169. if (data == null || data == "")
  170. return("");
  171. var match = Parser._reTagName.exec(data);
  172. if (!match)
  173. return("");
  174. return((match[1] ? "/" : "") + match[2]);
  175. }
  176. //Parses through HTML text and returns an array of found elements
  177. //I admit, this function is rather large but splitting up had an noticeable impact on speed
  178. Parser.prototype.parseTags = function Parser$parseTags () {
  179. var bufferEnd = this._buffer.length - 1;
  180. while (Parser._reTags.test(this._buffer)) {
  181. this._next = Parser._reTags.lastIndex - 1;
  182. var tagSep = this._buffer.charAt(this._next); //The currently found tag marker
  183. var rawData = this._buffer.substring(this._current, this._next); //The next chunk of data to parse
  184. //A new element to eventually be appended to the element list
  185. var element = {
  186. raw: rawData
  187. , data: (this._parseState == ElementType.Text) ? rawData : rawData.replace(Parser._reTrim, "")
  188. , type: this._parseState
  189. };
  190. var elementName = this.parseTagName(element.data);
  191. //This section inspects the current tag stack and modifies the current
  192. //element if we're actually parsing a special area (script/comment/style tag)
  193. if (this._tagStack.length) { //We're parsing inside a script/comment/style tag
  194. if (this._tagStack[this._tagStack.length - 1] == ElementType.Script) { //We're currently in a script tag
  195. if (elementName == "/script") //Actually, we're no longer in a script tag, so pop it off the stack
  196. this._tagStack.pop();
  197. else { //Not a closing script tag
  198. if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
  199. //All data from here to script close is now a text element
  200. element.type = ElementType.Text;
  201. //If the previous element is text, append the current text to it
  202. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
  203. var prevElement = this._elements[this._elements.length - 1];
  204. prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
  205. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  206. }
  207. }
  208. }
  209. }
  210. else if (this._tagStack[this._tagStack.length - 1] == ElementType.Style) { //We're currently in a style tag
  211. if (elementName == "/style") //Actually, we're no longer in a style tag, so pop it off the stack
  212. this._tagStack.pop();
  213. else {
  214. if (element.raw.indexOf("!--") != 0) { //Make sure we're not in a comment
  215. //All data from here to style close is now a text element
  216. element.type = ElementType.Text;
  217. //If the previous element is text, append the current text to it
  218. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Text) {
  219. if (element.raw != "") {
  220. var prevElement = this._elements[this._elements.length - 1];
  221. prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep + element.raw;
  222. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  223. }
  224. else{ //Element is empty, so just append the last tag marker found
  225. if (prevElement) {
  226. prevElement.raw = prevElement.data = prevElement.raw + this._prevTagSep;
  227. }
  228. }
  229. }
  230. else //The previous element was not text
  231. if (element.raw != "")
  232. element.raw = element.data = element.raw;
  233. }
  234. }
  235. }
  236. else if (this._tagStack[this._tagStack.length - 1] == ElementType.Comment) { //We're currently in a comment tag
  237. var rawLen = element.raw.length;
  238. if (element.raw.charAt(rawLen - 2) == "-" && element.raw.charAt(rawLen - 1) == "-" && tagSep == ">") {
  239. //Actually, we're no longer in a style tag, so pop it off the stack
  240. this._tagStack.pop();
  241. //If the previous element is a comment, append the current text to it
  242. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
  243. var prevElement = this._elements[this._elements.length - 1];
  244. prevElement.raw = prevElement.data = (prevElement.raw + element.raw).replace(Parser._reTrimComment, "");
  245. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  246. element.type = ElementType.Text;
  247. }
  248. else //Previous element not a comment
  249. element.type = ElementType.Comment; //Change the current element's type to a comment
  250. }
  251. else { //Still in a comment tag
  252. element.type = ElementType.Comment;
  253. //If the previous element is a comment, append the current text to it
  254. if (this._elements.length && this._elements[this._elements.length - 1].type == ElementType.Comment) {
  255. var prevElement = this._elements[this._elements.length - 1];
  256. prevElement.raw = prevElement.data = prevElement.raw + element.raw + tagSep;
  257. element.raw = element.data = ""; //This causes the current element to not be added to the element list
  258. element.type = ElementType.Text;
  259. }
  260. else
  261. element.raw = element.data = element.raw + tagSep;
  262. }
  263. }
  264. }
  265. //Processing of non-special tags
  266. if (element.type == ElementType.Tag) {
  267. element.name = elementName;
  268. if (element.raw.indexOf("!--") == 0) { //This tag is really comment
  269. element.type = ElementType.Comment;
  270. delete element["name"];
  271. var rawLen = element.raw.length;
  272. //Check if the comment is terminated in the current element
  273. if (element.raw.charAt(rawLen - 1) == "-" && element.raw.charAt(rawLen - 2) == "-" && tagSep == ">")
  274. element.raw = element.data = element.raw.replace(Parser._reTrimComment, "");
  275. else { //It's not so push the comment onto the tag stack
  276. element.raw += tagSep;
  277. this._tagStack.push(ElementType.Comment);
  278. }
  279. }
  280. else if (element.raw.indexOf("!") == 0 || element.raw.indexOf("?") == 0) {
  281. element.type = ElementType.Directive;
  282. //TODO: what about CDATA?
  283. }
  284. else if (element.name == "script") {
  285. element.type = ElementType.Script;
  286. //Special tag, push onto the tag stack if not terminated
  287. if (element.data.charAt(element.data.length - 1) != "/")
  288. this._tagStack.push(ElementType.Script);
  289. }
  290. else if (element.name == "/script")
  291. element.type = ElementType.Script;
  292. else if (element.name == "style") {
  293. element.type = ElementType.Style;
  294. //Special tag, push onto the tag stack if not terminated
  295. if (element.data.charAt(element.data.length - 1) != "/")
  296. this._tagStack.push(ElementType.Style);
  297. }
  298. else if (element.name == "/style")
  299. element.type = ElementType.Style;
  300. if (element.name && element.name.charAt(0) == "/")
  301. element.data = element.name;
  302. }
  303. //Add all tags and non-empty text elements to the element list
  304. if (element.raw != "" || element.type != ElementType.Text) {
  305. this.parseAttribs(element);
  306. this._elements.push(element);
  307. //If tag self-terminates, add an explicit, separate closing tag
  308. if (
  309. element.type != ElementType.Text
  310. &&
  311. element.type != ElementType.Comment
  312. &&
  313. element.type != ElementType.Directive
  314. &&
  315. element.data.charAt(element.data.length - 1) == "/"
  316. )
  317. this._elements.push({
  318. raw: "/" + element.name
  319. , data: "/" + element.name
  320. , name: "/" + element.name
  321. , type: element.type
  322. });
  323. }
  324. this._parseState = (tagSep == "<") ? ElementType.Tag : ElementType.Text;
  325. this._current = this._next + 1;
  326. this._prevTagSep = tagSep;
  327. }
  328. this._buffer = (this._current <= bufferEnd) ? this._buffer.substring(this._current) : "";
  329. this._current = 0;
  330. this.writeHandler();
  331. }
  332. //Checks the handler to make it is an object with the right "interface"
  333. Parser.prototype.validateHandler = function Parser$validateHandler (handler) {
  334. if ((typeof handler) != "object")
  335. throw new Error("Handler is not an object");
  336. if ((typeof handler.reset) != "function")
  337. throw new Error("Handler method 'reset' is invalid");
  338. if ((typeof handler.done) != "function")
  339. throw new Error("Handler method 'done' is invalid");
  340. if ((typeof handler.writeTag) != "function")
  341. throw new Error("Handler method 'writeTag' is invalid");
  342. if ((typeof handler.writeText) != "function")
  343. throw new Error("Handler method 'writeText' is invalid");
  344. if ((typeof handler.writeComment) != "function")
  345. throw new Error("Handler method 'writeComment' is invalid");
  346. if ((typeof handler.writeDirective) != "function")
  347. throw new Error("Handler method 'writeDirective' is invalid");
  348. }
  349. //Writes parsed elements out to the handler
  350. Parser.prototype.writeHandler = function Parser$writeHandler (forceFlush) {
  351. forceFlush = !!forceFlush;
  352. if (this._tagStack.length && !forceFlush)
  353. return;
  354. while (this._elements.length) {
  355. var element = this._elements.shift();
  356. switch (element.type) {
  357. case ElementType.Comment:
  358. this._handler.writeComment(element);
  359. break;
  360. case ElementType.Directive:
  361. this._handler.writeDirective(element);
  362. break;
  363. case ElementType.Text:
  364. this._handler.writeText(element);
  365. break;
  366. default:
  367. this._handler.writeTag(element);
  368. break;
  369. }
  370. }
  371. }
  372. Parser.prototype.handleError = function Parser$handleError (error) {
  373. if ((typeof this._handler.error) == "function")
  374. this._handler.error(error);
  375. else
  376. throw error;
  377. }
  378. //TODO: make this a trully streamable handler
  379. function RssHandler (callback) {
  380. RssHandler.super_.call(this, callback, { ignoreWhitespace: true, verbose: false, enforceEmptyTags: false });
  381. }
  382. inherits(RssHandler, DefaultHandler);
  383. RssHandler.prototype.done = function RssHandler$done () {
  384. var feed = { };
  385. var feedRoot;
  386. var found = DomUtils.getElementsByTagName(function (value) { return(value == "rss" || value == "feed"); }, this.dom, false);
  387. if (found.length) {
  388. feedRoot = found[0];
  389. }
  390. if (feedRoot) {
  391. if (feedRoot.name == "rss") {
  392. feed.type = "rss";
  393. feedRoot = feedRoot.children[0]; //<channel/>
  394. feed.id = "";
  395. try {
  396. feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
  397. } catch (ex) { }
  398. try {
  399. feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].children[0].data;
  400. } catch (ex) { }
  401. try {
  402. feed.description = DomUtils.getElementsByTagName("description", feedRoot.children, false)[0].children[0].data;
  403. } catch (ex) { }
  404. try {
  405. feed.updated = new Date(DomUtils.getElementsByTagName("lastBuildDate", feedRoot.children, false)[0].children[0].data);
  406. } catch (ex) { }
  407. try {
  408. feed.author = DomUtils.getElementsByTagName("managingEditor", feedRoot.children, false)[0].children[0].data;
  409. } catch (ex) { }
  410. feed.items = [];
  411. DomUtils.getElementsByTagName("item", feedRoot.children).forEach(function (item, index, list) {
  412. var entry = {};
  413. try {
  414. entry.id = DomUtils.getElementsByTagName("guid", item.children, false)[0].children[0].data;
  415. } catch (ex) { }
  416. try {
  417. entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
  418. } catch (ex) { }
  419. try {
  420. entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].children[0].data;
  421. } catch (ex) { }
  422. try {
  423. entry.description = DomUtils.getElementsByTagName("description", item.children, false)[0].children[0].data;
  424. } catch (ex) { }
  425. try {
  426. entry.pubDate = new Date(DomUtils.getElementsByTagName("pubDate", item.children, false)[0].children[0].data);
  427. } catch (ex) { }
  428. feed.items.push(entry);
  429. });
  430. } else {
  431. feed.type = "atom";
  432. try {
  433. feed.id = DomUtils.getElementsByTagName("id", feedRoot.children, false)[0].children[0].data;
  434. } catch (ex) { }
  435. try {
  436. feed.title = DomUtils.getElementsByTagName("title", feedRoot.children, false)[0].children[0].data;
  437. } catch (ex) { }
  438. try {
  439. feed.link = DomUtils.getElementsByTagName("link", feedRoot.children, false)[0].attribs.href;
  440. } catch (ex) { }
  441. try {
  442. feed.description = DomUtils.getElementsByTagName("subtitle", feedRoot.children, false)[0].children[0].data;
  443. } catch (ex) { }
  444. try {
  445. feed.updated = new Date(DomUtils.getElementsByTagName("updated", feedRoot.children, false)[0].children[0].data);
  446. } catch (ex) { }
  447. try {
  448. feed.author = DomUtils.getElementsByTagName("email", feedRoot.children, true)[0].children[0].data;
  449. } catch (ex) { }
  450. feed.items = [];
  451. DomUtils.getElementsByTagName("entry", feedRoot.children).forEach(function (item, index, list) {
  452. var entry = {};
  453. try {
  454. entry.id = DomUtils.getElementsByTagName("id", item.children, false)[0].children[0].data;
  455. } catch (ex) { }
  456. try {
  457. entry.title = DomUtils.getElementsByTagName("title", item.children, false)[0].children[0].data;
  458. } catch (ex) { }
  459. try {
  460. entry.link = DomUtils.getElementsByTagName("link", item.children, false)[0].attribs.href;
  461. } catch (ex) { }
  462. try {
  463. entry.description = DomUtils.getElementsByTagName("summary", item.children, false)[0].children[0].data;
  464. } catch (ex) { }
  465. try {
  466. entry.pubDate = new Date(DomUtils.getElementsByTagName("updated", item.children, false)[0].children[0].data);
  467. } catch (ex) { }
  468. feed.items.push(entry);
  469. });
  470. }
  471. this.dom = feed;
  472. }
  473. RssHandler.super_.prototype.done.call(this);
  474. }
  475. ///////////////////////////////////////////////////
  476. function DefaultHandler (callback, options) {
  477. this.reset();
  478. this._options = options ? options : { };
  479. if (this._options.ignoreWhitespace == undefined)
  480. this._options.ignoreWhitespace = false; //Keep whitespace-only text nodes
  481. if (this._options.verbose == undefined)
  482. this._options.verbose = true; //Keep data property for tags and raw property for all
  483. if (this._options.enforceEmptyTags == undefined)
  484. this._options.enforceEmptyTags = true; //Don't allow children for HTML tags defined as empty in spec
  485. if ((typeof callback) == "function")
  486. this._callback = callback;
  487. }
  488. //**"Static"**//
  489. //HTML Tags that shouldn't contain child nodes
  490. DefaultHandler._emptyTags = {
  491. area: 1
  492. , base: 1
  493. , basefont: 1
  494. , br: 1
  495. , col: 1
  496. , frame: 1
  497. , hr: 1
  498. , img: 1
  499. , input: 1
  500. , isindex: 1
  501. , link: 1
  502. , meta: 1
  503. , param: 1
  504. , embed: 1
  505. }
  506. //Regex to detect whitespace only text nodes
  507. DefaultHandler.reWhitespace = /^\s*$/;
  508. //**Public**//
  509. //Properties//
  510. DefaultHandler.prototype.dom = null; //The hierarchical object containing the parsed HTML
  511. //Methods//
  512. //Resets the handler back to starting state
  513. DefaultHandler.prototype.reset = function DefaultHandler$reset() {
  514. this.dom = [];
  515. this._done = false;
  516. this._tagStack = [];
  517. this._tagStack.last = function DefaultHandler$_tagStack$last () {
  518. return(this.length ? this[this.length - 1] : null);
  519. }
  520. }
  521. //Signals the handler that parsing is done
  522. DefaultHandler.prototype.done = function DefaultHandler$done () {
  523. this._done = true;
  524. this.handleCallback(null);
  525. }
  526. DefaultHandler.prototype.writeTag = function DefaultHandler$writeTag (element) {
  527. this.handleElement(element);
  528. }
  529. DefaultHandler.prototype.writeText = function DefaultHandler$writeText (element) {
  530. if (this._options.ignoreWhitespace)
  531. if (DefaultHandler.reWhitespace.test(element.data))
  532. return;
  533. this.handleElement(element);
  534. }
  535. DefaultHandler.prototype.writeComment = function DefaultHandler$writeComment (element) {
  536. this.handleElement(element);
  537. }
  538. DefaultHandler.prototype.writeDirective = function DefaultHandler$writeDirective (element) {
  539. this.handleElement(element);
  540. }
  541. DefaultHandler.prototype.error = function DefaultHandler$error (error) {
  542. this.handleCallback(error);
  543. }
  544. //**Private**//
  545. //Properties//
  546. DefaultHandler.prototype._options = null; //Handler options for how to behave
  547. DefaultHandler.prototype._callback = null; //Callback to respond to when parsing done
  548. DefaultHandler.prototype._done = false; //Flag indicating whether handler has been notified of parsing completed
  549. DefaultHandler.prototype._tagStack = null; //List of parents to the currently element being processed
  550. //Methods//
  551. DefaultHandler.prototype.handleCallback = function DefaultHandler$handleCallback (error) {
  552. if ((typeof this._callback) != "function")
  553. if (error)
  554. throw error;
  555. else
  556. return;
  557. this._callback(error, this.dom);
  558. }
  559. DefaultHandler.prototype.handleElement = function DefaultHandler$handleElement (element) {
  560. if (this._done)
  561. this.handleCallback(new Error("Writing to the handler after done() called is not allowed without a reset()"));
  562. if (!this._options.verbose) {
  563. // element.raw = null; //FIXME: Not clean
  564. //FIXME: Serious performance problem using delete
  565. delete element.raw;
  566. if (element.type == "tag" || element.type == "script" || element.type == "style")
  567. delete element.data;
  568. }
  569. if (!this._tagStack.last()) { //There are no parent elements
  570. //If the element can be a container, add it to the tag stack and the top level list
  571. if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
  572. if (element.name.charAt(0) != "/") { //Ignore closing tags that obviously don't have an opening tag
  573. this.dom.push(element);
  574. if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) { //Don't add tags to the tag stack that can't have children
  575. this._tagStack.push(element);
  576. }
  577. }
  578. }
  579. else //Otherwise just add to the top level list
  580. this.dom.push(element);
  581. }
  582. else { //There are parent elements
  583. //If the element can be a container, add it as a child of the element
  584. //on top of the tag stack and then add it to the tag stack
  585. if (element.type != ElementType.Text && element.type != ElementType.Comment && element.type != ElementType.Directive) {
  586. if (element.name.charAt(0) == "/") {
  587. //This is a closing tag, scan the tagStack to find the matching opening tag
  588. //and pop the stack up to the opening tag's parent
  589. var baseName = element.name.substring(1);
  590. if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[baseName]) {
  591. var pos = this._tagStack.length - 1;
  592. while (pos > -1 && this._tagStack[pos--].name != baseName) { }
  593. if (pos > -1 || this._tagStack[0].name == baseName)
  594. while (pos < this._tagStack.length - 1)
  595. this._tagStack.pop();
  596. }
  597. }
  598. else { //This is not a closing tag
  599. if (!this._tagStack.last().children)
  600. this._tagStack.last().children = [];
  601. this._tagStack.last().children.push(element);
  602. if (!this._options.enforceEmptyTags || !DefaultHandler._emptyTags[element.name]) //Don't add tags to the tag stack that can't have children
  603. this._tagStack.push(element);
  604. }
  605. }
  606. else { //This is not a container element
  607. if (!this._tagStack.last().children)
  608. this._tagStack.last().children = [];
  609. this._tagStack.last().children.push(element);
  610. }
  611. }
  612. }
  613. var DomUtils = {
  614. testElement: function DomUtils$testElement (options, element) {
  615. if (!element) {
  616. return false;
  617. }
  618. for (var key in options) {
  619. if (key == "tag_name") {
  620. if (element.type != "tag" && element.type != "script" && element.type != "style") {
  621. return false;
  622. }
  623. if (!options["tag_name"](element.name)) {
  624. return false;
  625. }
  626. } else if (key == "tag_type") {
  627. if (!options["tag_type"](element.type)) {
  628. return false;
  629. }
  630. } else if (key == "tag_contains") {
  631. if (element.type != "text" && element.type != "comment" && element.type != "directive") {
  632. return false;
  633. }
  634. if (!options["tag_contains"](element.data)) {
  635. return false;
  636. }
  637. } else {
  638. if (!element.attribs || !options[key](element.attribs[key])) {
  639. return false;
  640. }
  641. }
  642. }
  643. return true;
  644. }
  645. , getElements: function DomUtils$getElements (options, currentElement, recurse, limit) {
  646. recurse = (recurse === undefined || recurse === null) || !!recurse;
  647. limit = isNaN(parseInt(limit)) ? -1 : parseInt(limit);
  648. if (!currentElement) {
  649. return([]);
  650. }
  651. var found = [];
  652. var elementList;
  653. function getTest (checkVal) {
  654. return(function (value) { return(value == checkVal); });
  655. }
  656. for (var key in options) {
  657. if ((typeof options[key]) != "function") {
  658. options[key] = getTest(options[key]);
  659. }
  660. }
  661. if (DomUtils.testElement(options, currentElement)) {
  662. found.push(currentElement);
  663. }
  664. if (limit >= 0 && found.length >= limit) {
  665. return(found);
  666. }
  667. if (recurse && currentElement.children) {
  668. elementList = currentElement.children;
  669. } else if (currentElement instanceof Array) {
  670. elementList = currentElement;
  671. } else {
  672. return(found);
  673. }
  674. for (var i = 0; i < elementList.length; i++) {
  675. found = found.concat(DomUtils.getElements(options, elementList[i], recurse, limit));
  676. if (limit >= 0 && found.length >= limit) {
  677. break;
  678. }
  679. }
  680. return(found);
  681. }
  682. , getElementById: function DomUtils$getElementById (id, currentElement, recurse) {
  683. var result = DomUtils.getElements({ id: id }, currentElement, recurse, 1);
  684. return(result.length ? result[0] : null);
  685. }
  686. , getElementsByTagName: function DomUtils$getElementsByTagName (name, currentElement, recurse, limit) {
  687. return(DomUtils.getElements({ tag_name: name }, currentElement, recurse, limit));
  688. }
  689. , getElementsByTagType: function DomUtils$getElementsByTagType (type, currentElement, recurse, limit) {
  690. return(DomUtils.getElements({ tag_type: type }, currentElement, recurse, limit));
  691. }
  692. }
  693. function inherits (ctor, superCtor) {
  694. var tempCtor = function(){};
  695. tempCtor.prototype = superCtor.prototype;
  696. ctor.super_ = superCtor;
  697. ctor.prototype = new tempCtor();
  698. ctor.prototype.constructor = ctor;
  699. }
  700. exports.Parser = Parser;
  701. exports.DefaultHandler = DefaultHandler;
  702. exports.RssHandler = RssHandler;
  703. exports.ElementType = ElementType;
  704. exports.DomUtils = DomUtils;
  705. })();