sax.js 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535
  1. var sax = exports;
  2. sax.parser = function (strict, opt) { return new SAXParser(strict, opt) };
  3. sax.SAXParser = SAXParser;
  4. function SAXParser (strict, opt) {
  5. this.c = this.comment = this.sgmlDecl =
  6. this.textNode = this.tagName = this.doctype =
  7. this.procInstName = this.procInstBody = this.entity =
  8. this.attribName = this.attribValue = this.q =
  9. this.cdata = this.sgmlDecl = "";
  10. this.opt = opt || {};
  11. this.tagCase = this.opt.lowercasetags ? "toLowerCase" : "toUpperCase";
  12. this.tags = [];
  13. this.closed = this.closedRoot = this.sawRoot = false;
  14. this.tag = this.error = null;
  15. this.strict = !!strict;
  16. this.state = S.BEGIN;
  17. this.ENTITIES = Object.create(sax.ENTITIES);
  18. // just for error reporting
  19. this.position = this.line = this.column = 0;
  20. emit(this, "onready");
  21. }
  22. SAXParser.prototype = {
  23. write : write,
  24. resume : function () { this.error = null; return this },
  25. close : function () { return this.write(null) },
  26. }
  27. // character classes and tokens
  28. var whitespace = "\n\t ",
  29. // this really needs to be replaced with character classes.
  30. // XML allows all manner of ridiculous numbers and digits.
  31. number = "0124356789",
  32. letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
  33. // (Letter | '_' | ':')
  34. nameStart = letter+"_:",
  35. nameBody = nameStart+number+"-.",
  36. quote = "'\"",
  37. entity = number+letter+"#",
  38. CDATA = "[CDATA[",
  39. DOCTYPE = "DOCTYPE";
  40. function is (charclass, c) { return charclass.indexOf(c) !== -1 }
  41. function not (charclass, c) { return !is(charclass, c) }
  42. var S = 0;
  43. sax.STATE =
  44. { BEGIN : S++
  45. , TEXT : S++ // general stuff
  46. , TEXT_ENTITY : S++ // &amp and such.
  47. , OPEN_WAKA : S++ // <
  48. , SGML_DECL : S++ // <!BLARG
  49. , SGML_DECL_QUOTED : S++ // <!BLARG foo "bar
  50. , DOCTYPE : S++ // <!DOCTYPE
  51. , DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah
  52. , DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ...
  53. , DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo
  54. , COMMENT_STARTING : S++ // <!-
  55. , COMMENT : S++ // <!--
  56. , COMMENT_ENDING : S++ // <!-- blah -
  57. , COMMENT_ENDED : S++ // <!-- blah --
  58. , CDATA : S++ // <![CDATA[ something
  59. , CDATA_ENDING : S++ // ]
  60. , CDATA_ENDING_2 : S++ // ]]
  61. , PROC_INST : S++ // <?hi
  62. , PROC_INST_BODY : S++ // <?hi there
  63. , PROC_INST_QUOTED : S++ // <?hi there
  64. , PROC_INST_ENDING : S++ // <?hi there ?
  65. , OPEN_TAG : S++ // <strong
  66. , OPEN_TAG_SLASH : S++ // <strong /
  67. , ATTRIB : S++ // <a
  68. , ATTRIB_NAME : S++ // <a foo
  69. , ATTRIB_NAME_SAW_WHITE : S++ // <a foo _
  70. , ATTRIB_VALUE : S++ // <a foo="bar
  71. , ATTRIB_VALUE_QUOTED : S++ // <a foo="bar
  72. , ATTRIB_VALUE_UNQUOTED : S++ // <a foo="bar
  73. , ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar="&quot;"
  74. , ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=&quot;
  75. , CLOSE_TAG : S++ // </a
  76. , CLOSE_TAG_SAW_WHITE : S++ // </a >
  77. }
  78. sax.ENTITIES =
  79. { "apos" : "'"
  80. , "quot" : '"'
  81. , "amp" : "&"
  82. , "gt" : ">"
  83. , "lt" : "<"
  84. }
  85. for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S;
  86. // shorthand
  87. S = sax.STATE;
  88. sax.EVENTS = [ // for discoverability.
  89. "text", "processinginstruction", "sgmldeclaration",
  90. "doctype", "comment", "attribute", "opentag", "closetag",
  91. "cdata", "error", "end", "ready" ];
  92. function emit (parser, event, data) {
  93. parser[event] && parser[event](data);
  94. }
  95. function emitNode (parser, nodeType, data) {
  96. if (parser.textNode) closeText(parser);
  97. emit(parser, nodeType, data);
  98. }
  99. function closeText (parser) {
  100. parser.textNode = textopts(parser.opt, parser.textNode);
  101. if (parser.textNode) emit(parser, "ontext", parser.textNode);
  102. parser.textNode = "";
  103. }
  104. function textopts (opt, text) {
  105. if (opt.trim) text = text.trim();
  106. if (opt.normalize) text = text.replace(/\s+/g, " ");
  107. return text;
  108. }
  109. function error (parser, er) {
  110. closeText(parser);
  111. er += "\nLine: "+parser.line+
  112. "\nColumn: "+parser.column+
  113. "\nChar: "+parser.c;
  114. er = new Error(er);
  115. parser.error = er;
  116. emit(parser, "onerror", er);
  117. return parser;
  118. }
  119. function end (parser) {
  120. if (parser.state !== S.TEXT) error(parser, "Unexpected end");
  121. closeText(parser);
  122. parser.c = "";
  123. parser.closed = true;
  124. emit(parser, "onend");
  125. SAXParser.call(parser, parser.strict, parser.opt);
  126. return parser;
  127. }
  128. function strictFail (parser, message) {
  129. if (parser.strict) error(parser, message);
  130. }
  131. function newTag (parser) {
  132. if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]();
  133. parser.tag = { name : parser.tagName, attributes : {} };
  134. }
  135. function openTag (parser) {
  136. parser.sawRoot = true;
  137. parser.tags.push(parser.tag);
  138. emitNode(parser, "onopentag", parser.tag);
  139. parser.tag = null;
  140. parser.tagName = parser.attribName = parser.attribValue = "";
  141. parser.state = S.TEXT;
  142. }
  143. function closeTag (parser) {
  144. if (!parser.tagName) {
  145. strictFail(parser, "Weird empty close tag.");
  146. parser.textNode += "</>";
  147. parser.state = S.TEXT;
  148. return;
  149. }
  150. do {
  151. if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]();
  152. var closeTo = parser.tagName, close = parser.tags.pop();
  153. if (!close) {
  154. throw "wtf "+parser.tagName+" "+parser.tags+" "+parser.line+ " "+parser.position;
  155. }
  156. if (closeTo !== close.name) strictFail(parser, "Unexpected close tag.");
  157. parser.tag = close;
  158. parser.tagName = close.name;
  159. emitNode(parser, "onclosetag", parser.tagName);
  160. } while (closeTo !== close.name);
  161. if (parser.tags.length === 0) parser.closedRoot = true;
  162. parser.tagName = parser.attribValue = parser.attribName = "";
  163. parser.tag = null;
  164. parser.state = S.TEXT;
  165. }
  166. function parseEntity (parser) {
  167. var entity = parser.entity.toLowerCase(), num, numStr = "";
  168. if (parser.ENTITIES[entity]) return parser.ENTITIES[entity];
  169. if (entity.charAt(0) === "#") {
  170. if (entity.charAt(1) === "x") {
  171. entity = entity.slice(2);
  172. num = parseInt(entity, 16), numStr = num.toString(16);
  173. } else {
  174. entity = entity.slice(1);
  175. num = parseInt(entity, 10), numStr = num.toString(10);
  176. }
  177. }
  178. if (numStr.toLowerCase() !== entity) {
  179. strictFail(parser, "Invalid character entity");
  180. return "&"+parser.entity + ";";
  181. }
  182. return String.fromCharCode(num);
  183. }
  184. function write (chunk) {
  185. var parser = this;
  186. if (this.error) throw this.error;
  187. if (parser.closed) return error(parser,
  188. "Cannot write after close. Assign an onready handler.");
  189. if (chunk === null) return end(parser);
  190. var i = 0, c = ""
  191. while (parser.c = c = chunk.charAt(i++)) {
  192. parser.position ++;
  193. if (c === "\n") {
  194. parser.line ++;
  195. parser.column = 0;
  196. } else parser.column ++;
  197. switch (parser.state) {
  198. case S.BEGIN:
  199. if (c === "<") parser.state = S.OPEN_WAKA;
  200. else if (not(whitespace,c)) {
  201. // have to process this as a text node.
  202. // weird, but happens.
  203. strictFail(parser, "Non-whitespace before first tag.");
  204. parser.textNode = c;
  205. state = S.TEXT;
  206. }
  207. continue;
  208. case S.TEXT:
  209. if (c === "<") parser.state = S.OPEN_WAKA;
  210. else {
  211. if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot))
  212. strictFail("Text data outside of root node.");
  213. if (c === "&") parser.state = S.TEXT_ENTITY;
  214. else parser.textNode += c;
  215. }
  216. continue;
  217. case S.OPEN_WAKA:
  218. // either a /, ?, !, or text is coming next.
  219. if (c === "!") {
  220. parser.state = S.SGML_DECL;
  221. parser.sgmlDecl = "";
  222. } else if (is(whitespace, c)) {
  223. // wait for it...
  224. } else if (is(nameStart,c)) {
  225. parser.state = S.OPEN_TAG;
  226. parser.tagName = c;
  227. } else if (c === "/") {
  228. parser.state = S.CLOSE_TAG;
  229. parser.tagName = "";
  230. } else if (c === "?") {
  231. parser.state = S.PROC_INST;
  232. parser.procInstName = parser.procInstBody = "";
  233. } else {
  234. strictFail(parser, "Unencoded <");
  235. parser.textNode += "<" + c;
  236. parser.state = S.TEXT;
  237. }
  238. continue;
  239. case S.SGML_DECL:
  240. if ((parser.sgmlDecl+c).toUpperCase() === CDATA) {
  241. parser.state = S.CDATA;
  242. parser.sgmlDecl = "";
  243. parser.cdata = "";
  244. } else if (parser.sgmlDecl+c === "--") {
  245. parser.state = S.COMMENT;
  246. parser.comment = "";
  247. parser.sgmlDecl = "";
  248. } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) {
  249. parser.state = S.DOCTYPE;
  250. if (parser.doctype || parser.sawRoot) strictFail(parser,
  251. "Inappropriately located doctype declaration");
  252. parser.doctype = "";
  253. parser.sgmlDecl = "";
  254. } else if (c === ">") {
  255. emitNode(parser, "onsgmldeclaration", parser.sgmlDecl);
  256. parser.sgmlDecl = "";
  257. parser.state = S.TEXT;
  258. } else if (is(quote, c)) {
  259. parser.state = S.SGML_DECL_QUOTED;
  260. parser.sgmlDecl += c;
  261. } else parser.sgmlDecl += c;
  262. continue;
  263. case S.SGML_DECL_QUOTED:
  264. if (c === parser.q) {
  265. parser.state = S.SGML_DECL;
  266. parser.q = "";
  267. }
  268. parser.sgmlDecl += c;
  269. continue;
  270. case S.DOCTYPE:
  271. if (c === ">") {
  272. parser.state = S.TEXT;
  273. emitNode(parser, "ondoctype", parser.doctype);
  274. parser.doctype = true; // just remember that we saw it.
  275. } else {
  276. parser.doctype += c;
  277. if (c === "[") parser.state = S.DOCTYPE_DTD;
  278. else if (is(quote, c)) {
  279. parser.state = S.DOCTYPE_QUOTED;
  280. parser.q = c;
  281. }
  282. }
  283. continue;
  284. case S.DOCTYPE_QUOTED:
  285. parser.doctype += c;
  286. if (c === parser.q) {
  287. parser.q = "";
  288. parser.state = S.DOCTYPE;
  289. }
  290. continue;
  291. case S.DOCTYPE_DTD:
  292. parser.doctype += c;
  293. if (c === "]") parser.state = S.DOCTYPE;
  294. else if (is(quote,c)) {
  295. parser.state = S.DOCTYPE_DTD_QUOTED;
  296. parser.q = c;
  297. }
  298. continue;
  299. case S.DOCTYPE_DTD_QUOTED:
  300. parser.doctype += c;
  301. if (c === parser.q) {
  302. parser.state = S.DOCTYPE_DTD;
  303. parser.q = "";
  304. }
  305. continue;
  306. case S.COMMENT:
  307. if (c === "-") parser.state = S.COMMENT_ENDING;
  308. else parser.comment += c;
  309. continue;
  310. case S.COMMENT_ENDING:
  311. if (c === "-") {
  312. parser.state = S.COMMENT_ENDED;
  313. parser.comment = textopts(parser.opt, parser.comment);
  314. if (parser.comment) emitNode(parser, "oncomment", parser.comment);
  315. parser.comment = "";
  316. } else {
  317. strictFail(parser, "Invalid comment");
  318. parser.comment += "-" + c;
  319. }
  320. continue;
  321. case S.COMMENT_ENDED:
  322. if (c !== ">") strictFail(parser, "Malformed comment");
  323. else parser.state = S.TEXT;
  324. continue;
  325. case S.CDATA:
  326. if (c === "]") parser.state = S.CDATA_ENDING;
  327. else parser.cdata += c;
  328. continue;
  329. case S.CDATA_ENDING:
  330. if (c === "]") parser.state = S.CDATA_ENDING_2;
  331. else {
  332. parser.cdata += "]" + c;
  333. parser.state = S.CDATA;
  334. }
  335. continue;
  336. case S.CDATA_ENDING_2:
  337. if (c === ">") {
  338. emitNode(parser, "oncdata", parser.cdata);
  339. parser.cdata = "";
  340. parser.state = S.TEXT;
  341. } else {
  342. parser.cdata += "]]" + c;
  343. parser.state = S.CDATA;
  344. }
  345. continue;
  346. case S.PROC_INST:
  347. if (c === "?") parser.state = S.PROC_INST_ENDING;
  348. else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY;
  349. else parser.procInstName += c;
  350. continue;
  351. case S.PROC_INST_BODY:
  352. if (!parser.procInstBody && is(whitespace, c)) continue;
  353. else if (c === "?") parser.state = S.PROC_INST_ENDING;
  354. else if (is(quote, c)) {
  355. parser.state = S.PROC_INST_QUOTED;
  356. parser.q = c;
  357. parser.procInstBody += c;
  358. } else parser.procInstBody += c;
  359. continue;
  360. case S.PROC_INST_ENDING:
  361. if (c === ">") {
  362. emitNode(parser, "onprocessinginstruction", {
  363. name : parser.procInstName,
  364. body : parser.procInstBody
  365. });
  366. parser.procInstName = parser.procInstBody = "";
  367. parser.state = S.TEXT;
  368. } else {
  369. parser.procInstBody += "?" + c;
  370. parser.state = S.PROC_INST_BODY;
  371. }
  372. continue;
  373. case S.PROC_INST_QUOTED:
  374. parser.procInstBody += c;
  375. if (c === parser.q) {
  376. parser.state = S.PROC_INST_BODY;
  377. parser.q = "";
  378. }
  379. continue;
  380. case S.OPEN_TAG:
  381. if (is(nameBody, c)) parser.tagName += c;
  382. else {
  383. newTag(parser);
  384. if (c === ">") openTag(parser);
  385. else if (c === "/") parser.state = S.OPEN_TAG_SLASH;
  386. else {
  387. if (not(whitespace, c)) strictFail(
  388. parser, "Invalid character in tag name");
  389. parser.state = S.ATTRIB;
  390. }
  391. }
  392. continue;
  393. case S.OPEN_TAG_SLASH:
  394. if (c === ">") {
  395. openTag(parser);
  396. closeTag(parser);
  397. } else {
  398. strictFail(parser, "Forward-slash in opening tag not followed by >");
  399. parser.state = S.ATTRIB;
  400. }
  401. continue;
  402. case S.ATTRIB:
  403. // haven't read the attribute name yet.
  404. if (is(whitespace, c)) continue;
  405. else if (c === ">") openTag(parser);
  406. else if (is(nameStart, c)) {
  407. parser.attribName = c;
  408. parser.attribValue = "";
  409. parser.state = S.ATTRIB_NAME;
  410. } else strictFail(parser, "Invalid attribute name");
  411. continue;
  412. case S.ATTRIB_NAME:
  413. if (c === "=") parser.state = S.ATTRIB_VALUE;
  414. else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE;
  415. else if (is(nameBody, c)) parser.attribName += c;
  416. else strictFail(parser, "Invalid attribute name");
  417. continue;
  418. case S.ATTRIB_NAME_SAW_WHITE:
  419. if (c === "=") parser.state = S.ATTRIB_VALUE;
  420. else if (is(whitespace, c)) continue;
  421. else {
  422. strictFail(parser, "Attribute without value");
  423. parser.tag.attributes[parser.attribName] = "";
  424. parser.attribValue = "";
  425. emitNode(parser, "onattribute", { name : parser.attribName, value : "" });
  426. parser.attribName = "";
  427. if (c === ">") openTag(parser);
  428. else if (is(nameStart, c)) {
  429. parser.attribName = c;
  430. parser.state = S.ATTRIB_NAME;
  431. } else {
  432. strictFail(parser, "Invalid attribute name");
  433. parser.state = S.ATTRIB;
  434. }
  435. }
  436. continue;
  437. case S.ATTRIB_VALUE:
  438. if (is(quote, c)) {
  439. parser.q = c;
  440. parser.state = S.ATTRIB_VALUE_QUOTED;
  441. } else {
  442. strictFail(parser, "Unquoted attribute value");
  443. parser.state = S.ATTRIB_VALUE_UNQUOTED;
  444. parser.attribValue = c;
  445. }
  446. continue;
  447. case S.ATTRIB_VALUE_QUOTED:
  448. if (c !== parser.q) {
  449. if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q;
  450. else parser.attribValue += c;
  451. continue;
  452. }
  453. parser.tag.attributes[parser.attribName] = parser.attribValue;
  454. emitNode(parser, "onattribute", {
  455. name:parser.attribName, value:parser.attribValue});
  456. parser.attribName = parser.attribValue = "";
  457. parser.q = "";
  458. parser.state = S.ATTRIB;
  459. continue;
  460. case S.ATTRIB_VALUE_UNQUOTED:
  461. if (not(whitespace+">",c)) {
  462. if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U;
  463. else parser.attribValue += c;
  464. continue;
  465. }
  466. emitNode(parser, "onattribute", {
  467. name:parser.attribName, value:parser.attribValue});
  468. parser.attribName = parser.attribValue = "";
  469. if (c === ">") openTag(parser);
  470. else parser.state = S.ATTRIB;
  471. continue;
  472. case S.CLOSE_TAG:
  473. if (!parser.tagName) {
  474. if (is(whitespace, c)) continue;
  475. else if (not(nameStart, c)) strictFail(parser,
  476. "Invalid tagname in closing tag.");
  477. else parser.tagName = c;
  478. }
  479. else if (c === ">") closeTag(parser);
  480. else if (is(nameBody, c)) parser.tagName += c;
  481. else {
  482. if (not(whitespace, c)) strictFail(parser,
  483. "Invalid tagname in closing tag");
  484. parser.state = S.CLOSE_TAG_SAW_WHITE;
  485. }
  486. continue;
  487. case S.CLOSE_TAG_SAW_WHITE:
  488. if (is(whitespace, c)) continue;
  489. if (c === ">") closeTag(parser);
  490. else strictFail("Invalid characters in closing tag");
  491. continue;
  492. case S.TEXT_ENTITY:
  493. case S.ATTRIB_VALUE_ENTITY_Q:
  494. case S.ATTRIB_VALUE_ENTITY_U:
  495. switch(parser.state) {
  496. case S.TEXT_ENTITY:
  497. var returnState = S.TEXT, buffer = "textNode";
  498. break;
  499. case S.ATTRIB_VALUE_ENTITY_Q:
  500. var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue";
  501. break;
  502. case S.ATTRIB_VALUE_ENTITY_U:
  503. var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue";
  504. break;
  505. }
  506. if (c === ";") {
  507. parser[buffer] += parseEntity(parser);
  508. parser.entity = "";
  509. parser.state = returnState;
  510. }
  511. else if (is(entity, c)) parser.entity += c;
  512. else {
  513. strictFail("Invalid character entity");
  514. parser[buffer] += "&" + parser.entity;
  515. parser.entity = "";
  516. parser.state = returnState;
  517. }
  518. continue;
  519. default:
  520. throw "Unknown state: " + parser.state;
  521. break;
  522. }
  523. }
  524. return parser;
  525. }