sax.js 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. var sax = exports;
  2. sax.parser = function (strict, opt) { return new SAXParser(strict, opt) };
  3. sax.SAXParser = SAXParser;
  4. function SAXParser (strict, opt) {
  5. this.c = this.comment = this.sgmlDecl =
  6. this.textNode = this.tagName = this.doctype =
  7. this.procInstName = this.procInstBody = this.entity =
  8. this.attribName = this.attribValue = this.q =
  9. this.cdata = this.sgmlDecl = "";
  10. this.opt = opt || {};
  11. this.tagCase = this.opt.lowercasetags ? "toLowerCase" : "toUpperCase";
  12. this.tags = [];
  13. this.closed = this.closedRoot = this.sawRoot = false;
  14. this.tag = this.error = null;
  15. this.strict = !!strict;
  16. this.state = S.BEGIN;
  17. this.ENTITIES = Object.create(sax.ENTITIES);
  18. // just for error reporting
  19. this.position = this.line = this.column = 0;
  20. emit(this, "onready");
  21. }
  22. SAXParser.prototype = {
  23. write : write,
  24. resume : function () { this.error = null; return this },
  25. close : function () { return this.write(null) },
  26. }
  27. // character classes and tokens
  28. var whitespace = "\n\t ",
  29. // this really needs to be replaced with character classes.
  30. // XML allows all manner of ridiculous numbers and digits.
  31. number = "0124356789",
  32. letter = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ",
  33. // (Letter | '_' | ':')
  34. nameStart = letter+"_:",
  35. nameBody = nameStart+number+"-.",
  36. quote = "'\"",
  37. entity = number+letter+"#",
  38. CDATA = "[CDATA[",
  39. DOCTYPE = "DOCTYPE";
  40. function is (charclass, c) { return charclass.indexOf(c) !== -1 }
  41. function not (charclass, c) { return !is(charclass, c) }
  42. var S = 0;
  43. sax.STATE =
  44. { BEGIN : S++
  45. , TEXT : S++ // general stuff
  46. , TEXT_ENTITY : S++ // &amp and such.
  47. , OPEN_WAKA : S++ // <
  48. , SGML_DECL : S++ // <!BLARG
  49. , SGML_DECL_QUOTED : S++ // <!BLARG foo "bar
  50. , DOCTYPE : S++ // <!DOCTYPE
  51. , DOCTYPE_QUOTED : S++ // <!DOCTYPE "//blah
  52. , DOCTYPE_DTD : S++ // <!DOCTYPE "//blah" [ ...
  53. , DOCTYPE_DTD_QUOTED : S++ // <!DOCTYPE "//blah" [ "foo
  54. , COMMENT_STARTING : S++ // <!-
  55. , COMMENT : S++ // <!--
  56. , COMMENT_ENDING : S++ // <!-- blah -
  57. , COMMENT_ENDED : S++ // <!-- blah --
  58. , CDATA : S++ // <![CDATA[ something
  59. , CDATA_ENDING : S++ // ]
  60. , CDATA_ENDING_2 : S++ // ]]
  61. , PROC_INST : S++ // <?hi
  62. , PROC_INST_BODY : S++ // <?hi there
  63. , PROC_INST_QUOTED : S++ // <?hi there
  64. , PROC_INST_ENDING : S++ // <?hi there ?
  65. , OPEN_TAG : S++ // <strong
  66. , OPEN_TAG_SLASH : S++ // <strong /
  67. , ATTRIB : S++ // <a
  68. , ATTRIB_NAME : S++ // <a foo
  69. , ATTRIB_NAME_SAW_WHITE : S++ // <a foo _
  70. , ATTRIB_VALUE : S++ // <a foo="bar
  71. , ATTRIB_VALUE_QUOTED : S++ // <a foo="bar
  72. , ATTRIB_VALUE_UNQUOTED : S++ // <a foo="bar
  73. , ATTRIB_VALUE_ENTITY_Q : S++ // <foo bar="&quot;"
  74. , ATTRIB_VALUE_ENTITY_U : S++ // <foo bar=&quot;
  75. , CLOSE_TAG : S++ // </a
  76. , CLOSE_TAG_SAW_WHITE : S++ // </a >
  77. }
  78. sax.ENTITIES =
  79. { "apos" : "'"
  80. , "quot" : '"'
  81. , "amp" : "&"
  82. , "gt" : ">"
  83. , "lt" : "<"
  84. }
  85. for (var S in sax.STATE) sax.STATE[sax.STATE[S]] = S;
  86. // shorthand
  87. S = sax.STATE;
  88. sax.EVENTS = [ // for discoverability.
  89. "text", "processinginstruction", "sgmldeclaration",
  90. "doctype", "comment", "attribute", "opentag", "closetag",
  91. "cdata", "error", "end", "ready" ];
  92. function emit (parser, event, data) {
  93. parser[event] && parser[event](data);
  94. }
  95. function emitNode (parser, nodeType, data) {
  96. if (parser.textNode) closeText(parser);
  97. emit(parser, nodeType, data);
  98. }
  99. function closeText (parser) {
  100. parser.textNode = textopts(parser.opt, parser.textNode);
  101. if (parser.textNode) emit(parser, "ontext", parser.textNode);
  102. parser.textNode = "";
  103. }
  104. function textopts (opt, text) {
  105. if (opt.trim) text = text.trim();
  106. if (opt.normalize) text = text.replace(/\s+/g, " ");
  107. return text;
  108. }
  109. function error (parser, er) {
  110. closeText(parser);
  111. er += "\nLine: "+parser.line+
  112. "\nColumn: "+parser.column+
  113. "\nChar: "+parser.c;
  114. er = new Error(er);
  115. parser.error = er;
  116. emit(parser, "onerror", er);
  117. return parser;
  118. }
  119. function end (parser) {
  120. if (parser.state !== S.TEXT) error(parser, "Unexpected end");
  121. closeText(parser);
  122. parser.c = "";
  123. parser.closed = true;
  124. emit(parser, "onend");
  125. SAXParser.call(parser, parser.strict, parser.opt);
  126. return parser;
  127. }
  128. function strictFail (parser, message) {
  129. if (parser.strict) error(parser, message);
  130. }
  131. function newTag (parser) {
  132. if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]();
  133. parser.tag = { name : parser.tagName, attributes : {} };
  134. }
  135. function openTag (parser) {
  136. parser.sawRoot = true;
  137. parser.tags.push(parser.tag);
  138. emitNode(parser, "onopentag", parser.tag);
  139. parser.tag = null;
  140. parser.tagName = parser.attribName = parser.attribValue = "";
  141. parser.state = S.TEXT;
  142. }
  143. function closeTag (parser) {
  144. if (!parser.tagName) {
  145. strictFail(parser, "Weird empty close tag.");
  146. parser.textNode += "</>";
  147. parser.state = S.TEXT;
  148. return;
  149. }
  150. do {
  151. if (!parser.strict) parser.tagName = parser.tagName[parser.tagCase]();
  152. var closeTo = parser.tagName, close = parser.tags.pop();
  153. if (!close) {
  154. throw "wtf "+parser.tagName+" "+parser.tags+" "+parser.line+ " "+parser.position;
  155. }
  156. if (closeTo !== close.name) strictFail(parser, "Unexpected close tag.");
  157. parser.tag = close;
  158. parser.tagName = close.name;
  159. emitNode(parser, "onclosetag", parser.tagName);
  160. } while (closeTo !== close.name);
  161. if (parser.tags.length === 0) parser.closedRoot = true;
  162. parser.tagName = parser.attribValue = parser.attribName = "";
  163. parser.tag = null;
  164. parser.state = S.TEXT;
  165. }
  166. function parseEntity (parser) {
  167. var entity = parser.entity.toLowerCase(), num, numStr = "";
  168. if (parser.ENTITIES[entity]) return parser.ENTITIES[entity];
  169. if (entity.charAt(0) === "#") {
  170. if (entity.charAt(1) === "x") {
  171. entity = entity.slice(2);
  172. num = parseInt(entity, 16), numStr = num.toString(16);
  173. } else {
  174. entity = entity.slice(1);
  175. num = parseInt(entity, 10), numStr = num.toString(10);
  176. }
  177. }
  178. if (numStr.toLowerCase() !== entity) {
  179. strictFail(parser, "Invalid character entity");
  180. return "&"+parser.entity + ";";
  181. }
  182. return String.fromCharCode(num);
  183. }
  184. function write (chunk) {
  185. var parser = this;
  186. if (this.error) throw this.error;
  187. if (parser.closed) return error(parser,
  188. "Cannot write after close. Assign an onready handler.");
  189. if (chunk === null) return end(parser);
  190. var i = 0, c = ""
  191. while (parser.c = c = chunk.charAt(i++)) {
  192. parser.position ++;
  193. if (c === "\n") {
  194. parser.line ++;
  195. parser.column = 0;
  196. } else parser.column ++;
  197. switch (parser.state) {
  198. case S.BEGIN:
  199. if (c === "<") parser.state = S.OPEN_WAKA;
  200. else if (not(whitespace,c)) {
  201. // have to process this as a text node.
  202. // weird, but happens.
  203. strictFail(parser, "Non-whitespace before first tag.");
  204. parser.textNode = c;
  205. state = S.TEXT;
  206. }
  207. continue;
  208. case S.TEXT:
  209. if (c === "<") parser.state = S.OPEN_WAKA;
  210. else if (not(whitespace, c) && (!parser.sawRoot || parser.closedRoot)) {
  211. strictFail("Text data outside of root node.");
  212. }
  213. else if (c === "&") parser.state = S.TEXT_ENTITY;
  214. else parser.textNode += c;
  215. continue;
  216. case S.OPEN_WAKA:
  217. // either a /, ?, !, or text is coming next.
  218. if (c === "!") {
  219. parser.state = S.SGML_DECL;
  220. parser.sgmlDecl = "";
  221. } else if (is(whitespace, c)) {
  222. // wait for it...
  223. } else if (is(nameStart,c)) {
  224. parser.state = S.OPEN_TAG;
  225. parser.tagName = c;
  226. } else if (c === "/") {
  227. parser.state = S.CLOSE_TAG;
  228. parser.tagName = "";
  229. } else if (c === "?") {
  230. parser.state = S.PROC_INST;
  231. parser.procInstName = parser.procInstBody = "";
  232. } else {
  233. strictFail(parser, "Unencoded <");
  234. parser.textNode += "<" + c;
  235. parser.state = S.TEXT;
  236. }
  237. continue;
  238. case S.SGML_DECL:
  239. if ((parser.sgmlDecl+c).toUpperCase() === CDATA) {
  240. parser.state = S.CDATA;
  241. parser.sgmlDecl = "";
  242. parser.cdata = "";
  243. } else if (parser.sgmlDecl+c === "--") {
  244. parser.state = S.COMMENT;
  245. parser.comment = "";
  246. parser.sgmlDecl = "";
  247. } else if ((parser.sgmlDecl+c).toUpperCase() === DOCTYPE) {
  248. parser.state = S.DOCTYPE;
  249. if (parser.doctype || parser.sawRoot) strictFail(parser,
  250. "Inappropriately located doctype declaration");
  251. parser.doctype = "";
  252. parser.sgmlDecl = "";
  253. } else if (c === ">") {
  254. emitNode(parser, "onsgmldeclaration", parser.sgmlDecl);
  255. parser.sgmlDecl = "";
  256. parser.state = S.TEXT;
  257. } else if (is(quote, c)) {
  258. parser.state = S.SGML_DECL_QUOTED;
  259. parser.sgmlDecl += c;
  260. } else parser.sgmlDecl += c;
  261. continue;
  262. case S.SGML_DECL_QUOTED:
  263. if (c === parser.q) {
  264. parser.state = S.SGML_DECL;
  265. parser.q = "";
  266. }
  267. parser.sgmlDecl += c;
  268. continue;
  269. case S.DOCTYPE:
  270. if (c === ">") {
  271. parser.state = S.TEXT;
  272. emitNode(parser, "ondoctype", parser.doctype);
  273. parser.doctype = true; // just remember that we saw it.
  274. } else {
  275. parser.doctype += c;
  276. if (c === "[") parser.state = S.DOCTYPE_DTD;
  277. else if (is(quote, c)) {
  278. parser.state = S.DOCTYPE_QUOTED;
  279. parser.q = c;
  280. }
  281. }
  282. continue;
  283. case S.DOCTYPE_QUOTED:
  284. parser.doctype += c;
  285. if (c === parser.q) {
  286. parser.q = "";
  287. parser.state = S.DOCTYPE;
  288. }
  289. continue;
  290. case S.DOCTYPE_DTD:
  291. parser.doctype += c;
  292. if (c === "]") parser.state = S.DOCTYPE;
  293. else if (is(quote,c)) {
  294. parser.state = S.DOCTYPE_DTD_QUOTED;
  295. parser.q = c;
  296. }
  297. continue;
  298. case S.DOCTYPE_DTD_QUOTED:
  299. parser.doctype += c;
  300. if (c === parser.q) {
  301. parser.state = S.DOCTYPE_DTD;
  302. parser.q = "";
  303. }
  304. continue;
  305. case S.COMMENT:
  306. if (c === "-") parser.state = S.COMMENT_ENDING;
  307. else parser.comment += c;
  308. continue;
  309. case S.COMMENT_ENDING:
  310. if (c === "-") {
  311. parser.state = S.COMMENT_ENDED;
  312. parser.comment = textopts(parser.opt, parser.comment);
  313. if (parser.comment) emitNode(parser, "oncomment", parser.comment);
  314. parser.comment = "";
  315. } else {
  316. strictFail(parser, "Invalid comment");
  317. parser.comment += "-" + c;
  318. }
  319. continue;
  320. case S.COMMENT_ENDED:
  321. if (c !== ">") strictFail(parser, "Malformed comment");
  322. else parser.state = S.TEXT;
  323. continue;
  324. case S.CDATA:
  325. if (c === "]") parser.state = S.CDATA_ENDING;
  326. else parser.cdata += c;
  327. continue;
  328. case S.CDATA_ENDING:
  329. if (c === "]") parser.state = S.CDATA_ENDING_2;
  330. else {
  331. parser.cdata += "]" + c;
  332. parser.state = S.CDATA;
  333. }
  334. continue;
  335. case S.CDATA_ENDING_2:
  336. if (c === ">") {
  337. emitNode(parser, "oncdata", parser.cdata);
  338. parser.cdata = "";
  339. parser.state = S.TEXT;
  340. } else {
  341. parser.cdata += "]]" + c;
  342. parser.state = S.CDATA;
  343. }
  344. continue;
  345. case S.PROC_INST:
  346. if (c === "?") parser.state = S.PROC_INST_ENDING;
  347. else if (is(whitespace, c)) parser.state = S.PROC_INST_BODY;
  348. else parser.procInstName += c;
  349. continue;
  350. case S.PROC_INST_BODY:
  351. if (!parser.procInstBody && is(whitespace, c)) continue;
  352. else if (c === "?") parser.state = S.PROC_INST_ENDING;
  353. else if (is(quote, c)) {
  354. parser.state = S.PROC_INST_QUOTED;
  355. parser.q = c;
  356. parser.procInstBody += c;
  357. } else parser.procInstBody += c;
  358. continue;
  359. case S.PROC_INST_ENDING:
  360. if (c === ">") {
  361. emitNode(parser, "onprocessinginstruction", {
  362. name : parser.procInstName,
  363. body : parser.procInstBody
  364. });
  365. parser.procInstName = parser.procInstBody = "";
  366. parser.state = S.TEXT;
  367. } else {
  368. parser.procInstBody += "?" + c;
  369. parser.state = S.PROC_INST_BODY;
  370. }
  371. continue;
  372. case S.PROC_INST_QUOTED:
  373. parser.procInstBody += c;
  374. if (c === parser.q) {
  375. parser.state = S.PROC_INST_BODY;
  376. parser.q = "";
  377. }
  378. continue;
  379. case S.OPEN_TAG:
  380. if (is(nameBody, c)) parser.tagName += c;
  381. else {
  382. newTag(parser);
  383. if (c === ">") openTag(parser);
  384. else if (c === "/") parser.state = S.OPEN_TAG_SLASH;
  385. else {
  386. if (not(whitespace, c)) strictFail(
  387. parser, "Invalid character in tag name");
  388. parser.state = S.ATTRIB;
  389. }
  390. }
  391. continue;
  392. case S.OPEN_TAG_SLASH:
  393. if (c === ">") {
  394. openTag(parser);
  395. closeTag(parser);
  396. } else {
  397. strictFail(parser, "Forward-slash in opening tag not followed by >");
  398. parser.state = S.ATTRIB;
  399. }
  400. continue;
  401. case S.ATTRIB:
  402. // haven't read the attribute name yet.
  403. if (is(whitespace, c)) continue;
  404. else if (c === ">") openTag(parser);
  405. else if (is(nameStart, c)) {
  406. parser.attribName = c;
  407. parser.attribValue = "";
  408. parser.state = S.ATTRIB_NAME;
  409. } else strictFail(parser, "Invalid attribute name");
  410. continue;
  411. case S.ATTRIB_NAME:
  412. if (c === "=") parser.state = S.ATTRIB_VALUE;
  413. else if (is(whitespace, c)) parser.state = S.ATTRIB_NAME_SAW_WHITE;
  414. else if (is(nameBody, c)) parser.attribName += c;
  415. else strictFail(parser, "Invalid attribute name");
  416. continue;
  417. case S.ATTRIB_NAME_SAW_WHITE:
  418. if (c === "=") parser.state = S.ATTRIB_VALUE;
  419. else if (is(whitespace, c)) continue;
  420. else {
  421. strictFail(parser, "Attribute without value");
  422. parser.tag.attributes[parser.attribName] = "";
  423. parser.attribValue = "";
  424. emitNode(parser, "onattribute", { name : parser.attribName, value : "" });
  425. parser.attribName = "";
  426. if (c === ">") openTag(parser);
  427. else if (is(nameStart, c)) {
  428. parser.attribName = c;
  429. parser.state = S.ATTRIB_NAME;
  430. } else {
  431. strictFail(parser, "Invalid attribute name");
  432. parser.state = S.ATTRIB;
  433. }
  434. }
  435. continue;
  436. case S.ATTRIB_VALUE:
  437. if (is(quote, c)) {
  438. parser.q = c;
  439. parser.state = S.ATTRIB_VALUE_QUOTED;
  440. } else {
  441. strictFail(parser, "Unquoted attribute value");
  442. parser.state = S.ATTRIB_VALUE_UNQUOTED;
  443. parser.attribValue = c;
  444. }
  445. continue;
  446. case S.ATTRIB_VALUE_QUOTED:
  447. if (c !== parser.q) {
  448. if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_Q;
  449. else parser.attribValue += c;
  450. continue;
  451. }
  452. parser.tag.attributes[parser.attribName] = parser.attribValue;
  453. emitNode(parser, "onattribute", {
  454. name:parser.attribName, value:parser.attribValue});
  455. parser.attribName = parser.attribValue = "";
  456. parser.q = "";
  457. parser.state = S.ATTRIB;
  458. continue;
  459. case S.ATTRIB_VALUE_UNQUOTED:
  460. if (not(whitespace+">",c)) {
  461. if (c === "&") parser.state = S.ATTRIB_VALUE_ENTITY_U;
  462. else parser.attribValue += c;
  463. continue;
  464. }
  465. emitNode(parser, "onattribute", {
  466. name:parser.attribName, value:parser.attribValue});
  467. parser.attribName = parser.attribValue = "";
  468. if (c === ">") openTag(parser);
  469. else parser.state = S.ATTRIB;
  470. continue;
  471. case S.CLOSE_TAG:
  472. if (!parser.tagName) {
  473. if (is(whitespace, c)) continue;
  474. else if (not(nameStart, c)) strictFail(parser,
  475. "Invalid tagname in closing tag.");
  476. else parser.tagName = c;
  477. }
  478. else if (c === ">") closeTag(parser);
  479. else if (is(nameBody, c)) parser.tagName += c;
  480. else {
  481. if (not(whitespace, c)) strictFail(parser,
  482. "Invalid tagname in closing tag");
  483. parser.state = S.CLOSE_TAG_SAW_WHITE;
  484. }
  485. continue;
  486. case S.CLOSE_TAG_SAW_WHITE:
  487. if (is(whitespace, c)) continue;
  488. if (c === ">") closeTag(parser);
  489. else strictFail("Invalid characters in closing tag");
  490. continue;
  491. case S.TEXT_ENTITY:
  492. case S.ATTRIB_VALUE_ENTITY_Q:
  493. case S.ATTRIB_VALUE_ENTITY_U:
  494. switch(parser.state) {
  495. case S.TEXT_ENTITY:
  496. var returnState = S.TEXT, buffer = "textNode";
  497. break;
  498. case S.ATTRIB_VALUE_ENTITY_Q:
  499. var returnState = S.ATTRIB_VALUE_QUOTED, buffer = "attribValue";
  500. break;
  501. case S.ATTRIB_VALUE_ENTITY_U:
  502. var returnState = S.ATTRIB_VALUE_UNQUOTED, buffer = "attribValue";
  503. break;
  504. }
  505. if (c === ";") {
  506. parser[buffer] += parseEntity(parser);
  507. parser.entity = "";
  508. parser.state = returnState;
  509. }
  510. else if (is(entity, c)) parser.entity += c;
  511. else {
  512. strictFail("Invalid character entity");
  513. parser[buffer] += "&" + parser.entity;
  514. parser.entity = "";
  515. parser.state = returnState;
  516. }
  517. continue;
  518. default:
  519. throw "Unknown state: " + parser.state;
  520. break;
  521. }
  522. }
  523. return parser;
  524. }