sux.py 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638
  1. # -*- test-case-name: twisted.web.test.test_xml -*-
  2. #
  3. # Copyright (c) Twisted Matrix Laboratories.
  4. # See LICENSE for details.
  5. """
  6. *S*mall, *U*ncomplicated *X*ML.
  7. This is a very simple implementation of XML/HTML as a network
  8. protocol. It is not at all clever. Its main features are that it
  9. does not:
  10. - support namespaces
  11. - mung mnemonic entity references
  12. - validate
  13. - perform *any* external actions (such as fetching URLs or writing files)
  14. under *any* circumstances
  15. - has lots and lots of horrible hacks for supporting broken HTML (as an
  16. option, they're not on by default).
  17. """
  18. from __future__ import print_function
  19. from twisted.internet.protocol import Protocol
  20. from twisted.python.reflect import prefixedMethodNames
  21. # Elements of the three-tuples in the state table.
  22. BEGIN_HANDLER = 0
  23. DO_HANDLER = 1
  24. END_HANDLER = 2
  25. identChars = '.-_:'
  26. lenientIdentChars = identChars + ';+#/%~'
  27. def nop(*args, **kw):
  28. "Do nothing."
  29. def unionlist(*args):
  30. l = []
  31. for x in args:
  32. l.extend(x)
  33. d = dict([(x, 1) for x in l])
  34. return d.keys()
  35. def zipfndict(*args, **kw):
  36. default = kw.get('default', nop)
  37. d = {}
  38. for key in unionlist(*[fndict.keys() for fndict in args]):
  39. d[key] = tuple([x.get(key, default) for x in args])
  40. return d
  41. def prefixedMethodClassDict(clazz, prefix):
  42. return dict([(name, getattr(clazz, prefix + name)) for name in prefixedMethodNames(clazz, prefix)])
  43. def prefixedMethodObjDict(obj, prefix):
  44. return dict([(name, getattr(obj, prefix + name)) for name in prefixedMethodNames(obj.__class__, prefix)])
  45. class ParseError(Exception):
  46. def __init__(self, filename, line, col, message):
  47. self.filename = filename
  48. self.line = line
  49. self.col = col
  50. self.message = message
  51. def __str__(self):
  52. return "%s:%s:%s: %s" % (self.filename, self.line, self.col,
  53. self.message)
  54. class XMLParser(Protocol):
  55. state = None
  56. encodings = None
  57. filename = "<xml />"
  58. beExtremelyLenient = 0
  59. _prepend = None
  60. # _leadingBodyData will sometimes be set before switching to the
  61. # 'bodydata' state, when we "accidentally" read a byte of bodydata
  62. # in a different state.
  63. _leadingBodyData = None
  64. def connectionMade(self):
  65. self.lineno = 1
  66. self.colno = 0
  67. self.encodings = []
  68. def saveMark(self):
  69. '''Get the line number and column of the last character parsed'''
  70. # This gets replaced during dataReceived, restored afterwards
  71. return (self.lineno, self.colno)
  72. def _parseError(self, message):
  73. raise ParseError(*((self.filename,)+self.saveMark()+(message,)))
  74. def _buildStateTable(self):
  75. '''Return a dictionary of begin, do, end state function tuples'''
  76. # _buildStateTable leaves something to be desired but it does what it
  77. # does.. probably slowly, so I'm doing some evil caching so it doesn't
  78. # get called more than once per class.
  79. stateTable = getattr(self.__class__, '__stateTable', None)
  80. if stateTable is None:
  81. stateTable = self.__class__.__stateTable = zipfndict(
  82. *[prefixedMethodObjDict(self, prefix)
  83. for prefix in ('begin_', 'do_', 'end_')])
  84. return stateTable
  85. def _decode(self, data):
  86. if 'UTF-16' in self.encodings or 'UCS-2' in self.encodings:
  87. assert not len(data) & 1, 'UTF-16 must come in pairs for now'
  88. if self._prepend:
  89. data = self._prepend + data
  90. for encoding in self.encodings:
  91. data = unicode(data, encoding)
  92. return data
  93. def maybeBodyData(self):
  94. if self.endtag:
  95. return 'bodydata'
  96. # Get ready for fun! We're going to allow
  97. # <script>if (foo < bar)</script> to work!
  98. # We do this by making everything between <script> and
  99. # </script> a Text
  100. # BUT <script src="foo"> will be special-cased to do regular,
  101. # lenient behavior, because those may not have </script>
  102. # -radix
  103. if (self.tagName == 'script' and 'src' not in self.tagAttributes):
  104. # we do this ourselves rather than having begin_waitforendscript
  105. # because that can get called multiple times and we don't want
  106. # bodydata to get reset other than the first time.
  107. self.begin_bodydata(None)
  108. return 'waitforendscript'
  109. return 'bodydata'
  110. def dataReceived(self, data):
  111. stateTable = self._buildStateTable()
  112. if not self.state:
  113. # all UTF-16 starts with this string
  114. if data.startswith('\xff\xfe'):
  115. self._prepend = '\xff\xfe'
  116. self.encodings.append('UTF-16')
  117. data = data[2:]
  118. elif data.startswith('\xfe\xff'):
  119. self._prepend = '\xfe\xff'
  120. self.encodings.append('UTF-16')
  121. data = data[2:]
  122. self.state = 'begin'
  123. if self.encodings:
  124. data = self._decode(data)
  125. # bring state, lineno, colno into local scope
  126. lineno, colno = self.lineno, self.colno
  127. curState = self.state
  128. # replace saveMark with a nested scope function
  129. _saveMark = self.saveMark
  130. def saveMark():
  131. return (lineno, colno)
  132. self.saveMark = saveMark
  133. # fetch functions from the stateTable
  134. beginFn, doFn, endFn = stateTable[curState]
  135. try:
  136. for byte in data:
  137. # do newline stuff
  138. if byte == '\n':
  139. lineno += 1
  140. colno = 0
  141. else:
  142. colno += 1
  143. newState = doFn(byte)
  144. if newState is not None and newState != curState:
  145. # this is the endFn from the previous state
  146. endFn()
  147. curState = newState
  148. beginFn, doFn, endFn = stateTable[curState]
  149. beginFn(byte)
  150. finally:
  151. self.saveMark = _saveMark
  152. self.lineno, self.colno = lineno, colno
  153. # state doesn't make sense if there's an exception..
  154. self.state = curState
  155. def connectionLost(self, reason):
  156. """
  157. End the last state we were in.
  158. """
  159. stateTable = self._buildStateTable()
  160. stateTable[self.state][END_HANDLER]()
  161. # state methods
  162. def do_begin(self, byte):
  163. if byte.isspace():
  164. return
  165. if byte != '<':
  166. if self.beExtremelyLenient:
  167. self._leadingBodyData = byte
  168. return 'bodydata'
  169. self._parseError("First char of document [%r] wasn't <" % (byte,))
  170. return 'tagstart'
  171. def begin_comment(self, byte):
  172. self.commentbuf = ''
  173. def do_comment(self, byte):
  174. self.commentbuf += byte
  175. if self.commentbuf.endswith('-->'):
  176. self.gotComment(self.commentbuf[:-3])
  177. return 'bodydata'
  178. def begin_tagstart(self, byte):
  179. self.tagName = '' # name of the tag
  180. self.tagAttributes = {} # attributes of the tag
  181. self.termtag = 0 # is the tag self-terminating
  182. self.endtag = 0
  183. def do_tagstart(self, byte):
  184. if byte.isalnum() or byte in identChars:
  185. self.tagName += byte
  186. if self.tagName == '!--':
  187. return 'comment'
  188. elif byte.isspace():
  189. if self.tagName:
  190. if self.endtag:
  191. # properly strict thing to do here is probably to only
  192. # accept whitespace
  193. return 'waitforgt'
  194. return 'attrs'
  195. else:
  196. self._parseError("Whitespace before tag-name")
  197. elif byte == '>':
  198. if self.endtag:
  199. self.gotTagEnd(self.tagName)
  200. return 'bodydata'
  201. else:
  202. self.gotTagStart(self.tagName, {})
  203. return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
  204. elif byte == '/':
  205. if self.tagName:
  206. return 'afterslash'
  207. else:
  208. self.endtag = 1
  209. elif byte in '!?':
  210. if self.tagName:
  211. if not self.beExtremelyLenient:
  212. self._parseError("Invalid character in tag-name")
  213. else:
  214. self.tagName += byte
  215. self.termtag = 1
  216. elif byte == '[':
  217. if self.tagName == '!':
  218. return 'expectcdata'
  219. else:
  220. self._parseError("Invalid '[' in tag-name")
  221. else:
  222. if self.beExtremelyLenient:
  223. self.bodydata = '<'
  224. return 'unentity'
  225. self._parseError('Invalid tag character: %r'% byte)
  226. def begin_unentity(self, byte):
  227. self.bodydata += byte
  228. def do_unentity(self, byte):
  229. self.bodydata += byte
  230. return 'bodydata'
  231. def end_unentity(self):
  232. self.gotText(self.bodydata)
  233. def begin_expectcdata(self, byte):
  234. self.cdatabuf = byte
  235. def do_expectcdata(self, byte):
  236. self.cdatabuf += byte
  237. cdb = self.cdatabuf
  238. cd = '[CDATA['
  239. if len(cd) > len(cdb):
  240. if cd.startswith(cdb):
  241. return
  242. elif self.beExtremelyLenient:
  243. ## WHAT THE CRAP!? MSWord9 generates HTML that includes these
  244. ## bizarre <![if !foo]> <![endif]> chunks, so I've gotta ignore
  245. ## 'em as best I can. this should really be a separate parse
  246. ## state but I don't even have any idea what these _are_.
  247. return 'waitforgt'
  248. else:
  249. self._parseError("Mal-formed CDATA header")
  250. if cd == cdb:
  251. self.cdatabuf = ''
  252. return 'cdata'
  253. self._parseError("Mal-formed CDATA header")
  254. def do_cdata(self, byte):
  255. self.cdatabuf += byte
  256. if self.cdatabuf.endswith("]]>"):
  257. self.cdatabuf = self.cdatabuf[:-3]
  258. return 'bodydata'
  259. def end_cdata(self):
  260. self.gotCData(self.cdatabuf)
  261. self.cdatabuf = ''
  262. def do_attrs(self, byte):
  263. if byte.isalnum() or byte in identChars:
  264. # XXX FIXME really handle !DOCTYPE at some point
  265. if self.tagName == '!DOCTYPE':
  266. return 'doctype'
  267. if self.tagName[0] in '!?':
  268. return 'waitforgt'
  269. return 'attrname'
  270. elif byte.isspace():
  271. return
  272. elif byte == '>':
  273. self.gotTagStart(self.tagName, self.tagAttributes)
  274. return (not self.beExtremelyLenient) and 'bodydata' or self.maybeBodyData()
  275. elif byte == '/':
  276. return 'afterslash'
  277. elif self.beExtremelyLenient:
  278. # discard and move on? Only case I've seen of this so far was:
  279. # <foo bar="baz"">
  280. return
  281. self._parseError("Unexpected character: %r" % byte)
  282. def begin_doctype(self, byte):
  283. self.doctype = byte
  284. def do_doctype(self, byte):
  285. if byte == '>':
  286. return 'bodydata'
  287. self.doctype += byte
  288. def end_doctype(self):
  289. self.gotDoctype(self.doctype)
  290. self.doctype = None
  291. def do_waitforgt(self, byte):
  292. if byte == '>':
  293. if self.endtag or not self.beExtremelyLenient:
  294. return 'bodydata'
  295. return self.maybeBodyData()
  296. def begin_attrname(self, byte):
  297. self.attrname = byte
  298. self._attrname_termtag = 0
  299. def do_attrname(self, byte):
  300. if byte.isalnum() or byte in identChars:
  301. self.attrname += byte
  302. return
  303. elif byte == '=':
  304. return 'beforeattrval'
  305. elif byte.isspace():
  306. return 'beforeeq'
  307. elif self.beExtremelyLenient:
  308. if byte in '"\'':
  309. return 'attrval'
  310. if byte in lenientIdentChars or byte.isalnum():
  311. self.attrname += byte
  312. return
  313. if byte == '/':
  314. self._attrname_termtag = 1
  315. return
  316. if byte == '>':
  317. self.attrval = 'True'
  318. self.tagAttributes[self.attrname] = self.attrval
  319. self.gotTagStart(self.tagName, self.tagAttributes)
  320. if self._attrname_termtag:
  321. self.gotTagEnd(self.tagName)
  322. return 'bodydata'
  323. return self.maybeBodyData()
  324. # something is really broken. let's leave this attribute where it
  325. # is and move on to the next thing
  326. return
  327. self._parseError("Invalid attribute name: %r %r" % (self.attrname, byte))
  328. def do_beforeattrval(self, byte):
  329. if byte in '"\'':
  330. return 'attrval'
  331. elif byte.isspace():
  332. return
  333. elif self.beExtremelyLenient:
  334. if byte in lenientIdentChars or byte.isalnum():
  335. return 'messyattr'
  336. if byte == '>':
  337. self.attrval = 'True'
  338. self.tagAttributes[self.attrname] = self.attrval
  339. self.gotTagStart(self.tagName, self.tagAttributes)
  340. return self.maybeBodyData()
  341. if byte == '\\':
  342. # I saw this in actual HTML once:
  343. # <font size=\"3\"><sup>SM</sup></font>
  344. return
  345. self._parseError("Invalid initial attribute value: %r; Attribute values must be quoted." % byte)
  346. attrname = ''
  347. attrval = ''
  348. def begin_beforeeq(self,byte):
  349. self._beforeeq_termtag = 0
  350. def do_beforeeq(self, byte):
  351. if byte == '=':
  352. return 'beforeattrval'
  353. elif byte.isspace():
  354. return
  355. elif self.beExtremelyLenient:
  356. if byte.isalnum() or byte in identChars:
  357. self.attrval = 'True'
  358. self.tagAttributes[self.attrname] = self.attrval
  359. return 'attrname'
  360. elif byte == '>':
  361. self.attrval = 'True'
  362. self.tagAttributes[self.attrname] = self.attrval
  363. self.gotTagStart(self.tagName, self.tagAttributes)
  364. if self._beforeeq_termtag:
  365. self.gotTagEnd(self.tagName)
  366. return 'bodydata'
  367. return self.maybeBodyData()
  368. elif byte == '/':
  369. self._beforeeq_termtag = 1
  370. return
  371. self._parseError("Invalid attribute")
  372. def begin_attrval(self, byte):
  373. self.quotetype = byte
  374. self.attrval = ''
  375. def do_attrval(self, byte):
  376. if byte == self.quotetype:
  377. return 'attrs'
  378. self.attrval += byte
  379. def end_attrval(self):
  380. self.tagAttributes[self.attrname] = self.attrval
  381. self.attrname = self.attrval = ''
  382. def begin_messyattr(self, byte):
  383. self.attrval = byte
  384. def do_messyattr(self, byte):
  385. if byte.isspace():
  386. return 'attrs'
  387. elif byte == '>':
  388. endTag = 0
  389. if self.attrval.endswith('/'):
  390. endTag = 1
  391. self.attrval = self.attrval[:-1]
  392. self.tagAttributes[self.attrname] = self.attrval
  393. self.gotTagStart(self.tagName, self.tagAttributes)
  394. if endTag:
  395. self.gotTagEnd(self.tagName)
  396. return 'bodydata'
  397. return self.maybeBodyData()
  398. else:
  399. self.attrval += byte
  400. def end_messyattr(self):
  401. if self.attrval:
  402. self.tagAttributes[self.attrname] = self.attrval
  403. def begin_afterslash(self, byte):
  404. self._after_slash_closed = 0
  405. def do_afterslash(self, byte):
  406. # this state is only after a self-terminating slash, e.g. <foo/>
  407. if self._after_slash_closed:
  408. self._parseError("Mal-formed")#XXX When does this happen??
  409. if byte != '>':
  410. if self.beExtremelyLenient:
  411. return
  412. else:
  413. self._parseError("No data allowed after '/'")
  414. self._after_slash_closed = 1
  415. self.gotTagStart(self.tagName, self.tagAttributes)
  416. self.gotTagEnd(self.tagName)
  417. # don't need maybeBodyData here because there better not be
  418. # any javascript code after a <script/>... we'll see :(
  419. return 'bodydata'
  420. def begin_bodydata(self, byte):
  421. if self._leadingBodyData:
  422. self.bodydata = self._leadingBodyData
  423. del self._leadingBodyData
  424. else:
  425. self.bodydata = ''
  426. def do_bodydata(self, byte):
  427. if byte == '<':
  428. return 'tagstart'
  429. if byte == '&':
  430. return 'entityref'
  431. self.bodydata += byte
  432. def end_bodydata(self):
  433. self.gotText(self.bodydata)
  434. self.bodydata = ''
  435. def do_waitforendscript(self, byte):
  436. if byte == '<':
  437. return 'waitscriptendtag'
  438. self.bodydata += byte
  439. def begin_waitscriptendtag(self, byte):
  440. self.temptagdata = ''
  441. self.tagName = ''
  442. self.endtag = 0
  443. def do_waitscriptendtag(self, byte):
  444. # 1 enforce / as first byte read
  445. # 2 enforce following bytes to be subset of "script" until
  446. # tagName == "script"
  447. # 2a when that happens, gotText(self.bodydata) and gotTagEnd(self.tagName)
  448. # 3 spaces can happen anywhere, they're ignored
  449. # e.g. < / script >
  450. # 4 anything else causes all data I've read to be moved to the
  451. # bodydata, and switch back to waitforendscript state
  452. # If it turns out this _isn't_ a </script>, we need to
  453. # remember all the data we've been through so we can append it
  454. # to bodydata
  455. self.temptagdata += byte
  456. # 1
  457. if byte == '/':
  458. self.endtag = True
  459. elif not self.endtag:
  460. self.bodydata += "<" + self.temptagdata
  461. return 'waitforendscript'
  462. # 2
  463. elif byte.isalnum() or byte in identChars:
  464. self.tagName += byte
  465. if not 'script'.startswith(self.tagName):
  466. self.bodydata += "<" + self.temptagdata
  467. return 'waitforendscript'
  468. elif self.tagName == 'script':
  469. self.gotText(self.bodydata)
  470. self.gotTagEnd(self.tagName)
  471. return 'waitforgt'
  472. # 3
  473. elif byte.isspace():
  474. return 'waitscriptendtag'
  475. # 4
  476. else:
  477. self.bodydata += "<" + self.temptagdata
  478. return 'waitforendscript'
  479. def begin_entityref(self, byte):
  480. self.erefbuf = ''
  481. self.erefextra = '' # extra bit for lenient mode
  482. def do_entityref(self, byte):
  483. if byte.isspace() or byte == "<":
  484. if self.beExtremelyLenient:
  485. # '&foo' probably was '&amp;foo'
  486. if self.erefbuf and self.erefbuf != "amp":
  487. self.erefextra = self.erefbuf
  488. self.erefbuf = "amp"
  489. if byte == "<":
  490. return "tagstart"
  491. else:
  492. self.erefextra += byte
  493. return 'spacebodydata'
  494. self._parseError("Bad entity reference")
  495. elif byte != ';':
  496. self.erefbuf += byte
  497. else:
  498. return 'bodydata'
  499. def end_entityref(self):
  500. self.gotEntityReference(self.erefbuf)
  501. # hacky support for space after & in entityref in beExtremelyLenient
  502. # state should only happen in that case
  503. def begin_spacebodydata(self, byte):
  504. self.bodydata = self.erefextra
  505. self.erefextra = None
  506. do_spacebodydata = do_bodydata
  507. end_spacebodydata = end_bodydata
  508. # Sorta SAX-ish API
  509. def gotTagStart(self, name, attributes):
  510. '''Encountered an opening tag.
  511. Default behaviour is to print.'''
  512. print('begin', name, attributes)
  513. def gotText(self, data):
  514. '''Encountered text
  515. Default behaviour is to print.'''
  516. print('text:', repr(data))
  517. def gotEntityReference(self, entityRef):
  518. '''Encountered mnemonic entity reference
  519. Default behaviour is to print.'''
  520. print('entityRef: &%s;' % entityRef)
  521. def gotComment(self, comment):
  522. '''Encountered comment.
  523. Default behaviour is to ignore.'''
  524. pass
  525. def gotCData(self, cdata):
  526. '''Encountered CDATA
  527. Default behaviour is to call the gotText method'''
  528. self.gotText(cdata)
  529. def gotDoctype(self, doctype):
  530. """Encountered DOCTYPE
  531. This is really grotty: it basically just gives you everything between
  532. '<!DOCTYPE' and '>' as an argument.
  533. """
  534. print('!DOCTYPE', repr(doctype))
  535. def gotTagEnd(self, name):
  536. '''Encountered closing tag
  537. Default behaviour is to print.'''
  538. print('end', name)