parsehtml.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197
  1. __all__ = ['findHTMLMeta', 'MetaNotFound']
  2. from HTMLParser import HTMLParser, HTMLParseError
  3. import htmlentitydefs
  4. import re
  5. from openid.yadis.constants import YADIS_HEADER_NAME
  6. # Size of the chunks to search at a time (also the amount that gets
  7. # read at a time)
  8. CHUNK_SIZE = 1024 * 16 # 16 KB
  9. class ParseDone(Exception):
  10. """Exception to hold the URI that was located when the parse is
  11. finished. If the parse finishes without finding the URI, set it to
  12. None."""
  13. class MetaNotFound(Exception):
  14. """Exception to hold the content of the page if we did not find
  15. the appropriate <meta> tag"""
  16. re_flags = re.IGNORECASE | re.UNICODE | re.VERBOSE
  17. ent_pat = r'''
  18. &
  19. (?: \#x (?P<hex> [a-f0-9]+ )
  20. | \# (?P<dec> \d+ )
  21. | (?P<word> \w+ )
  22. )
  23. ;'''
  24. ent_re = re.compile(ent_pat, re_flags)
  25. def substituteMO(mo):
  26. if mo.lastgroup == 'hex':
  27. codepoint = int(mo.group('hex'), 16)
  28. elif mo.lastgroup == 'dec':
  29. codepoint = int(mo.group('dec'))
  30. else:
  31. assert mo.lastgroup == 'word'
  32. codepoint = htmlentitydefs.name2codepoint.get(mo.group('word'))
  33. if codepoint is None:
  34. return mo.group()
  35. else:
  36. return unichr(codepoint)
  37. def substituteEntities(s):
  38. return ent_re.sub(substituteMO, s)
  39. class YadisHTMLParser(HTMLParser):
  40. """Parser that finds a meta http-equiv tag in the head of a html
  41. document.
  42. When feeding in data, if the tag is matched or it will never be
  43. found, the parser will raise ParseDone with the uri as the first
  44. attribute.
  45. Parsing state diagram
  46. =====================
  47. Any unlisted input does not affect the state::
  48. 1, 2, 5 8
  49. +--------------------------+ +-+
  50. | | | |
  51. 4 | 3 1, 2, 5, 7 v | v
  52. TOP -> HTML -> HEAD ----------> TERMINATED
  53. | | ^ | ^ ^
  54. | | 3 | | | |
  55. | +------------+ +-> FOUND ------+ |
  56. | 6 8 |
  57. | 1, 2 |
  58. +------------------------------------+
  59. 1. any of </body>, </html>, </head> -> TERMINATE
  60. 2. <body> -> TERMINATE
  61. 3. <head> -> HEAD
  62. 4. <html> -> HTML
  63. 5. <html> -> TERMINATE
  64. 6. <meta http-equiv='X-XRDS-Location'> -> FOUND
  65. 7. <head> -> TERMINATE
  66. 8. Any input -> TERMINATE
  67. """
  68. TOP = 0
  69. HTML = 1
  70. HEAD = 2
  71. FOUND = 3
  72. TERMINATED = 4
  73. def __init__(self):
  74. HTMLParser.__init__(self)
  75. self.phase = self.TOP
  76. def _terminate(self):
  77. self.phase = self.TERMINATED
  78. raise ParseDone(None)
  79. def handle_endtag(self, tag):
  80. # If we ever see an end of head, body, or html, bail out right away.
  81. # [1]
  82. if tag in ['head', 'body', 'html']:
  83. self._terminate()
  84. def handle_starttag(self, tag, attrs):
  85. # if we ever see a start body tag, bail out right away, since
  86. # we want to prevent the meta tag from appearing in the body
  87. # [2]
  88. if tag=='body':
  89. self._terminate()
  90. if self.phase == self.TOP:
  91. # At the top level, allow a html tag or a head tag to move
  92. # to the head or html phase
  93. if tag == 'head':
  94. # [3]
  95. self.phase = self.HEAD
  96. elif tag == 'html':
  97. # [4]
  98. self.phase = self.HTML
  99. elif self.phase == self.HTML:
  100. # if we are in the html tag, allow a head tag to move to
  101. # the HEAD phase. If we get another html tag, then bail
  102. # out
  103. if tag == 'head':
  104. # [3]
  105. self.phase = self.HEAD
  106. elif tag == 'html':
  107. # [5]
  108. self._terminate()
  109. elif self.phase == self.HEAD:
  110. # If we are in the head phase, look for the appropriate
  111. # meta tag. If we get a head or body tag, bail out.
  112. if tag == 'meta':
  113. attrs_d = dict(attrs)
  114. http_equiv = attrs_d.get('http-equiv', '').lower()
  115. if http_equiv == YADIS_HEADER_NAME.lower():
  116. raw_attr = attrs_d.get('content')
  117. yadis_loc = substituteEntities(raw_attr)
  118. # [6]
  119. self.phase = self.FOUND
  120. raise ParseDone(yadis_loc)
  121. elif tag in ['head', 'html']:
  122. # [5], [7]
  123. self._terminate()
  124. def feed(self, chars):
  125. # [8]
  126. if self.phase in [self.TERMINATED, self.FOUND]:
  127. self._terminate()
  128. return HTMLParser.feed(self, chars)
  129. def findHTMLMeta(stream):
  130. """Look for a meta http-equiv tag with the YADIS header name.
  131. @param stream: Source of the html text
  132. @type stream: Object that implements a read() method that works
  133. like file.read
  134. @return: The URI from which to fetch the XRDS document
  135. @rtype: str
  136. @raises MetaNotFound: raised with the content that was
  137. searched as the first parameter.
  138. """
  139. parser = YadisHTMLParser()
  140. chunks = []
  141. while 1:
  142. chunk = stream.read(CHUNK_SIZE)
  143. if not chunk:
  144. # End of file
  145. break
  146. chunks.append(chunk)
  147. try:
  148. parser.feed(chunk)
  149. except HTMLParseError, why:
  150. # HTML parse error, so bail
  151. chunks.append(stream.read())
  152. break
  153. except ParseDone, why:
  154. uri = why[0]
  155. if uri is None:
  156. # Parse finished, but we may need the rest of the file
  157. chunks.append(stream.read())
  158. break
  159. else:
  160. return uri
  161. content = ''.join(chunks)
  162. raise MetaNotFound(content)