textfmts.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # -*- coding: utf-8 -*-
  2. """
  3. pygments.lexers.textfmts
  4. ~~~~~~~~~~~~~~~~~~~~~~~~
  5. Lexers for various text formats.
  6. :copyright: Copyright 2006-2017 by the Pygments team, see AUTHORS.
  7. :license: BSD, see LICENSE for details.
  8. """
  9. import re
  10. from pygments.lexer import RegexLexer, bygroups
  11. from pygments.token import Text, Comment, Operator, Keyword, Name, String, \
  12. Number, Generic, Literal
  13. from pygments.util import ClassNotFound
  14. __all__ = ['IrcLogsLexer', 'TodotxtLexer', 'HttpLexer', 'GettextLexer']
  15. class IrcLogsLexer(RegexLexer):
  16. """
  17. Lexer for IRC logs in *irssi*, *xchat* or *weechat* style.
  18. """
  19. name = 'IRC logs'
  20. aliases = ['irc']
  21. filenames = ['*.weechatlog']
  22. mimetypes = ['text/x-irclog']
  23. flags = re.VERBOSE | re.MULTILINE
  24. timestamp = r"""
  25. (
  26. # irssi / xchat and others
  27. (?: \[|\()? # Opening bracket or paren for the timestamp
  28. (?: # Timestamp
  29. (?: (?:\d{1,4} [-/])* # Date as - or /-separated groups of digits
  30. (?:\d{1,4})
  31. [T ])? # Date/time separator: T or space
  32. (?: \d?\d [:.])* # Time as :/.-separated groups of 1 or 2 digits
  33. (?: \d?\d)
  34. )
  35. (?: \]|\))?\s+ # Closing bracket or paren for the timestamp
  36. |
  37. # weechat
  38. \d{4}\s\w{3}\s\d{2}\s # Date
  39. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  40. |
  41. # xchat
  42. \w{3}\s\d{2}\s # Date
  43. \d{2}:\d{2}:\d{2}\s+ # Time + Whitespace
  44. )?
  45. """
  46. tokens = {
  47. 'root': [
  48. # log start/end
  49. (r'^\*\*\*\*(.*)\*\*\*\*$', Comment),
  50. # hack
  51. ("^" + timestamp + r'(\s*<[^>]*>\s*)$', bygroups(Comment.Preproc, Name.Tag)),
  52. # normal msgs
  53. ("^" + timestamp + r"""
  54. (\s*<.*?>\s*) # Nick """,
  55. bygroups(Comment.Preproc, Name.Tag), 'msg'),
  56. # /me msgs
  57. ("^" + timestamp + r"""
  58. (\s*[*]\s+) # Star
  59. (\S+\s+.*?\n) # Nick + rest of message """,
  60. bygroups(Comment.Preproc, Keyword, Generic.Inserted)),
  61. # join/part msgs
  62. ("^" + timestamp + r"""
  63. (\s*(?:\*{3}|<?-[!@=P]?->?)\s*) # Star(s) or symbols
  64. (\S+\s+) # Nick + Space
  65. (.*?\n) # Rest of message """,
  66. bygroups(Comment.Preproc, Keyword, String, Comment)),
  67. (r"^.*?\n", Text),
  68. ],
  69. 'msg': [
  70. (r"\S+:(?!//)", Name.Attribute), # Prefix
  71. (r".*\n", Text, '#pop'),
  72. ],
  73. }
  74. class GettextLexer(RegexLexer):
  75. """
  76. Lexer for Gettext catalog files.
  77. .. versionadded:: 0.9
  78. """
  79. name = 'Gettext Catalog'
  80. aliases = ['pot', 'po']
  81. filenames = ['*.pot', '*.po']
  82. mimetypes = ['application/x-gettext', 'text/x-gettext', 'text/gettext']
  83. tokens = {
  84. 'root': [
  85. (r'^#,\s.*?$', Keyword.Type),
  86. (r'^#:\s.*?$', Keyword.Declaration),
  87. # (r'^#$', Comment),
  88. (r'^(#|#\.\s|#\|\s|#~\s|#\s).*$', Comment.Single),
  89. (r'^(")([A-Za-z-]+:)(.*")$',
  90. bygroups(String, Name.Property, String)),
  91. (r'^".*"$', String),
  92. (r'^(msgid|msgid_plural|msgstr|msgctxt)(\s+)(".*")$',
  93. bygroups(Name.Variable, Text, String)),
  94. (r'^(msgstr\[)(\d)(\])(\s+)(".*")$',
  95. bygroups(Name.Variable, Number.Integer, Name.Variable, Text, String)),
  96. ]
  97. }
  98. class HttpLexer(RegexLexer):
  99. """
  100. Lexer for HTTP sessions.
  101. .. versionadded:: 1.5
  102. """
  103. name = 'HTTP'
  104. aliases = ['http']
  105. flags = re.DOTALL
  106. def get_tokens_unprocessed(self, text, stack=('root',)):
  107. """Reset the content-type state."""
  108. self.content_type = None
  109. return RegexLexer.get_tokens_unprocessed(self, text, stack)
  110. def header_callback(self, match):
  111. if match.group(1).lower() == 'content-type':
  112. content_type = match.group(5).strip()
  113. if ';' in content_type:
  114. content_type = content_type[:content_type.find(';')].strip()
  115. self.content_type = content_type
  116. yield match.start(1), Name.Attribute, match.group(1)
  117. yield match.start(2), Text, match.group(2)
  118. yield match.start(3), Operator, match.group(3)
  119. yield match.start(4), Text, match.group(4)
  120. yield match.start(5), Literal, match.group(5)
  121. yield match.start(6), Text, match.group(6)
  122. def continuous_header_callback(self, match):
  123. yield match.start(1), Text, match.group(1)
  124. yield match.start(2), Literal, match.group(2)
  125. yield match.start(3), Text, match.group(3)
  126. def content_callback(self, match):
  127. content_type = getattr(self, 'content_type', None)
  128. content = match.group()
  129. offset = match.start()
  130. if content_type:
  131. from pygments.lexers import get_lexer_for_mimetype
  132. possible_lexer_mimetypes = [content_type]
  133. if '+' in content_type:
  134. # application/calendar+xml can be treated as application/xml
  135. # if there's not a better match.
  136. general_type = re.sub(r'^(.*)/.*\+(.*)$', r'\1/\2',
  137. content_type)
  138. possible_lexer_mimetypes.append(general_type)
  139. for i in possible_lexer_mimetypes:
  140. try:
  141. lexer = get_lexer_for_mimetype(i)
  142. except ClassNotFound:
  143. pass
  144. else:
  145. for idx, token, value in lexer.get_tokens_unprocessed(content):
  146. yield offset + idx, token, value
  147. return
  148. yield offset, Text, content
  149. tokens = {
  150. 'root': [
  151. (r'(GET|POST|PUT|DELETE|HEAD|OPTIONS|TRACE|PATCH)( +)([^ ]+)( +)'
  152. r'(HTTP)(/)(1\.[01])(\r?\n|\Z)',
  153. bygroups(Name.Function, Text, Name.Namespace, Text,
  154. Keyword.Reserved, Operator, Number, Text),
  155. 'headers'),
  156. (r'(HTTP)(/)(1\.[01])( +)(\d{3})( +)([^\r\n]+)(\r?\n|\Z)',
  157. bygroups(Keyword.Reserved, Operator, Number, Text, Number,
  158. Text, Name.Exception, Text),
  159. 'headers'),
  160. ],
  161. 'headers': [
  162. (r'([^\s:]+)( *)(:)( *)([^\r\n]+)(\r?\n|\Z)', header_callback),
  163. (r'([\t ]+)([^\r\n]+)(\r?\n|\Z)', continuous_header_callback),
  164. (r'\r?\n', Text, 'content')
  165. ],
  166. 'content': [
  167. (r'.+', content_callback)
  168. ]
  169. }
  170. def analyse_text(text):
  171. return text.startswith(('GET /', 'POST /', 'PUT /', 'DELETE /', 'HEAD /',
  172. 'OPTIONS /', 'TRACE /', 'PATCH /'))
  173. class TodotxtLexer(RegexLexer):
  174. """
  175. Lexer for `Todo.txt <http://todotxt.com/>`_ todo list format.
  176. .. versionadded:: 2.0
  177. """
  178. name = 'Todotxt'
  179. aliases = ['todotxt']
  180. # *.todotxt is not a standard extension for Todo.txt files; including it
  181. # makes testing easier, and also makes autodetecting file type easier.
  182. filenames = ['todo.txt', '*.todotxt']
  183. mimetypes = ['text/x-todo']
  184. # Aliases mapping standard token types of Todo.txt format concepts
  185. CompleteTaskText = Operator # Chosen to de-emphasize complete tasks
  186. IncompleteTaskText = Text # Incomplete tasks should look like plain text
  187. # Priority should have most emphasis to indicate importance of tasks
  188. Priority = Generic.Heading
  189. # Dates should have next most emphasis because time is important
  190. Date = Generic.Subheading
  191. # Project and context should have equal weight, and be in different colors
  192. Project = Generic.Error
  193. Context = String
  194. # If tag functionality is added, it should have the same weight as Project
  195. # and Context, and a different color. Generic.Traceback would work well.
  196. # Regex patterns for building up rules; dates, priorities, projects, and
  197. # contexts are all atomic
  198. # TODO: Make date regex more ISO 8601 compliant
  199. date_regex = r'\d{4,}-\d{2}-\d{2}'
  200. priority_regex = r'\([A-Z]\)'
  201. project_regex = r'\+\S+'
  202. context_regex = r'@\S+'
  203. # Compound regex expressions
  204. complete_one_date_regex = r'(x )(' + date_regex + r')'
  205. complete_two_date_regex = (complete_one_date_regex + r'( )(' +
  206. date_regex + r')')
  207. priority_date_regex = r'(' + priority_regex + r')( )(' + date_regex + r')'
  208. tokens = {
  209. # Should parse starting at beginning of line; each line is a task
  210. 'root': [
  211. # Complete task entry points: two total:
  212. # 1. Complete task with two dates
  213. (complete_two_date_regex, bygroups(CompleteTaskText, Date,
  214. CompleteTaskText, Date),
  215. 'complete'),
  216. # 2. Complete task with one date
  217. (complete_one_date_regex, bygroups(CompleteTaskText, Date),
  218. 'complete'),
  219. # Incomplete task entry points: six total:
  220. # 1. Priority plus date
  221. (priority_date_regex, bygroups(Priority, IncompleteTaskText, Date),
  222. 'incomplete'),
  223. # 2. Priority only
  224. (priority_regex, Priority, 'incomplete'),
  225. # 3. Leading date
  226. (date_regex, Date, 'incomplete'),
  227. # 4. Leading context
  228. (context_regex, Context, 'incomplete'),
  229. # 5. Leading project
  230. (project_regex, Project, 'incomplete'),
  231. # 6. Non-whitespace catch-all
  232. ('\S+', IncompleteTaskText, 'incomplete'),
  233. ],
  234. # Parse a complete task
  235. 'complete': [
  236. # Newline indicates end of task, should return to root
  237. (r'\s*\n', CompleteTaskText, '#pop'),
  238. # Tokenize contexts and projects
  239. (context_regex, Context),
  240. (project_regex, Project),
  241. # Tokenize non-whitespace text
  242. ('\S+', CompleteTaskText),
  243. # Tokenize whitespace not containing a newline
  244. ('\s+', CompleteTaskText),
  245. ],
  246. # Parse an incomplete task
  247. 'incomplete': [
  248. # Newline indicates end of task, should return to root
  249. (r'\s*\n', IncompleteTaskText, '#pop'),
  250. # Tokenize contexts and projects
  251. (context_regex, Context),
  252. (project_regex, Project),
  253. # Tokenize non-whitespace text
  254. ('\S+', IncompleteTaskText),
  255. # Tokenize whitespace not containing a newline
  256. ('\s+', IncompleteTaskText),
  257. ],
  258. }