pofile.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. # -*- coding: utf-8 -*-
  2. """
  3. babel.messages.pofile
  4. ~~~~~~~~~~~~~~~~~~~~~
  5. Reading and writing of files in the ``gettext`` PO (portable object)
  6. format.
  7. :copyright: (c) 2013 by the Babel Team.
  8. :license: BSD, see LICENSE for more details.
  9. """
  10. from __future__ import print_function
  11. import os
  12. import re
  13. from babel.messages.catalog import Catalog, Message
  14. from babel.util import wraptext
  15. from babel._compat import text_type
  16. def unescape(string):
  17. r"""Reverse `escape` the given string.
  18. >>> print(unescape('"Say:\\n \\"hello, world!\\"\\n"'))
  19. Say:
  20. "hello, world!"
  21. <BLANKLINE>
  22. :param string: the string to unescape
  23. """
  24. def replace_escapes(match):
  25. m = match.group(1)
  26. if m == 'n':
  27. return '\n'
  28. elif m == 't':
  29. return '\t'
  30. elif m == 'r':
  31. return '\r'
  32. # m is \ or "
  33. return m
  34. return re.compile(r'\\([\\trn"])').sub(replace_escapes, string[1:-1])
  35. def denormalize(string):
  36. r"""Reverse the normalization done by the `normalize` function.
  37. >>> print(denormalize(r'''""
  38. ... "Say:\n"
  39. ... " \"hello, world!\"\n"'''))
  40. Say:
  41. "hello, world!"
  42. <BLANKLINE>
  43. >>> print(denormalize(r'''""
  44. ... "Say:\n"
  45. ... " \"Lorem ipsum dolor sit "
  46. ... "amet, consectetur adipisicing"
  47. ... " elit, \"\n"'''))
  48. Say:
  49. "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
  50. <BLANKLINE>
  51. :param string: the string to denormalize
  52. """
  53. if '\n' in string:
  54. escaped_lines = string.splitlines()
  55. if string.startswith('""'):
  56. escaped_lines = escaped_lines[1:]
  57. lines = map(unescape, escaped_lines)
  58. return ''.join(lines)
  59. else:
  60. return unescape(string)
  61. class _NormalizedString(object):
  62. def __init__(self, *args):
  63. self._strs = []
  64. for arg in args:
  65. self.append(arg)
  66. def append(self, s):
  67. self._strs.append(s.strip())
  68. def denormalize(self):
  69. return ''.join(map(unescape, self._strs))
  70. def __nonzero__(self):
  71. return bool(self._strs)
  72. class PoFileParser(object):
  73. """Support class to read messages from a ``gettext`` PO (portable object) file
  74. and add them to a `Catalog`
  75. See `read_po` for simple cases.
  76. """
  77. _keywords = [
  78. 'msgid',
  79. 'msgstr',
  80. 'msgctxt',
  81. 'msgid_plural',
  82. ]
  83. def __init__(self, catalog, ignore_obsolete=False):
  84. self.catalog = catalog
  85. self.ignore_obsolete = ignore_obsolete
  86. self.counter = 0
  87. self.offset = 0
  88. self._reset_message_state()
  89. def _reset_message_state(self):
  90. self.messages = []
  91. self.translations = []
  92. self.locations = []
  93. self.flags = []
  94. self.user_comments = []
  95. self.auto_comments = []
  96. self.context = None
  97. self.obsolete = False
  98. self.in_msgid = False
  99. self.in_msgstr = False
  100. self.in_msgctxt = False
  101. def _add_message(self):
  102. """
  103. Add a message to the catalog based on the current parser state and
  104. clear the state ready to process the next message.
  105. """
  106. self.translations.sort()
  107. if len(self.messages) > 1:
  108. msgid = tuple([m.denormalize() for m in self.messages])
  109. else:
  110. msgid = self.messages[0].denormalize()
  111. if isinstance(msgid, (list, tuple)):
  112. string = ['' for _ in range(self.catalog.num_plurals)]
  113. for idx, translation in self.translations:
  114. if idx >= self.catalog.num_plurals:
  115. self._invalid_pofile("", self.offset, "msg has more translations than num_plurals of catalog")
  116. continue
  117. string[idx] = translation.denormalize()
  118. string = tuple(string)
  119. else:
  120. string = self.translations[0][1].denormalize()
  121. if self.context:
  122. msgctxt = self.context.denormalize()
  123. else:
  124. msgctxt = None
  125. message = Message(msgid, string, list(self.locations), set(self.flags),
  126. self.auto_comments, self.user_comments, lineno=self.offset + 1,
  127. context=msgctxt)
  128. if self.obsolete:
  129. if not self.ignore_obsolete:
  130. self.catalog.obsolete[msgid] = message
  131. else:
  132. self.catalog[msgid] = message
  133. self.counter += 1
  134. self._reset_message_state()
  135. def _finish_current_message(self):
  136. if self.messages:
  137. self._add_message()
  138. def _process_message_line(self, lineno, line, obsolete=False):
  139. if line.startswith('"'):
  140. self._process_string_continuation_line(line, lineno)
  141. else:
  142. self._process_keyword_line(lineno, line, obsolete)
  143. def _process_keyword_line(self, lineno, line, obsolete=False):
  144. for keyword in self._keywords:
  145. if line.startswith(keyword) and line[len(keyword)] in [' ', '[']:
  146. arg = line[len(keyword):]
  147. break
  148. else:
  149. self._invalid_pofile(line, lineno, "Start of line didn't match any expected keyword.")
  150. return
  151. if keyword in ['msgid', 'msgctxt']:
  152. self._finish_current_message()
  153. self.obsolete = obsolete
  154. # The line that has the msgid is stored as the offset of the msg
  155. # should this be the msgctxt if it has one?
  156. if keyword == 'msgid':
  157. self.offset = lineno
  158. if keyword in ['msgid', 'msgid_plural']:
  159. self.in_msgctxt = False
  160. self.in_msgid = True
  161. self.messages.append(_NormalizedString(arg))
  162. elif keyword == 'msgstr':
  163. self.in_msgid = False
  164. self.in_msgstr = True
  165. if arg.startswith('['):
  166. idx, msg = arg[1:].split(']', 1)
  167. self.translations.append([int(idx), _NormalizedString(msg)])
  168. else:
  169. self.translations.append([0, _NormalizedString(arg)])
  170. elif keyword == 'msgctxt':
  171. self.in_msgctxt = True
  172. self.context = _NormalizedString(arg)
  173. def _process_string_continuation_line(self, line, lineno):
  174. if self.in_msgid:
  175. s = self.messages[-1]
  176. elif self.in_msgstr:
  177. s = self.translations[-1][1]
  178. elif self.in_msgctxt:
  179. s = self.context
  180. else:
  181. self._invalid_pofile(line, lineno, "Got line starting with \" but not in msgid, msgstr or msgctxt")
  182. return
  183. s.append(line)
  184. def _process_comment(self, line):
  185. self._finish_current_message()
  186. if line[1:].startswith(':'):
  187. for location in line[2:].lstrip().split():
  188. pos = location.rfind(':')
  189. if pos >= 0:
  190. try:
  191. lineno = int(location[pos + 1:])
  192. except ValueError:
  193. continue
  194. self.locations.append((location[:pos], lineno))
  195. else:
  196. self.locations.append((location, None))
  197. elif line[1:].startswith(','):
  198. for flag in line[2:].lstrip().split(','):
  199. self.flags.append(flag.strip())
  200. elif line[1:].startswith('.'):
  201. # These are called auto-comments
  202. comment = line[2:].strip()
  203. if comment: # Just check that we're not adding empty comments
  204. self.auto_comments.append(comment)
  205. else:
  206. # These are called user comments
  207. self.user_comments.append(line[1:].strip())
  208. def parse(self, fileobj):
  209. """
  210. Reads from the file-like object `fileobj` and adds any po file
  211. units found in it to the `Catalog` supplied to the constructor.
  212. """
  213. for lineno, line in enumerate(fileobj):
  214. line = line.strip()
  215. if not isinstance(line, text_type):
  216. line = line.decode(self.catalog.charset)
  217. if not line:
  218. continue
  219. if line.startswith('#'):
  220. if line[1:].startswith('~'):
  221. self._process_message_line(lineno, line[2:].lstrip(), obsolete=True)
  222. else:
  223. self._process_comment(line)
  224. else:
  225. self._process_message_line(lineno, line)
  226. self._finish_current_message()
  227. # No actual messages found, but there was some info in comments, from which
  228. # we'll construct an empty header message
  229. if not self.counter and (self.flags or self.user_comments or self.auto_comments):
  230. self.messages.append(_NormalizedString(u'""'))
  231. self.translations.append([0, _NormalizedString(u'""')])
  232. self._add_message()
  233. def _invalid_pofile(self, line, lineno, msg):
  234. print("WARNING:", msg)
  235. print("WARNING: Problem on line {0}: {1}".format(lineno + 1, line))
  236. def read_po(fileobj, locale=None, domain=None, ignore_obsolete=False, charset=None):
  237. """Read messages from a ``gettext`` PO (portable object) file from the given
  238. file-like object and return a `Catalog`.
  239. >>> from datetime import datetime
  240. >>> from babel._compat import StringIO
  241. >>> buf = StringIO('''
  242. ... #: main.py:1
  243. ... #, fuzzy, python-format
  244. ... msgid "foo %(name)s"
  245. ... msgstr "quux %(name)s"
  246. ...
  247. ... # A user comment
  248. ... #. An auto comment
  249. ... #: main.py:3
  250. ... msgid "bar"
  251. ... msgid_plural "baz"
  252. ... msgstr[0] "bar"
  253. ... msgstr[1] "baaz"
  254. ... ''')
  255. >>> catalog = read_po(buf)
  256. >>> catalog.revision_date = datetime(2007, 4, 1)
  257. >>> for message in catalog:
  258. ... if message.id:
  259. ... print((message.id, message.string))
  260. ... print(' ', (message.locations, sorted(list(message.flags))))
  261. ... print(' ', (message.user_comments, message.auto_comments))
  262. (u'foo %(name)s', u'quux %(name)s')
  263. ([(u'main.py', 1)], [u'fuzzy', u'python-format'])
  264. ([], [])
  265. ((u'bar', u'baz'), (u'bar', u'baaz'))
  266. ([(u'main.py', 3)], [])
  267. ([u'A user comment'], [u'An auto comment'])
  268. .. versionadded:: 1.0
  269. Added support for explicit charset argument.
  270. :param fileobj: the file-like object to read the PO file from
  271. :param locale: the locale identifier or `Locale` object, or `None`
  272. if the catalog is not bound to a locale (which basically
  273. means it's a template)
  274. :param domain: the message domain
  275. :param ignore_obsolete: whether to ignore obsolete messages in the input
  276. :param charset: the character set of the catalog.
  277. """
  278. catalog = Catalog(locale=locale, domain=domain, charset=charset)
  279. parser = PoFileParser(catalog, ignore_obsolete)
  280. parser.parse(fileobj)
  281. return catalog
  282. WORD_SEP = re.compile('('
  283. r'\s+|' # any whitespace
  284. r'[^\s\w]*\w+[a-zA-Z]-(?=\w+[a-zA-Z])|' # hyphenated words
  285. r'(?<=[\w\!\"\'\&\.\,\?])-{2,}(?=\w)' # em-dash
  286. ')')
  287. def escape(string):
  288. r"""Escape the given string so that it can be included in double-quoted
  289. strings in ``PO`` files.
  290. >>> escape('''Say:
  291. ... "hello, world!"
  292. ... ''')
  293. '"Say:\\n \\"hello, world!\\"\\n"'
  294. :param string: the string to escape
  295. """
  296. return '"%s"' % string.replace('\\', '\\\\') \
  297. .replace('\t', '\\t') \
  298. .replace('\r', '\\r') \
  299. .replace('\n', '\\n') \
  300. .replace('\"', '\\"')
  301. def normalize(string, prefix='', width=76):
  302. r"""Convert a string into a format that is appropriate for .po files.
  303. >>> print(normalize('''Say:
  304. ... "hello, world!"
  305. ... ''', width=None))
  306. ""
  307. "Say:\n"
  308. " \"hello, world!\"\n"
  309. >>> print(normalize('''Say:
  310. ... "Lorem ipsum dolor sit amet, consectetur adipisicing elit, "
  311. ... ''', width=32))
  312. ""
  313. "Say:\n"
  314. " \"Lorem ipsum dolor sit "
  315. "amet, consectetur adipisicing"
  316. " elit, \"\n"
  317. :param string: the string to normalize
  318. :param prefix: a string that should be prepended to every line
  319. :param width: the maximum line width; use `None`, 0, or a negative number
  320. to completely disable line wrapping
  321. """
  322. if width and width > 0:
  323. prefixlen = len(prefix)
  324. lines = []
  325. for line in string.splitlines(True):
  326. if len(escape(line)) + prefixlen > width:
  327. chunks = WORD_SEP.split(line)
  328. chunks.reverse()
  329. while chunks:
  330. buf = []
  331. size = 2
  332. while chunks:
  333. l = len(escape(chunks[-1])) - 2 + prefixlen
  334. if size + l < width:
  335. buf.append(chunks.pop())
  336. size += l
  337. else:
  338. if not buf:
  339. # handle long chunks by putting them on a
  340. # separate line
  341. buf.append(chunks.pop())
  342. break
  343. lines.append(u''.join(buf))
  344. else:
  345. lines.append(line)
  346. else:
  347. lines = string.splitlines(True)
  348. if len(lines) <= 1:
  349. return escape(string)
  350. # Remove empty trailing line
  351. if lines and not lines[-1]:
  352. del lines[-1]
  353. lines[-1] += '\n'
  354. return u'""\n' + u'\n'.join([(prefix + escape(line)) for line in lines])
  355. def write_po(fileobj, catalog, width=76, no_location=False, omit_header=False,
  356. sort_output=False, sort_by_file=False, ignore_obsolete=False,
  357. include_previous=False, include_lineno=True):
  358. r"""Write a ``gettext`` PO (portable object) template file for a given
  359. message catalog to the provided file-like object.
  360. >>> catalog = Catalog()
  361. >>> catalog.add(u'foo %(name)s', locations=[('main.py', 1)],
  362. ... flags=('fuzzy',))
  363. <Message...>
  364. >>> catalog.add((u'bar', u'baz'), locations=[('main.py', 3)])
  365. <Message...>
  366. >>> from babel._compat import BytesIO
  367. >>> buf = BytesIO()
  368. >>> write_po(buf, catalog, omit_header=True)
  369. >>> print(buf.getvalue().decode("utf8"))
  370. #: main.py:1
  371. #, fuzzy, python-format
  372. msgid "foo %(name)s"
  373. msgstr ""
  374. <BLANKLINE>
  375. #: main.py:3
  376. msgid "bar"
  377. msgid_plural "baz"
  378. msgstr[0] ""
  379. msgstr[1] ""
  380. <BLANKLINE>
  381. <BLANKLINE>
  382. :param fileobj: the file-like object to write to
  383. :param catalog: the `Catalog` instance
  384. :param width: the maximum line width for the generated output; use `None`,
  385. 0, or a negative number to completely disable line wrapping
  386. :param no_location: do not emit a location comment for every message
  387. :param omit_header: do not include the ``msgid ""`` entry at the top of the
  388. output
  389. :param sort_output: whether to sort the messages in the output by msgid
  390. :param sort_by_file: whether to sort the messages in the output by their
  391. locations
  392. :param ignore_obsolete: whether to ignore obsolete messages and not include
  393. them in the output; by default they are included as
  394. comments
  395. :param include_previous: include the old msgid as a comment when
  396. updating the catalog
  397. :param include_lineno: include line number in the location comment
  398. """
  399. def _normalize(key, prefix=''):
  400. return normalize(key, prefix=prefix, width=width)
  401. def _write(text):
  402. if isinstance(text, text_type):
  403. text = text.encode(catalog.charset, 'backslashreplace')
  404. fileobj.write(text)
  405. def _write_comment(comment, prefix=''):
  406. # xgettext always wraps comments even if --no-wrap is passed;
  407. # provide the same behaviour
  408. if width and width > 0:
  409. _width = width
  410. else:
  411. _width = 76
  412. for line in wraptext(comment, _width):
  413. _write('#%s %s\n' % (prefix, line.strip()))
  414. def _write_message(message, prefix=''):
  415. if isinstance(message.id, (list, tuple)):
  416. if message.context:
  417. _write('%smsgctxt %s\n' % (prefix,
  418. _normalize(message.context, prefix)))
  419. _write('%smsgid %s\n' % (prefix, _normalize(message.id[0], prefix)))
  420. _write('%smsgid_plural %s\n' % (
  421. prefix, _normalize(message.id[1], prefix)
  422. ))
  423. for idx in range(catalog.num_plurals):
  424. try:
  425. string = message.string[idx]
  426. except IndexError:
  427. string = ''
  428. _write('%smsgstr[%d] %s\n' % (
  429. prefix, idx, _normalize(string, prefix)
  430. ))
  431. else:
  432. if message.context:
  433. _write('%smsgctxt %s\n' % (prefix,
  434. _normalize(message.context, prefix)))
  435. _write('%smsgid %s\n' % (prefix, _normalize(message.id, prefix)))
  436. _write('%smsgstr %s\n' % (
  437. prefix, _normalize(message.string or '', prefix)
  438. ))
  439. sort_by = None
  440. if sort_output:
  441. sort_by = "message"
  442. elif sort_by_file:
  443. sort_by = "location"
  444. for message in _sort_messages(catalog, sort_by=sort_by):
  445. if not message.id: # This is the header "message"
  446. if omit_header:
  447. continue
  448. comment_header = catalog.header_comment
  449. if width and width > 0:
  450. lines = []
  451. for line in comment_header.splitlines():
  452. lines += wraptext(line, width=width,
  453. subsequent_indent='# ')
  454. comment_header = u'\n'.join(lines)
  455. _write(comment_header + u'\n')
  456. for comment in message.user_comments:
  457. _write_comment(comment)
  458. for comment in message.auto_comments:
  459. _write_comment(comment, prefix='.')
  460. if not no_location:
  461. locs = []
  462. for filename, lineno in sorted(message.locations):
  463. if lineno and include_lineno:
  464. locs.append(u'%s:%d' % (filename.replace(os.sep, '/'), lineno))
  465. else:
  466. locs.append(u'%s' % filename.replace(os.sep, '/'))
  467. _write_comment(' '.join(locs), prefix=':')
  468. if message.flags:
  469. _write('#%s\n' % ', '.join([''] + sorted(message.flags)))
  470. if message.previous_id and include_previous:
  471. _write_comment('msgid %s' % _normalize(message.previous_id[0]),
  472. prefix='|')
  473. if len(message.previous_id) > 1:
  474. _write_comment('msgid_plural %s' % _normalize(
  475. message.previous_id[1]
  476. ), prefix='|')
  477. _write_message(message)
  478. _write('\n')
  479. if not ignore_obsolete:
  480. for message in _sort_messages(
  481. catalog.obsolete.values(),
  482. sort_by=sort_by
  483. ):
  484. for comment in message.user_comments:
  485. _write_comment(comment)
  486. _write_message(message, prefix='#~ ')
  487. _write('\n')
  488. def _sort_messages(messages, sort_by):
  489. """
  490. Sort the given message iterable by the given criteria.
  491. Always returns a list.
  492. :param messages: An iterable of Messages.
  493. :param sort_by: Sort by which criteria? Options are `message` and `location`.
  494. :return: list[Message]
  495. """
  496. messages = list(messages)
  497. if sort_by == "message":
  498. messages.sort()
  499. elif sort_by == "location":
  500. messages.sort(key=lambda m: m.locations)
  501. return messages