pdfpage.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132
  1. #!/usr/bin/env python
  2. import sys
  3. from psparser import LIT
  4. from pdftypes import PDFObjectNotFound
  5. from pdftypes import resolve1
  6. from pdftypes import int_value, list_value, dict_value
  7. from pdfparser import PDFParser
  8. from pdfdocument import PDFDocument
  9. from pdfdocument import PDFEncryptionError
  10. from pdfdocument import PDFTextExtractionNotAllowed
  11. # some predefined literals and keywords.
  12. LITERAL_PAGE = LIT('Page')
  13. LITERAL_PAGES = LIT('Pages')
  14. ## PDFPage
  15. ##
  16. class PDFPage(object):
  17. """An object that holds the information about a page.
  18. A PDFPage object is merely a convenience class that has a set
  19. of keys and values, which describe the properties of a page
  20. and point to its contents.
  21. Attributes:
  22. doc: a PDFDocument object.
  23. pageid: any Python object that can uniquely identify the page.
  24. attrs: a dictionary of page attributes.
  25. contents: a list of PDFStream objects that represents the page content.
  26. lastmod: the last modified time of the page.
  27. resources: a list of resources used by the page.
  28. mediabox: the physical size of the page.
  29. cropbox: the crop rectangle of the page.
  30. rotate: the page rotation (in degree).
  31. annots: the page annotations.
  32. beads: a chain that represents natural reading order.
  33. """
  34. def __init__(self, doc, pageid, attrs):
  35. """Initialize a page object.
  36. doc: a PDFDocument object.
  37. pageid: any Python object that can uniquely identify the page.
  38. attrs: a dictionary of page attributes.
  39. """
  40. self.doc = doc
  41. self.pageid = pageid
  42. self.attrs = dict_value(attrs)
  43. self.lastmod = resolve1(self.attrs.get('LastModified'))
  44. self.resources = resolve1(self.attrs['Resources'])
  45. self.mediabox = resolve1(self.attrs['MediaBox'])
  46. if 'CropBox' in self.attrs:
  47. self.cropbox = resolve1(self.attrs['CropBox'])
  48. else:
  49. self.cropbox = self.mediabox
  50. self.rotate = (int_value(self.attrs.get('Rotate', 0))+360) % 360
  51. self.annots = self.attrs.get('Annots')
  52. self.beads = self.attrs.get('B')
  53. if 'Contents' in self.attrs:
  54. contents = resolve1(self.attrs['Contents'])
  55. else:
  56. contents = []
  57. if not isinstance(contents, list):
  58. contents = [contents]
  59. self.contents = contents
  60. return
  61. def __repr__(self):
  62. return '<PDFPage: Resources=%r, MediaBox=%r>' % (self.resources, self.mediabox)
  63. INHERITABLE_ATTRS = set(['Resources', 'MediaBox', 'CropBox', 'Rotate'])
  64. @classmethod
  65. def create_pages(klass, document, debug=0):
  66. def search(obj, parent):
  67. if isinstance(obj, int):
  68. objid = obj
  69. tree = dict_value(document.getobj(objid)).copy()
  70. else:
  71. objid = obj.objid
  72. tree = dict_value(obj).copy()
  73. for (k, v) in parent.iteritems():
  74. if k in klass.INHERITABLE_ATTRS and k not in tree:
  75. tree[k] = v
  76. if tree.get('Type') is LITERAL_PAGES and 'Kids' in tree:
  77. if 1 <= debug:
  78. print >>sys.stderr, 'Pages: Kids=%r' % tree['Kids']
  79. for c in list_value(tree['Kids']):
  80. for x in search(c, tree):
  81. yield x
  82. elif tree.get('Type') is LITERAL_PAGE:
  83. if 1 <= debug:
  84. print >>sys.stderr, 'Page: %r' % tree
  85. yield (objid, tree)
  86. pages = False
  87. if 'Pages' in document.catalog:
  88. for (objid, tree) in search(document.catalog['Pages'], document.catalog):
  89. yield klass(document, objid, tree)
  90. pages = True
  91. if not pages:
  92. # fallback when /Pages is missing.
  93. for xref in document.xrefs:
  94. for objid in xref.get_objids():
  95. try:
  96. obj = document.getobj(objid)
  97. if isinstance(obj, dict) and obj.get('Type') is LITERAL_PAGE:
  98. yield klass(document, objid, obj)
  99. except PDFObjectNotFound:
  100. pass
  101. return
  102. @classmethod
  103. def get_pages(klass, fp,
  104. pagenos=None, maxpages=0, password='',
  105. caching=True, check_extractable=True):
  106. # Create a PDF parser object associated with the file object.
  107. parser = PDFParser(fp)
  108. # Create a PDF document object that stores the document structure.
  109. doc = PDFDocument(parser, password=password, caching=caching)
  110. # Check if the document allows text extraction. If not, abort.
  111. if check_extractable and not doc.is_extractable:
  112. raise PDFTextExtractionNotAllowed('Text extraction is not allowed: %r' % fp)
  113. # Process each page contained in the document.
  114. for (pageno, page) in enumerate(klass.create_pages(doc)):
  115. if pagenos and (pageno not in pagenos):
  116. continue
  117. yield page
  118. if maxpages and maxpages <= pageno+1:
  119. break
  120. return