layout.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703
  1. #!/usr/bin/env python
  2. from utils import INF, Plane, get_bound, uniq, csort, fsplit
  3. from utils import bbox2str, matrix2str, apply_matrix_pt
  4. ## IndexAssigner
  5. ##
  6. class IndexAssigner(object):
  7. def __init__(self, index=0):
  8. self.index = index
  9. return
  10. def run(self, obj):
  11. if isinstance(obj, LTTextBox):
  12. obj.index = self.index
  13. self.index += 1
  14. elif isinstance(obj, LTTextGroup):
  15. for x in obj:
  16. self.run(x)
  17. return
  18. ## LAParams
  19. ##
  20. class LAParams(object):
  21. def __init__(self,
  22. line_overlap=0.5,
  23. char_margin=2.0,
  24. line_margin=0.5,
  25. word_margin=0.1,
  26. boxes_flow=0.5,
  27. detect_vertical=False,
  28. all_texts=False):
  29. self.line_overlap = line_overlap
  30. self.char_margin = char_margin
  31. self.line_margin = line_margin
  32. self.word_margin = word_margin
  33. self.boxes_flow = boxes_flow
  34. self.detect_vertical = detect_vertical
  35. self.all_texts = all_texts
  36. return
  37. def __repr__(self):
  38. return ('<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
  39. (self.char_margin, self.line_margin, self.word_margin, self.all_texts))
  40. ## LTItem
  41. ##
  42. class LTItem(object):
  43. def analyze(self, laparams):
  44. """Perform the layout analysis."""
  45. return
  46. ## LTText
  47. ##
  48. class LTText(object):
  49. def __repr__(self):
  50. return ('<%s %r>' %
  51. (self.__class__.__name__, self.get_text()))
  52. def get_text(self):
  53. raise NotImplementedError
  54. ## LTComponent
  55. ##
  56. class LTComponent(LTItem):
  57. def __init__(self, bbox):
  58. LTItem.__init__(self)
  59. self.set_bbox(bbox)
  60. return
  61. def __repr__(self):
  62. return ('<%s %s>' %
  63. (self.__class__.__name__, bbox2str(self.bbox)))
  64. def set_bbox(self, (x0, y0, x1, y1)):
  65. self.x0 = x0
  66. self.y0 = y0
  67. self.x1 = x1
  68. self.y1 = y1
  69. self.width = x1-x0
  70. self.height = y1-y0
  71. self.bbox = (x0, y0, x1, y1)
  72. return
  73. def is_empty(self):
  74. return self.width <= 0 or self.height <= 0
  75. def is_hoverlap(self, obj):
  76. assert isinstance(obj, LTComponent)
  77. return obj.x0 <= self.x1 and self.x0 <= obj.x1
  78. def hdistance(self, obj):
  79. assert isinstance(obj, LTComponent)
  80. if self.is_hoverlap(obj):
  81. return 0
  82. else:
  83. return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
  84. def hoverlap(self, obj):
  85. assert isinstance(obj, LTComponent)
  86. if self.is_hoverlap(obj):
  87. return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
  88. else:
  89. return 0
  90. def is_voverlap(self, obj):
  91. assert isinstance(obj, LTComponent)
  92. return obj.y0 <= self.y1 and self.y0 <= obj.y1
  93. def vdistance(self, obj):
  94. assert isinstance(obj, LTComponent)
  95. if self.is_voverlap(obj):
  96. return 0
  97. else:
  98. return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
  99. def voverlap(self, obj):
  100. assert isinstance(obj, LTComponent)
  101. if self.is_voverlap(obj):
  102. return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
  103. else:
  104. return 0
  105. ## LTCurve
  106. ##
  107. class LTCurve(LTComponent):
  108. def __init__(self, linewidth, pts):
  109. LTComponent.__init__(self, get_bound(pts))
  110. self.pts = pts
  111. self.linewidth = linewidth
  112. return
  113. def get_pts(self):
  114. return ','.join('%.3f,%.3f' % p for p in self.pts)
  115. ## LTLine
  116. ##
  117. class LTLine(LTCurve):
  118. def __init__(self, linewidth, p0, p1):
  119. LTCurve.__init__(self, linewidth, [p0, p1])
  120. return
  121. ## LTRect
  122. ##
  123. class LTRect(LTCurve):
  124. def __init__(self, linewidth, (x0, y0, x1, y1)):
  125. LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
  126. return
  127. ## LTImage
  128. ##
  129. class LTImage(LTComponent):
  130. def __init__(self, name, stream, bbox):
  131. LTComponent.__init__(self, bbox)
  132. self.name = name
  133. self.stream = stream
  134. self.srcsize = (stream.get_any(('W', 'Width')),
  135. stream.get_any(('H', 'Height')))
  136. self.imagemask = stream.get_any(('IM', 'ImageMask'))
  137. self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
  138. self.colorspace = stream.get_any(('CS', 'ColorSpace'))
  139. if not isinstance(self.colorspace, list):
  140. self.colorspace = [self.colorspace]
  141. return
  142. def __repr__(self):
  143. return ('<%s(%s) %s %r>' %
  144. (self.__class__.__name__, self.name,
  145. bbox2str(self.bbox), self.srcsize))
  146. ## LTAnno
  147. ##
  148. class LTAnno(LTItem, LTText):
  149. def __init__(self, text):
  150. self._text = text
  151. return
  152. def get_text(self):
  153. return self._text
  154. ## LTChar
  155. ##
  156. class LTChar(LTComponent, LTText):
  157. def __init__(self, matrix, font, fontsize, scaling, rise,
  158. text, textwidth, textdisp):
  159. LTText.__init__(self)
  160. self._text = text
  161. self.matrix = matrix
  162. self.fontname = font.fontname
  163. self.adv = textwidth * fontsize * scaling
  164. # compute the boundary rectangle.
  165. if font.is_vertical():
  166. # vertical
  167. width = font.get_width() * fontsize
  168. (vx, vy) = textdisp
  169. if vx is None:
  170. vx = width//2
  171. else:
  172. vx = vx * fontsize * .001
  173. vy = (1000 - vy) * fontsize * .001
  174. tx = -vx
  175. ty = vy + rise
  176. bll = (tx, ty+self.adv)
  177. bur = (tx+width, ty)
  178. else:
  179. # horizontal
  180. height = font.get_height() * fontsize
  181. descent = font.get_descent() * fontsize
  182. ty = descent + rise
  183. bll = (0, ty)
  184. bur = (self.adv, ty+height)
  185. (a, b, c, d, e, f) = self.matrix
  186. self.upright = (0 < a*d*scaling and b*c <= 0)
  187. (x0, y0) = apply_matrix_pt(self.matrix, bll)
  188. (x1, y1) = apply_matrix_pt(self.matrix, bur)
  189. if x1 < x0:
  190. (x0, x1) = (x1, x0)
  191. if y1 < y0:
  192. (y0, y1) = (y1, y0)
  193. LTComponent.__init__(self, (x0, y0, x1, y1))
  194. if font.is_vertical():
  195. self.size = self.width
  196. else:
  197. self.size = self.height
  198. return
  199. def __repr__(self):
  200. return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
  201. (self.__class__.__name__, bbox2str(self.bbox),
  202. matrix2str(self.matrix), self.fontname, self.adv,
  203. self.get_text()))
  204. def get_text(self):
  205. return self._text
  206. def is_compatible(self, obj):
  207. """Returns True if two characters can coexist in the same line."""
  208. return True
  209. ## LTContainer
  210. ##
  211. class LTContainer(LTComponent):
  212. def __init__(self, bbox):
  213. LTComponent.__init__(self, bbox)
  214. self._objs = []
  215. return
  216. def __iter__(self):
  217. return iter(self._objs)
  218. def __len__(self):
  219. return len(self._objs)
  220. def add(self, obj):
  221. self._objs.append(obj)
  222. return
  223. def extend(self, objs):
  224. for obj in objs:
  225. self.add(obj)
  226. return
  227. def analyze(self, laparams):
  228. for obj in self._objs:
  229. obj.analyze(laparams)
  230. return
  231. ## LTExpandableContainer
  232. ##
  233. class LTExpandableContainer(LTContainer):
  234. def __init__(self):
  235. LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
  236. return
  237. def add(self, obj):
  238. LTContainer.add(self, obj)
  239. self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
  240. max(self.x1, obj.x1), max(self.y1, obj.y1)))
  241. return
  242. ## LTTextContainer
  243. ##
  244. class LTTextContainer(LTExpandableContainer, LTText):
  245. def __init__(self):
  246. LTText.__init__(self)
  247. LTExpandableContainer.__init__(self)
  248. return
  249. def get_text(self):
  250. return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
  251. ## LTTextLine
  252. ##
  253. class LTTextLine(LTTextContainer):
  254. def __init__(self, word_margin):
  255. LTTextContainer.__init__(self)
  256. self.word_margin = word_margin
  257. return
  258. def __repr__(self):
  259. return ('<%s %s %r>' %
  260. (self.__class__.__name__, bbox2str(self.bbox),
  261. self.get_text()))
  262. def analyze(self, laparams):
  263. LTTextContainer.analyze(self, laparams)
  264. LTContainer.add(self, LTAnno('\n'))
  265. return
  266. def find_neighbors(self, plane, ratio):
  267. raise NotImplementedError
  268. class LTTextLineHorizontal(LTTextLine):
  269. def __init__(self, word_margin):
  270. LTTextLine.__init__(self, word_margin)
  271. self._x1 = +INF
  272. return
  273. def add(self, obj):
  274. if isinstance(obj, LTChar) and self.word_margin:
  275. margin = self.word_margin * max(obj.width, obj.height)
  276. if self._x1 < obj.x0-margin:
  277. LTContainer.add(self, LTAnno(' '))
  278. self._x1 = obj.x1
  279. LTTextLine.add(self, obj)
  280. return
  281. def find_neighbors(self, plane, ratio):
  282. d = ratio*self.height
  283. objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
  284. return [obj for obj in objs
  285. if (isinstance(obj, LTTextLineHorizontal) and
  286. abs(obj.height-self.height) < d and
  287. (abs(obj.x0-self.x0) < d or
  288. abs(obj.x1-self.x1) < d))]
  289. class LTTextLineVertical(LTTextLine):
  290. def __init__(self, word_margin):
  291. LTTextLine.__init__(self, word_margin)
  292. self._y0 = -INF
  293. return
  294. def add(self, obj):
  295. if isinstance(obj, LTChar) and self.word_margin:
  296. margin = self.word_margin * max(obj.width, obj.height)
  297. if obj.y1+margin < self._y0:
  298. LTContainer.add(self, LTAnno(' '))
  299. self._y0 = obj.y0
  300. LTTextLine.add(self, obj)
  301. return
  302. def find_neighbors(self, plane, ratio):
  303. d = ratio*self.width
  304. objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
  305. return [obj for obj in objs
  306. if (isinstance(obj, LTTextLineVertical) and
  307. abs(obj.width-self.width) < d and
  308. (abs(obj.y0-self.y0) < d or
  309. abs(obj.y1-self.y1) < d))]
  310. ## LTTextBox
  311. ##
  312. ## A set of text objects that are grouped within
  313. ## a certain rectangular area.
  314. ##
  315. class LTTextBox(LTTextContainer):
  316. def __init__(self):
  317. LTTextContainer.__init__(self)
  318. self.index = -1
  319. return
  320. def __repr__(self):
  321. return ('<%s(%s) %s %r>' %
  322. (self.__class__.__name__,
  323. self.index, bbox2str(self.bbox), self.get_text()))
  324. class LTTextBoxHorizontal(LTTextBox):
  325. def analyze(self, laparams):
  326. LTTextBox.analyze(self, laparams)
  327. self._objs = csort(self._objs, key=lambda obj: -obj.y1)
  328. return
  329. def get_writing_mode(self):
  330. return 'lr-tb'
  331. class LTTextBoxVertical(LTTextBox):
  332. def analyze(self, laparams):
  333. LTTextBox.analyze(self, laparams)
  334. self._objs = csort(self._objs, key=lambda obj: -obj.x1)
  335. return
  336. def get_writing_mode(self):
  337. return 'tb-rl'
  338. ## LTTextGroup
  339. ##
  340. class LTTextGroup(LTTextContainer):
  341. def __init__(self, objs):
  342. LTTextContainer.__init__(self)
  343. self.extend(objs)
  344. return
  345. class LTTextGroupLRTB(LTTextGroup):
  346. def analyze(self, laparams):
  347. LTTextGroup.analyze(self, laparams)
  348. # reorder the objects from top-left to bottom-right.
  349. self._objs = csort(self._objs, key=lambda obj:
  350. (1-laparams.boxes_flow)*(obj.x0) -
  351. (1+laparams.boxes_flow)*(obj.y0+obj.y1))
  352. return
  353. class LTTextGroupTBRL(LTTextGroup):
  354. def analyze(self, laparams):
  355. LTTextGroup.analyze(self, laparams)
  356. # reorder the objects from top-right to bottom-left.
  357. self._objs = csort(self._objs, key=lambda obj:
  358. -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
  359. - (1-laparams.boxes_flow)*(obj.y1))
  360. return
  361. ## LTLayoutContainer
  362. ##
  363. class LTLayoutContainer(LTContainer):
  364. def __init__(self, bbox):
  365. LTContainer.__init__(self, bbox)
  366. self.groups = None
  367. return
  368. # group_objects: group text object to textlines.
  369. def group_objects(self, laparams, objs):
  370. obj0 = None
  371. line = None
  372. for obj1 in objs:
  373. if obj0 is not None:
  374. # halign: obj0 and obj1 is horizontally aligned.
  375. #
  376. # +------+ - - -
  377. # | obj0 | - - +------+ -
  378. # | | | obj1 | | (line_overlap)
  379. # +------+ - - | | -
  380. # - - - +------+
  381. #
  382. # |<--->|
  383. # (char_margin)
  384. halign = (obj0.is_compatible(obj1) and
  385. obj0.is_voverlap(obj1) and
  386. (min(obj0.height, obj1.height) * laparams.line_overlap <
  387. obj0.voverlap(obj1)) and
  388. (obj0.hdistance(obj1) <
  389. max(obj0.width, obj1.width) * laparams.char_margin))
  390. # valign: obj0 and obj1 is vertically aligned.
  391. #
  392. # +------+
  393. # | obj0 |
  394. # | |
  395. # +------+ - - -
  396. # | | | (char_margin)
  397. # +------+ - -
  398. # | obj1 |
  399. # | |
  400. # +------+
  401. #
  402. # |<-->|
  403. # (line_overlap)
  404. valign = (laparams.detect_vertical and
  405. obj0.is_compatible(obj1) and
  406. obj0.is_hoverlap(obj1) and
  407. (min(obj0.width, obj1.width) * laparams.line_overlap <
  408. obj0.hoverlap(obj1)) and
  409. (obj0.vdistance(obj1) <
  410. max(obj0.height, obj1.height) * laparams.char_margin))
  411. if ((halign and isinstance(line, LTTextLineHorizontal)) or
  412. (valign and isinstance(line, LTTextLineVertical))):
  413. line.add(obj1)
  414. elif line is not None:
  415. yield line
  416. line = None
  417. else:
  418. if valign and not halign:
  419. line = LTTextLineVertical(laparams.word_margin)
  420. line.add(obj0)
  421. line.add(obj1)
  422. elif halign and not valign:
  423. line = LTTextLineHorizontal(laparams.word_margin)
  424. line.add(obj0)
  425. line.add(obj1)
  426. else:
  427. line = LTTextLineHorizontal(laparams.word_margin)
  428. line.add(obj0)
  429. yield line
  430. line = None
  431. obj0 = obj1
  432. if line is None:
  433. line = LTTextLineHorizontal(laparams.word_margin)
  434. line.add(obj0)
  435. yield line
  436. return
  437. # group_textlines: group neighboring lines to textboxes.
  438. def group_textlines(self, laparams, lines):
  439. plane = Plane(self.bbox)
  440. plane.extend(lines)
  441. boxes = {}
  442. for line in lines:
  443. neighbors = line.find_neighbors(plane, laparams.line_margin)
  444. if line not in neighbors: continue
  445. members = []
  446. for obj1 in neighbors:
  447. members.append(obj1)
  448. if obj1 in boxes:
  449. members.extend(boxes.pop(obj1))
  450. if isinstance(line, LTTextLineHorizontal):
  451. box = LTTextBoxHorizontal()
  452. else:
  453. box = LTTextBoxVertical()
  454. for obj in uniq(members):
  455. box.add(obj)
  456. boxes[obj] = box
  457. done = set()
  458. for line in lines:
  459. if line not in boxes: continue
  460. box = boxes[line]
  461. if box in done:
  462. continue
  463. done.add(box)
  464. if not box.is_empty():
  465. yield box
  466. return
  467. # group_textboxes: group textboxes hierarchically.
  468. def group_textboxes(self, laparams, boxes):
  469. assert boxes
  470. def dist(obj1, obj2):
  471. """A distance function between two TextBoxes.
  472. Consider the bounding rectangle for obj1 and obj2.
  473. Return its area less the areas of obj1 and obj2,
  474. shown as 'www' below. This value may be negative.
  475. +------+..........+ (x1, y1)
  476. | obj1 |wwwwwwwwww:
  477. +------+www+------+
  478. :wwwwwwwwww| obj2 |
  479. (x0, y0) +..........+------+
  480. """
  481. x0 = min(obj1.x0, obj2.x0)
  482. y0 = min(obj1.y0, obj2.y0)
  483. x1 = max(obj1.x1, obj2.x1)
  484. y1 = max(obj1.y1, obj2.y1)
  485. return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
  486. def isany(obj1, obj2):
  487. """Check if there's any other object between obj1 and obj2.
  488. """
  489. x0 = min(obj1.x0, obj2.x0)
  490. y0 = min(obj1.y0, obj2.y0)
  491. x1 = max(obj1.x1, obj2.x1)
  492. y1 = max(obj1.y1, obj2.y1)
  493. objs = set(plane.find((x0, y0, x1, y1)))
  494. return objs.difference((obj1, obj2))
  495. # XXX this still takes O(n^2) :(
  496. dists = []
  497. for i in xrange(len(boxes)):
  498. obj1 = boxes[i]
  499. for j in xrange(i+1, len(boxes)):
  500. obj2 = boxes[j]
  501. dists.append((0, dist(obj1, obj2), obj1, obj2))
  502. dists.sort()
  503. plane = Plane(self.bbox)
  504. plane.extend(boxes)
  505. while dists:
  506. (c, d, obj1, obj2) = dists.pop(0)
  507. if c == 0 and isany(obj1, obj2):
  508. dists.append((1, d, obj1, obj2))
  509. continue
  510. if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
  511. isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
  512. group = LTTextGroupTBRL([obj1, obj2])
  513. else:
  514. group = LTTextGroupLRTB([obj1, obj2])
  515. plane.remove(obj1)
  516. plane.remove(obj2)
  517. # this line is optimized -- don't change without profiling
  518. dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
  519. for other in plane:
  520. dists.append((0, dist(group, other), group, other))
  521. dists.sort()
  522. plane.add(group)
  523. assert len(plane) == 1
  524. return list(plane)
  525. def analyze(self, laparams):
  526. # textobjs is a list of LTChar objects, i.e.
  527. # it has all the individual characters in the page.
  528. (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
  529. for obj in otherobjs:
  530. obj.analyze(laparams)
  531. if not textobjs:
  532. return
  533. textlines = list(self.group_objects(laparams, textobjs))
  534. (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
  535. for obj in empties:
  536. obj.analyze(laparams)
  537. textboxes = list(self.group_textlines(laparams, textlines))
  538. if textboxes:
  539. self.groups = self.group_textboxes(laparams, textboxes)
  540. assigner = IndexAssigner()
  541. for group in self.groups:
  542. group.analyze(laparams)
  543. assigner.run(group)
  544. textboxes.sort(key=lambda box: box.index)
  545. self._objs = textboxes + otherobjs + empties
  546. return
  547. ## LTFigure
  548. ##
  549. class LTFigure(LTLayoutContainer):
  550. def __init__(self, name, bbox, matrix):
  551. self.name = name
  552. self.matrix = matrix
  553. (x, y, w, h) = bbox
  554. bbox = get_bound(apply_matrix_pt(matrix, (p, q))
  555. for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
  556. LTLayoutContainer.__init__(self, bbox)
  557. return
  558. def __repr__(self):
  559. return ('<%s(%s) %s matrix=%s>' %
  560. (self.__class__.__name__, self.name,
  561. bbox2str(self.bbox), matrix2str(self.matrix)))
  562. def analyze(self, laparams):
  563. if not laparams.all_texts:
  564. return
  565. LTLayoutContainer.analyze(self, laparams)
  566. return
  567. ## LTPage
  568. ##
  569. class LTPage(LTLayoutContainer):
  570. def __init__(self, pageid, bbox, rotate=0):
  571. LTLayoutContainer.__init__(self, bbox)
  572. self.pageid = pageid
  573. self.rotate = rotate
  574. return
  575. def __repr__(self):
  576. return ('<%s(%r) %s rotate=%r>' %
  577. (self.__class__.__name__, self.pageid,
  578. bbox2str(self.bbox), self.rotate))