123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703 |
- #!/usr/bin/env python
- from utils import INF, Plane, get_bound, uniq, csort, fsplit
- from utils import bbox2str, matrix2str, apply_matrix_pt
- ## IndexAssigner
- ##
- class IndexAssigner(object):
- def __init__(self, index=0):
- self.index = index
- return
- def run(self, obj):
- if isinstance(obj, LTTextBox):
- obj.index = self.index
- self.index += 1
- elif isinstance(obj, LTTextGroup):
- for x in obj:
- self.run(x)
- return
- ## LAParams
- ##
- class LAParams(object):
- def __init__(self,
- line_overlap=0.5,
- char_margin=2.0,
- line_margin=0.5,
- word_margin=0.1,
- boxes_flow=0.5,
- detect_vertical=False,
- all_texts=False):
- self.line_overlap = line_overlap
- self.char_margin = char_margin
- self.line_margin = line_margin
- self.word_margin = word_margin
- self.boxes_flow = boxes_flow
- self.detect_vertical = detect_vertical
- self.all_texts = all_texts
- return
- def __repr__(self):
- return ('<LAParams: char_margin=%.1f, line_margin=%.1f, word_margin=%.1f all_texts=%r>' %
- (self.char_margin, self.line_margin, self.word_margin, self.all_texts))
- ## LTItem
- ##
- class LTItem(object):
- def analyze(self, laparams):
- """Perform the layout analysis."""
- return
- ## LTText
- ##
- class LTText(object):
- def __repr__(self):
- return ('<%s %r>' %
- (self.__class__.__name__, self.get_text()))
- def get_text(self):
- raise NotImplementedError
- ## LTComponent
- ##
- class LTComponent(LTItem):
- def __init__(self, bbox):
- LTItem.__init__(self)
- self.set_bbox(bbox)
- return
- def __repr__(self):
- return ('<%s %s>' %
- (self.__class__.__name__, bbox2str(self.bbox)))
- def set_bbox(self, (x0, y0, x1, y1)):
- self.x0 = x0
- self.y0 = y0
- self.x1 = x1
- self.y1 = y1
- self.width = x1-x0
- self.height = y1-y0
- self.bbox = (x0, y0, x1, y1)
- return
- def is_empty(self):
- return self.width <= 0 or self.height <= 0
- def is_hoverlap(self, obj):
- assert isinstance(obj, LTComponent)
- return obj.x0 <= self.x1 and self.x0 <= obj.x1
- def hdistance(self, obj):
- assert isinstance(obj, LTComponent)
- if self.is_hoverlap(obj):
- return 0
- else:
- return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
- def hoverlap(self, obj):
- assert isinstance(obj, LTComponent)
- if self.is_hoverlap(obj):
- return min(abs(self.x0-obj.x1), abs(self.x1-obj.x0))
- else:
- return 0
- def is_voverlap(self, obj):
- assert isinstance(obj, LTComponent)
- return obj.y0 <= self.y1 and self.y0 <= obj.y1
- def vdistance(self, obj):
- assert isinstance(obj, LTComponent)
- if self.is_voverlap(obj):
- return 0
- else:
- return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
- def voverlap(self, obj):
- assert isinstance(obj, LTComponent)
- if self.is_voverlap(obj):
- return min(abs(self.y0-obj.y1), abs(self.y1-obj.y0))
- else:
- return 0
- ## LTCurve
- ##
- class LTCurve(LTComponent):
- def __init__(self, linewidth, pts):
- LTComponent.__init__(self, get_bound(pts))
- self.pts = pts
- self.linewidth = linewidth
- return
- def get_pts(self):
- return ','.join('%.3f,%.3f' % p for p in self.pts)
- ## LTLine
- ##
- class LTLine(LTCurve):
- def __init__(self, linewidth, p0, p1):
- LTCurve.__init__(self, linewidth, [p0, p1])
- return
- ## LTRect
- ##
- class LTRect(LTCurve):
- def __init__(self, linewidth, (x0, y0, x1, y1)):
- LTCurve.__init__(self, linewidth, [(x0, y0), (x1, y0), (x1, y1), (x0, y1)])
- return
- ## LTImage
- ##
- class LTImage(LTComponent):
- def __init__(self, name, stream, bbox):
- LTComponent.__init__(self, bbox)
- self.name = name
- self.stream = stream
- self.srcsize = (stream.get_any(('W', 'Width')),
- stream.get_any(('H', 'Height')))
- self.imagemask = stream.get_any(('IM', 'ImageMask'))
- self.bits = stream.get_any(('BPC', 'BitsPerComponent'), 1)
- self.colorspace = stream.get_any(('CS', 'ColorSpace'))
- if not isinstance(self.colorspace, list):
- self.colorspace = [self.colorspace]
- return
- def __repr__(self):
- return ('<%s(%s) %s %r>' %
- (self.__class__.__name__, self.name,
- bbox2str(self.bbox), self.srcsize))
- ## LTAnno
- ##
- class LTAnno(LTItem, LTText):
- def __init__(self, text):
- self._text = text
- return
- def get_text(self):
- return self._text
- ## LTChar
- ##
- class LTChar(LTComponent, LTText):
- def __init__(self, matrix, font, fontsize, scaling, rise,
- text, textwidth, textdisp):
- LTText.__init__(self)
- self._text = text
- self.matrix = matrix
- self.fontname = font.fontname
- self.adv = textwidth * fontsize * scaling
- # compute the boundary rectangle.
- if font.is_vertical():
- # vertical
- width = font.get_width() * fontsize
- (vx, vy) = textdisp
- if vx is None:
- vx = width//2
- else:
- vx = vx * fontsize * .001
- vy = (1000 - vy) * fontsize * .001
- tx = -vx
- ty = vy + rise
- bll = (tx, ty+self.adv)
- bur = (tx+width, ty)
- else:
- # horizontal
- height = font.get_height() * fontsize
- descent = font.get_descent() * fontsize
- ty = descent + rise
- bll = (0, ty)
- bur = (self.adv, ty+height)
- (a, b, c, d, e, f) = self.matrix
- self.upright = (0 < a*d*scaling and b*c <= 0)
- (x0, y0) = apply_matrix_pt(self.matrix, bll)
- (x1, y1) = apply_matrix_pt(self.matrix, bur)
- if x1 < x0:
- (x0, x1) = (x1, x0)
- if y1 < y0:
- (y0, y1) = (y1, y0)
- LTComponent.__init__(self, (x0, y0, x1, y1))
- if font.is_vertical():
- self.size = self.width
- else:
- self.size = self.height
- return
- def __repr__(self):
- return ('<%s %s matrix=%s font=%r adv=%s text=%r>' %
- (self.__class__.__name__, bbox2str(self.bbox),
- matrix2str(self.matrix), self.fontname, self.adv,
- self.get_text()))
- def get_text(self):
- return self._text
- def is_compatible(self, obj):
- """Returns True if two characters can coexist in the same line."""
- return True
- ## LTContainer
- ##
- class LTContainer(LTComponent):
- def __init__(self, bbox):
- LTComponent.__init__(self, bbox)
- self._objs = []
- return
- def __iter__(self):
- return iter(self._objs)
- def __len__(self):
- return len(self._objs)
- def add(self, obj):
- self._objs.append(obj)
- return
- def extend(self, objs):
- for obj in objs:
- self.add(obj)
- return
- def analyze(self, laparams):
- for obj in self._objs:
- obj.analyze(laparams)
- return
- ## LTExpandableContainer
- ##
- class LTExpandableContainer(LTContainer):
- def __init__(self):
- LTContainer.__init__(self, (+INF, +INF, -INF, -INF))
- return
- def add(self, obj):
- LTContainer.add(self, obj)
- self.set_bbox((min(self.x0, obj.x0), min(self.y0, obj.y0),
- max(self.x1, obj.x1), max(self.y1, obj.y1)))
- return
- ## LTTextContainer
- ##
- class LTTextContainer(LTExpandableContainer, LTText):
- def __init__(self):
- LTText.__init__(self)
- LTExpandableContainer.__init__(self)
- return
- def get_text(self):
- return ''.join(obj.get_text() for obj in self if isinstance(obj, LTText))
- ## LTTextLine
- ##
- class LTTextLine(LTTextContainer):
- def __init__(self, word_margin):
- LTTextContainer.__init__(self)
- self.word_margin = word_margin
- return
- def __repr__(self):
- return ('<%s %s %r>' %
- (self.__class__.__name__, bbox2str(self.bbox),
- self.get_text()))
- def analyze(self, laparams):
- LTTextContainer.analyze(self, laparams)
- LTContainer.add(self, LTAnno('\n'))
- return
- def find_neighbors(self, plane, ratio):
- raise NotImplementedError
- class LTTextLineHorizontal(LTTextLine):
- def __init__(self, word_margin):
- LTTextLine.__init__(self, word_margin)
- self._x1 = +INF
- return
- def add(self, obj):
- if isinstance(obj, LTChar) and self.word_margin:
- margin = self.word_margin * max(obj.width, obj.height)
- if self._x1 < obj.x0-margin:
- LTContainer.add(self, LTAnno(' '))
- self._x1 = obj.x1
- LTTextLine.add(self, obj)
- return
- def find_neighbors(self, plane, ratio):
- d = ratio*self.height
- objs = plane.find((self.x0, self.y0-d, self.x1, self.y1+d))
- return [obj for obj in objs
- if (isinstance(obj, LTTextLineHorizontal) and
- abs(obj.height-self.height) < d and
- (abs(obj.x0-self.x0) < d or
- abs(obj.x1-self.x1) < d))]
- class LTTextLineVertical(LTTextLine):
- def __init__(self, word_margin):
- LTTextLine.__init__(self, word_margin)
- self._y0 = -INF
- return
- def add(self, obj):
- if isinstance(obj, LTChar) and self.word_margin:
- margin = self.word_margin * max(obj.width, obj.height)
- if obj.y1+margin < self._y0:
- LTContainer.add(self, LTAnno(' '))
- self._y0 = obj.y0
- LTTextLine.add(self, obj)
- return
- def find_neighbors(self, plane, ratio):
- d = ratio*self.width
- objs = plane.find((self.x0-d, self.y0, self.x1+d, self.y1))
- return [obj for obj in objs
- if (isinstance(obj, LTTextLineVertical) and
- abs(obj.width-self.width) < d and
- (abs(obj.y0-self.y0) < d or
- abs(obj.y1-self.y1) < d))]
- ## LTTextBox
- ##
- ## A set of text objects that are grouped within
- ## a certain rectangular area.
- ##
- class LTTextBox(LTTextContainer):
- def __init__(self):
- LTTextContainer.__init__(self)
- self.index = -1
- return
- def __repr__(self):
- return ('<%s(%s) %s %r>' %
- (self.__class__.__name__,
- self.index, bbox2str(self.bbox), self.get_text()))
- class LTTextBoxHorizontal(LTTextBox):
- def analyze(self, laparams):
- LTTextBox.analyze(self, laparams)
- self._objs = csort(self._objs, key=lambda obj: -obj.y1)
- return
- def get_writing_mode(self):
- return 'lr-tb'
- class LTTextBoxVertical(LTTextBox):
- def analyze(self, laparams):
- LTTextBox.analyze(self, laparams)
- self._objs = csort(self._objs, key=lambda obj: -obj.x1)
- return
- def get_writing_mode(self):
- return 'tb-rl'
- ## LTTextGroup
- ##
- class LTTextGroup(LTTextContainer):
- def __init__(self, objs):
- LTTextContainer.__init__(self)
- self.extend(objs)
- return
- class LTTextGroupLRTB(LTTextGroup):
- def analyze(self, laparams):
- LTTextGroup.analyze(self, laparams)
- # reorder the objects from top-left to bottom-right.
- self._objs = csort(self._objs, key=lambda obj:
- (1-laparams.boxes_flow)*(obj.x0) -
- (1+laparams.boxes_flow)*(obj.y0+obj.y1))
- return
- class LTTextGroupTBRL(LTTextGroup):
- def analyze(self, laparams):
- LTTextGroup.analyze(self, laparams)
- # reorder the objects from top-right to bottom-left.
- self._objs = csort(self._objs, key=lambda obj:
- -(1+laparams.boxes_flow)*(obj.x0+obj.x1)
- - (1-laparams.boxes_flow)*(obj.y1))
- return
- ## LTLayoutContainer
- ##
- class LTLayoutContainer(LTContainer):
- def __init__(self, bbox):
- LTContainer.__init__(self, bbox)
- self.groups = None
- return
- # group_objects: group text object to textlines.
- def group_objects(self, laparams, objs):
- obj0 = None
- line = None
- for obj1 in objs:
- if obj0 is not None:
- # halign: obj0 and obj1 is horizontally aligned.
- #
- # +------+ - - -
- # | obj0 | - - +------+ -
- # | | | obj1 | | (line_overlap)
- # +------+ - - | | -
- # - - - +------+
- #
- # |<--->|
- # (char_margin)
- halign = (obj0.is_compatible(obj1) and
- obj0.is_voverlap(obj1) and
- (min(obj0.height, obj1.height) * laparams.line_overlap <
- obj0.voverlap(obj1)) and
- (obj0.hdistance(obj1) <
- max(obj0.width, obj1.width) * laparams.char_margin))
-
- # valign: obj0 and obj1 is vertically aligned.
- #
- # +------+
- # | obj0 |
- # | |
- # +------+ - - -
- # | | | (char_margin)
- # +------+ - -
- # | obj1 |
- # | |
- # +------+
- #
- # |<-->|
- # (line_overlap)
- valign = (laparams.detect_vertical and
- obj0.is_compatible(obj1) and
- obj0.is_hoverlap(obj1) and
- (min(obj0.width, obj1.width) * laparams.line_overlap <
- obj0.hoverlap(obj1)) and
- (obj0.vdistance(obj1) <
- max(obj0.height, obj1.height) * laparams.char_margin))
-
- if ((halign and isinstance(line, LTTextLineHorizontal)) or
- (valign and isinstance(line, LTTextLineVertical))):
- line.add(obj1)
- elif line is not None:
- yield line
- line = None
- else:
- if valign and not halign:
- line = LTTextLineVertical(laparams.word_margin)
- line.add(obj0)
- line.add(obj1)
- elif halign and not valign:
- line = LTTextLineHorizontal(laparams.word_margin)
- line.add(obj0)
- line.add(obj1)
- else:
- line = LTTextLineHorizontal(laparams.word_margin)
- line.add(obj0)
- yield line
- line = None
- obj0 = obj1
- if line is None:
- line = LTTextLineHorizontal(laparams.word_margin)
- line.add(obj0)
- yield line
- return
- # group_textlines: group neighboring lines to textboxes.
- def group_textlines(self, laparams, lines):
- plane = Plane(self.bbox)
- plane.extend(lines)
- boxes = {}
- for line in lines:
- neighbors = line.find_neighbors(plane, laparams.line_margin)
- if line not in neighbors: continue
- members = []
- for obj1 in neighbors:
- members.append(obj1)
- if obj1 in boxes:
- members.extend(boxes.pop(obj1))
- if isinstance(line, LTTextLineHorizontal):
- box = LTTextBoxHorizontal()
- else:
- box = LTTextBoxVertical()
- for obj in uniq(members):
- box.add(obj)
- boxes[obj] = box
- done = set()
- for line in lines:
- if line not in boxes: continue
- box = boxes[line]
- if box in done:
- continue
- done.add(box)
- if not box.is_empty():
- yield box
- return
- # group_textboxes: group textboxes hierarchically.
- def group_textboxes(self, laparams, boxes):
- assert boxes
- def dist(obj1, obj2):
- """A distance function between two TextBoxes.
- Consider the bounding rectangle for obj1 and obj2.
- Return its area less the areas of obj1 and obj2,
- shown as 'www' below. This value may be negative.
- +------+..........+ (x1, y1)
- | obj1 |wwwwwwwwww:
- +------+www+------+
- :wwwwwwwwww| obj2 |
- (x0, y0) +..........+------+
- """
- x0 = min(obj1.x0, obj2.x0)
- y0 = min(obj1.y0, obj2.y0)
- x1 = max(obj1.x1, obj2.x1)
- y1 = max(obj1.y1, obj2.y1)
- return ((x1-x0)*(y1-y0) - obj1.width*obj1.height - obj2.width*obj2.height)
- def isany(obj1, obj2):
- """Check if there's any other object between obj1 and obj2.
- """
- x0 = min(obj1.x0, obj2.x0)
- y0 = min(obj1.y0, obj2.y0)
- x1 = max(obj1.x1, obj2.x1)
- y1 = max(obj1.y1, obj2.y1)
- objs = set(plane.find((x0, y0, x1, y1)))
- return objs.difference((obj1, obj2))
- # XXX this still takes O(n^2) :(
- dists = []
- for i in xrange(len(boxes)):
- obj1 = boxes[i]
- for j in xrange(i+1, len(boxes)):
- obj2 = boxes[j]
- dists.append((0, dist(obj1, obj2), obj1, obj2))
- dists.sort()
- plane = Plane(self.bbox)
- plane.extend(boxes)
- while dists:
- (c, d, obj1, obj2) = dists.pop(0)
- if c == 0 and isany(obj1, obj2):
- dists.append((1, d, obj1, obj2))
- continue
- if (isinstance(obj1, (LTTextBoxVertical, LTTextGroupTBRL)) or
- isinstance(obj2, (LTTextBoxVertical, LTTextGroupTBRL))):
- group = LTTextGroupTBRL([obj1, obj2])
- else:
- group = LTTextGroupLRTB([obj1, obj2])
- plane.remove(obj1)
- plane.remove(obj2)
- # this line is optimized -- don't change without profiling
- dists = [n for n in dists if n[2] in plane._objs and n[3] in plane._objs]
- for other in plane:
- dists.append((0, dist(group, other), group, other))
- dists.sort()
- plane.add(group)
- assert len(plane) == 1
- return list(plane)
- def analyze(self, laparams):
- # textobjs is a list of LTChar objects, i.e.
- # it has all the individual characters in the page.
- (textobjs, otherobjs) = fsplit(lambda obj: isinstance(obj, LTChar), self)
- for obj in otherobjs:
- obj.analyze(laparams)
- if not textobjs:
- return
- textlines = list(self.group_objects(laparams, textobjs))
- (empties, textlines) = fsplit(lambda obj: obj.is_empty(), textlines)
- for obj in empties:
- obj.analyze(laparams)
- textboxes = list(self.group_textlines(laparams, textlines))
- if textboxes:
- self.groups = self.group_textboxes(laparams, textboxes)
- assigner = IndexAssigner()
- for group in self.groups:
- group.analyze(laparams)
- assigner.run(group)
- textboxes.sort(key=lambda box: box.index)
- self._objs = textboxes + otherobjs + empties
- return
- ## LTFigure
- ##
- class LTFigure(LTLayoutContainer):
- def __init__(self, name, bbox, matrix):
- self.name = name
- self.matrix = matrix
- (x, y, w, h) = bbox
- bbox = get_bound(apply_matrix_pt(matrix, (p, q))
- for (p, q) in ((x, y), (x+w, y), (x, y+h), (x+w, y+h)))
- LTLayoutContainer.__init__(self, bbox)
- return
- def __repr__(self):
- return ('<%s(%s) %s matrix=%s>' %
- (self.__class__.__name__, self.name,
- bbox2str(self.bbox), matrix2str(self.matrix)))
- def analyze(self, laparams):
- if not laparams.all_texts:
- return
- LTLayoutContainer.analyze(self, laparams)
- return
- ## LTPage
- ##
- class LTPage(LTLayoutContainer):
- def __init__(self, pageid, bbox, rotate=0):
- LTLayoutContainer.__init__(self, bbox)
- self.pageid = pageid
- self.rotate = rotate
- return
- def __repr__(self):
- return ('<%s(%r) %s rotate=%r>' %
- (self.__class__.__name__, self.pageid,
- bbox2str(self.bbox), self.rotate))
|