Skip to content

Reference

paves.image

Various ways of converting PDFs to images for feeding them to models and/or visualisation.`

BoxFunc = Callable[[Boxable], Rect] module-attribute

Function to get a bounding box for a Boxable.

Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect] module-attribute

Object for which we can get a bounding box.

Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]] module-attribute

Type alias for things that can be used as colors.

ColorMaker = Callable[[str], PillowColor] module-attribute

Function that makes a Pillow color for a string label.

Colors = Union[Color, List[Color], Dict[str, Color]] module-attribute

Type alias for colors or collections of colors.

DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan'] module-attribute

Default color cycle (same as matplotlib)

LabelFunc = Callable[[Boxable], Any] module-attribute

Function to get a label for a Boxable.

PillowColor = Union[str, Tuple[int, int, int]] module-attribute

Type alias for things Pillow accepts as colors.

NotInstalledError

Bases: RuntimeError

Exception raised if the dependencies for a particular PDF to image backend are not installed.

Source code in src/paves/image.py
39
40
41
class NotInstalledError(RuntimeError):
    """Exception raised if the dependencies for a particular PDF to
    image backend are not installed."""

box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

Draw boxes around things in a page of a PDF.

Source code in src/paves/image.py
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
def box(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    label: bool = True,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    label_fill: bool = True,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Draw boxes around things in a page of a PDF."""
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        try:
            left, top, right, bottom = (x * scale for x in boxfunc(obj))
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(image)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), outline=obj_color)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color if label_fill else None,
            )
            draw.text(
                xy=(left + label_margin, top - label_margin),
                text=text,
                font=font,
                fill="white" if label_fill else obj_color,
                anchor="ld",
            )
    return image

color_maker(spec, default='red')

Create a function that makes colors.

Source code in src/paves/image.py
523
524
525
526
@functools.singledispatch
def color_maker(spec: Colors, default: Color = "red") -> ColorMaker:
    """Create a function that makes colors."""
    return lambda _: pillow_color(default)

convert(pdf, *, dpi=0, width=0, height=0)

Convert a PDF to images.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required
dpi int

Render to this resolution (default is 72 dpi).

0
width int

Render to this width in pixels (0 to keep aspect ratio).

0
height int

Render to this height in pixels (0 to keep aspect ratio).

0

Yields: Pillow Image.Image objects, one per page. The original page width and height in default user space units are available in the info property of these images as page_width and page_height Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If no renderer is available

Source code in src/paves/image.py
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
def convert(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels (0 to keep aspect ratio).
        height: Render to this height in pixels (0 to keep aspect ratio).
    Yields:
        Pillow `Image.Image` objects, one per page.  The original page
        width and height in default user space units are available in
        the `info` property of these images as `page_width` and
        `page_height`
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If no renderer is available

    """
    for method in METHODS:
        try:
            for img in method(pdf, dpi=dpi, width=width, height=height):
                yield img
            break
        except NotInstalledError:
            continue
    else:
        raise NotInstalledError(
            "No renderers available, tried: %s"
            % (", ".join(m.__name__ for m in METHODS))
        )

get_box(obj)

Default function to get the bounding box for an object.

Source code in src/paves/image.py
388
389
390
391
392
393
@functools.singledispatch
def get_box(obj) -> Rect:
    """Default function to get the bounding box for an object."""
    if hasattr(obj, "bbox"):
        return obj.bbox
    raise RuntimeError(f"Don't know how to get the box for {obj!r}")

get_box_annotation(obj)

Get the bounding box of an Annotation

Source code in src/paves/image.py
409
410
411
412
@get_box.register(Annotation)
def get_box_annotation(obj: Annotation) -> Rect:
    """Get the bounding box of an Annotation"""
    return transform_bbox(obj.page.ctm, obj.rect)

get_box_content(obj)

Get the bounding box of a ContentObject

Source code in src/paves/image.py
402
403
404
405
406
@get_box.register(ContentObject)
@get_box.register(Element)
def get_box_content(obj: Union[ContentObject, Element]) -> Rect:
    """Get the bounding box of a ContentObject"""
    return obj.bbox

get_box_rect(obj)

Get the bounding box of a ContentObject

Source code in src/paves/image.py
396
397
398
399
@get_box.register(tuple)
def get_box_rect(obj: Rect) -> Rect:
    """Get the bounding box of a ContentObject"""
    return obj

get_label(obj)

Default function to get the label text for an object.

Source code in src/paves/image.py
415
416
417
418
@functools.singledispatch
def get_label(obj: Boxable) -> str:
    """Default function to get the label text for an object."""
    return str(obj)

get_label_annotation(obj)

Get the default label text for an Annotation.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image.py
427
428
429
430
431
432
433
434
435
@get_label.register(Annotation)
def get_label_annotation(obj: Annotation) -> str:
    """Get the default label text for an Annotation.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.subtype

get_label_content(obj)

Get the label text for a ContentObject.

Source code in src/paves/image.py
421
422
423
424
@get_label.register(ContentObject)
def get_label_content(obj: ContentObject) -> str:
    """Get the label text for a ContentObject."""
    return obj.object_type

get_label_element(obj)

Get the default label text for an Element.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image.py
438
439
440
441
442
443
444
445
446
@get_label.register(Element)
def get_label_element(obj: Element) -> str:
    """Get the default label text for an Element.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.type

mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

Highlight things in a page of a PDF.

Source code in src/paves/image.py
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
def mark(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    transparency: float = 0.75,
    label: bool = False,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    outline: bool = False,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Highlight things in a page of a PDF."""
    overlay: Union[Image.Image, None] = None
    mask: Union[Image.Image, None] = None
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    alpha = min(255, int(transparency * 255))
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        if overlay is None:
            overlay = Image.new("RGB", image.size)
        if mask is None:
            mask = Image.new("L", image.size, 255)
        try:
            left, top, right, bottom = (x * scale for x in boxfunc(obj))
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(overlay)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), fill=obj_color)
        mask_draw = ImageDraw.ImageDraw(mask)
        mask_draw.rectangle((left, top, right, bottom), fill=alpha)
        if outline:
            draw.rectangle((left, top, right, bottom), outline="black")
            mask_draw.rectangle((left, top, right, bottom), outline=0)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color,
            )
            mask_draw.rectangle(
                label_box,
                fill=alpha,
            )
            if outline:
                draw.rectangle(
                    label_box,
                    outline="black",
                )
                mask_draw.rectangle(
                    label_box,
                    outline=0,
                )
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="black",
                    anchor="ld",
                )
                mask_draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill=0,
                    anchor="ld",
                )
            else:
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="white",
                    anchor="ld",
                )
    if image is None:
        return None
    if overlay is not None and mask is not None:
        return Image.composite(image, overlay, mask)
    else:
        return image

pdfium(pdf, *, dpi=0, width=0, height=0)

Convert a PDF to images using PyPDFium2

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required
dpi int

Render to this resolution (default is 72 dpi).

0
width int

Render to this width in pixels.

0
height int

Render to this height in pixels.

0

Yields: Pillow Image.Image objects, one per page. Page width and height are available in the info property of the images. Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If PyPDFium2 is not installed.

Source code in src/paves/image.py
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
def pdfium(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images using PyPDFium2

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels.
        height: Render to this height in pixels.
    Yields:
        Pillow `Image.Image` objects, one per page.  Page width and height are
        available in the `info` property of the images.
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If PyPDFium2 is not installed.
    """
    if dpi and (width or height):
        raise ValueError("Cannot specify both `dpi` and `width` or `height`")
    try:
        import pypdfium2  # noqa: F401
    except ImportError as e:
        raise NotInstalledError("PyPDFium2 does not seem to be installed") from e
    for idx, page in _get_pdfium_pages(pdf):
        page_width = page.get_width()
        page_height = page.get_height()
        if width == 0 and height == 0:
            scale = (dpi or 72) / 72
            img = page.render(scale=scale).to_pil()
        else:
            if width and height:
                # Scale to longest side (since pypdfium2 doesn't
                # appear to allow non-1:1 aspect ratio)
                scale = max(width / page_width, height / page_height)
                img = page.render(scale=scale).to_pil()
                # Resize down to desired size
                img = img.resize(size=(width, height))
            elif width:
                scale = width / page.get_width()
                img = page.render(scale=scale).to_pil()
            elif height:
                scale = height / page.get_height()
                img = page.render(scale=scale).to_pil()
        img.info["page_index"] = idx
        img.info["page_width"] = page_width
        img.info["page_height"] = page_height
        yield img

pillow_color(color)

Convert colors to a form acceptable to Pillow.

Source code in src/paves/image.py
511
512
513
514
515
516
517
518
519
520
def pillow_color(color: Color) -> PillowColor:
    """Convert colors to a form acceptable to Pillow."""
    if isinstance(color, str):
        return color
    r, g, b = color
    # Would sure be nice if MyPy understood all()
    if isinstance(r, int) and isinstance(g, int) and isinstance(b, int):
        return (r, g, b)
    r, g, b = (int(x * 255) for x in color)
    return (r, g, b)

popple(pdf, *, dpi=0, width=0, height=0)

Convert a PDF to images using Poppler's pdftoppm.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required
dpi int

Render to this resolution (default is 72 dpi).

0
width int

Render to this width in pixels.

0
height int

Render to this height in pixels.

0

Yields: Pillow Image.Image objects, one per page. Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If Poppler is not installed.

Source code in src/paves/image.py
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
def popple(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images using Poppler's pdftoppm.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels.
        height: Render to this height in pixels.
    Yields:
        Pillow `Image.Image` objects, one per page.
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If Poppler is not installed.
    """
    if dpi and (width or height):
        raise ValueError("Cannot specify both `dpi` and `width` or `height`")
    try:
        subprocess.run(["pdftoppm", "-h"], capture_output=True)
    except FileNotFoundError as e:
        raise NotInstalledError("Poppler does not seem to be installed") from e
    args = make_poppler_args(dpi, width, height)
    with tempfile.TemporaryDirectory() as tempdir:
        temppath = Path(tempdir)
        # FIXME: Possible to Popple in a Parallel Pipeline
        page_sizes = _popple(pdf, temppath, args)
        for (page_idx, page_width, page_height), ppm in zip(
            page_sizes,
            (path for path in sorted(temppath.iterdir()) if path.suffix == ".ppm"),
        ):
            img = Image.open(ppm)
            img.info["page_index"] = page_idx
            img.info["page_width"] = page_width
            img.info["page_height"] = page_height
            yield img

show(page, dpi=72)

Show a single page with some reasonable defaults.

Source code in src/paves/image.py
367
368
369
def show(page: Page, dpi: int = 72) -> Image.Image:
    """Show a single page with some reasonable defaults."""
    return next(convert(page, dpi=dpi))

paves.text

Various somewhat-more-heuristic ways of guessing, getting, and processing text in PDFs.

WordObject dataclass

Bases: TextBase

"Word" in a PDF.

This is heuristically determined, either by explicit whitespace (if you're lucky enough to have a Tagged PDF) or by a sufficient gap between adjacent glyphs (otherwise).

It otherwise behaves just like a TextObject. You can iterate over its glyphs, etc. But, as a treat, these glyphs are "finalized" so you don't have to worry about inconsistent graphics states and so forth, and you also get some convenience properties.

The origin of the curent (logical) line is also available, to facilitate grouping words into lines, if you so desire (simply use itertools.groupby(words, paves.text.line))

Source code in src/paves/text.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@dataclass
class WordObject(TextBase):
    """
    "Word" in a PDF.

    This is heuristically determined, either by explicit whitespace
    (if you're lucky enough to have a Tagged PDF) or by a sufficient
    gap between adjacent glyphs (otherwise).

    It otherwise behaves just like a `TextObject`.  You can iterate
    over its glyphs, etc.  But, as a treat, these glyphs are
    "finalized" so you don't have to worry about inconsistent graphics
    states and so forth, and you also get some convenience properties.

    The origin of the curent (logical) line is also available, to
    facilitate grouping words into lines, if you so desire (simply
    use `itertools.groupby(words, paves.text.line)`)
    """

    _glyphs: List[GlyphObject]
    _next_origin: Point
    line: Point

    def __iter__(self) -> Iterator["ContentObject"]:
        return iter(self._glyphs)

    @property
    def matrix(self) -> Matrix:
        return self._glyphs[0].matrix

    @property
    def chars(self) -> str:
        return "".join(g.text for g in self._glyphs if g.text is not None)

    @property
    def origin(self) -> Point:
        return self._glyphs[0].origin

    @property
    def displacement(self) -> Point:
        ax, ay = self.origin
        bx, by = self._next_origin
        return bx - ax, by - ay

line_break(glyph, predicted_origin)

Heuristically predict a line break based on the predicted origin from the previous glyph.

Source code in src/paves/text.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def line_break(glyph: GlyphObject, predicted_origin: Point) -> bool:
    """Heuristically predict a line break based on the predicted origin
    from the previous glyph."""
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        line_offset = x - px
    else:
        line_offset = y - py
        if glyph.page.space == "screen":
            line_offset = -line_offset
    return line_offset < 0 or line_offset > 100  # FIXME: arbitrary!

text_objects(pdf)

Iterate over all text objects in a PDF, page, or pages

Source code in src/paves/text.py
109
110
111
112
113
114
@singledispatch
def text_objects(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TextObject]:
    """Iterate over all text objects in a PDF, page, or pages"""
    raise NotImplementedError

word_break(glyph, predicted_origin, prev_displacement)

Heuristically predict a word break based on the predicted origin from the previous glyph.

Source code in src/paves/text.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def word_break(
    glyph: GlyphObject, predicted_origin: Point, prev_displacement: Point
) -> bool:
    """Heuristically predict a word break based on the predicted origin
    from the previous glyph."""
    if glyph.text == " ":
        return True
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        glyph_offset = y - py
        _, displacement = prev_displacement
        if glyph.page.space == "screen":
            glyph_offset = -glyph_offset
            displacement = -displacement
    else:
        glyph_offset = x - px
        displacement, _ = prev_displacement
    # If there's a space, *or* if we are before the prev glyph
    return glyph_offset > 0.5 or glyph_offset < -displacement

words(pdf)

Extract "words" (i.e. whitespace-separated text cells) from a PDF or one of its pages.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Yields:

Type Description
WordObject

WordObject objects, which can be visualized with paves.image

WordObject

functions, or you can do various other things with them too.

Source code in src/paves/text.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def words(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[WordObject]:
    """Extract "words" (i.e. whitespace-separated text cells) from a
    PDF or one of its pages.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Yields:
        `WordObject` objects, which can be visualized with `paves.image`
        functions, or you can do various other things with them too.
    """
    glyphs: List[GlyphObject] = []
    predicted_origin: Union[None, Point] = None
    prev_disp: Union[None, Point] = None
    line_origin: Union[None, Point] = None
    for obj in text_objects(pdf):
        for glyph in obj:
            if line_origin is None:
                line_origin = glyph.origin
            if predicted_origin and prev_disp:
                new_word = word_break(glyph, predicted_origin, prev_disp)
                new_line = line_break(glyph, predicted_origin)
                if glyphs and (new_word or new_line):
                    yield WordObject(
                        _pageref=glyphs[0]._pageref,
                        _parentkey=glyphs[0]._parentkey,
                        gstate=glyphs[0].gstate,  # Not necessarily correct!
                        ctm=glyphs[0].ctm,  # Not necessarily correct!
                        mcstack=glyphs[0].mcstack,  # Not necessarily correct!
                        _glyphs=glyphs,
                        _next_origin=predicted_origin,
                        line=line_origin,
                    )
                    glyphs = []
                if new_line:
                    line_origin = glyph.origin
            if glyph.text is not None and glyph.text != " ":
                glyphs.append(cast(GlyphObject, glyph.finalize()))
            prev_disp = glyph.displacement
            predicted_origin = _add_point(glyph.origin, prev_disp)
    if predicted_origin and line_origin and glyphs:
        yield WordObject(
            _pageref=glyphs[0]._pageref,
            _parentkey=glyphs[0]._parentkey,
            gstate=glyphs[0].gstate,  # Not necessarily correct!
            ctm=glyphs[0].ctm,  # Not necessarily correct!
            mcstack=glyphs[0].mcstack,  # Not necessarily correct!
            _glyphs=glyphs,
            _next_origin=predicted_origin,
            line=line_origin,
        )

paves.tables

Simple and not at all Java-damaged interface for table detection.

TableObject dataclass

Bases: ContentObject

Table on one page of a PDF.

This is a ContentObject and can be treated as one (notably with paves.image functions).

It could either come from a logical structure element, or it could simply be a bounding box (as detected by some sort of visual model). While these TableObjects will never span multiple pages, the underlying logical structure element may do so. This is currently the only way to detect multi-page tables through this interface (they will have an equivalent parent property).

Note that the graphics state and coordinate transformation matrix may just be the page defaults, if Machine Learning™ was used to detect the table in a rendered image of the page.

Source code in src/paves/tables.py
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
@dataclass
class TableObject(ContentObject):
    """Table on one page of a PDF.

    This **is** a ContentObject and can be treated as one (notably
    with `paves.image` functions).

    It could either come from a logical structure element, or it could
    simply be a bounding box (as detected by some sort of visual
    model).  While these `TableObject`s will never span multiple
    pages, the underlying logical structure element may do so.  This
    is currently the only way to detect multi-page tables through this
    interface (they will have an equivalent `parent` property).

    Note that the graphics state and coordinate transformation matrix
    may just be the page defaults, if Machine Learning™ was used to
    detect the table in a rendered image of the page.

    """

    _bbox: Union[Rect, None]
    _parent: Union[Element, None]

    @property
    def bbox(self) -> Rect:
        # _bbox takes priority as we *could* have both
        if self._bbox is not None:
            return self._bbox
        elif self._parent is not None:
            # Try to get it from the element but only if it has the
            # same page as us (otherwise it will be wrong!)
            if self._parent.page is self.page:
                bbox = self._parent.bbox
                if bbox is not BBOX_NONE:
                    return bbox
            # We always have a page even if self._parent doesn't
            return get_bound_rects(
                item.bbox
                for item in self._parent.contents
                if item.page is self.page and item.bbox is not BBOX_NONE
            )
        else:
            # This however should never happen
            return BBOX_NONE

    @classmethod
    def from_bbox(cls, page: Page, bbox: Rect) -> "TableObject":
        # Use default values
        return cls(
            _pageref=_ref_page(page),
            _parentkey=None,
            gstate=GraphicState(),
            ctm=page.ctm,
            mcstack=(),
            _bbox=bbox,
            _parent=None,
        )

    @classmethod
    def from_element(
        cls,
        el: Element,
        page: Page,
        contents: Union[Iterable[Union[ContentItem, StructContentObject]], None] = None,
    ) -> Union["TableObject", None]:
        if contents is None:
            contents = el.contents
        # Find a ContentObject so we can get a bbox, mcstack, ctm
        # (they might not be *correct* of course, but oh well)
        gstate: Union[GraphicState, None] = None
        ctm: Union[Matrix, None] = None
        mcstack: Union[Tuple[MarkedContent, ...], None] = None
        bbox: Union[Rect, None] = None
        for kid in contents:
            # For multi-page tables, skip any contents on a different page
            if kid.page != page:
                continue
            if isinstance(kid, StructContentObject):
                obj = kid.obj
                if obj is None:
                    continue
                elif isinstance(obj, Annotation):
                    # FIXME: for the moment just ignore these
                    continue
                else:
                    gstate = copy(obj.gstate)
                    ctm = obj.ctm
                    mcstack = obj.mcstack
                    bbox = obj.bbox
                    break
            elif isinstance(kid, ContentItem):
                # It's a ContentItem
                try:
                    cobj = next(iter(kid))
                except StopIteration:
                    continue
                gstate = copy(cobj.gstate)
                ctm = cobj.ctm
                mcstack = cobj.mcstack
                break
        else:
            # No contents, no table for you!
            return None
        return cls(
            _pageref=_ref_page(page),
            _parentkey=None,
            gstate=gstate,
            ctm=ctm,
            mcstack=mcstack,
            _bbox=bbox,
            _parent=el,
        )

table_bounds_to_objects(pdf, bounds)

Create TableObjects from detected bounding boxes.

Source code in src/paves/tables.py
249
250
251
252
253
254
255
256
257
def table_bounds_to_objects(
    pdf: Union[str, PathLike, Document, Page, PageList],
    bounds: Iterable[Tuple[int, Iterable[Rect]]],
) -> Iterator[TableObject]:
    """Create TableObjects from detected bounding boxes."""
    for page, (page_idx, tables) in zip(_get_pages(pdf), bounds):
        assert page.page_idx == page_idx
        for bbox in tables:
            yield TableObject.from_bbox(page, bbox)

table_elements(pdf)

Iterate over all text objects in a PDF, page, or pages

Source code in src/paves/tables.py
141
142
143
144
145
146
@singledispatch
def table_elements(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[Element]:
    """Iterate over all text objects in a PDF, page, or pages"""
    raise NotImplementedError(f"Not implemented for {type(pdf)}")

table_elements_to_objects(elements, page=None)

Make TableObjects from Elements.

Source code in src/paves/tables.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
def table_elements_to_objects(
    elements: Iterable[Element], page: Union[Page, None] = None
) -> Iterator[TableObject]:
    """Make TableObjects from Elements."""
    for el in elements:
        # It usually has a page, but it can also span multiple pages
        # if this is the case.  So a page passed explicitly here
        # should take precedence.
        for kidpage, kids in groupby(el.contents, attrgetter("page")):
            if kidpage is None:
                continue
            if page is not None and kidpage is not page:
                continue
            table = TableObject.from_element(el, kidpage, kids)
            if table is not None:
                yield table

tables(pdf, **kwargs)

Identify tables in a PDF or one of its pages.

This will always try to use logical structure (via PLAYA-PDF) first to identify tables.

For the moment, this only works on tagged and accessible PDFs. So, like paves.image, it can also use Machine Learning Models™ to do so, which involves nasty horrible dependencyses (we hates them, they stole the precious) like cudnn-10-gigabytes-of-c++.

If you'd like to try that, then you can do so by installing the transformers[torch] package (if you don't have a GPU, try adding --extra-index-url https://download.pytorch.org/whl/cpu to pip's command line).

These tables cannot span multiple pages.

Often, a table will span multiple pages. With PDF logical structure, this can be represented (and sometimes is), but if there is no logical structure, this is not possible, since tables are detected from the rendered image of a page. Reconstructing this information is both extremely important and also very difficult with current models (perhaps very big VLMs can do it?). Since we also want to visualize tables with paves.image, we don't return multi-page tables here.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Iterator[TableObject]

An iterator over TableObject. If no method is available to

Iterator[TableObject]

detect tables, this will return an iterator over an empty

Iterator[TableObject]

list. You may wish to use tables_orelse to ensure that

Iterator[TableObject]

tables can be detected.

Source code in src/paves/tables.py
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
def tables(
    pdf: Union[str, PathLike, Document, Page, PageList], **kwargs: Any
) -> Iterator[TableObject]:
    """Identify tables in a PDF or one of its pages.

    This will always try to use logical structure (via PLAYA-PDF)
    first to identify tables.

    For the moment, this only works on tagged and accessible PDFs.
    So, like `paves.image`, it can also use Machine Learning Models™
    to do so, which involves nasty horrible dependencyses (we hates
    them, they stole the precious) like `cudnn-10-gigabytes-of-c++`.

    If you'd like to try that, then you can do so by installing the
    `transformers[torch]` package (if you don't have a GPU, try adding
    `--extra-index-url https://download.pytorch.org/whl/cpu` to pip's
    command line).

    Note: These tables cannot span multiple pages.
        Often, a table will span multiple pages.  With PDF logical
        structure, this can be represented (and sometimes is), but if
        there is no logical structure, this is not possible, since
        tables are detected from the rendered image of a page.
        Reconstructing this information is both extremely important
        and also very difficult with current models (perhaps very big
        VLMs can do it?).  Since we also want to visualize tables with
        `paves.image`, we don't return multi-page tables here.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`.  If no method is available to
        detect tables, this will return an iterator over an empty
        list.  You may wish to use `tables_orelse` to ensure that
        tables can be detected.

    """
    itor = tables_orelse(pdf, **kwargs)
    if itor is None:
        return iter(())
    return itor

tables_detr(pdf, device='cpu')

Identify tables in a PDF or one of its pages using IBM's RT-DETR layout detection model

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required
device str

Torch device for running the model.

'cpu'

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if the model can't be used

Source code in src/paves/tables.py
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
def tables_detr(
    pdf: Union[str, PathLike, Document, Page, PageList],
    device: str = "cpu",
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using IBM's
    RT-DETR layout detection model

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        device: Torch device for running the model.

    Returns:
      An iterator over `TableObject`, or `None`, if the model can't be used
    """
    try:
        from paves.tables_detr import table_bounds
    except ImportError:
        return None
    return table_bounds_to_objects(pdf, table_bounds(pdf, device=device))

tables_orelse(pdf, **kwargs)

Identify tables in a PDF or one of its pages, or fail.

This works like tables but forces you (if you use type checking) to detect the case where tables cannot be detected by any known method.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if there is no

Union[Iterator[TableObject], None]

method available to detect tables. This will cause a

Union[Iterator[TableObject], None]

TypeError if you try to iterate over it anyway.

Source code in src/paves/tables.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
def tables_orelse(
    pdf: Union[str, PathLike, Document, Page, PageList], **kwargs: Any
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages, or fail.

    This works like `tables` but forces you (if you use type checking)
    to detect the case where tables cannot be detected by any known
    method.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`, or `None`, if there is no
        method available to detect tables.  This will cause a
        `TypeError` if you try to iterate over it anyway.

    """
    for method in METHODS:
        itor = method(pdf, **kwargs)
        if itor is not None:
            return itor
    else:
        return None

tables_structure(pdf)

Identify tables in a PDF or one of its pages using logical structure.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if there is no

Union[Iterator[TableObject], None]

logical structure (this will cause a TypeError, if you don't

Union[Iterator[TableObject], None]

check for it).

Source code in src/paves/tables.py
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
def tables_structure(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using logical structure.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if there is no
      logical structure (this will cause a TypeError, if you don't
      check for it).
    """
    page = pdf if isinstance(pdf, Page) else None
    try:
        return table_elements_to_objects(table_elements(pdf), page)
    except TypeError:  # means that structure is None
        return None