Skip to content

Reference

paves.image

Various ways of converting PDFs to images for feeding them to models and/or visualisation.`

BoxFunc = Callable[[Boxable], Union[Rect, None]] module-attribute

Function to get a bounding box for a Boxable.

Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect] module-attribute

Object for which we can get a bounding box.

Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]] module-attribute

Type alias for things that can be used as colors.

ColorMaker = Callable[[str], PillowColor] module-attribute

Function that makes a Pillow color for a string label.

Colors = Union[Color, List[Color], Dict[str, Color]] module-attribute

Type alias for colors or collections of colors.

DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan'] module-attribute

Default color cycle (same as matplotlib)

LabelFunc = Callable[[Boxable], Any] module-attribute

Function to get a label for a Boxable.

PillowColor = Union[str, Tuple[int, int, int]] module-attribute

Type alias for things Pillow accepts as colors.

box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

Draw boxes around things in a page of a PDF.

Source code in src/paves/image/__init__.py
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
def box(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    label: bool = True,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    label_fill: bool = True,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Draw boxes around things in a page of a PDF."""
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        try:
            box = boxfunc(obj)
            if box is None:  # it has no box
                continue
            left, top, right, bottom = (x * scale for x in box)
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(image)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), outline=obj_color)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color if label_fill else None,
            )
            draw.text(
                xy=(left + label_margin, top - label_margin),
                text=text,
                font=font,
                fill="white" if label_fill else obj_color,
                anchor="ld",
            )
    return image

color_maker(spec, default='red')

Create a function that makes colors.

Source code in src/paves/image/__init__.py
181
182
183
184
@functools.singledispatch
def color_maker(spec: Colors, default: Color = "red") -> ColorMaker:
    """Create a function that makes colors."""
    return lambda _: pillow_color(default)

convert(pdf, *, dpi=0, width=0, height=0)

Convert a PDF to images.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required
dpi int

Render to this resolution (default is 72 dpi).

0
width int

Render to this width in pixels (0 to keep aspect ratio).

0
height int

Render to this height in pixels (0 to keep aspect ratio).

0

Yields: Pillow Image.Image objects, one per page. The original page width and height in default user space units are available in the info property of these images as page_width and page_height Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If no renderer is available

Source code in src/paves/image/converters.py
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
def convert(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels (0 to keep aspect ratio).
        height: Render to this height in pixels (0 to keep aspect ratio).
    Yields:
        Pillow `Image.Image` objects, one per page.  The original page
        width and height in default user space units are available in
        the `info` property of these images as `page_width` and
        `page_height`
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If no renderer is available

    """
    for _, convert in CONVERTERS:
        try:
            for img in convert(pdf, dpi=dpi, width=width, height=height):
                yield img
            break
        except NotInstalledError:
            continue
    else:
        raise NotInstalledError(
            "No converters available, tried: %s"
            % (", ".join(m.__name__ for _, m in CONVERTERS))
        )

get_box(obj)

Default function to get the bounding box for an object.

Source code in src/paves/image/__init__.py
59
60
61
62
63
64
@functools.singledispatch
def get_box(obj) -> Union[Rect, None]:
    """Default function to get the bounding box for an object."""
    if hasattr(obj, "bbox"):
        return obj.bbox
    raise RuntimeError(f"Don't know how to get the box for {obj!r}")

get_box_rect(obj)

Get the bounding box of a bounding box

Source code in src/paves/image/__init__.py
67
68
69
70
@get_box.register(tuple)
def get_box_rect(obj: Rect) -> Union[Rect, None]:
    """Get the bounding box of a bounding box"""
    return obj

get_label(obj)

Default function to get the label text for an object.

Source code in src/paves/image/__init__.py
73
74
75
76
@functools.singledispatch
def get_label(obj: Boxable) -> str:
    """Default function to get the label text for an object."""
    return str(obj)

get_label_annotation(obj)

Get the default label text for an Annotation.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image/__init__.py
85
86
87
88
89
90
91
92
93
@get_label.register(Annotation)
def get_label_annotation(obj: Annotation) -> str:
    """Get the default label text for an Annotation.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.type

get_label_content(obj)

Get the label text for a ContentObject.

Source code in src/paves/image/__init__.py
79
80
81
82
@get_label.register(ContentObject)
def get_label_content(obj: ContentObject) -> str:
    """Get the label text for a ContentObject."""
    return obj.object_type

get_label_element(obj)

Get the default label text for an Element.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image/__init__.py
 96
 97
 98
 99
100
101
102
103
104
@get_label.register(Element)
def get_label_element(obj: Element) -> str:
    """Get the default label text for an Element.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.type

mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

Highlight things in a page of a PDF.

Source code in src/paves/image/__init__.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
def mark(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    transparency: float = 0.75,
    label: bool = False,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    outline: bool = False,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Highlight things in a page of a PDF."""
    overlay: Union[Image.Image, None] = None
    mask: Union[Image.Image, None] = None
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    alpha = min(255, int(transparency * 255))
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        if overlay is None:
            overlay = Image.new("RGB", image.size)
        if mask is None:
            mask = Image.new("L", image.size, 255)
        try:
            box = boxfunc(obj)
            if box is None:  # it has no box
                continue
            left, top, right, bottom = (x * scale for x in box)
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(overlay)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), fill=obj_color)
        mask_draw = ImageDraw.ImageDraw(mask)
        mask_draw.rectangle((left, top, right, bottom), fill=alpha)
        if outline:
            draw.rectangle((left, top, right, bottom), outline="black")
            mask_draw.rectangle((left, top, right, bottom), outline=0)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color,
            )
            mask_draw.rectangle(
                label_box,
                fill=alpha,
            )
            if outline:
                draw.rectangle(
                    label_box,
                    outline="black",
                )
                mask_draw.rectangle(
                    label_box,
                    outline=0,
                )
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="black",
                    anchor="ld",
                )
                mask_draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill=0,
                    anchor="ld",
                )
            else:
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="white",
                    anchor="ld",
                )
    if image is None:
        return None
    if overlay is not None and mask is not None:
        return Image.composite(image, overlay, mask)
    else:
        return image

pillow_color(color)

Convert colors to a form acceptable to Pillow.

Source code in src/paves/image/__init__.py
169
170
171
172
173
174
175
176
177
178
def pillow_color(color: Color) -> PillowColor:
    """Convert colors to a form acceptable to Pillow."""
    if isinstance(color, str):
        return color
    r, g, b = color
    # Would sure be nice if MyPy understood all()
    if isinstance(r, int) and isinstance(g, int) and isinstance(b, int):
        return (r, g, b)
    r, g, b = (int(x * 255) for x in color)
    return (r, g, b)

popple(pdf, *, dpi=0, width=0, height=0)

Convert a PDF to images using Poppler's pdftoppm.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required
dpi int

Render to this resolution (default is 72 dpi).

0
width int

Render to this width in pixels.

0
height int

Render to this height in pixels.

0

Yields: Pillow Image.Image objects, one per page. Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If Poppler is not installed.

Source code in src/paves/image/poppler.py
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
@converter(priority=10)
def popple(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images using Poppler's pdftoppm.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels.
        height: Render to this height in pixels.
    Yields:
        Pillow `Image.Image` objects, one per page.
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If Poppler is not installed.
    """
    if dpi and (width or height):
        raise ValueError("Cannot specify both `dpi` and `width` or `height`")
    try:
        subprocess.run(["pdftoppm", "-h"], capture_output=True)
    except FileNotFoundError as e:
        raise NotInstalledError("Poppler does not seem to be installed") from e
    args = make_poppler_args(dpi, width, height)
    with tempfile.TemporaryDirectory() as tempdir:
        temppath = Path(tempdir)
        # FIXME: Possible to Popple in a Parallel Pipeline
        page_sizes = _popple(pdf, temppath, args)
        for (page_idx, page_width, page_height), ppm in zip(
            page_sizes,
            (path for path in sorted(temppath.iterdir()) if path.suffix == ".ppm"),
        ):
            img = Image.open(ppm)
            img.info["page_index"] = page_idx
            img.info["page_width"] = page_width
            img.info["page_height"] = page_height
            yield img

show(page, dpi=72)

Show a single page with some reasonable defaults.

Source code in src/paves/image/__init__.py
32
33
34
35
36
37
38
39
40
def show(page: Page, dpi: int = 72) -> Image.Image:
    """Show a single page with some reasonable defaults."""
    try:
        return next(convert(page, dpi=dpi))
    except NotImplementedError as e:
        raise ValueError(
            f"Can't call show() on a {type(page).__name__}, "
            "did you mean to call box() or mark()?"
        ) from e

paves.text

Various somewhat-more-heuristic ways of guessing, getting, and processing text in PDFs.

WordObject dataclass

Bases: TextBase

"Word" in a PDF.

This is heuristically determined, either by explicit whitespace (if you're lucky enough to have a Tagged PDF) or by a sufficient gap between adjacent glyphs (otherwise).

It otherwise behaves just like a TextObject. You can iterate over its glyphs, etc. But, as a treat, these glyphs are "finalized" so you don't have to worry about inconsistent graphics states and so forth, and you also get some convenience properties.

The origin of the curent (logical) line is also available, to facilitate grouping words into lines, if you so desire (simply use itertools.groupby(words, paves.text.line))

Source code in src/paves/text/__init__.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
@dataclass
class WordObject(TextBase):
    """
    "Word" in a PDF.

    This is heuristically determined, either by explicit whitespace
    (if you're lucky enough to have a Tagged PDF) or by a sufficient
    gap between adjacent glyphs (otherwise).

    It otherwise behaves just like a `TextObject`.  You can iterate
    over its glyphs, etc.  But, as a treat, these glyphs are
    "finalized" so you don't have to worry about inconsistent graphics
    states and so forth, and you also get some convenience properties.

    The origin of the curent (logical) line is also available, to
    facilitate grouping words into lines, if you so desire (simply
    use `itertools.groupby(words, paves.text.line)`)
    """

    _glyphs: List[GlyphObject]
    _next_origin: Point
    line: Point

    def __iter__(self) -> Iterator["ContentObject"]:
        return iter(self._glyphs)

    @property
    def matrix(self) -> Matrix:
        return self._glyphs[0].matrix

    @property
    def chars(self) -> str:
        return "".join(g.text for g in self._glyphs if g.text is not None)

    @property
    def origin(self) -> Point:
        return self._glyphs[0].origin

    @property
    def displacement(self) -> Point:
        ax, ay = self.origin
        bx, by = self._next_origin
        return bx - ax, by - ay

line_break(glyph, predicted_origin)

Heuristically predict a line break based on the predicted origin from the previous glyph.

Source code in src/paves/text/__init__.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def line_break(glyph: GlyphObject, predicted_origin: Point) -> bool:
    """Heuristically predict a line break based on the predicted origin
    from the previous glyph."""
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        line_offset = x - px
    else:
        line_offset = y - py
        if glyph.page.space == "screen":
            line_offset = -line_offset
    return line_offset < 0 or line_offset > 100  # FIXME: arbitrary!

text_objects(pdf)

Iterate over all text objects in a PDF, page, or pages

Source code in src/paves/text/__init__.py
109
110
111
112
113
114
@singledispatch
def text_objects(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TextObject]:
    """Iterate over all text objects in a PDF, page, or pages"""
    raise NotImplementedError

word_break(glyph, predicted_origin, prev_displacement)

Heuristically predict a word break based on the predicted origin from the previous glyph.

Source code in src/paves/text/__init__.py
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def word_break(
    glyph: GlyphObject, predicted_origin: Point, prev_displacement: Point
) -> bool:
    """Heuristically predict a word break based on the predicted origin
    from the previous glyph."""
    if glyph.text == " ":
        return True
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        glyph_offset = y - py
        _, displacement = prev_displacement
        if glyph.page.space == "screen":
            glyph_offset = -glyph_offset
            displacement = -displacement
    else:
        glyph_offset = x - px
        displacement, _ = prev_displacement
    # If there's a space, *or* if we are before the prev glyph
    return glyph_offset > 0.5 or glyph_offset < -displacement

words(pdf)

Extract "words" (i.e. whitespace-separated text cells) from a PDF or one of its pages.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Yields:

Type Description
WordObject

WordObject objects, which can be visualized with paves.image

WordObject

functions, or you can do various other things with them too.

Source code in src/paves/text/__init__.py
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
def words(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[WordObject]:
    """Extract "words" (i.e. whitespace-separated text cells) from a
    PDF or one of its pages.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Yields:
        `WordObject` objects, which can be visualized with `paves.image`
        functions, or you can do various other things with them too.
    """
    glyphs: List[GlyphObject] = []
    predicted_origin: Union[None, Point] = None
    prev_disp: Union[None, Point] = None
    line_origin: Union[None, Point] = None
    for obj in text_objects(pdf):
        for glyph in obj:
            if line_origin is None:
                line_origin = glyph.origin
            if predicted_origin and prev_disp:
                new_word = word_break(glyph, predicted_origin, prev_disp)
                new_line = line_break(glyph, predicted_origin)
                if glyphs and (new_word or new_line):
                    yield WordObject(
                        _pageref=glyphs[0]._pageref,
                        _parentkey=glyphs[0]._parentkey,
                        gstate=glyphs[0].gstate,  # Not necessarily correct!
                        ctm=glyphs[0].ctm,  # Not necessarily correct!
                        mcstack=glyphs[0].mcstack,  # Not necessarily correct!
                        _glyphs=glyphs,
                        _next_origin=predicted_origin,
                        line=line_origin,
                    )
                    glyphs = []
                if new_line:
                    line_origin = glyph.origin
            if glyph.text is not None and glyph.text != " ":
                glyphs.append(cast(GlyphObject, glyph.finalize()))
            prev_disp = glyph.displacement
            predicted_origin = _add_point(glyph.origin, prev_disp)
    if predicted_origin and line_origin and glyphs:
        yield WordObject(
            _pageref=glyphs[0]._pageref,
            _parentkey=glyphs[0]._parentkey,
            gstate=glyphs[0].gstate,  # Not necessarily correct!
            ctm=glyphs[0].ctm,  # Not necessarily correct!
            mcstack=glyphs[0].mcstack,  # Not necessarily correct!
            _glyphs=glyphs,
            _next_origin=predicted_origin,
            line=line_origin,
        )

paves.tables

Simple and not at all Java-damaged interface for table detection.

detector(name)

Look up a detector by name.

Source code in src/paves/tables/detectors.py
39
40
41
42
43
44
def lookup(name: str) -> Union[Detector, None]:
    """Look up a detector by name."""
    for _, d in DETECTORS:
        if d.__name__ == name:
            return d
    return None

tables(pdf)

Identify tables in a PDF or one of its pages.

This will always try to use logical structure (via PLAYA-PDF) first to identify tables.

Of course, that only works on tagged and accessible PDFs. So, like paves.image, we can also use Machine Learning Models™ here, which involves nasty horrible dependencyses (we hates them, they stole the precious) like cudnn-10-gigabytes-of-c++.

If you'd like to try that, then you can do so by installing the transformers[torch] package (if you don't have a GPU, try adding --extra-index-url https://download.pytorch.org/whl/cpu to pip's command line).

These tables cannot span multiple pages.

Often, a table will span multiple pages. With PDF logical structure, this can be represented (and sometimes is), but if there is no logical structure, this is not possible, since tables are detected from the rendered image of a page. Reconstructing this information is both extremely important and also very difficult with current models (perhaps very big VLMs can do it?). Since we also want to visualize tables with paves.image, we don't return multi-page tables here.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Iterator[TableObject]

An iterator over TableObject. If no method is available to

Iterator[TableObject]

detect tables, this will return an iterator over an empty

Iterator[TableObject]

list. You may wish to use tables_orelse to ensure that

Iterator[TableObject]

tables can be detected.

Source code in src/paves/tables/detectors.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def tables(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TableObject]:
    """Identify tables in a PDF or one of its pages.

    This will always try to use logical structure (via PLAYA-PDF)
    first to identify tables.

    Of course, that only works on tagged and accessible PDFs.  So,
    like `paves.image`, we can also use Machine Learning Models™ here,
    which involves nasty horrible dependencyses (we hates them, they
    stole the precious) like `cudnn-10-gigabytes-of-c++`.

    If you'd like to try that, then you can do so by installing the
    `transformers[torch]` package (if you don't have a GPU, try adding
    `--extra-index-url https://download.pytorch.org/whl/cpu` to pip's
    command line).

    Note: These tables cannot span multiple pages.
        Often, a table will span multiple pages.  With PDF logical
        structure, this can be represented (and sometimes is), but if
        there is no logical structure, this is not possible, since
        tables are detected from the rendered image of a page.
        Reconstructing this information is both extremely important
        and also very difficult with current models (perhaps very big
        VLMs can do it?).  Since we also want to visualize tables with
        `paves.image`, we don't return multi-page tables here.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`.  If no method is available to
        detect tables, this will return an iterator over an empty
        list.  You may wish to use `tables_orelse` to ensure that
        tables can be detected.

    """
    itor = tables_orelse(pdf)
    if itor is None:
        return iter(())
    return itor

tables_detr(pdf)

Identify tables in a PDF or one of its pages using Docling Project layout model.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if the model can't be used

Source code in src/paves/tables/detr.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@detector(priority=10)
def docling_heron(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using Docling Project
    layout model.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if the model can't be used

    """
    try:
        detected = detect_objects(pdf, "docling-project/docling-layout-heron")
    except ImportError:
        return None

    def itor() -> Iterator[TableObject]:
        for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
            assert page.page_idx == page_idx
            for label, bbox in objects:
                if label == "Table":
                    yield TableObject.from_bbox(page, bbox)

    return itor()

tables_orelse(pdf)

Identify tables in a PDF or one of its pages, or fail.

This works like tables but forces you (if you use type checking) to detect the case where tables cannot be detected by any known method.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if there is no

Union[Iterator[TableObject], None]

method available to detect tables. This will cause a

Union[Iterator[TableObject], None]

TypeError if you try to iterate over it anyway.

Source code in src/paves/tables/detectors.py
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def tables_orelse(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages, or fail.

    This works like `tables` but forces you (if you use type checking)
    to detect the case where tables cannot be detected by any known
    method.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`, or `None`, if there is no
        method available to detect tables.  This will cause a
        `TypeError` if you try to iterate over it anyway.

    """
    for _, method in DETECTORS:
        itor = method(pdf)
        if itor is not None:
            return itor
    else:
        return None

tables_structure(pdf)

Identify tables in a PDF or one of its pages using logical structure.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if there is no

Union[Iterator[TableObject], None]

logical structure (this will cause a TypeError, if you don't

Union[Iterator[TableObject], None]

check for it).

Source code in src/paves/tables/structure.py
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
@detector(priority=0)
def structure(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using logical structure.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if there is no
      logical structure (this will cause a TypeError, if you don't
      check for it).
    """
    page = pdf if isinstance(pdf, Page) else None
    try:
        return table_elements_to_objects(table_elements(pdf), page)
    except TypeError:  # means that structure is None
        return None

tables_tatr(pdf)

Identify tables in a PDF or one of its pages using Microsoft Table Transformer model.

Parameters:

Name Type Description Default
pdf Union[str, PathLike, Document, Page, PageList]

PLAYA-PDF document, page, pages, or path to a PDF.

required

Returns:

Type Description
Union[Iterator[TableObject], None]

An iterator over TableObject, or None, if the model can't be used

Source code in src/paves/tables/detr.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
@detector(priority=20)
def table_transformer(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using Microsoft Table
    Transformer model.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if the model can't be used

    """
    try:
        detected = detect_objects(
            pdf,
            "microsoft/table-transformer-detection",
            model_kwargs={"revision": "no_timm"},
            threshold=0.9,
        )
    except ImportError:
        return None

    def itor() -> Iterator[TableObject]:
        for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
            assert page.page_idx == page_idx
            for label, bbox in objects:
                yield TableObject.from_bbox(page, bbox)

    return itor()