Reference

`paves.image`

Various ways of converting PDFs to images for feeding them to models and/or visualisation.`

`BoxFunc = Callable[[Boxable], Union[Rect, None]]` `module-attribute`

Function to get a bounding box for a Boxable.

`Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect]` `module-attribute`

Object for which we can get a bounding box.

`Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]]` `module-attribute`

Type alias for things that can be used as colors.

`ColorMaker = Callable[[str], PillowColor]` `module-attribute`

Function that makes a Pillow color for a string label.

`Colors = Union[Color, List[Color], Dict[str, Color]]` `module-attribute`

Type alias for colors or collections of colors.

`DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']` `module-attribute`

Default color cycle (same as matplotlib)

`LabelFunc = Callable[[Boxable], Any]` `module-attribute`

Function to get a label for a Boxable.

`PillowColor = Union[str, Tuple[int, int, int]]` `module-attribute`

Type alias for things Pillow accepts as colors.

`box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

Draw boxes around things in a page of a PDF.

Source code in src/paves/image/__init__.py

def box(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    label: bool = True,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    label_fill: bool = True,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Draw boxes around things in a page of a PDF."""
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        try:
            box = boxfunc(obj)
            if box is None:  # it has no box
                continue
            left, top, right, bottom = (x * scale for x in box)
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(image)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), outline=obj_color)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color if label_fill else None,
            )
            draw.text(
                xy=(left + label_margin, top - label_margin),
                text=text,
                font=font,
                fill="white" if label_fill else obj_color,
                anchor="ld",
            )
    return image

`color_maker(spec, default='red')`

Create a function that makes colors.

Source code in src/paves/image/__init__.py

@functools.singledispatch
def color_maker(spec: Colors, default: Color = "red") -> ColorMaker:
    """Create a function that makes colors."""
    return lambda _: pillow_color(default)

`convert(pdf, *, dpi=0, width=0, height=0)`

Convert a PDF to images.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required
`dpi`	`int`	Render to this resolution (default is 72 dpi).	`0`
`width`	`int`	Render to this width in pixels (0 to keep aspect ratio).	`0`
`height`	`int`	Render to this height in pixels (0 to keep aspect ratio).	`0`

Yields: Pillow Image.Image objects, one per page. The original page width and height in default user space units are available in the info property of these images as page_width and page_height Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If no renderer is available

Source code in src/paves/image/converters.py

def convert(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels (0 to keep aspect ratio).
        height: Render to this height in pixels (0 to keep aspect ratio).
    Yields:
        Pillow `Image.Image` objects, one per page.  The original page
        width and height in default user space units are available in
        the `info` property of these images as `page_width` and
        `page_height`
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If no renderer is available

    """
    for _, convert in CONVERTERS:
        try:
            for img in convert(pdf, dpi=dpi, width=width, height=height):
                yield img
            break
        except NotInstalledError:
            continue
    else:
        raise NotInstalledError(
            "No converters available, tried: %s"
            % (", ".join(m.__name__ for _, m in CONVERTERS))
        )

`get_box(obj)`

Default function to get the bounding box for an object.

Source code in src/paves/image/__init__.py

@functools.singledispatch
def get_box(obj) -> Union[Rect, None]:
    """Default function to get the bounding box for an object."""
    if hasattr(obj, "bbox"):
        return obj.bbox
    raise RuntimeError(f"Don't know how to get the box for {obj!r}")

`get_box_rect(obj)`

Get the bounding box of a bounding box

Source code in src/paves/image/__init__.py

@get_box.register(tuple)
def get_box_rect(obj: Rect) -> Union[Rect, None]:
    """Get the bounding box of a bounding box"""
    return obj

`get_label(obj)`

Default function to get the label text for an object.

Source code in src/paves/image/__init__.py

@functools.singledispatch
def get_label(obj: Boxable) -> str:
    """Default function to get the label text for an object."""
    return str(obj)

`get_label_annotation(obj)`

Get the default label text for an Annotation.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image/__init__.py

@get_label.register(Annotation)
def get_label_annotation(obj: Annotation) -> str:
    """Get the default label text for an Annotation.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.type

`get_label_content(obj)`

Get the label text for a ContentObject.

Source code in src/paves/image/__init__.py

@get_label.register(ContentObject)
def get_label_content(obj: ContentObject) -> str:
    """Get the label text for a ContentObject."""
    return obj.object_type

`get_label_element(obj)`

Get the default label text for an Element.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image/__init__.py

@get_label.register(Element)
def get_label_element(obj: Element) -> str:
    """Get the default label text for an Element.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.type

`mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

Highlight things in a page of a PDF.

Source code in src/paves/image/__init__.py

def mark(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    transparency: float = 0.75,
    label: bool = False,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    outline: bool = False,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Highlight things in a page of a PDF."""
    overlay: Union[Image.Image, None] = None
    mask: Union[Image.Image, None] = None
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    alpha = min(255, int(transparency * 255))
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        if overlay is None:
            overlay = Image.new("RGB", image.size)
        if mask is None:
            mask = Image.new("L", image.size, 255)
        try:
            box = boxfunc(obj)
            if box is None:  # it has no box
                continue
            left, top, right, bottom = (x * scale for x in box)
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(overlay)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), fill=obj_color)
        mask_draw = ImageDraw.ImageDraw(mask)
        mask_draw.rectangle((left, top, right, bottom), fill=alpha)
        if outline:
            draw.rectangle((left, top, right, bottom), outline="black")
            mask_draw.rectangle((left, top, right, bottom), outline=0)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color,
            )
            mask_draw.rectangle(
                label_box,
                fill=alpha,
            )
            if outline:
                draw.rectangle(
                    label_box,
                    outline="black",
                )
                mask_draw.rectangle(
                    label_box,
                    outline=0,
                )
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="black",
                    anchor="ld",
                )
                mask_draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill=0,
                    anchor="ld",
                )
            else:
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="white",
                    anchor="ld",
                )
    if image is None:
        return None
    if overlay is not None and mask is not None:
        return Image.composite(image, overlay, mask)
    else:
        return image

`pillow_color(color)`

Convert colors to a form acceptable to Pillow.

Source code in src/paves/image/__init__.py

def pillow_color(color: Color) -> PillowColor:
    """Convert colors to a form acceptable to Pillow."""
    if isinstance(color, str):
        return color
    r, g, b = color
    # Would sure be nice if MyPy understood all()
    if isinstance(r, int) and isinstance(g, int) and isinstance(b, int):
        return (r, g, b)
    r, g, b = (int(x * 255) for x in color)
    return (r, g, b)

`popple(pdf, *, dpi=0, width=0, height=0)`

Convert a PDF to images using Poppler's pdftoppm.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required
`dpi`	`int`	Render to this resolution (default is 72 dpi).	`0`
`width`	`int`	Render to this width in pixels.	`0`
`height`	`int`	Render to this height in pixels.	`0`

Yields: Pillow Image.Image objects, one per page. Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If Poppler is not installed.

Source code in src/paves/image/poppler.py

@converter(priority=10)
def popple(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images using Poppler's pdftoppm.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels.
        height: Render to this height in pixels.
    Yields:
        Pillow `Image.Image` objects, one per page.
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If Poppler is not installed.
    """
    if dpi and (width or height):
        raise ValueError("Cannot specify both `dpi` and `width` or `height`")
    try:
        subprocess.run(["pdftoppm", "-h"], capture_output=True)
    except FileNotFoundError as e:
        raise NotInstalledError("Poppler does not seem to be installed") from e
    args = make_poppler_args(dpi, width, height)
    with tempfile.TemporaryDirectory() as tempdir:
        temppath = Path(tempdir)
        # FIXME: Possible to Popple in a Parallel Pipeline
        page_sizes = _popple(pdf, temppath, args)
        for (page_idx, page_width, page_height), ppm in zip(
            page_sizes,
            (path for path in sorted(temppath.iterdir()) if path.suffix == ".ppm"),
        ):
            img = Image.open(ppm)
            img.info["page_index"] = page_idx
            img.info["page_width"] = page_width
            img.info["page_height"] = page_height
            yield img

`show(page, dpi=72)`

Show a single page with some reasonable defaults.

Source code in src/paves/image/__init__.py

def show(page: Page, dpi: int = 72) -> Image.Image:
    """Show a single page with some reasonable defaults."""
    try:
        return next(convert(page, dpi=dpi))
    except NotImplementedError as e:
        raise ValueError(
            f"Can't call show() on a {type(page).__name__}, "
            "did you mean to call box() or mark()?"
        ) from e

`paves.text`

Various somewhat-more-heuristic ways of guessing, getting, and processing text in PDFs.

`WordObject` `dataclass`

Bases: TextBase

"Word" in a PDF.

This is heuristically determined, either by explicit whitespace (if you're lucky enough to have a Tagged PDF) or by a sufficient gap between adjacent glyphs (otherwise).

It otherwise behaves just like a TextObject. You can iterate over its glyphs, etc. But, as a treat, these glyphs are "finalized" so you don't have to worry about inconsistent graphics states and so forth, and you also get some convenience properties.

The origin of the curent (logical) line is also available, to facilitate grouping words into lines, if you so desire (simply use itertools.groupby(words, paves.text.line))

Source code in src/paves/text/__init__.py

@dataclass
class WordObject(TextBase):
    """
    "Word" in a PDF.

    This is heuristically determined, either by explicit whitespace
    (if you're lucky enough to have a Tagged PDF) or by a sufficient
    gap between adjacent glyphs (otherwise).

    It otherwise behaves just like a `TextObject`.  You can iterate
    over its glyphs, etc.  But, as a treat, these glyphs are
    "finalized" so you don't have to worry about inconsistent graphics
    states and so forth, and you also get some convenience properties.

    The origin of the curent (logical) line is also available, to
    facilitate grouping words into lines, if you so desire (simply
    use `itertools.groupby(words, paves.text.line)`)
    """

    _glyphs: List[GlyphObject]
    _next_origin: Point
    line: Point

    def __iter__(self) -> Iterator["ContentObject"]:
        return iter(self._glyphs)

    @property
    def matrix(self) -> Matrix:
        return self._glyphs[0].matrix

    @property
    def chars(self) -> str:
        return "".join(g.text for g in self._glyphs if g.text is not None)

    @property
    def origin(self) -> Point:
        return self._glyphs[0].origin

    @property
    def displacement(self) -> Point:
        ax, ay = self.origin
        bx, by = self._next_origin
        return bx - ax, by - ay

`line_break(glyph, predicted_origin)`

Heuristically predict a line break based on the predicted origin from the previous glyph.

Source code in src/paves/text/__init__.py

def line_break(glyph: GlyphObject, predicted_origin: Point) -> bool:
    """Heuristically predict a line break based on the predicted origin
    from the previous glyph."""
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        line_offset = x - px
    else:
        line_offset = y - py
        if glyph.page.space == "screen":
            line_offset = -line_offset
    return line_offset < 0 or line_offset > 100  # FIXME: arbitrary!

`text_objects(pdf)`

Iterate over all text objects in a PDF, page, or pages

Source code in src/paves/text/__init__.py

@singledispatch
def text_objects(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TextObject]:
    """Iterate over all text objects in a PDF, page, or pages"""
    raise NotImplementedError

`word_break(glyph, predicted_origin, prev_displacement)`

Heuristically predict a word break based on the predicted origin from the previous glyph.

Source code in src/paves/text/__init__.py

def word_break(
    glyph: GlyphObject, predicted_origin: Point, prev_displacement: Point
) -> bool:
    """Heuristically predict a word break based on the predicted origin
    from the previous glyph."""
    if glyph.text == " ":
        return True
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        glyph_offset = y - py
        _, displacement = prev_displacement
        if glyph.page.space == "screen":
            glyph_offset = -glyph_offset
            displacement = -displacement
    else:
        glyph_offset = x - px
        displacement, _ = prev_displacement
    # If there's a space, *or* if we are before the prev glyph
    return glyph_offset > 0.5 or glyph_offset < -displacement

`words(pdf)`

Extract "words" (i.e. whitespace-separated text cells) from a PDF or one of its pages.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Yields:

Type	Description
`WordObject`	`WordObject` objects, which can be visualized with `paves.image`
`WordObject`	functions, or you can do various other things with them too.

Source code in src/paves/text/__init__.py

def words(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[WordObject]:
    """Extract "words" (i.e. whitespace-separated text cells) from a
    PDF or one of its pages.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Yields:
        `WordObject` objects, which can be visualized with `paves.image`
        functions, or you can do various other things with them too.
    """
    glyphs: List[GlyphObject] = []
    predicted_origin: Union[None, Point] = None
    prev_disp: Union[None, Point] = None
    line_origin: Union[None, Point] = None
    for obj in text_objects(pdf):
        for glyph in obj:
            if line_origin is None:
                line_origin = glyph.origin
            if predicted_origin and prev_disp:
                new_word = word_break(glyph, predicted_origin, prev_disp)
                new_line = line_break(glyph, predicted_origin)
                if glyphs and (new_word or new_line):
                    yield WordObject(
                        _pageref=glyphs[0]._pageref,
                        _parentkey=glyphs[0]._parentkey,
                        gstate=glyphs[0].gstate,  # Not necessarily correct!
                        ctm=glyphs[0].ctm,  # Not necessarily correct!
                        mcstack=glyphs[0].mcstack,  # Not necessarily correct!
                        _glyphs=glyphs,
                        _next_origin=predicted_origin,
                        line=line_origin,
                    )
                    glyphs = []
                if new_line:
                    line_origin = glyph.origin
            if glyph.text is not None and glyph.text != " ":
                glyphs.append(cast(GlyphObject, glyph.finalize()))
            prev_disp = glyph.displacement
            predicted_origin = _add_point(glyph.origin, prev_disp)
    if predicted_origin and line_origin and glyphs:
        yield WordObject(
            _pageref=glyphs[0]._pageref,
            _parentkey=glyphs[0]._parentkey,
            gstate=glyphs[0].gstate,  # Not necessarily correct!
            ctm=glyphs[0].ctm,  # Not necessarily correct!
            mcstack=glyphs[0].mcstack,  # Not necessarily correct!
            _glyphs=glyphs,
            _next_origin=predicted_origin,
            line=line_origin,
        )

`paves.tables`

Simple and not at all Java-damaged interface for table detection.

`detector(name)`

Look up a detector by name.

Source code in src/paves/tables/detectors.py

def lookup(name: str) -> Union[Detector, None]:
    """Look up a detector by name."""
    for _, d in DETECTORS:
        if d.__name__ == name:
            return d
    return None

`tables(pdf)`

Identify tables in a PDF or one of its pages.

This will always try to use logical structure (via PLAYA-PDF) first to identify tables.

Of course, that only works on tagged and accessible PDFs. So, like paves.image, we can also use Machine Learning Models™ here, which involves nasty horrible dependencyses (we hates them, they stole the precious) like cudnn-10-gigabytes-of-c++.

If you'd like to try that, then you can do so by installing the transformers[torch] package (if you don't have a GPU, try adding --extra-index-url https://download.pytorch.org/whl/cpu to pip's command line).

These tables cannot span multiple pages.

Often, a table will span multiple pages. With PDF logical structure, this can be represented (and sometimes is), but if there is no logical structure, this is not possible, since tables are detected from the rendered image of a page. Reconstructing this information is both extremely important and also very difficult with current models (perhaps very big VLMs can do it?). Since we also want to visualize tables with paves.image, we don't return multi-page tables here.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Iterator[TableObject]`	An iterator over `TableObject`. If no method is available to
`Iterator[TableObject]`	detect tables, this will return an iterator over an empty
`Iterator[TableObject]`	list. You may wish to use `tables_orelse` to ensure that
`Iterator[TableObject]`	tables can be detected.

Source code in src/paves/tables/detectors.py

def tables(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TableObject]:
    """Identify tables in a PDF or one of its pages.

    This will always try to use logical structure (via PLAYA-PDF)
    first to identify tables.

    Of course, that only works on tagged and accessible PDFs.  So,
    like `paves.image`, we can also use Machine Learning Models™ here,
    which involves nasty horrible dependencyses (we hates them, they
    stole the precious) like `cudnn-10-gigabytes-of-c++`.

    If you'd like to try that, then you can do so by installing the
    `transformers[torch]` package (if you don't have a GPU, try adding
    `--extra-index-url https://download.pytorch.org/whl/cpu` to pip's
    command line).

    Note: These tables cannot span multiple pages.
        Often, a table will span multiple pages.  With PDF logical
        structure, this can be represented (and sometimes is), but if
        there is no logical structure, this is not possible, since
        tables are detected from the rendered image of a page.
        Reconstructing this information is both extremely important
        and also very difficult with current models (perhaps very big
        VLMs can do it?).  Since we also want to visualize tables with
        `paves.image`, we don't return multi-page tables here.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`.  If no method is available to
        detect tables, this will return an iterator over an empty
        list.  You may wish to use `tables_orelse` to ensure that
        tables can be detected.

    """
    itor = tables_orelse(pdf)
    if itor is None:
        return iter(())
    return itor

`tables_detr(pdf)`

Identify tables in a PDF or one of its pages using Docling Project layout model.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if the model can't be used

Source code in src/paves/tables/detr.py

@detector(priority=10)
def docling_heron(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using Docling Project
    layout model.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if the model can't be used

    """
    try:
        detected = detect_objects(pdf, "docling-project/docling-layout-heron")
    except ImportError:
        return None

    def itor() -> Iterator[TableObject]:
        for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
            assert page.page_idx == page_idx
            for label, bbox in objects:
                if label == "Table":
                    yield TableObject.from_bbox(page, bbox)

    return itor()

`tables_orelse(pdf)`

Identify tables in a PDF or one of its pages, or fail.

This works like tables but forces you (if you use type checking) to detect the case where tables cannot be detected by any known method.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if there is no
`Union[Iterator[TableObject], None]`	method available to detect tables. This will cause a
`Union[Iterator[TableObject], None]`	`TypeError` if you try to iterate over it anyway.

Source code in src/paves/tables/detectors.py

def tables_orelse(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages, or fail.

    This works like `tables` but forces you (if you use type checking)
    to detect the case where tables cannot be detected by any known
    method.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`, or `None`, if there is no
        method available to detect tables.  This will cause a
        `TypeError` if you try to iterate over it anyway.

    """
    for _, method in DETECTORS:
        itor = method(pdf)
        if itor is not None:
            return itor
    else:
        return None

`tables_structure(pdf)`

Identify tables in a PDF or one of its pages using logical structure.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if there is no
`Union[Iterator[TableObject], None]`	logical structure (this will cause a TypeError, if you don't
`Union[Iterator[TableObject], None]`	check for it).

Source code in src/paves/tables/structure.py

@detector(priority=0)
def structure(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using logical structure.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if there is no
      logical structure (this will cause a TypeError, if you don't
      check for it).
    """
    page = pdf if isinstance(pdf, Page) else None
    try:
        return table_elements_to_objects(table_elements(pdf), page)
    except TypeError:  # means that structure is None
        return None

`tables_tatr(pdf)`

Identify tables in a PDF or one of its pages using Microsoft Table Transformer model.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if the model can't be used

Source code in src/paves/tables/detr.py

@detector(priority=20)
def table_transformer(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using Microsoft Table
    Transformer model.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if the model can't be used

    """
    try:
        detected = detect_objects(
            pdf,
            "microsoft/table-transformer-detection",
            model_kwargs={"revision": "no_timm"},
            threshold=0.9,
        )
    except ImportError:
        return None

    def itor() -> Iterator[TableObject]:
        for page, (page_idx, objects) in zip(_get_pages(pdf), detected):
            assert page.page_idx == page_idx
            for label, bbox in objects:
                yield TableObject.from_bbox(page, bbox)

    return itor()

Reference

paves.image

BoxFunc = Callable[[Boxable], Union[Rect, None]] module-attribute

Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect] module-attribute

Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]] module-attribute

ColorMaker = Callable[[str], PillowColor] module-attribute

Colors = Union[Color, List[Color], Dict[str, Color]] module-attribute

DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan'] module-attribute

LabelFunc = Callable[[Boxable], Any] module-attribute

PillowColor = Union[str, Tuple[int, int, int]] module-attribute

box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

color_maker(spec, default='red')

convert(pdf, *, dpi=0, width=0, height=0)

get_box(obj)

get_box_rect(obj)

get_label(obj)

get_label_annotation(obj)

get_label_content(obj)

get_label_element(obj)

mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

pillow_color(color)

popple(pdf, *, dpi=0, width=0, height=0)

show(page, dpi=72)

paves.text

WordObject dataclass

line_break(glyph, predicted_origin)

text_objects(pdf)

word_break(glyph, predicted_origin, prev_displacement)

words(pdf)

paves.tables

detector(name)

tables(pdf)

tables_detr(pdf)

tables_orelse(pdf)

tables_structure(pdf)

tables_tatr(pdf)

`paves.image`

`BoxFunc = Callable[[Boxable], Union[Rect, None]]` `module-attribute`

`Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect]` `module-attribute`

`Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]]` `module-attribute`

`ColorMaker = Callable[[str], PillowColor]` `module-attribute`

`Colors = Union[Color, List[Color], Dict[str, Color]]` `module-attribute`

`DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']` `module-attribute`

`LabelFunc = Callable[[Boxable], Any]` `module-attribute`

`PillowColor = Union[str, Tuple[int, int, int]]` `module-attribute`

`box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

`color_maker(spec, default='red')`

`convert(pdf, *, dpi=0, width=0, height=0)`

`get_box(obj)`

`get_box_rect(obj)`

`get_label(obj)`

`get_label_annotation(obj)`

`get_label_content(obj)`

`get_label_element(obj)`

`mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

`pillow_color(color)`

`popple(pdf, *, dpi=0, width=0, height=0)`

`show(page, dpi=72)`

`paves.text`

`WordObject` `dataclass`

`line_break(glyph, predicted_origin)`

`text_objects(pdf)`

`word_break(glyph, predicted_origin, prev_displacement)`

`words(pdf)`

`paves.tables`

`detector(name)`

`tables(pdf)`

`tables_detr(pdf)`

`tables_orelse(pdf)`

`tables_structure(pdf)`

`tables_tatr(pdf)`