Reference

`paves.image`

Various ways of converting PDFs to images for feeding them to models and/or visualisation.`

`BoxFunc = Callable[[Boxable], Rect]` `module-attribute`

Function to get a bounding box for a Boxable.

`Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect]` `module-attribute`

Object for which we can get a bounding box.

`Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]]` `module-attribute`

Type alias for things that can be used as colors.

`ColorMaker = Callable[[str], PillowColor]` `module-attribute`

Function that makes a Pillow color for a string label.

`Colors = Union[Color, List[Color], Dict[str, Color]]` `module-attribute`

Type alias for colors or collections of colors.

`DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']` `module-attribute`

Default color cycle (same as matplotlib)

`LabelFunc = Callable[[Boxable], Any]` `module-attribute`

Function to get a label for a Boxable.

`PillowColor = Union[str, Tuple[int, int, int]]` `module-attribute`

Type alias for things Pillow accepts as colors.

`NotInstalledError`

Bases: RuntimeError

Exception raised if the dependencies for a particular PDF to image backend are not installed.

Source code in src/paves/image.py

class NotInstalledError(RuntimeError):
    """Exception raised if the dependencies for a particular PDF to
    image backend are not installed."""

`box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

Draw boxes around things in a page of a PDF.

Source code in src/paves/image.py

def box(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    label: bool = True,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    label_fill: bool = True,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Draw boxes around things in a page of a PDF."""
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        try:
            left, top, right, bottom = (x * scale for x in boxfunc(obj))
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(image)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), outline=obj_color)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color if label_fill else None,
            )
            draw.text(
                xy=(left + label_margin, top - label_margin),
                text=text,
                font=font,
                fill="white" if label_fill else obj_color,
                anchor="ld",
            )
    return image

`color_maker(spec, default='red')`

Create a function that makes colors.

Source code in src/paves/image.py

@functools.singledispatch
def color_maker(spec: Colors, default: Color = "red") -> ColorMaker:
    """Create a function that makes colors."""
    return lambda _: pillow_color(default)

`convert(pdf, *, dpi=0, width=0, height=0)`

Convert a PDF to images.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required
`dpi`	`int`	Render to this resolution (default is 72 dpi).	`0`
`width`	`int`	Render to this width in pixels (0 to keep aspect ratio).	`0`
`height`	`int`	Render to this height in pixels (0 to keep aspect ratio).	`0`

Yields: Pillow Image.Image objects, one per page. The original page width and height in default user space units are available in the info property of these images as page_width and page_height Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If no renderer is available

Source code in src/paves/image.py

def convert(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels (0 to keep aspect ratio).
        height: Render to this height in pixels (0 to keep aspect ratio).
    Yields:
        Pillow `Image.Image` objects, one per page.  The original page
        width and height in default user space units are available in
        the `info` property of these images as `page_width` and
        `page_height`
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If no renderer is available

    """
    for method in METHODS:
        try:
            for img in method(pdf, dpi=dpi, width=width, height=height):
                yield img
            break
        except NotInstalledError:
            continue
    else:
        raise NotInstalledError(
            "No renderers available, tried: %s"
            % (", ".join(m.__name__ for m in METHODS))
        )

`get_box(obj)`

Default function to get the bounding box for an object.

Source code in src/paves/image.py

@functools.singledispatch
def get_box(obj) -> Rect:
    """Default function to get the bounding box for an object."""
    if hasattr(obj, "bbox"):
        return obj.bbox
    raise RuntimeError(f"Don't know how to get the box for {obj!r}")

`get_box_annotation(obj)`

Get the bounding box of an Annotation

Source code in src/paves/image.py

@get_box.register(Annotation)
def get_box_annotation(obj: Annotation) -> Rect:
    """Get the bounding box of an Annotation"""
    return transform_bbox(obj.page.ctm, obj.rect)

`get_box_content(obj)`

Get the bounding box of a ContentObject

Source code in src/paves/image.py

@get_box.register(ContentObject)
@get_box.register(Element)
def get_box_content(obj: Union[ContentObject, Element]) -> Rect:
    """Get the bounding box of a ContentObject"""
    return obj.bbox

`get_box_rect(obj)`

Get the bounding box of a ContentObject

Source code in src/paves/image.py

@get_box.register(tuple)
def get_box_rect(obj: Rect) -> Rect:
    """Get the bounding box of a ContentObject"""
    return obj

`get_label(obj)`

Default function to get the label text for an object.

Source code in src/paves/image.py

@functools.singledispatch
def get_label(obj: Boxable) -> str:
    """Default function to get the label text for an object."""
    return str(obj)

`get_label_annotation(obj)`

Get the default label text for an Annotation.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image.py

@get_label.register(Annotation)
def get_label_annotation(obj: Annotation) -> str:
    """Get the default label text for an Annotation.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.subtype

`get_label_content(obj)`

Get the label text for a ContentObject.

Source code in src/paves/image.py

@get_label.register(ContentObject)
def get_label_content(obj: ContentObject) -> str:
    """Get the label text for a ContentObject."""
    return obj.object_type

`get_label_element(obj)`

Get the default label text for an Element.

This is just a default.

This is one of many possible options, so you may wish to define your own custom LabelFunc.

Source code in src/paves/image.py

@get_label.register(Element)
def get_label_element(obj: Element) -> str:
    """Get the default label text for an Element.

    Note: This is just a default.
        This is one of many possible options, so you may wish to
        define your own custom LabelFunc.
    """
    return obj.type

`mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

Highlight things in a page of a PDF.

Source code in src/paves/image.py

def mark(
    objs: Union[
        Boxable,
        Iterable[Union[Boxable, None]],
    ],
    *,
    color: Colors = DEFAULT_COLOR_CYCLE,
    transparency: float = 0.75,
    label: bool = False,
    label_color: Color = "white",
    label_size: float = 9,
    label_margin: float = 1,
    outline: bool = False,
    image: Union[Image.Image, None] = None,
    labelfunc: LabelFunc = get_label,
    boxfunc: BoxFunc = get_box,
    dpi: int = 72,
    page: Union[Page, None] = None,
) -> Union[Image.Image, None]:
    """Highlight things in a page of a PDF."""
    overlay: Union[Image.Image, None] = None
    mask: Union[Image.Image, None] = None
    draw: ImageDraw.ImageDraw
    scale = dpi / 72
    font = ImageFont.load_default(label_size * scale)
    alpha = min(255, int(transparency * 255))
    label_margin *= scale
    make_color = color_maker(color)
    image_page: Union[Page, None] = None
    for obj in _make_boxes(objs):
        if obj is None:
            continue
        if image_page is not None:
            if hasattr(obj, "page"):
                if cast(HasPage, obj).page != image_page:
                    break
        if image is None:
            image_page = _getpage(obj, page)
            image = show(image_page, dpi)
        if overlay is None:
            overlay = Image.new("RGB", image.size)
        if mask is None:
            mask = Image.new("L", image.size, 255)
        try:
            left, top, right, bottom = (x * scale for x in boxfunc(obj))
        except ValueError:  # it has no content and no box
            continue
        draw = ImageDraw.ImageDraw(overlay)
        text = str(labelfunc(obj))
        obj_color = make_color(text)
        draw.rectangle((left, top, right, bottom), fill=obj_color)
        mask_draw = ImageDraw.ImageDraw(mask)
        mask_draw.rectangle((left, top, right, bottom), fill=alpha)
        if outline:
            draw.rectangle((left, top, right, bottom), outline="black")
            mask_draw.rectangle((left, top, right, bottom), outline=0)
        if label:
            tl, tt, tr, tb = font.getbbox(text)
            label_box = (
                left,
                top - tb - label_margin * 2,
                left + tr + label_margin * 2,
                top,
            )
            draw.rectangle(
                label_box,
                outline=obj_color,
                fill=obj_color,
            )
            mask_draw.rectangle(
                label_box,
                fill=alpha,
            )
            if outline:
                draw.rectangle(
                    label_box,
                    outline="black",
                )
                mask_draw.rectangle(
                    label_box,
                    outline=0,
                )
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="black",
                    anchor="ld",
                )
                mask_draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill=0,
                    anchor="ld",
                )
            else:
                draw.text(
                    xy=(left + label_margin, top - label_margin),
                    text=text,
                    font=font,
                    fill="white",
                    anchor="ld",
                )
    if image is None:
        return None
    if overlay is not None and mask is not None:
        return Image.composite(image, overlay, mask)
    else:
        return image

`pdfium(pdf, *, dpi=0, width=0, height=0)`

Convert a PDF to images using PyPDFium2

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required
`dpi`	`int`	Render to this resolution (default is 72 dpi).	`0`
`width`	`int`	Render to this width in pixels.	`0`
`height`	`int`	Render to this height in pixels.	`0`

Yields: Pillow Image.Image objects, one per page. Page width and height are available in the info property of the images. Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If PyPDFium2 is not installed.

Source code in src/paves/image.py

def pdfium(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images using PyPDFium2

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels.
        height: Render to this height in pixels.
    Yields:
        Pillow `Image.Image` objects, one per page.  Page width and height are
        available in the `info` property of the images.
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If PyPDFium2 is not installed.
    """
    if dpi and (width or height):
        raise ValueError("Cannot specify both `dpi` and `width` or `height`")
    try:
        import pypdfium2  # noqa: F401
    except ImportError as e:
        raise NotInstalledError("PyPDFium2 does not seem to be installed") from e
    for idx, page in _get_pdfium_pages(pdf):
        page_width = page.get_width()
        page_height = page.get_height()
        if width == 0 and height == 0:
            scale = (dpi or 72) / 72
            img = page.render(scale=scale).to_pil()
        else:
            if width and height:
                # Scale to longest side (since pypdfium2 doesn't
                # appear to allow non-1:1 aspect ratio)
                scale = max(width / page_width, height / page_height)
                img = page.render(scale=scale).to_pil()
                # Resize down to desired size
                img = img.resize(size=(width, height))
            elif width:
                scale = width / page.get_width()
                img = page.render(scale=scale).to_pil()
            elif height:
                scale = height / page.get_height()
                img = page.render(scale=scale).to_pil()
        img.info["page_index"] = idx
        img.info["page_width"] = page_width
        img.info["page_height"] = page_height
        yield img

`pillow_color(color)`

Convert colors to a form acceptable to Pillow.

Source code in src/paves/image.py

def pillow_color(color: Color) -> PillowColor:
    """Convert colors to a form acceptable to Pillow."""
    if isinstance(color, str):
        return color
    r, g, b = color
    # Would sure be nice if MyPy understood all()
    if isinstance(r, int) and isinstance(g, int) and isinstance(b, int):
        return (r, g, b)
    r, g, b = (int(x * 255) for x in color)
    return (r, g, b)

`popple(pdf, *, dpi=0, width=0, height=0)`

Convert a PDF to images using Poppler's pdftoppm.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required
`dpi`	`int`	Render to this resolution (default is 72 dpi).	`0`
`width`	`int`	Render to this width in pixels.	`0`
`height`	`int`	Render to this height in pixels.	`0`

Yields: Pillow Image.Image objects, one per page. Raises: ValueError: Invalid arguments (e.g. both dpi and width/height) NotInstalledError: If Poppler is not installed.

Source code in src/paves/image.py

def popple(
    pdf: Union[str, PathLike, Document, Page, PageList],
    *,
    dpi: int = 0,
    width: int = 0,
    height: int = 0,
) -> Iterator[Image.Image]:
    """Convert a PDF to images using Poppler's pdftoppm.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        dpi: Render to this resolution (default is 72 dpi).
        width: Render to this width in pixels.
        height: Render to this height in pixels.
    Yields:
        Pillow `Image.Image` objects, one per page.
    Raises:
        ValueError: Invalid arguments (e.g. both `dpi` and `width`/`height`)
        NotInstalledError: If Poppler is not installed.
    """
    if dpi and (width or height):
        raise ValueError("Cannot specify both `dpi` and `width` or `height`")
    try:
        subprocess.run(["pdftoppm", "-h"], capture_output=True)
    except FileNotFoundError as e:
        raise NotInstalledError("Poppler does not seem to be installed") from e
    args = make_poppler_args(dpi, width, height)
    with tempfile.TemporaryDirectory() as tempdir:
        temppath = Path(tempdir)
        # FIXME: Possible to Popple in a Parallel Pipeline
        page_sizes = _popple(pdf, temppath, args)
        for (page_idx, page_width, page_height), ppm in zip(
            page_sizes,
            (path for path in sorted(temppath.iterdir()) if path.suffix == ".ppm"),
        ):
            img = Image.open(ppm)
            img.info["page_index"] = page_idx
            img.info["page_width"] = page_width
            img.info["page_height"] = page_height
            yield img

`show(page, dpi=72)`

Show a single page with some reasonable defaults.

Source code in src/paves/image.py

def show(page: Page, dpi: int = 72) -> Image.Image:
    """Show a single page with some reasonable defaults."""
    return next(convert(page, dpi=dpi))

`paves.text`

Various somewhat-more-heuristic ways of guessing, getting, and processing text in PDFs.

`WordObject` `dataclass`

Bases: TextBase

"Word" in a PDF.

This is heuristically determined, either by explicit whitespace (if you're lucky enough to have a Tagged PDF) or by a sufficient gap between adjacent glyphs (otherwise).

It otherwise behaves just like a TextObject. You can iterate over its glyphs, etc. But, as a treat, these glyphs are "finalized" so you don't have to worry about inconsistent graphics states and so forth, and you also get some convenience properties.

The origin of the curent (logical) line is also available, to facilitate grouping words into lines, if you so desire (simply use itertools.groupby(words, paves.text.line))

Source code in src/paves/text.py

@dataclass
class WordObject(TextBase):
    """
    "Word" in a PDF.

    This is heuristically determined, either by explicit whitespace
    (if you're lucky enough to have a Tagged PDF) or by a sufficient
    gap between adjacent glyphs (otherwise).

    It otherwise behaves just like a `TextObject`.  You can iterate
    over its glyphs, etc.  But, as a treat, these glyphs are
    "finalized" so you don't have to worry about inconsistent graphics
    states and so forth, and you also get some convenience properties.

    The origin of the curent (logical) line is also available, to
    facilitate grouping words into lines, if you so desire (simply
    use `itertools.groupby(words, paves.text.line)`)
    """

    _glyphs: List[GlyphObject]
    _next_origin: Point
    line: Point

    def __iter__(self) -> Iterator["ContentObject"]:
        return iter(self._glyphs)

    @property
    def matrix(self) -> Matrix:
        return self._glyphs[0].matrix

    @property
    def chars(self) -> str:
        return "".join(g.text for g in self._glyphs if g.text is not None)

    @property
    def origin(self) -> Point:
        return self._glyphs[0].origin

    @property
    def displacement(self) -> Point:
        ax, ay = self.origin
        bx, by = self._next_origin
        return bx - ax, by - ay

`line_break(glyph, predicted_origin)`

Heuristically predict a line break based on the predicted origin from the previous glyph.

Source code in src/paves/text.py

def line_break(glyph: GlyphObject, predicted_origin: Point) -> bool:
    """Heuristically predict a line break based on the predicted origin
    from the previous glyph."""
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        line_offset = x - px
    else:
        line_offset = y - py
        if glyph.page.space == "screen":
            line_offset = -line_offset
    return line_offset < 0 or line_offset > 100  # FIXME: arbitrary!

`text_objects(pdf)`

Iterate over all text objects in a PDF, page, or pages

Source code in src/paves/text.py

@singledispatch
def text_objects(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[TextObject]:
    """Iterate over all text objects in a PDF, page, or pages"""
    raise NotImplementedError

`word_break(glyph, predicted_origin, prev_displacement)`

Heuristically predict a word break based on the predicted origin from the previous glyph.

Source code in src/paves/text.py

def word_break(
    glyph: GlyphObject, predicted_origin: Point, prev_displacement: Point
) -> bool:
    """Heuristically predict a word break based on the predicted origin
    from the previous glyph."""
    if glyph.text == " ":
        return True
    x, y = glyph.origin
    px, py = predicted_origin
    if glyph.font.vertical:
        glyph_offset = y - py
        _, displacement = prev_displacement
        if glyph.page.space == "screen":
            glyph_offset = -glyph_offset
            displacement = -displacement
    else:
        glyph_offset = x - px
        displacement, _ = prev_displacement
    # If there's a space, *or* if we are before the prev glyph
    return glyph_offset > 0.5 or glyph_offset < -displacement

`words(pdf)`

Extract "words" (i.e. whitespace-separated text cells) from a PDF or one of its pages.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Yields:

Type	Description
`WordObject`	`WordObject` objects, which can be visualized with `paves.image`
`WordObject`	functions, or you can do various other things with them too.

Source code in src/paves/text.py

def words(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[WordObject]:
    """Extract "words" (i.e. whitespace-separated text cells) from a
    PDF or one of its pages.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Yields:
        `WordObject` objects, which can be visualized with `paves.image`
        functions, or you can do various other things with them too.
    """
    glyphs: List[GlyphObject] = []
    predicted_origin: Union[None, Point] = None
    prev_disp: Union[None, Point] = None
    line_origin: Union[None, Point] = None
    for obj in text_objects(pdf):
        for glyph in obj:
            if line_origin is None:
                line_origin = glyph.origin
            if predicted_origin and prev_disp:
                new_word = word_break(glyph, predicted_origin, prev_disp)
                new_line = line_break(glyph, predicted_origin)
                if glyphs and (new_word or new_line):
                    yield WordObject(
                        _pageref=glyphs[0]._pageref,
                        _parentkey=glyphs[0]._parentkey,
                        gstate=glyphs[0].gstate,  # Not necessarily correct!
                        ctm=glyphs[0].ctm,  # Not necessarily correct!
                        mcstack=glyphs[0].mcstack,  # Not necessarily correct!
                        _glyphs=glyphs,
                        _next_origin=predicted_origin,
                        line=line_origin,
                    )
                    glyphs = []
                if new_line:
                    line_origin = glyph.origin
            if glyph.text is not None and glyph.text != " ":
                glyphs.append(cast(GlyphObject, glyph.finalize()))
            prev_disp = glyph.displacement
            predicted_origin = _add_point(glyph.origin, prev_disp)
    if predicted_origin and line_origin and glyphs:
        yield WordObject(
            _pageref=glyphs[0]._pageref,
            _parentkey=glyphs[0]._parentkey,
            gstate=glyphs[0].gstate,  # Not necessarily correct!
            ctm=glyphs[0].ctm,  # Not necessarily correct!
            mcstack=glyphs[0].mcstack,  # Not necessarily correct!
            _glyphs=glyphs,
            _next_origin=predicted_origin,
            line=line_origin,
        )

`paves.tables`

Simple and not at all Java-damaged interface for table detection.

`TableObject` `dataclass`

Bases: ContentObject

Table on one page of a PDF.

This is a ContentObject and can be treated as one (notably with paves.image functions).

It could either come from a logical structure element, or it could simply be a bounding box (as detected by some sort of visual model). While these TableObjects will never span multiple pages, the underlying logical structure element may do so. This is currently the only way to detect multi-page tables through this interface (they will have an equivalent parent property).

Note that the graphics state and coordinate transformation matrix may just be the page defaults, if Machine Learning™ was used to detect the table in a rendered image of the page.

Source code in src/paves/tables.py

@dataclass
class TableObject(ContentObject):
    """Table on one page of a PDF.

    This **is** a ContentObject and can be treated as one (notably
    with `paves.image` functions).

    It could either come from a logical structure element, or it could
    simply be a bounding box (as detected by some sort of visual
    model).  While these `TableObject`s will never span multiple
    pages, the underlying logical structure element may do so.  This
    is currently the only way to detect multi-page tables through this
    interface (they will have an equivalent `parent` property).

    Note that the graphics state and coordinate transformation matrix
    may just be the page defaults, if Machine Learning™ was used to
    detect the table in a rendered image of the page.

    """

    _bbox: Union[Rect, None]
    _parent: Union[Element, None]

    @property
    def bbox(self) -> Rect:
        # _bbox takes priority as we *could* have both
        if self._bbox is not None:
            return self._bbox
        elif self._parent is not None:
            # Try to get it from the element but only if it has the
            # same page as us (otherwise it will be wrong!)
            if self._parent.page is self.page:
                bbox = self._parent.bbox
                if bbox is not BBOX_NONE:
                    return bbox
            # We always have a page even if self._parent doesn't
            return get_bound_rects(
                item.bbox
                for item in self._parent.contents
                if item.page is self.page and item.bbox is not BBOX_NONE
            )
        else:
            # This however should never happen
            return BBOX_NONE

    @classmethod
    def from_bbox(cls, page: Page, bbox: Rect) -> "TableObject":
        # Use default values
        return cls(
            _pageref=_ref_page(page),
            _parentkey=None,
            gstate=GraphicState(),
            ctm=page.ctm,
            mcstack=(),
            _bbox=bbox,
            _parent=None,
        )

    @classmethod
    def from_element(
        cls,
        el: Element,
        page: Page,
        contents: Union[Iterable[Union[ContentItem, StructContentObject]], None] = None,
    ) -> Union["TableObject", None]:
        if contents is None:
            contents = el.contents
        # Find a ContentObject so we can get a bbox, mcstack, ctm
        # (they might not be *correct* of course, but oh well)
        gstate: Union[GraphicState, None] = None
        ctm: Union[Matrix, None] = None
        mcstack: Union[Tuple[MarkedContent, ...], None] = None
        bbox: Union[Rect, None] = None
        for kid in contents:
            # For multi-page tables, skip any contents on a different page
            if kid.page != page:
                continue
            if isinstance(kid, StructContentObject):
                obj = kid.obj
                if obj is None:
                    continue
                elif isinstance(obj, Annotation):
                    # FIXME: for the moment just ignore these
                    continue
                else:
                    gstate = copy(obj.gstate)
                    ctm = obj.ctm
                    mcstack = obj.mcstack
                    bbox = obj.bbox
                    break
            elif isinstance(kid, ContentItem):
                # It's a ContentItem
                try:
                    cobj = next(iter(kid))
                except StopIteration:
                    continue
                gstate = copy(cobj.gstate)
                ctm = cobj.ctm
                mcstack = cobj.mcstack
                break
        else:
            # No contents, no table for you!
            return None
        return cls(
            _pageref=_ref_page(page),
            _parentkey=None,
            gstate=gstate,
            ctm=ctm,
            mcstack=mcstack,
            _bbox=bbox,
            _parent=el,
        )

`table_bounds_to_objects(pdf, bounds)`

Create TableObjects from detected bounding boxes.

Source code in src/paves/tables.py

def table_bounds_to_objects(
    pdf: Union[str, PathLike, Document, Page, PageList],
    bounds: Iterable[Tuple[int, Iterable[Rect]]],
) -> Iterator[TableObject]:
    """Create TableObjects from detected bounding boxes."""
    for page, (page_idx, tables) in zip(_get_pages(pdf), bounds):
        assert page.page_idx == page_idx
        for bbox in tables:
            yield TableObject.from_bbox(page, bbox)

`table_elements(pdf)`

Iterate over all text objects in a PDF, page, or pages

Source code in src/paves/tables.py

@singledispatch
def table_elements(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Iterator[Element]:
    """Iterate over all text objects in a PDF, page, or pages"""
    raise NotImplementedError(f"Not implemented for {type(pdf)}")

`table_elements_to_objects(elements, page=None)`

Make TableObjects from Elements.

Source code in src/paves/tables.py

def table_elements_to_objects(
    elements: Iterable[Element], page: Union[Page, None] = None
) -> Iterator[TableObject]:
    """Make TableObjects from Elements."""
    for el in elements:
        # It usually has a page, but it can also span multiple pages
        # if this is the case.  So a page passed explicitly here
        # should take precedence.
        for kidpage, kids in groupby(el.contents, attrgetter("page")):
            if kidpage is None:
                continue
            if page is not None and kidpage is not page:
                continue
            table = TableObject.from_element(el, kidpage, kids)
            if table is not None:
                yield table

`tables(pdf, **kwargs)`

Identify tables in a PDF or one of its pages.

This will always try to use logical structure (via PLAYA-PDF) first to identify tables.

For the moment, this only works on tagged and accessible PDFs. So, like paves.image, it can also use Machine Learning Models™ to do so, which involves nasty horrible dependencyses (we hates them, they stole the precious) like cudnn-10-gigabytes-of-c++.

If you'd like to try that, then you can do so by installing the transformers[torch] package (if you don't have a GPU, try adding --extra-index-url https://download.pytorch.org/whl/cpu to pip's command line).

These tables cannot span multiple pages.

Often, a table will span multiple pages. With PDF logical structure, this can be represented (and sometimes is), but if there is no logical structure, this is not possible, since tables are detected from the rendered image of a page. Reconstructing this information is both extremely important and also very difficult with current models (perhaps very big VLMs can do it?). Since we also want to visualize tables with paves.image, we don't return multi-page tables here.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Iterator[TableObject]`	An iterator over `TableObject`. If no method is available to
`Iterator[TableObject]`	detect tables, this will return an iterator over an empty
`Iterator[TableObject]`	list. You may wish to use `tables_orelse` to ensure that
`Iterator[TableObject]`	tables can be detected.

Source code in src/paves/tables.py

def tables(
    pdf: Union[str, PathLike, Document, Page, PageList], **kwargs: Any
) -> Iterator[TableObject]:
    """Identify tables in a PDF or one of its pages.

    This will always try to use logical structure (via PLAYA-PDF)
    first to identify tables.

    For the moment, this only works on tagged and accessible PDFs.
    So, like `paves.image`, it can also use Machine Learning Models™
    to do so, which involves nasty horrible dependencyses (we hates
    them, they stole the precious) like `cudnn-10-gigabytes-of-c++`.

    If you'd like to try that, then you can do so by installing the
    `transformers[torch]` package (if you don't have a GPU, try adding
    `--extra-index-url https://download.pytorch.org/whl/cpu` to pip's
    command line).

    Note: These tables cannot span multiple pages.
        Often, a table will span multiple pages.  With PDF logical
        structure, this can be represented (and sometimes is), but if
        there is no logical structure, this is not possible, since
        tables are detected from the rendered image of a page.
        Reconstructing this information is both extremely important
        and also very difficult with current models (perhaps very big
        VLMs can do it?).  Since we also want to visualize tables with
        `paves.image`, we don't return multi-page tables here.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`.  If no method is available to
        detect tables, this will return an iterator over an empty
        list.  You may wish to use `tables_orelse` to ensure that
        tables can be detected.

    """
    itor = tables_orelse(pdf, **kwargs)
    if itor is None:
        return iter(())
    return itor

`tables_detr(pdf, device='cpu')`

Identify tables in a PDF or one of its pages using IBM's RT-DETR layout detection model

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required
`device`	`str`	Torch device for running the model.	`'cpu'`

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if the model can't be used

Source code in src/paves/tables.py

def tables_detr(
    pdf: Union[str, PathLike, Document, Page, PageList],
    device: str = "cpu",
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using IBM's
    RT-DETR layout detection model

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.
        device: Torch device for running the model.

    Returns:
      An iterator over `TableObject`, or `None`, if the model can't be used
    """
    try:
        from paves.tables_detr import table_bounds
    except ImportError:
        return None
    return table_bounds_to_objects(pdf, table_bounds(pdf, device=device))

`tables_orelse(pdf, **kwargs)`

Identify tables in a PDF or one of its pages, or fail.

This works like tables but forces you (if you use type checking) to detect the case where tables cannot be detected by any known method.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if there is no
`Union[Iterator[TableObject], None]`	method available to detect tables. This will cause a
`Union[Iterator[TableObject], None]`	`TypeError` if you try to iterate over it anyway.

Source code in src/paves/tables.py

def tables_orelse(
    pdf: Union[str, PathLike, Document, Page, PageList], **kwargs: Any
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages, or fail.

    This works like `tables` but forces you (if you use type checking)
    to detect the case where tables cannot be detected by any known
    method.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
        An iterator over `TableObject`, or `None`, if there is no
        method available to detect tables.  This will cause a
        `TypeError` if you try to iterate over it anyway.

    """
    for method in METHODS:
        itor = method(pdf, **kwargs)
        if itor is not None:
            return itor
    else:
        return None

`tables_structure(pdf)`

Identify tables in a PDF or one of its pages using logical structure.

Parameters:

Name	Type	Description	Default
`pdf`	`Union[str, PathLike, Document, Page, PageList]`	PLAYA-PDF document, page, pages, or path to a PDF.	required

Returns:

Type	Description
`Union[Iterator[TableObject], None]`	An iterator over `TableObject`, or `None`, if there is no
`Union[Iterator[TableObject], None]`	logical structure (this will cause a TypeError, if you don't
`Union[Iterator[TableObject], None]`	check for it).

Source code in src/paves/tables.py

def tables_structure(
    pdf: Union[str, PathLike, Document, Page, PageList],
) -> Union[Iterator[TableObject], None]:
    """Identify tables in a PDF or one of its pages using logical structure.

    Args:
        pdf: PLAYA-PDF document, page, pages, or path to a PDF.

    Returns:
      An iterator over `TableObject`, or `None`, if there is no
      logical structure (this will cause a TypeError, if you don't
      check for it).
    """
    page = pdf if isinstance(pdf, Page) else None
    try:
        return table_elements_to_objects(table_elements(pdf), page)
    except TypeError:  # means that structure is None
        return None

Reference

paves.image

BoxFunc = Callable[[Boxable], Rect] module-attribute

Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect] module-attribute

Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]] module-attribute

ColorMaker = Callable[[str], PillowColor] module-attribute

Colors = Union[Color, List[Color], Dict[str, Color]] module-attribute

DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan'] module-attribute

LabelFunc = Callable[[Boxable], Any] module-attribute

PillowColor = Union[str, Tuple[int, int, int]] module-attribute

NotInstalledError

box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

color_maker(spec, default='red')

convert(pdf, *, dpi=0, width=0, height=0)

get_box(obj)

get_box_annotation(obj)

get_box_content(obj)

get_box_rect(obj)

get_label(obj)

get_label_annotation(obj)

get_label_content(obj)

get_label_element(obj)

mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)

pdfium(pdf, *, dpi=0, width=0, height=0)

pillow_color(color)

popple(pdf, *, dpi=0, width=0, height=0)

show(page, dpi=72)

paves.text

WordObject dataclass

line_break(glyph, predicted_origin)

text_objects(pdf)

word_break(glyph, predicted_origin, prev_displacement)

words(pdf)

paves.tables

TableObject dataclass

table_bounds_to_objects(pdf, bounds)

table_elements(pdf)

table_elements_to_objects(elements, page=None)

tables(pdf, **kwargs)

tables_detr(pdf, device='cpu')

tables_orelse(pdf, **kwargs)

tables_structure(pdf)

`paves.image`

`BoxFunc = Callable[[Boxable], Rect]` `module-attribute`

`Boxable = Union[Annotation, ContentObject, Element, HasBbox, Rect]` `module-attribute`

`Color = Union[str, Tuple[int, int, int], Tuple[float, float, float]]` `module-attribute`

`ColorMaker = Callable[[str], PillowColor]` `module-attribute`

`Colors = Union[Color, List[Color], Dict[str, Color]]` `module-attribute`

`DEFAULT_COLOR_CYCLE = ['blue', 'orange', 'green', 'red', 'purple', 'brown', 'pink', 'gray', 'olive', 'cyan']` `module-attribute`

`LabelFunc = Callable[[Boxable], Any]` `module-attribute`

`PillowColor = Union[str, Tuple[int, int, int]]` `module-attribute`

`NotInstalledError`

`box(objs, *, color=DEFAULT_COLOR_CYCLE, label=True, label_color='white', label_size=9, label_margin=1, label_fill=True, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

`color_maker(spec, default='red')`

`convert(pdf, *, dpi=0, width=0, height=0)`

`get_box(obj)`

`get_box_annotation(obj)`

`get_box_content(obj)`

`get_box_rect(obj)`

`get_label(obj)`

`get_label_annotation(obj)`

`get_label_content(obj)`

`get_label_element(obj)`

`mark(objs, *, color=DEFAULT_COLOR_CYCLE, transparency=0.75, label=False, label_color='white', label_size=9, label_margin=1, outline=False, image=None, labelfunc=get_label, boxfunc=get_box, dpi=72, page=None)`

`pdfium(pdf, *, dpi=0, width=0, height=0)`

`pillow_color(color)`

`popple(pdf, *, dpi=0, width=0, height=0)`

`show(page, dpi=72)`

`paves.text`

`WordObject` `dataclass`

`line_break(glyph, predicted_origin)`

`text_objects(pdf)`

`word_break(glyph, predicted_origin, prev_displacement)`

`words(pdf)`

`paves.tables`

`TableObject` `dataclass`

`table_bounds_to_objects(pdf, bounds)`

`table_elements(pdf)`

`table_elements_to_objects(elements, page=None)`

`tables(pdf, **kwargs)`

`tables_detr(pdf, device='cpu')`

`tables_orelse(pdf, **kwargs)`

`tables_structure(pdf)`