Skip to content

Reference

playa

PLAYA ain't a LAYout Analyzer... but it can help you get stuff out of PDFs.

Basic usage:

with playa.open(path) as pdf:
    for page in pdf.pages:
        print(f"page {page.label}:")
        for obj in page:
            print(f"    {obj.object_type} at {obj.bbox}")
            if obj.object_type == "text":
                print(f"        chars: {obj.chars}")

open(path, password='', space='screen')

Open a PDF document from a path on the filesystem.

Source code in playa/__init__.py
28
29
30
31
32
33
34
35
def open(
    path: Union[PathLike, str], password: str = "", space: DeviceSpace = "screen"
) -> Document:
    """Open a PDF document from a path on the filesystem."""
    fp = builtins.open(path, "rb")
    pdf = Document(fp, password=password, space=space)
    pdf._fp = fp
    return pdf

playa.document

Basic classes for PDF document parsing.

Document

Representation of a PDF document on disk.

Since PDF documents can be very large and complex, merely creating a Document does very little aside from opening the file and verifying that the password is correct and it is, in fact, a PDF. This may, however, involve a certain amount of file access since the cross-reference table and trailer must be read in order to determine this (we do not treat linearized PDFs specially for the moment).

Some metadata, such as the structure tree and page tree, will be loaded lazily and cached. We do not handle modification of PDFs.

Parameters:

Name Type Description Default
fp BinaryIO

File-like object in binary mode. Will be read using mmap if possible, otherwise will be read into memory.

required
password str

Password for decryption, if needed.

''
space DeviceSpace

the device space to use for interpreting content ("screen" or "page")

'screen'
Source code in playa/document.py
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
class Document:
    """Representation of a PDF document on disk.

    Since PDF documents can be very large and complex, merely creating
    a `Document` does very little aside from opening the file and
    verifying that the password is correct and it is, in fact, a PDF.
    This may, however, involve a certain amount of file access since
    the cross-reference table and trailer must be read in order to
    determine this (we do not treat linearized PDFs specially for the
    moment).

    Some metadata, such as the structure tree and page tree, will be
    loaded lazily and cached.  We do not handle modification of PDFs.

    Args:
      fp: File-like object in binary mode.  Will be read using
          `mmap` if possible, otherwise will be read into memory.
      password: Password for decryption, if needed.
      space: the device space to use for interpreting content ("screen"
             or "page")

    """

    _fp: Union[BinaryIO, None] = None
    _pages: Union["PageList", None] = None

    def __enter__(self) -> "Document":
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        # If we were opened from a file then close it
        if self._fp:
            self._fp.close()
            self._fp = None

    def __init__(
        self,
        fp: BinaryIO,
        password: str = "",
        space: DeviceSpace = "screen",
    ) -> None:
        self.xrefs: List[XRef] = []
        self.space = space
        self.info = []
        self.catalog: Dict[str, Any] = {}
        self.encryption: Optional[Tuple[Any, Any]] = None
        self.decipher: Optional[DecipherCallable] = None
        self._cached_objs: Dict[int, PDFObject] = {}
        self._parsed_objs: Dict[int, Tuple[List[PDFObject], int]] = {}
        self._cached_fonts: Dict[object, Font] = {}
        if isinstance(fp, io.TextIOBase):
            raise TypeError("fp is not a binary file")
        # The header is frequently mangled, in which case we will try to read the
        # file anyway.
        try:
            self.pdf_version = read_header(fp)
        except PDFSyntaxError:
            log.warning("PDF header not found, will try to read the file anyway")
            self.pdf_version = "UNKNOWN"
        # Make sure we read the whole file if we need to read the file!
        fp.seek(0, 0)
        try:
            self.buffer: Union[bytes, mmap.mmap] = mmap.mmap(
                fp.fileno(), 0, access=mmap.ACCESS_READ
            )
        except io.UnsupportedOperation:
            log.warning("mmap not supported on %r, reading document into memory", fp)
            self.buffer = fp.read()
        except ValueError:
            raise
        self.is_printable = self.is_modifiable = self.is_extractable = True
        # Getting the XRef table and trailer is done non-lazily
        # because they contain encryption information among other
        # things.  As noted above we don't try to look for the first
        # page cross-reference table (for linearized PDFs) after the
        # header, it will instead be loaded with all the rest.
        self.parser = IndirectObjectParser(self.buffer, self)
        try:
            pos = self._find_xref()
            self._read_xref_from(pos, self.xrefs)
        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
            log.debug("Using fallback XRef parsing: %s", e)
            newxref = XRefFallback(self.parser)
            self.xrefs.append(newxref)
        # Now find the trailer
        for xref in self.xrefs:
            trailer = xref.trailer
            if not trailer:
                continue
            # If there's an encryption info, remember it.
            if "Encrypt" in trailer:
                if "ID" in trailer:
                    id_value = list_value(trailer["ID"])
                else:
                    # Some documents may not have a /ID, use two empty
                    # byte strings instead. Solves
                    # https://github.com/pdfminer/pdfminer.six/issues/594
                    id_value = (b"", b"")
                self.encryption = (id_value, dict_value(trailer["Encrypt"]))
                self._initialize_password(password)
            if "Info" in trailer:
                try:
                    self.info.append(dict_value(trailer["Info"]))
                except TypeError:
                    log.warning("Info is a broken reference (incorrect xref table?)")
            if "Root" in trailer:
                # Every PDF file must have exactly one /Root dictionary.
                try:
                    self.catalog = dict_value(trailer["Root"])
                except TypeError:
                    log.warning("Root is a broken reference (incorrect xref table?)")
                    self.catalog = {}
                break
        else:
            raise PDFSyntaxError("No /Root object! - Is this really a PDF?")
        if self.catalog.get("Type") is not LITERAL_CATALOG:
            log.warning("Catalog not found!")
        if "Version" in self.catalog:
            log.debug(
                "Using PDF version %r from catalog instead of %r from header",
                self.catalog["Version"],
                self.pdf_version,
            )
            self.pdf_version = self.catalog["Version"]

    def _initialize_password(self, password: str = "") -> None:
        """Initialize the decryption handler with a given password, if any.

        Internal function, requires the Encrypt dictionary to have
        been read from the trailer into self.encryption.
        """
        assert self.encryption is not None
        (docid, param) = self.encryption
        if literal_name(param.get("Filter")) != "Standard":
            raise PDFEncryptionError("Unknown filter: param=%r" % param)
        v = int_value(param.get("V", 0))
        # 3 (PDF 1.4) An unpublished algorithm that permits encryption
        # key lengths ranging from 40 to 128 bits. This value shall
        # not appear in a conforming PDF file.
        if v == 3:
            raise PDFEncryptionError("Unpublished algorithm 3 not supported")
        factory = SECURITY_HANDLERS.get(v)
        # 0 An algorithm that is undocumented. This value shall not be used.
        if factory is None:
            raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
        handler = factory(docid, param, password)
        self.decipher = handler.decrypt
        self.is_printable = handler.is_printable
        self.is_modifiable = handler.is_modifiable
        self.is_extractable = handler.is_extractable
        assert self.parser is not None
        # Ensure that no extra data leaks into encrypted streams
        self.parser.strict = True

    def __iter__(self) -> Iterator[IndirectObject]:
        """Iterate over `IndirectObject`"""
        return (obj for pos, obj in IndirectObjectParser(self.buffer, self))

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterate over tokens."""
        return (tok for pos, tok in Lexer(self.buffer))

    @property
    def layout(self) -> Iterator[LayoutDict]:
        """Iterate over `LayoutDict` for all pages."""
        for idx, page in enumerate(self.pages):
            for dic in page.layout:
                dic = cast(LayoutDict, dic)  # ugh
                dic["page_index"] = idx
                dic["page_label"] = page.label
                yield dic

    @property
    def structtree(self) -> StructTree:
        """Return the PDF structure tree."""
        return StructTree(self)

    def _getobj_objstm(
        self, stream: ContentStream, index: int, objid: int
    ) -> PDFObject:
        if stream.objid in self._parsed_objs:
            (objs, n) = self._parsed_objs[stream.objid]
        else:
            (objs, n) = self._get_objects(stream)
            assert stream.objid is not None
            self._parsed_objs[stream.objid] = (objs, n)
        i = n * 2 + index
        try:
            obj = objs[i]
        except IndexError:
            raise PDFSyntaxError("index too big: %r" % index)
        return obj

    def _get_objects(self, stream: ContentStream) -> Tuple[List[PDFObject], int]:
        if stream.get("Type") is not LITERAL_OBJSTM:
            log.warning("Content stream Type is not /ObjStm: %r" % stream)
        try:
            n = cast(int, stream["N"])
        except KeyError:
            log.warning("N is not defined in content stream: %r" % stream)
            n = 0
        parser = ObjectParser(stream.buffer, self)
        objs: List[PDFObject] = [obj for _, obj in parser]
        return (objs, n)

    def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
        assert self.parser is not None
        log.debug("getobj_parse: seeking to %d for objid %d", pos, objid)
        self.parser.seek(pos)
        try:
            _, obj = next(self.parser)
            if obj.objid != objid:
                raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
        except (ValueError, IndexError, PDFSyntaxError) as e:
            log.warning(
                "Indirect object %d not found at position %d: %r", objid, pos, e
            )
            # In case of malformed pdf files where the offset in the
            # xref table doesn't point exactly at the object
            # definition (probably more frequent than you think), just
            # use a regular expression to find the object because we
            # can do that.
            realpos = -1
            lastgen = -1
            for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer):
                genno = int(m.group(1))
                if genno > lastgen:
                    lastgen = genno
                    realpos = m.start(0)
            if realpos == -1:
                raise PDFSyntaxError(
                    f"Indirect object {objid!r} not found in document"
                ) from e
            log.debug("found object (%r) seeking to %r", m.group(0), realpos)
            self.parser.seek(realpos)
            (_, obj) = next(self.parser)
        if obj.objid != objid:
            raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
        if self.decipher:
            return decipher_all(self.decipher, obj.objid, obj.genno, obj.obj)
        return obj.obj

    def __getitem__(self, objid: int) -> Any:
        """Get an indirect object from the PDF.

        Raises:
          ValueError: if Document is not initialized
          IndexError: if objid does not exist in PDF
        """
        if not self.xrefs:
            raise ValueError("Document is not initialized")
        if objid not in self._cached_objs:
            log.debug("getobj: objid=%r", objid)
            obj = None
            for xref in self.xrefs:
                try:
                    (strmid, index, genno) = xref.get_pos(objid)
                except KeyError:
                    continue
                log.debug("getobj: strmid %r index %r genno %r", strmid, index, genno)
                try:
                    if strmid is not None:
                        stream = stream_value(self[strmid])
                        obj = self._getobj_objstm(stream, index, objid)
                    else:
                        obj = self._getobj_parse(index, objid)
                    break
                # FIXME: We might not actually want to catch these...
                except StopIteration:
                    log.debug("EOF when searching for object %d", objid)
                    continue
                except PDFSyntaxError as e:
                    log.debug("Syntax error when searching for object %d: %s", objid, e)
                    continue
            if obj is None:
                raise IndexError(f"Object with ID {objid} not found")
            log.debug("register: objid=%r: %r", objid, obj)
            self._cached_objs[objid] = obj
        return self._cached_objs[objid]

    def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
        if objid and objid in self._cached_fonts:
            return self._cached_fonts[objid]
        log.debug("get_font: create: objid=%r, spec=%r", objid, spec)
        if spec.get("Type") is not LITERAL_FONT:
            log.warning("Font specification Type is not /Font: %r", spec)
        # Create a Font object.
        if "Subtype" in spec:
            subtype = literal_name(spec["Subtype"])
        else:
            log.warning("Font specification Subtype is not specified: %r", spec)
            subtype = ""
        if subtype in ("Type1", "MMType1"):
            # Type1 Font
            font: Font = Type1Font(spec)
        elif subtype == "TrueType":
            # TrueType Font
            font = PDFTrueTypeFont(spec)
        elif subtype == "Type3":
            # Type3 Font
            font = Type3Font(spec)
        elif subtype in ("CIDFontType0", "CIDFontType2"):
            # CID Font
            font = CIDFont(spec)
        elif subtype == "Type0":
            # Type0 Font
            dfonts = list_value(spec["DescendantFonts"])
            assert dfonts
            subspec = dict_value(dfonts[0]).copy()
            # FIXME: Bad tightly coupled with internals of CIDFont
            for k in ("Encoding", "ToUnicode"):
                if k in spec:
                    subspec[k] = resolve1(spec[k])
            font = self.get_font(None, subspec)
        else:
            log.warning("Invalid Font spec: %r" % spec)
            font = Type1Font(spec)  # FIXME: this is so wrong!
        if objid:
            self._cached_fonts[objid] = font
        return font

    @property
    def outlines(self) -> Iterator[OutlineItem]:
        """
        Iterate over the PDF document outline.
        """
        if "Outlines" not in self.catalog:
            raise KeyError

        def search(entry: object, level: int) -> Iterator[OutlineItem]:
            entry = dict_value(entry)
            if "Title" in entry:
                if "A" in entry or "Dest" in entry:
                    title = decode_text(str_value(entry["Title"]))
                    dest = entry.get("Dest")
                    action = entry.get("A")
                    se = entry.get("SE")
                    yield OutlineItem(
                        level, title, resolve1(dest), resolve1(action), se
                    )
            if "First" in entry and "Last" in entry:
                yield from search(entry["First"], level + 1)
            if "Next" in entry:
                yield from search(entry["Next"], level)

        return search(self.catalog["Outlines"], 0)

    @property
    def page_labels(self) -> Iterator[str]:
        """Generate page label strings for the PDF document.

        If the document includes page labels, generates strings, one per page.
        If not, raise KeyError.

        The resulting iterator is unbounded (because the page label
        tree does not actually include all the pages), so it is
        recommended to use `pages` instead.

        Raises:
          KeyError: No page labels are present in the catalog

        """
        assert self.catalog is not None  # really it cannot be None

        page_labels = PageLabels(self.catalog["PageLabels"])
        return page_labels.labels

    PageType = Dict[Any, Dict[Any, Any]]

    def _get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
        """Find pages from the cross-reference tables if the page tree
        is missing (note that this only happens in invalid PDFs, but
        it happens.)

        Returns:
          an iterator over (objid, dict) pairs.
        """
        for xref in self.xrefs:
            for object_id in xref.objids:
                try:
                    obj = self[object_id]
                    if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
                        yield object_id, obj
                except IndexError:
                    pass

    def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
        """Iterate over the flattened page tree in reading order, propagating
        inheritable attributes.  Returns an iterator over (objid, dict) pairs.

        Raises:
          KeyError: if there is no page tree.
        """
        if "Pages" not in self.catalog:
            raise KeyError("No 'Pages' entry in catalog")
        stack = [(self.catalog["Pages"], self.catalog)]
        visited = set()
        while stack:
            (obj, parent) = stack.pop()
            if isinstance(obj, ObjRef):
                # The PDF specification *requires* both the Pages
                # element of the catalog and the entries in Kids in
                # the page tree to be indirect references.
                object_id = int(obj.objid)
            elif isinstance(obj, int):
                # Should not happen in a valid PDF, but probably does?
                log.warning("Page tree contains bare integer: %r in %r", obj, parent)
                object_id = obj
            else:
                log.warning("Page tree contains unknown object: %r", obj)
            page_object = dict_value(self[object_id])

            # Avoid recursion errors by keeping track of visited nodes
            # (again, this should never actually happen in a valid PDF)
            if object_id in visited:
                log.warning("Circular reference %r in page tree", obj)
                continue
            visited.add(object_id)

            # Propagate inheritable attributes
            object_properties = page_object.copy()
            for k, v in parent.items():
                if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
                    object_properties[k] = v

            # Recurse, depth-first
            object_type = object_properties.get("Type")
            if object_type is None:
                log.warning("Page has no Type, trying type: %r", object_properties)
                object_type = object_properties.get("type")
            if object_type is LITERAL_PAGES and "Kids" in object_properties:
                log.debug("Pages: Kids=%r", object_properties["Kids"])
                for child in reversed(list_value(object_properties["Kids"])):
                    stack.append((child, object_properties))
            elif object_type is LITERAL_PAGE:
                log.debug("Page: %r", object_properties)
                yield object_id, object_properties

    @property
    def pages(self) -> "PageList":
        if self._pages is None:
            self._pages = PageList(self)
        return self._pages

    @property
    def names(self) -> Dict[str, Any]:
        """PDF name dictionary (PDF 1.7 sec 7.7.4).

        Raises:
          KeyError: if nonexistent.
        """
        return dict_value(self.catalog["Names"])

    @property
    def dests(self) -> Iterable[Tuple[str, list]]:
        """Iterable of named destinations as (name, destination) tuples
        (PDF 1.7 sec 12.3.2).

        Note that we assume the names of destinations are either "name
        objects" (that's PDF for UTF-8) or "text strings", since the
        PDF spec says (p. 367):

        > The keys in the name tree may be treated as text strings for
        > display purposes.

        therefore, you get them as `str`.

        Raises:
          KeyError: if no destination tree exists
        """
        try:
            # PDF-1.2 or later
            dests = (
                (decode_text(k), resolve1(v)) for k, v in NameTree(self.names["Dests"])
            )
        except KeyError:
            # PDF-1.1 or prior
            dests = (
                (k, resolve1(v)) for k, v in dict_value(self.catalog["Dests"]).items()
            )
        for name, dest in dests:
            if isinstance(dest, dict):
                yield name, resolve1(dest["D"])
            else:
                yield name, dest

    def _find_xref(self) -> int:
        """Internal function used to locate the first XRef."""
        # search the last xref table by scanning the file backwards.
        prev = b""
        for pos, line in reverse_iter_lines(self.buffer):
            line = line.strip()
            log.debug("find_xref: %r", line)
            if line == b"startxref":
                log.debug("xref found: pos=%r", prev)
                if not prev.isdigit():
                    log.warning("Invalid startxref position: %r", prev)
                    continue
                start = int(prev)
                if not start >= 0:
                    log.warning("Invalid negative startxref position: %d", start)
                    continue
                elif start > pos:
                    log.warning("Invalid startxref position (> %d): %d", pos, start)
                    continue
                return start
            elif line == b"xref":
                return pos
            elif line == b"endobj":
                # Okay, we're probably not in Kansas anymore...
                break
            if line:
                prev = line
        raise ValueError("No xref table found at end of file")

    # read xref table
    def _read_xref_from(
        self,
        start: int,
        xrefs: List[XRef],
    ) -> None:
        """Reads XRefs from the given location."""
        parser = ObjectParser(self.buffer, self, start)
        try:
            (pos, token) = parser.nexttoken()
        except StopIteration:
            raise ValueError("Unexpected EOF at {start}")
        log.debug("read_xref_from: start=%d, token=%r", start, token)
        if token is KEYWORD_XREF:
            parser.nextline()
            xref: XRef = XRefTable(parser)
        else:
            # It might be an XRefStream, if this is an indirect object...
            _, token = parser.nexttoken()
            _, token = parser.nexttoken()
            if token is KEYWORD_OBJ:
                # XRefStream: PDF-1.5
                self.parser.seek(pos)
                self.parser.reset()
                xref = XRefStream(self.parser)
            else:
                # Well, maybe it's an XRef table without "xref" (but
                # probably not)
                parser.seek(pos)
                xref = XRefTable(parser)
        xrefs.append(xref)
        trailer = xref.trailer
        # For hybrid-reference files, an additional set of xrefs as a
        # stream.
        if "XRefStm" in trailer:
            pos = int_value(trailer["XRefStm"])
            self._read_xref_from(pos, xrefs)
        # Recurse into any previous xref tables or streams
        if "Prev" in trailer:
            # find previous xref
            pos = int_value(trailer["Prev"])
            self._read_xref_from(pos, xrefs)

dests: Iterable[Tuple[str, list]] property

Iterable of named destinations as (name, destination) tuples (PDF 1.7 sec 12.3.2).

Note that we assume the names of destinations are either "name objects" (that's PDF for UTF-8) or "text strings", since the PDF spec says (p. 367):

The keys in the name tree may be treated as text strings for display purposes.

therefore, you get them as str.

Raises:

Type Description
KeyError

if no destination tree exists

layout: Iterator[LayoutDict] property

Iterate over LayoutDict for all pages.

names: Dict[str, Any] property

PDF name dictionary (PDF 1.7 sec 7.7.4).

Raises:

Type Description
KeyError

if nonexistent.

outlines: Iterator[OutlineItem] property

Iterate over the PDF document outline.

page_labels: Iterator[str] property

Generate page label strings for the PDF document.

If the document includes page labels, generates strings, one per page. If not, raise KeyError.

The resulting iterator is unbounded (because the page label tree does not actually include all the pages), so it is recommended to use pages instead.

Raises:

Type Description
KeyError

No page labels are present in the catalog

structtree: StructTree property

Return the PDF structure tree.

tokens: Iterator[Token] property

Iterate over tokens.

__getitem__(objid)

Get an indirect object from the PDF.

Raises:

Type Description
ValueError

if Document is not initialized

IndexError

if objid does not exist in PDF

Source code in playa/document.py
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
def __getitem__(self, objid: int) -> Any:
    """Get an indirect object from the PDF.

    Raises:
      ValueError: if Document is not initialized
      IndexError: if objid does not exist in PDF
    """
    if not self.xrefs:
        raise ValueError("Document is not initialized")
    if objid not in self._cached_objs:
        log.debug("getobj: objid=%r", objid)
        obj = None
        for xref in self.xrefs:
            try:
                (strmid, index, genno) = xref.get_pos(objid)
            except KeyError:
                continue
            log.debug("getobj: strmid %r index %r genno %r", strmid, index, genno)
            try:
                if strmid is not None:
                    stream = stream_value(self[strmid])
                    obj = self._getobj_objstm(stream, index, objid)
                else:
                    obj = self._getobj_parse(index, objid)
                break
            # FIXME: We might not actually want to catch these...
            except StopIteration:
                log.debug("EOF when searching for object %d", objid)
                continue
            except PDFSyntaxError as e:
                log.debug("Syntax error when searching for object %d: %s", objid, e)
                continue
        if obj is None:
            raise IndexError(f"Object with ID {objid} not found")
        log.debug("register: objid=%r: %r", objid, obj)
        self._cached_objs[objid] = obj
    return self._cached_objs[objid]

__iter__()

Iterate over IndirectObject

Source code in playa/document.py
941
942
943
def __iter__(self) -> Iterator[IndirectObject]:
    """Iterate over `IndirectObject`"""
    return (obj for pos, obj in IndirectObjectParser(self.buffer, self))

playa.page

Classes for looking at pages and their contents.

ContentObject dataclass

Any sort of content object.

Attributes:

Name Type Description
gstate GraphicState

Graphics state.

ctm Matrix

Coordinate transformation matrix (PDF 1.7 section 8.3.2).

mcs Union[MarkedContent, None]

Marked content (point or section).

Source code in playa/page.py
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
@dataclass
class ContentObject:
    """Any sort of content object.

    Attributes:
      gstate: Graphics state.
      ctm: Coordinate transformation matrix (PDF 1.7 section 8.3.2).
      mcs: Marked content (point or section).
    """

    gstate: GraphicState
    ctm: Matrix
    mcs: Union[MarkedContent, None]

    def __iter__(self) -> Iterator["ContentObject"]:
        yield from ()

    @property
    def object_type(self):
        """Type of this object as a string, e.g. "text", "path", "image"."""
        name = self.__class__.__name__
        return name[: -len("Object")].lower()

    @property
    def bbox(self) -> Rect:
        # These bboxes have already been computed in device space so
        # we don't need all 4 corners!
        points = itertools.chain.from_iterable(
            ((x0, y0), (x1, y1)) for x0, y0, x1, y1 in (item.bbox for item in self)
        )
        return get_bound(points)

object_type property

Type of this object as a string, e.g. "text", "path", "image".

DashPattern

Bases: NamedTuple

Line dash pattern in PDF graphics state (PDF 1.7 section 8.4.3.6).

Attributes:

Name Type Description
dash Tuple[float, ...]

lengths of dashes and gaps in user space units

phase float

starting position in the dash pattern

Source code in playa/page.py
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
class DashPattern(NamedTuple):
    """
    Line dash pattern in PDF graphics state (PDF 1.7 section 8.4.3.6).

    Attributes:
      dash: lengths of dashes and gaps in user space units
      phase: starting position in the dash pattern
    """

    dash: Tuple[float, ...]
    phase: float

    def __str__(self):
        if len(self.dash) == 0:
            return ""
        else:
            return f"{self.dash} {self.phase}"

GlyphObject dataclass

Bases: ContentObject

Individual glyph on the page.

Attributes:

Name Type Description
textstate TextState

Text state for this glyph. This is a mutable object and you should not expect it to be valid outside the context of iteration over the parent TextObject.

cid int

Character ID for this glyph.

text Union[str, None]

Unicode mapping of this glyph, if any.

adv float

glyph displacement in user space units.

bbox Rect

glyph bounding box in device space.

Source code in playa/page.py
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
@dataclass
class GlyphObject(ContentObject):
    """Individual glyph on the page.

    Attributes:
      textstate: Text state for this glyph.  This is a **mutable**
        object and you should not expect it to be valid outside the
        context of iteration over the parent `TextObject`.
      cid: Character ID for this glyph.
      text: Unicode mapping of this glyph, if any.
      adv: glyph displacement in user space units.
      bbox: glyph bounding box in device space.
    """

    textstate: TextState
    cid: int
    text: Union[str, None]
    adv: float
    _bbox: Rect

    @property
    def bbox(self) -> Rect:
        return self._bbox

object_type property

Type of this object as a string, e.g. "text", "path", "image".

GraphicState dataclass

PDF Graphics state (PDF 1.7 section 8.4)

Attributes:

Name Type Description
linewidth float

Line width in user space units (sec. 8.4.3.2)

linecap int

Line cap style (sec. 8.4.3.3)

linejoin int

Line join style (sec. 8.4.3.4)

miterlimit float

Maximum length of mitered line joins (sec. 8.4.3.5)

dash DashPattern

Dash pattern for stroking (sec 8.4.3.6)

intent PSLiteral

Rendering intent (sec. 8.6.5.8)

flatness float

The precision with which curves shall be rendered on the output device (sec. 10.6.2)

scolor Color

Colour used for stroking operations

scs ColorSpace

Colour space used for stroking operations

ncolor Color

Colour used for non-stroking operations

scs ColorSpace

Colour space used for non-stroking operations

Source code in playa/page.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
@dataclass
class GraphicState:
    """PDF Graphics state (PDF 1.7 section 8.4)

    Attributes:
      linewidth: Line width in user space units (sec. 8.4.3.2)
      linecap: Line cap style (sec. 8.4.3.3)
      linejoin: Line join style (sec. 8.4.3.4)
      miterlimit: Maximum length of mitered line joins (sec. 8.4.3.5)
      dash: Dash pattern for stroking (sec 8.4.3.6)
      intent: Rendering intent (sec. 8.6.5.8)
      flatness: The precision with which curves shall be rendered on
        the output device (sec. 10.6.2)
      scolor: Colour used for stroking operations
      scs: Colour space used for stroking operations
      ncolor: Colour used for non-stroking operations
      scs: Colour space used for non-stroking operations
    """

    linewidth: float = 0
    linecap: int = 0
    linejoin: int = 0
    miterlimit: float = 10
    dash: DashPattern = DashPattern((), 0)
    intent: PSLiteral = LITERAL_RELATIVE_COLORIMETRIC
    flatness: float = 1
    # stroking color
    scolor: Color = BASIC_BLACK
    # stroking color space
    scs: ColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
    # non stroking color
    ncolor: Color = BASIC_BLACK
    # non stroking color space
    ncs: ColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]

ImageObject dataclass

Bases: ContentObject

An image (either inline or XObject).

Attributes:

Name Type Description
xobjid Union[str, None]

Name of XObject (or None for inline images).

srcsize Tuple[int, int]

Size of source image in pixels.

bits int

Number of bits per component, if required (otherwise 1).

imagemask bool

True if the image is a mask.

stream ContentStream

Content stream with image data.

colorspace Union[ColorSpace, None]

Colour space for this image, if required (otherwise None).

Source code in playa/page.py
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
@dataclass
class ImageObject(ContentObject):
    """An image (either inline or XObject).

    Attributes:
      xobjid: Name of XObject (or None for inline images).
      srcsize: Size of source image in pixels.
      bits: Number of bits per component, if required (otherwise 1).
      imagemask: True if the image is a mask.
      stream: Content stream with image data.
      colorspace: Colour space for this image, if required (otherwise
        None).
    """

    xobjid: Union[str, None]
    srcsize: Tuple[int, int]
    bits: int
    imagemask: bool
    stream: ContentStream
    colorspace: Union[ColorSpace, None]

    def __contains__(self, name: object) -> bool:
        return name in self.stream

    def __getitem__(self, name: str) -> PDFObject:
        return self.stream[name]

    @property
    def buffer(self) -> bytes:
        """Binary stream content for this image"""
        return self.stream.buffer

    @property
    def bbox(self) -> Rect:
        # PDF 1.7 sec 8.3.24: All images shall be 1 unit wide by 1
        # unit high in user space, regardless of the number of samples
        # in the image. To be painted, an image shall be mapped to a
        # region of the page by temporarily altering the CTM.
        return get_transformed_bound(self.ctm, (0, 0, 1, 1))

buffer: bytes property

Binary stream content for this image

object_type property

Type of this object as a string, e.g. "text", "path", "image".

LayoutDict

Bases: TypedDict

Dictionary-based layout objects.

These are somewhat like the T_obj dictionaries returned by pdfplumber. The type of coordinates returned are determined by the space argument passed to Page. By default, (0, 0) is the top-left corner of the page, with 72 units per inch.

All values can be converted to strings in some meaningful fashion, such that you can simply write one of these to a CSV. You can access the field names through the __annotations__ property:

writer = DictWriter(fieldnames=LayoutDict.__annotations__.keys())
dictwriter.write_rows(writer)

Attributes:

Name Type Description
object_type str

Type of object as a string.

mcid Union[int, None]

Containing marked content section ID (or None if marked content has no ID, such as artifacts or if there is no logical structure).

tag Union[str, None]

Containing marked content tag name (or None if not in a marked content section).

xobjid Union[str, None]

String name of containing Form XObject, if any.

cid int

Integer character ID of glyph, if object_type == "char".

text Union[str, None]

Unicode mapping for glyph, if any.

fontname str

str

size float

Font size in device space.

glyph_offset_x float

Horizontal offset (in device space) of glyph from start of line.

glyph_offset_y float

Vertical offset (in device space) of glyph from start of line.

render_mode int

Text rendering mode.

upright bool

FIXME: Not really sure what this means. pdfminer.six?

x0 float

Minimum x coordinate of bounding box (top or bottom depending on device space).

x1 float

Maximum x coordinate of bounding box (top or bottom depending on device space).

y0 float

Minimum y coordinate of bounding box (left or right depending on device space).

x1 float

Minimum x coordinate of bounding box (left or right depending on device space).

stroking_colorspace str

String name of colour space for stroking operations.

stroking_color Tuple[float, ...]

Numeric parameters for stroking color.

stroking_pattern Union[str, None]

Name of stroking pattern, if any.

non_stroking_colorspace str

String name of colour space for non-stroking operations.

non_stroking_color Tuple[float, ...]

Numeric parameters for non-stroking color.

non_stroking_pattern Union[str, None]

Name of stroking pattern, if any.

path_ops str

Sequence of path operations (e.g. "mllh" for a triangle or "mlllh" for a quadrilateral)

dash_pattern Tuple[float, ...]

Sequence of user space units for alternating stroke and non-stroke segments of dash pattern, () for solid line. (Cannot be in device space because this would depend on which direction the line or curve is drawn).

dash_phase float

Initial position in dash_pattern in user space units. (see above for why it's in user space)

evenodd bool

Was this path filled with Even-Odd (if True) or Nonzero-Winding-Number rule (if False)? Note that this is meaningless for determining if a path is actually filled since subpaths have already been decomposed. If you really care then use the lazy API instead.

stroke bool

Is this path stroked?

fill bool

Is this path filled?

linewidth float

Line width in user space units (again, not possible to transform to device space).

pts_x List[float]

X coordinates of path endpoints, one for each character in path_ops. This is optimized for CSV/DataFrame output, if you care about the control points then use the lazy API.

pts_y List[float]

Y coordinates of path endpoints, one for each character in path_ops. This is optimized for CSV/DataFrame output, if you care about the control points then use the lazy API.

stream Union[Tuple[int, int], None]

Object number and generation number for the content stream associated with an image, or None for inline images. If you want image data then use the lazy API.

imagemask bool

Is this image a mask?

image_colorspace Union[ColorSpace, None]

String description of image colour space, or None if irrelevant/forbidden,

srcsize Tuple[int, int]

Source dimensions of image in pixels.

bits int

Number of bits per channel of image.

Source code in playa/page.py
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
class LayoutDict(TypedDict, total=False):
    """Dictionary-based layout objects.

    These are somewhat like the `T_obj` dictionaries returned by
    pdfplumber.  The type of coordinates returned are determined by
    the `space` argument passed to `Page`.  By default, `(0, 0)` is
    the top-left corner of the page, with 72 units per inch.

    All values can be converted to strings in some meaningful fashion,
    such that you can simply write one of these to a CSV.  You can access
    the field names through the `__annotations__` property:

        writer = DictWriter(fieldnames=LayoutDict.__annotations__.keys())
        dictwriter.write_rows(writer)

    Attributes:
      object_type: Type of object as a string.
      mcid: Containing marked content section ID (or None if marked
        content has no ID, such as artifacts or if there is no logical
        structure).
      tag: Containing marked content tag name (or None if not in a marked
        content section).
      xobjid: String name of containing Form XObject, if any.
      cid: Integer character ID of glyph, if `object_type == "char"`.
      text: Unicode mapping for glyph, if any.
      fontname: str
      size: Font size in device space.
      glyph_offset_x: Horizontal offset (in device space) of glyph
        from start of line.
      glyph_offset_y: Vertical offset (in device space) of glyph from
        start of line.
      render_mode: Text rendering mode.
      upright: FIXME: Not really sure what this means.  pdfminer.six?
      x0: Minimum x coordinate of bounding box (top or bottom
        depending on device space).
      x1: Maximum x coordinate of bounding box (top or bottom
        depending on device space).
      y0: Minimum y coordinate of bounding box (left or right
        depending on device space).
      x1: Minimum x coordinate of bounding box (left or right
        depending on device space).
      stroking_colorspace: String name of colour space for stroking
        operations.
      stroking_color: Numeric parameters for stroking color.
      stroking_pattern: Name of stroking pattern, if any.
      non_stroking_colorspace: String name of colour space for non-stroking
        operations.
      non_stroking_color: Numeric parameters for non-stroking color.
      non_stroking_pattern: Name of stroking pattern, if any.
      path_ops: Sequence of path operations (e.g. `"mllh"` for a
        triangle or `"mlllh"` for a quadrilateral)
      dash_pattern: Sequence of user space units for alternating
        stroke and non-stroke segments of dash pattern, `()` for solid
        line. (Cannot be in device space because this would depend on
        which direction the line or curve is drawn).
      dash_phase: Initial position in `dash_pattern` in user space
        units.  (see above for why it's in user space)
      evenodd: Was this path filled with Even-Odd (if `True`) or
        Nonzero-Winding-Number rule (if `False`)?  Note that this is
        **meaningless** for determining if a path is actually filled
        since subpaths have already been decomposed.  If you really
        care then use the lazy API instead.
      stroke: Is this path stroked?
      fill: Is this path filled?
      linewidth: Line width in user space units (again, not possible
        to transform to device space).
      pts_x: X coordinates of path endpoints, one for each character
        in `path_ops`.  This is optimized for CSV/DataFrame output, if
        you care about the control points then use the lazy API.
      pts_y: Y coordinates of path endpoints, one for each character
        in `path_ops`.  This is optimized for CSV/DataFrame output, if
        you care about the control points then use the lazy API.
      stream: Object number and generation number for the content
        stream associated with an image, or `None` for inline images.
        If you want image data then use the lazy API.
      imagemask: Is this image a mask?
      image_colorspace: String description of image colour space, or
        `None` if irrelevant/forbidden,
      srcsize: Source dimensions of image in pixels.
      bits: Number of bits per channel of image.
    """

    object_type: str
    mcid: Union[int, None]
    tag: Union[str, None]
    xobjid: Union[str, None]
    cid: int
    text: Union[str, None]
    fontname: str
    size: float
    glyph_offset_x: float
    glyph_offset_y: float
    render_mode: int
    upright: bool
    x0: float
    y0: float
    x1: float
    y1: float
    stroking_colorspace: str
    stroking_color: Tuple[float, ...]
    stroking_pattern: Union[str, None]
    non_stroking_colorspace: str
    non_stroking_color: Tuple[float, ...]
    non_stroking_pattern: Union[str, None]
    path_ops: str
    dash_pattern: Tuple[float, ...]
    dash_phase: float
    evenodd: bool
    stroke: bool
    fill: bool
    linewidth: float
    pts_x: List[float]
    pts_y: List[float]
    stream: Union[Tuple[int, int], None]
    imagemask: bool
    image_colorspace: Union[ColorSpace, None]
    srcsize: Tuple[int, int]
    bits: int

MarkedContent

Bases: NamedTuple

Marked content point or section in a PDF page.

Attributes:

Name Type Description
mcid Union[int, None]

Marked content section ID, or None for a marked content point.

tag str

Name of tag for this marked content.

props Dict[str, PDFObject]

Marked content property dictionary.

Source code in playa/page.py
631
632
633
634
635
636
637
638
639
640
641
642
643
class MarkedContent(NamedTuple):
    """
    Marked content point or section in a PDF page.

    Attributes:
      mcid: Marked content section ID, or `None` for a marked content point.
      tag: Name of tag for this marked content.
      props: Marked content property dictionary.
    """

    mcid: Union[int, None]
    tag: str
    props: Dict[str, PDFObject]

Page

An object that holds the information about a page.

Parameters:

Name Type Description Default
doc Document

a Document object.

required
pageid int

the integer PDF object ID associated with the page in the page tree.

required
attrs Dict

a dictionary of page attributes.

required
label Optional[str]

page label string.

required
page_idx int

0-based index of the page in the document.

0
space DeviceSpace

the device space to use for interpreting content

'screen'

Attributes:

Name Type Description
pageid

the integer object ID associated with the page in the page tree

attrs

a dictionary of page attributes.

resources Dict[str, PDFObject]

a dictionary of resources used by the page.

mediabox

the physical size of the page.

cropbox

the crop rectangle of the page.

rotate

the page rotation (in degree).

label

the page's label (typically, the logical page number).

page_number

the "physical" page number, indexed from 1.

Source code in playa/page.py
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
class Page:
    """An object that holds the information about a page.

    Args:
      doc: a Document object.
      pageid: the integer PDF object ID associated with the page in the page tree.
      attrs: a dictionary of page attributes.
      label: page label string.
      page_idx: 0-based index of the page in the document.
      space: the device space to use for interpreting content

    Attributes:
      pageid: the integer object ID associated with the page in the page tree
      attrs: a dictionary of page attributes.
      resources: a dictionary of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      label: the page's label (typically, the logical page number).
      page_number: the "physical" page number, indexed from 1.

    """

    def __init__(
        self,
        doc: "Document",
        pageid: int,
        attrs: Dict,
        label: Optional[str],
        page_idx: int = 0,
        space: DeviceSpace = "screen",
    ) -> None:
        self.doc = weakref.ref(doc)
        self.pageid = pageid
        self.attrs = attrs
        self.label = label
        self.page_idx = page_idx
        self.space = space
        self.lastmod = resolve1(self.attrs.get("LastModified"))
        try:
            self.resources: Dict[str, PDFObject] = dict_value(
                self.attrs.get("Resources")
            )
        except TypeError:
            log.warning("Resources missing or invalid from Page id %d", pageid)
            self.resources = {}
        if "MediaBox" in self.attrs:
            self.mediabox = normalize_rect(parse_rect(self.attrs["MediaBox"]))
        else:
            log.warning(
                "MediaBox missing from Page id %d (and not inherited),"
                " defaulting to US Letter (612x792)",
                pageid,
            )
            self.mediabox = (0, 0, 612, 792)
        self.cropbox = self.mediabox
        if "CropBox" in self.attrs:
            try:
                self.cropbox = normalize_rect(parse_rect(self.attrs["CropBox"]))
            except ValueError:
                log.warning("Invalid CropBox in /Page, defaulting to MediaBox")

        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
        self.annots = self.attrs.get("Annots")
        self.beads = self.attrs.get("B")
        contents = resolve1(self.attrs.get("Contents"))
        if contents is None:
            self._contents = []
        else:
            if isinstance(contents, list):
                self._contents = contents
            else:
                self._contents = [contents]

    @property
    def streams(self) -> Iterator[ContentStream]:
        """Return resolved content streams."""
        for obj in self._contents:
            yield stream_value(obj)

    @property
    def width(self) -> float:
        """Width of the page in default user space units."""
        x0, _, x1, _ = self.mediabox
        return x1 - x0

    @property
    def height(self) -> float:
        """Width of the page in default user space units."""
        _, y0, _, y1 = self.mediabox
        return y1 - y0

    @property
    def contents(self) -> Iterator[PDFObject]:
        """Iterator over PDF objects in the content streams."""
        for pos, obj in ContentParser(self._contents):
            yield obj

    def __iter__(self) -> Iterator["ContentObject"]:
        """Iterator over lazy layout objects."""
        return iter(LazyInterpreter(self, self._contents))

    @property
    def paths(self) -> Iterator["PathObject"]:
        """Iterator over lazy path objects."""
        return (obj for obj in self if isinstance(obj, PathObject))

    @property
    def images(self) -> Iterator["ImageObject"]:
        """Iterator over lazy image objects."""
        return (obj for obj in self if isinstance(obj, ImageObject))

    @property
    def texts(self) -> Iterator["TextObject"]:
        """Iterator over lazy text objects."""
        return (obj for obj in self if isinstance(obj, TextObject))

    @property
    def xobjects(self) -> Iterator["XObjectObject"]:
        """Return resolved and rendered Form XObjects.

        This does *not* return any image or PostScript XObjects.  You
        can get images via the `images` property.  Apparently you
        aren't supposed to use PostScript XObjects for anything, ever.

        Note that these are the XObjects as rendered on the page, so
        you may see the same named XObject multiple times.  If you
        need to access their actual definitions you'll have to look at
        `page.resources`.
        """
        return (obj for obj in self if isinstance(obj, XObjectObject))

    @property
    def layout(self) -> Iterator["LayoutDict"]:
        """Iterator over eager layout object dictionaries."""
        return iter(PageInterpreter(self, self._contents))

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterator over tokens in the content streams."""
        parser = ContentParser(self._contents)
        while True:
            try:
                pos, tok = parser.nexttoken()
            except StopIteration:
                return
            yield tok

    @property
    def structtree(self) -> StructTree:
        """Return the PDF structure tree."""
        doc = self.doc()
        if doc is None:
            raise RuntimeError("Document no longer exists!")
        return StructTree(doc, (self,))

    def __repr__(self) -> str:
        return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

contents: Iterator[PDFObject] property

Iterator over PDF objects in the content streams.

height: float property

Width of the page in default user space units.

images: Iterator[ImageObject] property

Iterator over lazy image objects.

layout: Iterator[LayoutDict] property

Iterator over eager layout object dictionaries.

paths: Iterator[PathObject] property

Iterator over lazy path objects.

streams: Iterator[ContentStream] property

Return resolved content streams.

structtree: StructTree property

Return the PDF structure tree.

texts: Iterator[TextObject] property

Iterator over lazy text objects.

tokens: Iterator[Token] property

Iterator over tokens in the content streams.

width: float property

Width of the page in default user space units.

xobjects: Iterator[XObjectObject] property

Return resolved and rendered Form XObjects.

This does not return any image or PostScript XObjects. You can get images via the images property. Apparently you aren't supposed to use PostScript XObjects for anything, ever.

Note that these are the XObjects as rendered on the page, so you may see the same named XObject multiple times. If you need to access their actual definitions you'll have to look at page.resources.

__iter__()

Iterator over lazy layout objects.

Source code in playa/page.py
209
210
211
def __iter__(self) -> Iterator["ContentObject"]:
    """Iterator over lazy layout objects."""
    return iter(LazyInterpreter(self, self._contents))

PathObject dataclass

Bases: ContentObject

A path object.

Attributes:

Name Type Description
raw_segments List[PathSegment]

Segments in path (in user space).

stroke bool

True if the outline of the path is stroked.

fill bool

True if the path is filled.

evenodd bool

True if the filling of complex paths uses the even-odd winding rule, False if the non-zero winding number rule is used (PDF 1.7 section 8.5.3.3)

Source code in playa/page.py
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
@dataclass
class PathObject(ContentObject):
    """A path object.

    Attributes:
      raw_segments: Segments in path (in user space).
      stroke: True if the outline of the path is stroked.
      fill: True if the path is filled.
      evenodd: True if the filling of complex paths uses the even-odd
        winding rule, False if the non-zero winding number rule is
        used (PDF 1.7 section 8.5.3.3)
    """

    raw_segments: List[PathSegment]
    stroke: bool
    fill: bool
    evenodd: bool

    def __len__(self):
        """Number of subpaths."""
        return min(1, sum(1 for seg in self.raw_segments if seg.operator == "m"))

    def __iter__(self):
        """Iterate over subpaths.

        Note: subpaths inherit the values of `fill` and `evenodd` from
        the parent path, but these values are no longer meaningful
        since the winding rules must be applied to the composite path
        as a whole (this is not a bug, just don't rely on them to know
        which regions are filled or not).
        """
        # FIXME: Is there an itertool or a more_itertool for this?
        segs = []
        for seg in self.raw_segments:
            if seg.operator == "m" and segs:
                yield PathObject(
                    self.gstate,
                    self.ctm,
                    self.mcs,
                    segs,
                    self.stroke,
                    self.fill,
                    self.evenodd,
                )
            segs.append(seg)
        if segs:
            yield PathObject(
                self.gstate,
                self.ctm,
                self.mcs,
                segs,
                self.stroke,
                self.fill,
                self.evenodd,
            )

    @property
    def segments(self) -> Iterator[PathSegment]:
        """Get path segments in device space."""
        return (
            PathSegment(
                p.operator,
                tuple(apply_matrix_pt(self.ctm, point) for point in p.points),
            )
            for p in self.raw_segments
        )

    @property
    def bbox(self) -> Rect:
        """Get bounding box of path in device space as defined by its
        points and control points."""
        # First get the bounding box in user space (fast)
        bbox = get_bound(
            itertools.chain.from_iterable(seg.points for seg in self.raw_segments)
        )
        # Transform it and get the new bounding box
        return get_transformed_bound(self.ctm, bbox)

bbox: Rect property

Get bounding box of path in device space as defined by its points and control points.

object_type property

Type of this object as a string, e.g. "text", "path", "image".

segments: Iterator[PathSegment] property

Get path segments in device space.

__iter__()

Iterate over subpaths.

Note: subpaths inherit the values of fill and evenodd from the parent path, but these values are no longer meaningful since the winding rules must be applied to the composite path as a whole (this is not a bug, just don't rely on them to know which regions are filled or not).

Source code in playa/page.py
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
def __iter__(self):
    """Iterate over subpaths.

    Note: subpaths inherit the values of `fill` and `evenodd` from
    the parent path, but these values are no longer meaningful
    since the winding rules must be applied to the composite path
    as a whole (this is not a bug, just don't rely on them to know
    which regions are filled or not).
    """
    # FIXME: Is there an itertool or a more_itertool for this?
    segs = []
    for seg in self.raw_segments:
        if seg.operator == "m" and segs:
            yield PathObject(
                self.gstate,
                self.ctm,
                self.mcs,
                segs,
                self.stroke,
                self.fill,
                self.evenodd,
            )
        segs.append(seg)
    if segs:
        yield PathObject(
            self.gstate,
            self.ctm,
            self.mcs,
            segs,
            self.stroke,
            self.fill,
            self.evenodd,
        )

__len__()

Number of subpaths.

Source code in playa/page.py
1877
1878
1879
def __len__(self):
    """Number of subpaths."""
    return min(1, sum(1 for seg in self.raw_segments if seg.operator == "m"))

PathSegment

Bases: NamedTuple

Segment in a PDF graphics path.

Source code in playa/page.py
649
650
651
652
653
654
655
class PathSegment(NamedTuple):
    """
    Segment in a PDF graphics path.
    """

    operator: PathOperator
    points: Tuple[Point, ...]

TagObject dataclass

Bases: ContentObject

A marked content point with no content.

Source code in playa/page.py
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
@dataclass
class TagObject(ContentObject):
    """A marked content point with no content."""

    @property
    def bbox(self) -> Rect:
        """A tag has no content and thus no bounding box.

        To avoid needlessly complicating user code this returns
        `BBOX_NONE` instead of `None` or throwing a exception.
        Because that is a specific object, you can reliably check for
        it with:

            if obj.bbox is BBOX_NONE:
                ...
        """
        return BBOX_NONE

bbox: Rect property

A tag has no content and thus no bounding box.

To avoid needlessly complicating user code this returns BBOX_NONE instead of None or throwing a exception. Because that is a specific object, you can reliably check for it with:

if obj.bbox is BBOX_NONE:
    ...

object_type property

Type of this object as a string, e.g. "text", "path", "image".

TextItem

Bases: NamedTuple

Semi-parsed item in a text object. Actual "rendering" is deferred, just like with paths.

Attributes:

Name Type Description
operator TextOperator

Text operator for this item. Many operators simply modify the TextState and do not actually output any text.

args Tuple[TextArgument, ...]

Arguments for the operator.

Source code in playa/page.py
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
class TextItem(NamedTuple):
    """Semi-parsed item in a text object.  Actual "rendering" is
    deferred, just like with paths.

    Attributes:
      operator: Text operator for this item. Many operators simply
        modify the `TextState` and do not actually output any text.
      args: Arguments for the operator.
    """

    operator: TextOperator
    args: Tuple[TextArgument, ...]

TextObject dataclass

Bases: ContentObject

Text object (contains one or more glyphs).

Attributes:

Name Type Description
textstate TextState

Text state for this object. This is a mutable object and you should not expect it to be valid outside the context of iteration over the parent TextObject.

items List[TextItem]

Raw text items (strings and operators) for this object.

Source code in playa/page.py
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
@dataclass
class TextObject(ContentObject):
    """Text object (contains one or more glyphs).

    Attributes:
      textstate: Text state for this object.  This is a **mutable**
        object and you should not expect it to be valid outside the
        context of iteration over the parent `TextObject`.
      items: Raw text items (strings and operators) for this object.
    """

    textstate: TextState
    items: List[TextItem]
    _chars: Union[List[str], None] = None

    def _render_char(
        self,
        *,
        cid: int,
        matrix: Matrix,
        scaling: float,
    ) -> GlyphObject:
        font = self.textstate.font
        assert font is not None
        fontsize = self.textstate.fontsize
        rise = self.textstate.rise
        try:
            text = font.to_unichr(cid)
            assert isinstance(text, str), f"Text {text!r} is not a str"
        except PDFUnicodeNotDefined:
            log.debug("undefined char: %r, %r", font, cid)
            text = None
        textwidth = font.char_width(cid)
        adv = textwidth * fontsize * scaling
        if font.vertical:
            textdisp = font.char_disp(cid)
            assert isinstance(textdisp, tuple)
            (vx, vy) = textdisp
            if vx is None:
                vx = fontsize * 0.5
            else:
                vx = vx * fontsize * 0.001
            vy = (1000 - vy) * fontsize * 0.001
            x0, y0 = (-vx, vy + rise + adv)
            x1, y1 = (-vx + fontsize, vy + rise)
        else:
            descent = font.get_descent() * fontsize
            x0, y0 = (0, descent + rise)
            x1, y1 = (adv, descent + rise + fontsize)
        bbox = get_transformed_bound(matrix, (x0, y0, x1, y1))
        return GlyphObject(
            self.gstate, self.ctm, self.mcs, self.textstate, cid, text, adv, bbox
        )

    def _render_string(self, item: TextItem) -> Iterator[GlyphObject]:
        assert self.textstate.font is not None
        vert = self.textstate.font.vertical
        assert self.ctm is not None
        matrix = mult_matrix(self.textstate.line_matrix, self.ctm)
        scaling = self.textstate.scaling * 0.01
        charspace = self.textstate.charspace * scaling
        wordspace = self.textstate.wordspace * scaling
        if self.textstate.font.multibyte:
            wordspace = 0
        (x, y) = self.textstate.glyph_offset
        pos = y if vert else x
        needcharspace = False
        dxscale = 0.001 * self.textstate.fontsize * scaling
        for obj in item.args:
            if isinstance(obj, (int, float)):
                pos -= obj * dxscale
                needcharspace = True
            else:
                if not isinstance(obj, bytes):
                    log.warning("Found non-string %r in text object", obj)
                    continue
                for cid in self.textstate.font.decode(obj):
                    if needcharspace:
                        pos += charspace
                    self.textstate.glyph_offset = (x, pos) if vert else (pos, y)
                    glyph = self._render_char(
                        cid=cid,
                        matrix=translate_matrix(matrix, self.textstate.glyph_offset),
                        scaling=scaling,
                    )
                    pos += glyph.adv
                    yield glyph
                    if cid == 32 and wordspace:
                        pos += wordspace
                    needcharspace = True
        self.textstate.glyph_offset = (x, pos) if vert else (pos, y)

    @property
    def chars(self) -> str:
        """Get the Unicode characters (in stream order) for this object."""
        if self._chars is not None:
            return "".join(self._chars)
        self._chars = []
        # This is not strictly necessary since we don't care about
        # positioning, but perhaps we might in the future
        self.textstate.reset()
        for item in self.items:
            # Only TJ and Tf are relevant to Unicode output
            if item.operator == "TJ":
                font = self.textstate.font
                assert font is not None, "No font was selected"
                for obj in item.args:
                    if not isinstance(obj, bytes):
                        continue
                    for cid in font.decode(obj):
                        try:
                            text = font.to_unichr(cid)
                            assert isinstance(text, str), f"Text {text!r} is not a str"
                            self._chars.append(text)
                        except PDFUnicodeNotDefined:
                            log.debug("undefined char: %r, %r", font, cid)
            elif item.operator == "Tf":
                self.textstate.update(item.operator, *item.args)
        return "".join(self._chars)

    def __iter__(self) -> Iterator[GlyphObject]:
        """Generate glyphs for this text object"""
        # This corresponds to a BT operator so reset the textstate
        self.textstate.reset()
        for item in self.items:
            if item.operator == "TJ":
                for glyph in self._render_string(item):
                    yield glyph
            else:
                self.textstate.update(item.operator, *item.args)

chars: str property

Get the Unicode characters (in stream order) for this object.

object_type property

Type of this object as a string, e.g. "text", "path", "image".

__iter__()

Generate glyphs for this text object

Source code in playa/page.py
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
def __iter__(self) -> Iterator[GlyphObject]:
    """Generate glyphs for this text object"""
    # This corresponds to a BT operator so reset the textstate
    self.textstate.reset()
    for item in self.items:
        if item.operator == "TJ":
            for glyph in self._render_string(item):
                yield glyph
        else:
            self.textstate.update(item.operator, *item.args)

TextState dataclass

PDF Text State (PDF 1.7 section 9.3.1).

Exceptionally, the line matrix and text matrix are represented more compactly with the line matrix itself in line_matrix, which gets translated by glyph_offset for the current glyph (note: expressed in user space), which pdfminer confusingly called linematrix, to produce the text matrix.

Attributes:

Name Type Description
line_matrix Matrix

The text line matrix, which defines (in user space) the start of the current line of text, which may or may not correspond to an actual line because PDF is a presentation format.

glyph_offset Point

The offset of the current glyph with relation to the line matrix (in user space). To get this in device space you may use playa.utils.apply_matrix_norm with TextObject.ctm.

font Optional[Font]

The current font.

fontsize float

The current font size, in text space units. This is often just 1.0 as it relies on the text matrix (you may use line_matrix here) to scale it to the actual size in user space.

charspace float

Extra spacing to add between each glyph, in text space units.

wordspace float

The width of a space, defined curiously as cid==32 (But PDF Is A prESeNTaTion fORmAT sO ThERe maY NOt Be aNY SpACeS!!), in text space units.

scaling float

The horizontal scaling factor as defined by the PDF standard.

leading float

The leading as defined by the PDF standard.

render_mode int

The PDF rendering mode. The really important one here is 3, which means "don't render the text". You might want to use this to detect invisible text.

rise float

The text rise (superscript or subscript position), in text space units.

Source code in playa/page.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
@dataclass
class TextState:
    """PDF Text State (PDF 1.7 section 9.3.1).

    Exceptionally, the line matrix and text matrix are represented
    more compactly with the line matrix itself in `line_matrix`, which
    gets translated by `glyph_offset` for the current glyph (note:
    expressed in **user space**), which pdfminer confusingly called
    `linematrix`, to produce the text matrix.

    Attributes:
      line_matrix: The text line matrix, which defines (in user
        space) the start of the current line of text, which may or may
        not correspond to an actual line because PDF is a presentation
        format.
      glyph_offset: The offset of the current glyph with relation to
        the line matrix (in user space).  To get this in device space
        you may use `playa.utils.apply_matrix_norm` with
        `TextObject.ctm`.
      font: The current font.
      fontsize: The current font size, **in text space units**.
        This is often just 1.0 as it relies on the text matrix (you
        may use `line_matrix` here) to scale it to the actual size in
        user space.
      charspace: Extra spacing to add between each glyph, in
        text space units.
      wordspace: The width of a space, defined curiously as `cid==32`
        (But PDF Is A prESeNTaTion fORmAT sO ThERe maY NOt Be aNY
        SpACeS!!), in text space units.
      scaling: The horizontal scaling factor as defined by the PDF
        standard.
      leading: The leading as defined by the PDF standard.
      render_mode: The PDF rendering mode.  The really important one
        here is 3, which means "don't render the text".  You might
        want to use this to detect invisible text.
      rise: The text rise (superscript or subscript position), in text
        space units.
    """

    line_matrix: Matrix = MATRIX_IDENTITY
    glyph_offset: Point = (0, 0)
    font: Optional[Font] = None
    fontsize: float = 0
    charspace: float = 0
    wordspace: float = 0
    scaling: float = 100
    leading: float = 0
    render_mode: int = 0
    rise: float = 0

    def reset(self) -> None:
        """Reset the text state"""
        self.line_matrix = MATRIX_IDENTITY
        self.glyph_offset = (0, 0)

    def update(self, operator: TextOperator, *args: TextArgument):
        """Apply a text state operator"""
        if operator == "Tc":
            # FIXME: these casts are not evil like the other ones,
            # but it would be nice to be able to avoid them.
            self.charspace = cast(float, args[0])
        elif operator == "Tw":
            self.wordspace = cast(float, args[0])
        elif operator == "Tz":
            self.scaling = cast(float, args[0])
        elif operator == "TL":
            self.leading = cast(float, args[0])
        elif operator == "Tf":
            self.font = cast(Font, args[0])
            self.fontsize = cast(float, args[1])
        elif operator == "Tr":
            self.render_mode = cast(int, args[0])
        elif operator == "Ts":
            self.rise = cast(float, args[0])
        elif operator == "Td":
            tx = cast(float, args[0])
            ty = cast(float, args[1])
            (a, b, c, d, e, f) = self.line_matrix
            e_new = tx * a + ty * c + e
            f_new = tx * b + ty * d + f
            self.line_matrix = (a, b, c, d, e_new, f_new)
            self.glyph_offset = (0, 0)
        elif operator == "Tm":
            a, b, c, d, e, f = (cast(float, x) for x in args)
            self.line_matrix = (a, b, c, d, e, f)
            self.glyph_offset = (0, 0)
        elif operator == "T*":
            # PDF 1.7 table 108: equivalent to 0 -leading Td - but
            # because we are lazy we don't know the leading until
            # we get here, so we can't expand it in advance.
            (a, b, c, d, e, f) = self.line_matrix
            self.line_matrix = (
                a,
                b,
                c,
                d,
                -self.leading * c + e,
                -self.leading * d + f,
            )
            self.glyph_offset = (0, 0)

reset()

Reset the text state

Source code in playa/page.py
325
326
327
328
def reset(self) -> None:
    """Reset the text state"""
    self.line_matrix = MATRIX_IDENTITY
    self.glyph_offset = (0, 0)

update(operator, *args)

Apply a text state operator

Source code in playa/page.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
def update(self, operator: TextOperator, *args: TextArgument):
    """Apply a text state operator"""
    if operator == "Tc":
        # FIXME: these casts are not evil like the other ones,
        # but it would be nice to be able to avoid them.
        self.charspace = cast(float, args[0])
    elif operator == "Tw":
        self.wordspace = cast(float, args[0])
    elif operator == "Tz":
        self.scaling = cast(float, args[0])
    elif operator == "TL":
        self.leading = cast(float, args[0])
    elif operator == "Tf":
        self.font = cast(Font, args[0])
        self.fontsize = cast(float, args[1])
    elif operator == "Tr":
        self.render_mode = cast(int, args[0])
    elif operator == "Ts":
        self.rise = cast(float, args[0])
    elif operator == "Td":
        tx = cast(float, args[0])
        ty = cast(float, args[1])
        (a, b, c, d, e, f) = self.line_matrix
        e_new = tx * a + ty * c + e
        f_new = tx * b + ty * d + f
        self.line_matrix = (a, b, c, d, e_new, f_new)
        self.glyph_offset = (0, 0)
    elif operator == "Tm":
        a, b, c, d, e, f = (cast(float, x) for x in args)
        self.line_matrix = (a, b, c, d, e, f)
        self.glyph_offset = (0, 0)
    elif operator == "T*":
        # PDF 1.7 table 108: equivalent to 0 -leading Td - but
        # because we are lazy we don't know the leading until
        # we get here, so we can't expand it in advance.
        (a, b, c, d, e, f) = self.line_matrix
        self.line_matrix = (
            a,
            b,
            c,
            d,
            -self.leading * c + e,
            -self.leading * d + f,
        )
        self.glyph_offset = (0, 0)

XObjectObject dataclass

Bases: ContentObject

An eXternal Object, in the context of a page.

There are a couple of kinds of XObjects. Here we are only concerned with "Form XObjects" which, despite their name, have nothing at all to do with fillable forms. Instead they are like little embeddable PDF pages, possibly with their own resources, definitely with their own definition of "user space".

Image XObjects are handled by ImageObject.

Attributes:

Name Type Description
xobjid str

Name of this XObject (in the page resources).

page ReferenceType

Weak reference to containing page.

stream ContentStream

Content stream with PDF operators.

resources Union[None, Dict[str, PDFObject]]

Resources specific to this XObject, if any.

Source code in playa/page.py
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
@dataclass
class XObjectObject(ContentObject):
    """An eXternal Object, in the context of a page.

    There are a couple of kinds of XObjects.  Here we are only
    concerned with "Form XObjects" which, despite their name, have
    nothing at all to do with fillable forms.  Instead they are like
    little embeddable PDF pages, possibly with their own resources,
    definitely with their own definition of "user space".

    Image XObjects are handled by `ImageObject`.

    Attributes:
      xobjid: Name of this XObject (in the page resources).
      page: Weak reference to containing page.
      stream: Content stream with PDF operators.
      resources: Resources specific to this XObject, if any.
    """

    xobjid: str
    page: weakref.ReferenceType
    stream: ContentStream
    resources: Union[None, Dict[str, PDFObject]]

    def __contains__(self, name: object) -> bool:
        return name in self.stream

    def __getitem__(self, name: str) -> PDFObject:
        return self.stream[name]

    @property
    def bbox(self) -> Rect:
        """Get the bounding box of this XObject in device space."""
        # It is a required attribute!
        return get_transformed_bound(self.ctm, parse_rect(self.stream["BBox"]))

    @property
    def buffer(self) -> bytes:
        """Raw stream content for this XObject"""
        return self.stream.buffer

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterate over tokens in the XObject's content stream."""
        parser = ContentParser([self.stream])
        while True:
            try:
                pos, tok = parser.nexttoken()
            except StopIteration:
                return
            yield tok

    @property
    def layout(self) -> Iterator["LayoutDict"]:
        """Iterator over eager layout object dictionaries."""
        page = self.page()
        if page is None:
            raise RuntimeError("Page no longer exists!")
        return iter(PageInterpreter(page, [self.stream], self.resources))

    @property
    def contents(self) -> Iterator[PDFObject]:
        """Iterator over PDF objects in the content stream."""
        page = self.page()
        if page is None:
            raise RuntimeError("Page no longer exists!")
        for pos, obj in ContentParser([self.stream]):
            yield obj

    def __iter__(self) -> Iterator["ContentObject"]:
        page = self.page()
        if page is None:
            raise RuntimeError("Page no longer exists!")
        return iter(LazyInterpreter(page, [self.stream], self.resources))

bbox: Rect property

Get the bounding box of this XObject in device space.

buffer: bytes property

Raw stream content for this XObject

contents: Iterator[PDFObject] property

Iterator over PDF objects in the content stream.

layout: Iterator[LayoutDict] property

Iterator over eager layout object dictionaries.

object_type property

Type of this object as a string, e.g. "text", "path", "image".

tokens: Iterator[Token] property

Iterate over tokens in the XObject's content stream.

playa.structtree

PDF logical structure trees.

StructElement dataclass

Bases: Findable

Source code in playa/structtree.py
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
@dataclass
class StructElement(Findable):
    type: str
    revision: Union[int, None]
    id: Union[str, None]
    lang: Union[str, None]
    alt_text: Union[str, None]
    actual_text: Union[str, None]
    title: Union[str, None]
    page_idx: Union[int, None]
    attributes: Dict[str, Any] = field(default_factory=dict)
    mcids: List[int] = field(default_factory=list)
    children: List["StructElement"] = field(default_factory=list)

    def __iter__(self) -> Iterator["StructElement"]:
        return iter(self.children)

    def all_mcids(self) -> Iterator[Tuple[int, int]]:
        """Collect all MCIDs (with their page indices) inside a
        structure element.
        """
        # MCIDs are meaningless without a page object
        if self.mcids:
            assert self.page_idx is not None
            # Collect them depth-first to preserve ordering
            for mcid in self.mcids:
                yield self.page_idx, mcid
        d = deque(self.children)
        while d:
            el = d.popleft()
            if el.mcids:
                assert el.page_idx is not None
                for mcid in el.mcids:
                    yield el.page_idx, mcid
            d.extendleft(reversed(el.children))

all_mcids()

Collect all MCIDs (with their page indices) inside a structure element.

Source code in playa/structtree.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def all_mcids(self) -> Iterator[Tuple[int, int]]:
    """Collect all MCIDs (with their page indices) inside a
    structure element.
    """
    # MCIDs are meaningless without a page object
    if self.mcids:
        assert self.page_idx is not None
        # Collect them depth-first to preserve ordering
        for mcid in self.mcids:
            yield self.page_idx, mcid
    d = deque(self.children)
    while d:
        el = d.popleft()
        if el.mcids:
            assert el.page_idx is not None
            for mcid in el.mcids:
                yield el.page_idx, mcid
        d.extendleft(reversed(el.children))

find(matcher)

Find the first matching element in subtree.

The matcher argument is either an element name, a regular expression, or a function taking a StructElement and returning True if the element matches.

Source code in playa/structtree.py
84
85
86
87
88
89
90
91
92
93
94
95
96
def find(
    self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Union["StructElement", None]:
    """Find the first matching element in subtree.

    The `matcher` argument is either an element name, a regular
    expression, or a function taking a `StructElement` and
    returning `True` if the element matches.
    """
    try:
        return next(_find_all(self.children, matcher))
    except StopIteration:
        return None

find_all(matcher)

Iterate depth-first over matching elements in subtree.

The matcher argument is either an element name, a regular expression, or a function taking a StructElement and returning True if the element matches.

Source code in playa/structtree.py
73
74
75
76
77
78
79
80
81
82
def find_all(
    self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Iterator["StructElement"]:
    """Iterate depth-first over matching elements in subtree.

    The `matcher` argument is either an element name, a regular
    expression, or a function taking a `StructElement` and
    returning `True` if the element matches.
    """
    return _find_all(self.children, matcher)

StructTree

Bases: Findable

Parse the structure tree of a PDF.

This class creates a representation of the portion of the structure tree that reaches marked content sections for a document or a subset of its pages. Note that this is slightly different from the behaviour of other PDF libraries which will also include structure elements with no content.

Raises:

Type Description
KeyError

If the PDF has no structure tree.

Parameters:

Name Type Description Default
doc Document

Document from which to extract structure tree

required
pages Union[Iterable[Page], None]

List of (index, page) pairs - indices will be used to identify pages in the tree through the page_idx attribute of StructElement.

None
Source code in playa/structtree.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
class StructTree(Findable):
    """Parse the structure tree of a PDF.

    This class creates a representation of the portion of the
    structure tree that reaches marked content sections for a document
    or a subset of its pages.  Note that this is slightly different
    from the behaviour of other PDF libraries which will also include
    structure elements with no content.

    Raises:
      KeyError: If the PDF has no structure tree.

    Args:
      doc: Document from which to extract structure tree
      pages: List of (index, page) pairs - indices will be used to
             identify pages in the tree through the `page_idx`
             attribute of `StructElement`.
    """

    page: Union["Page", None]

    def __init__(
        self,
        doc: "Document",
        pages: Union[Iterable["Page"], None] = None,
    ):
        if "StructTreeRoot" not in doc.catalog:
            raise KeyError("Catalog has no 'StructTreeRoot' entry")
        self.root = dict_value(doc.catalog["StructTreeRoot"])
        self.role_map = dict_value(self.root.get("RoleMap", {}))
        self.class_map = dict_value(self.root.get("ClassMap", {}))
        self.children: List[StructElement] = []
        self.page_dict: Dict[Any, Union[int, None]]

        if pages is None:
            self.page_dict = {page.pageid: page.page_idx for page in doc.pages}
            self._parse_struct_tree()
        else:
            pagelist = list(pages)
            self.page_dict = {page.pageid: page.page_idx for page in pagelist}
            parent_tree_obj = self.root.get("ParentTree")
            # If we have a single page then we will work backwards from
            # its ParentTree - this is because structure elements could
            # span multiple pages, and the "Pg" attribute is *optional*,
            # so this is the approved way to get a page's structure...
            if len(pagelist) == 1 and parent_tree_obj is not None:
                page = pagelist[0]
                parent_tree = NumberTree(parent_tree_obj)
                # If there is no marked content in the structure tree for
                # this page (which can happen even when there is a
                # structure tree) then there is no `StructParents`.
                # Note however that if there are XObjects in a page,
                # *they* may have `StructParent` (not `StructParents`)
                if "StructParents" not in page.attrs:
                    return
                parent_id = page.attrs["StructParents"]
                parent_array = list_value(parent_tree[parent_id])
                assert isinstance(parent_array, list)  # srsly
                self._parse_parent_tree(parent_array)
            else:
                # ...EXCEPT that the ParentTree is sometimes missing, in which
                # case we fall back to the non-approved way.
                self._parse_struct_tree()

    def _make_attributes(
        self, attrs: Dict[str, Any], revision: Union[int, None]
    ) -> Dict[str, Any]:
        attr_obj_list: List[PDFObject] = []
        for key in "C", "A":
            if key not in attrs:
                continue
            attr_obj = resolve1(attrs[key])
            # It could be a list of attribute objects (why?)
            if isinstance(attr_obj, list):
                attr_obj_list.extend(resolve1(val) for val in attr_obj)
            else:
                attr_obj_list.append(attr_obj)
        attr_objs: List[Union[int, dict]] = []
        prev_obj = None
        for aobj in attr_obj_list:
            # If we find a revision number, which might "follow the
            # revision object" (the spec is not clear about what this
            # should look like but it implies they are simply adjacent
            # in a flat array), then use it to decide whether to take
            # the previous object...
            if isinstance(aobj, int):
                if aobj == revision and prev_obj is not None:
                    attr_objs.append(prev_obj)
                prev_obj = None
            elif isinstance(aobj, dict):
                if prev_obj is not None:
                    attr_objs.append(prev_obj)
                prev_obj = aobj
            else:
                logger.warning("Structure attribute of unknown type: %r", aobj)
        if prev_obj is not None:
            attr_objs.append(prev_obj)
        # Now merge all the attribute objects in the collected to a
        # single set (again, the spec doesn't really explain this but
        # does say that attributes in /A supersede those in /C)
        attr = {}
        for obj in attr_objs:
            # They should all be resolved by now!
            assert not isinstance(obj, ObjRef)
            # A class name
            if isinstance(obj, PSLiteral):
                key = decode_text(obj.name)
                if key not in self.class_map:
                    logger.warning("Unknown attribute class %s", key)
                    continue
                obj = self.class_map[key]
            elif isinstance(obj, dict):
                for k, v in obj.items():
                    if isinstance(v, PSLiteral):
                        attr[k] = decode_text(v.name)
                    else:
                        attr[k] = obj[k]
            else:
                logger.warning("Unexpected attribute object type: %r", obj)
        return attr

    def _make_element(self, obj: Any) -> Tuple[Union[StructElement, None], List[Any]]:
        # We hopefully caught these earlier
        assert "MCID" not in obj, "Uncaught MCR: %s" % obj
        assert "Obj" not in obj, "Uncaught OBJR: %s" % obj
        # Get page index if necessary
        page_idx = None
        if "Pg" in obj:
            page_objid = obj["Pg"].objid
            assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj
            page_idx = self.page_dict[page_objid]
        obj_tag = ""
        if "S" in obj:
            obj_tag = decode_text(obj["S"].name)
            if obj_tag in self.role_map:
                obj_tag = decode_text(self.role_map[obj_tag].name)
        children = resolve1(obj["K"]) if "K" in obj else []
        if isinstance(children, int):  # ugh... isinstance...
            children = [children]
        elif isinstance(children, dict):  # a single object.. ugh...
            children = [obj["K"]]
        revision = obj.get("R")
        attributes = self._make_attributes(obj, revision)
        element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None
        title = decode_text(resolve1(obj["T"])) if "T" in obj else None
        lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None
        alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None
        actual_text = (
            decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None
        )
        element = StructElement(
            type=obj_tag,
            id=element_id,
            page_idx=page_idx,
            revision=revision,
            lang=lang,
            title=title,
            alt_text=alt_text,
            actual_text=actual_text,
            attributes=attributes,
        )
        return element, children

    def _parse_parent_tree(self, parent_array: List[Any]) -> None:
        """Populate the structure tree using the leaves of the parent tree for
        a given page."""
        # First walk backwards from the leaves to the root, tracking references
        d = deque(parent_array)
        s = {}
        found_root = False
        while d:
            ref = d.popleft()
            # In the case where an MCID is not associated with any
            # structure, there will be None in the parent tree
            # (previously it was KWD("null") but we now parse that
            # properly as None)
            if ref is KEYWORD_NULL or ref is None:
                continue
            if repr(ref) in s:
                continue
            obj = dict_value(ref)
            assert obj is not None  # This means the XRef tables are borked
            # This is required! It's in the spec!
            if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot":
                found_root = True
            else:
                # We hope that these are actual elements and not
                # references or marked-content sections...
                element, children = self._make_element(obj)
                # We have no page tree so we assume this page was parsed
                assert element is not None
                s[repr(ref)] = element, children
                d.append(obj["P"])
        # If we didn't reach the root something is quite wrong!
        assert found_root
        self._resolve_children(s)

    def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
        if "Pg" not in obj:
            return True
        page_objid = obj["Pg"].objid
        return page_objid in self.page_dict

    def _parse_struct_tree(self) -> None:
        """Populate the structure tree starting from the root, skipping
        unparsed pages and empty elements."""
        root = resolve1(self.root["K"])

        # It could just be a single object ... it's in the spec (argh)
        if isinstance(root, dict):
            root = [self.root["K"]]
        d = deque(root)
        s = {}
        while d:
            ref = d.popleft()
            # In case the tree is actually a DAG and not a tree...
            if repr(ref) in s:  # pragma: nocover (shouldn't happen)
                continue
            obj = resolve1(ref)
            # Deref top-level OBJR skipping refs to unparsed pages
            if isinstance(obj, dict) and "Obj" in obj:
                if not self.on_parsed_page(obj):
                    continue
                ref = obj["Obj"]
                obj = resolve1(ref)
            element, children = self._make_element(obj)
            # Similar to above, delay resolving the children to avoid
            # tree-recursion.
            s[repr(ref)] = element, children
            for child in children:
                obj = resolve1(child)
                if isinstance(obj, dict):
                    if not self.on_parsed_page(obj):
                        continue
                    if "Obj" in obj:
                        child = obj["Obj"]
                    elif "MCID" in obj:
                        continue
                if isinstance(child, ObjRef):
                    d.append(child)

        # Traverse depth-first, removing empty elements (unsure how to
        # do this non-recursively)
        def prune(elements: List[Any]) -> List[Any]:
            next_elements = []
            for ref in elements:
                obj = resolve1(ref)
                if isinstance(ref, int):
                    next_elements.append(ref)
                    continue
                elif isinstance(obj, dict):
                    if not self.on_parsed_page(obj):
                        continue
                    if "MCID" in obj:
                        next_elements.append(obj["MCID"])
                        continue
                    elif "Obj" in obj:
                        ref = obj["Obj"]
                element, children = s[repr(ref)]
                children = prune(children)
                # See assertions below
                if element is None or not children:
                    del s[repr(ref)]
                else:
                    s[repr(ref)] = element, children
                    next_elements.append(ref)
            return next_elements

        prune(root)
        self._resolve_children(s)

    def _resolve_children(self, seen: Dict[str, Any]) -> None:
        """Resolve children starting from the tree root based on references we
        saw when traversing the structure tree.
        """
        root = resolve1(self.root["K"])
        # It could just be a single object ... it's in the spec (argh)
        if isinstance(root, dict):
            root = [self.root["K"]]
        self.children = []
        # Create top-level self.children
        parsed_root = []
        for ref in root:
            obj = resolve1(ref)
            if isinstance(obj, dict) and "Obj" in obj:
                if not self.on_parsed_page(obj):
                    continue
                ref = obj["Obj"]
            if repr(ref) in seen:
                parsed_root.append(ref)
        d = deque(parsed_root)
        while d:
            ref = d.popleft()
            element, children = seen[repr(ref)]
            assert element is not None, "Unparsed element"
            for child in children:
                obj = resolve1(child)
                if isinstance(obj, int):
                    # FIXME: This might fail! (but that indicates a
                    # programming failure as MCIDs should never occur
                    # without a page object)
                    element.mcids.append(obj)
                elif isinstance(obj, dict):
                    # Skip out-of-page MCIDS and OBJRs (FIXME: do we
                    # *really* want to do this? Perhaps we should
                    # store the page indices directly with the MCIDs?)
                    if not self.on_parsed_page(obj):
                        continue
                    if "MCID" in obj:
                        # FIXME: This might fail, for the same reasons!
                        assert element.page_idx is not None
                        element.mcids.append(obj["MCID"])
                    elif "Obj" in obj:
                        child = obj["Obj"]
                # NOTE: if, not elif, in case of OBJR above
                if isinstance(child, ObjRef):
                    child_element, _ = seen.get(repr(child), (None, None))
                    if child_element is not None:
                        element.children.append(child_element)
                        d.append(child)
        self.children = [seen[repr(ref)][0] for ref in parsed_root]

    def __iter__(self) -> Iterator[StructElement]:
        return iter(self.children)

find(matcher)

Find the first matching element in subtree.

The matcher argument is either an element name, a regular expression, or a function taking a StructElement and returning True if the element matches.

Source code in playa/structtree.py
84
85
86
87
88
89
90
91
92
93
94
95
96
def find(
    self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Union["StructElement", None]:
    """Find the first matching element in subtree.

    The `matcher` argument is either an element name, a regular
    expression, or a function taking a `StructElement` and
    returning `True` if the element matches.
    """
    try:
        return next(_find_all(self.children, matcher))
    except StopIteration:
        return None

find_all(matcher)

Iterate depth-first over matching elements in subtree.

The matcher argument is either an element name, a regular expression, or a function taking a StructElement and returning True if the element matches.

Source code in playa/structtree.py
73
74
75
76
77
78
79
80
81
82
def find_all(
    self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Iterator["StructElement"]:
    """Iterate depth-first over matching elements in subtree.

    The `matcher` argument is either an element name, a regular
    expression, or a function taking a `StructElement` and
    returning `True` if the element matches.
    """
    return _find_all(self.children, matcher)