Skip to content

Reference

playa

PLAYA ain't a LAYout Analyzer... but it can help you get stuff out of PDFs.

Basic usage:

with playa.open(path) as pdf:
    for page in pdf.pages:
        print(f"page {page.label}:")
        for obj in page:
            print(f"    {obj.object_type} at {obj.bbox}")
            if obj.object_type == "text":
                print(f"        chars: {obj.chars}")

open(path, *, password='', space='screen', max_workers=1, mp_context=None)

Open a PDF document from a path on the filesystem.

Parameters:

Name Type Description Default
path Union[PathLike, str]

Path to the document to open.

required
space DeviceSpace

Device space to use ("screen" for screen-like coordinates, "page" for pdfminer.six-like coordinates, "default" for default user space with no rotation or translation)

'screen'
max_workers Union[int, None]

Number of worker processes to use for parallel processing of pages (if 1, no workers are spawned)

1
mp_context Union[BaseContext, None]

Multiprocessing context to use for worker processes, see Contexts and Start Methods for more information.

None
Source code in playa/__init__.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
def open(
    path: Union[PathLike, str],
    *,
    password: str = "",
    space: DeviceSpace = "screen",
    max_workers: Union[int, None] = 1,
    mp_context: Union[BaseContext, None] = None,
) -> Document:
    """Open a PDF document from a path on the filesystem.

    Args:
        path: Path to the document to open.
        space: Device space to use ("screen" for screen-like
               coordinates, "page" for pdfminer.six-like coordinates, "default" for
               default user space with no rotation or translation)
        max_workers: Number of worker processes to use for parallel
                     processing of pages (if 1, no workers are spawned)
        mp_context: Multiprocessing context to use for worker
                    processes, see [Contexts and Start
                    Methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
                    for more information.
    """
    fp = builtins.open(path, "rb")
    pdf = Document(fp, password=password, space=space)
    pdf._fp = fp
    if max_workers is None or max_workers > 1:
        pdf._pool = ProcessPoolExecutor(
            max_workers=max_workers,
            mp_context=mp_context,
            initializer=_init_worker,  # type: ignore[arg-type]
            initargs=(id(pdf), path, password, space),  # type: ignore[arg-type]
        )
    return pdf

parse(buffer, *, password='', space='screen', max_workers=1, mp_context=None)

Read a PDF document from binary data.

Potential slowness

When using multiple processes, this results in the entire buffer being copied to the worker processes for the moment, which may cause some overhead. It is preferable to use open on a filesystem path if possible, since that uses memory-mapped I/O.

Parameters:

Name Type Description Default
buffer bytes

Buffer containing PDF data.

required
space DeviceSpace

Device space to use ("screen" for screen-like coordinates, "page" for pdfminer.six-like coordinates, "default" for default user space with no rotation or translation)

'screen'
max_workers Union[int, None]

Number of worker processes to use for parallel processing of pages (if 1, no workers are spawned)

1
mp_context Union[BaseContext, None]

Multiprocessing context to use for worker processes, see Contexts and Start Methods for more information.

None
Source code in playa/__init__.py
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
def parse(
    buffer: bytes,
    *,
    password: str = "",
    space: DeviceSpace = "screen",
    max_workers: Union[int, None] = 1,
    mp_context: Union[BaseContext, None] = None,
) -> Document:
    """Read a PDF document from binary data.

    Note: Potential slowness
        When using multiple processes, this results in the entire
        buffer being copied to the worker processes for the moment,
        which may cause some overhead.  It is preferable to use `open`
        on a filesystem path if possible, since that uses
        memory-mapped I/O.

    Args:
        buffer: Buffer containing PDF data.
        space: Device space to use ("screen" for screen-like
               coordinates, "page" for pdfminer.six-like coordinates, "default" for
               default user space with no rotation or translation)
        max_workers: Number of worker processes to use for parallel
                     processing of pages (if 1, no workers are spawned)
        mp_context: Multiprocessing context to use for worker
                    processes, see [Contexts and Start
                    Methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
                    for more information.

    """
    pdf = Document(buffer, password=password, space=space)
    if max_workers is None or max_workers > 1:
        pdf._pool = ProcessPoolExecutor(
            max_workers=max_workers,
            mp_context=mp_context,
            initializer=_init_worker_buffer,  # type: ignore[arg-type]
            initargs=(id(pdf), buffer, password, space),  # type: ignore[arg-type]
        )
    return pdf

playa.document

Basic classes for PDF document parsing.

Destinations

Mapping of named destinations.

These either come as a NameTree or a dict, depending on the version of the PDF standard, so this abstracts that away.

Source code in playa/document.py
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
class Destinations:
    """Mapping of named destinations.

    These either come as a NameTree or a dict, depending on the
    version of the PDF standard, so this abstracts that away.
    """

    dests_dict: Union[Dict[str, PDFObject], None] = None
    dests_tree: Union[NameTree, None] = None

    def __init__(self, doc: Document) -> None:
        self._docref = _ref_document(doc)
        self.dests: Dict[str, Destination] = {}
        if "Dests" in doc.catalog:
            # PDF-1.1: dictionary
            dests_dict = resolve1(doc.catalog["Dests"])
            if isinstance(dests_dict, dict):
                self.dests_dict = dests_dict
            else:
                log.warning(
                    "Dests entry in catalog is not dictionary: %r", self.dests_dict
                )
                self.dests_dict = None
        elif "Names" in doc.catalog:
            names = resolve1(doc.catalog["Names"])
            if not isinstance(names, dict):
                log.warning("Names entry in catalog is not dictionary: %r", names)
                return
            if "Dests" in names:
                dests = resolve1(names["Dests"])
                if not isinstance(names, dict):
                    log.warning("Dests entry in names is not dictionary: %r", dests)
                    return
                self.dests_tree = NameTree(dests)

    def __iter__(self) -> Iterator[str]:
        """Iterate over names of destinations.

        Danger: Beware of corrupted PDFs
            This simply iterates over the names listed in the PDF, and
            does not attempt to actually parse the destinations
            (because that's pretty slow).  If the PDF is broken, you
            may encounter exceptions when actually trying to access
            them by name.
        """
        if self.dests_dict is not None:
            yield from self.dests_dict
        elif self.dests_tree is not None:
            for kb, _ in self.dests_tree:
                ks = decode_text(kb)
                yield ks

    def items(self) -> Iterator[Tuple[str, Destination]]:
        """Iterate over named destinations."""
        if self.dests_dict is not None:
            for name, dest in self.dests_dict.items():
                if name not in self.dests:
                    dest = resolve1(self.dests_dict[name])
                    self.dests[name] = self._create_dest(dest, name)
                yield name, self.dests[name]
        elif self.dests_tree is not None:
            for k, v in self.dests_tree:
                name = decode_text(k)
                if name not in self.dests:
                    dest = resolve1(v)
                    self.dests[name] = self._create_dest(dest, name)
                yield name, self.dests[name]

    def __getitem__(self, name: Union[bytes, str, PSLiteral]) -> Destination:
        """Get a named destination.

        Args:
            name: The name of the destination.

        Raises:
            KeyError: If no such destination exists.
            TypeError: If the PDF is damaged and the destinations tree
                contains something unexpected or missing.
        """
        if isinstance(name, bytes):
            name = decode_text(name)
        elif isinstance(name, PSLiteral):
            name = literal_name(name)
        if name in self.dests:
            return self.dests[name]
        elif self.dests_dict is not None:
            # This will raise KeyError or TypeError if necessary, so
            # we don't have to do it explicitly
            dest = resolve1(self.dests_dict[name])
            self.dests[name] = self._create_dest(dest, name)
        elif self.dests_tree is not None:
            # This is not at all efficient, but we need to decode
            # the keys (and we cache the result...)
            for k, v in self.dests_tree:
                if decode_text(k) == name:
                    dest = resolve1(v)
                    self.dests[name] = self._create_dest(dest, name)
                    break
        # This will also raise KeyError if necessary
        return self.dests[name]

    def _create_dest(self, dest: PDFObject, name: str) -> Destination:
        if isinstance(dest, list):
            return Destination.from_list(self.doc, dest)
        elif isinstance(dest, dict) and "D" in dest:
            destlist = resolve1(dest["D"])
            if not isinstance(destlist, list):
                raise TypeError("Invalid destination for %s: %r", name, dest)
            return Destination.from_list(self.doc, destlist)
        else:
            raise TypeError("Invalid destination for %s: %r", name, dest)

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self._docref)

doc property

Get associated document if it exists.

__getitem__(name)

Get a named destination.

Parameters:

Name Type Description Default
name Union[bytes, str, PSLiteral]

The name of the destination.

required

Raises:

Type Description
KeyError

If no such destination exists.

TypeError

If the PDF is damaged and the destinations tree contains something unexpected or missing.

Source code in playa/document.py
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
def __getitem__(self, name: Union[bytes, str, PSLiteral]) -> Destination:
    """Get a named destination.

    Args:
        name: The name of the destination.

    Raises:
        KeyError: If no such destination exists.
        TypeError: If the PDF is damaged and the destinations tree
            contains something unexpected or missing.
    """
    if isinstance(name, bytes):
        name = decode_text(name)
    elif isinstance(name, PSLiteral):
        name = literal_name(name)
    if name in self.dests:
        return self.dests[name]
    elif self.dests_dict is not None:
        # This will raise KeyError or TypeError if necessary, so
        # we don't have to do it explicitly
        dest = resolve1(self.dests_dict[name])
        self.dests[name] = self._create_dest(dest, name)
    elif self.dests_tree is not None:
        # This is not at all efficient, but we need to decode
        # the keys (and we cache the result...)
        for k, v in self.dests_tree:
            if decode_text(k) == name:
                dest = resolve1(v)
                self.dests[name] = self._create_dest(dest, name)
                break
    # This will also raise KeyError if necessary
    return self.dests[name]

__iter__()

Iterate over names of destinations.

Beware of corrupted PDFs

This simply iterates over the names listed in the PDF, and does not attempt to actually parse the destinations (because that's pretty slow). If the PDF is broken, you may encounter exceptions when actually trying to access them by name.

Source code in playa/document.py
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
def __iter__(self) -> Iterator[str]:
    """Iterate over names of destinations.

    Danger: Beware of corrupted PDFs
        This simply iterates over the names listed in the PDF, and
        does not attempt to actually parse the destinations
        (because that's pretty slow).  If the PDF is broken, you
        may encounter exceptions when actually trying to access
        them by name.
    """
    if self.dests_dict is not None:
        yield from self.dests_dict
    elif self.dests_tree is not None:
        for kb, _ in self.dests_tree:
            ks = decode_text(kb)
            yield ks

items()

Iterate over named destinations.

Source code in playa/document.py
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
def items(self) -> Iterator[Tuple[str, Destination]]:
    """Iterate over named destinations."""
    if self.dests_dict is not None:
        for name, dest in self.dests_dict.items():
            if name not in self.dests:
                dest = resolve1(self.dests_dict[name])
                self.dests[name] = self._create_dest(dest, name)
            yield name, self.dests[name]
    elif self.dests_tree is not None:
        for k, v in self.dests_tree:
            name = decode_text(k)
            if name not in self.dests:
                dest = resolve1(v)
                self.dests[name] = self._create_dest(dest, name)
            yield name, self.dests[name]

Document

Representation of a PDF document.

Since PDF documents can be very large and complex, merely creating a Document does very little aside from verifying that the password is correct and getting a minimal amount of metadata. In general, PLAYA will try to open just about anything as a PDF, so you should not expect the constructor to fail here if you give it nonsense (something else may fail later on).

Some metadata, such as the structure tree and page tree, will be loaded lazily and cached. We do not handle modification of PDFs.

Parameters:

Name Type Description Default
fp Union[BinaryIO, bytes]

File-like object in binary mode, or a buffer with binary data. Files will be read using mmap if possible. They do not need to be seekable, as if mmap fails the entire file will simply be read into memory (so a pipe or socket ought to work).

required
password str

Password for decryption, if needed.

''
space DeviceSpace

the device space to use for interpreting content ("screen" or "page")

'screen'

Raises:

Type Description
TypeError

if fp is a file opened in text mode (don't do that!)

PDFEncryptionError

if the PDF has an unsupported encryption scheme

PDFPasswordIncorrect

if the password is incorrect

Source code in playa/document.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
class Document:
    """Representation of a PDF document.

    Since PDF documents can be very large and complex, merely creating
    a `Document` does very little aside from verifying that the
    password is correct and getting a minimal amount of metadata.  In
    general, PLAYA will try to open just about anything as a PDF, so
    you should not expect the constructor to fail here if you give it
    nonsense (something else may fail later on).

    Some metadata, such as the structure tree and page tree, will be
    loaded lazily and cached.  We do not handle modification of PDFs.

    Args:
      fp: File-like object in binary mode, or a buffer with binary data.
          Files will be read using `mmap` if possible.  They do not need
          to be seekable, as if `mmap` fails the entire file will simply
          be read into memory (so a pipe or socket ought to work).
      password: Password for decryption, if needed.
      space: the device space to use for interpreting content ("screen"
          or "page")

    Raises:
      TypeError: if `fp` is a file opened in text mode (don't do that!)
      PDFEncryptionError: if the PDF has an unsupported encryption scheme
      PDFPasswordIncorrect: if the password is incorrect
    """

    _fp: Union[BinaryIO, None] = None
    _pages: Union["PageList", None] = None
    _pool: Union[Executor, None] = None
    _outline: Union["Outline", None] = None
    _destinations: Union["Destinations", None] = None
    _structure: Union["Tree", None]
    _fontmap: Union[Dict[str, Font], None] = None

    def __enter__(self) -> "Document":
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()

    def close(self) -> None:
        # If we were opened from a file then close it
        if self._fp:
            self._fp.close()
            self._fp = None
        # Shutdown process pool
        if self._pool:
            self._pool.shutdown()
            self._pool = None

    def __init__(
        self,
        fp: Union[BinaryIO, bytes],
        password: str = "",
        space: DeviceSpace = "screen",
        _boss_id: int = 0,
    ) -> None:
        if _boss_id:
            # Set this **right away** because it is needed to get
            # indirect object references right.
            _set_document(self, _boss_id)
            assert in_worker()
        self.xrefs: List[XRef] = []
        self.space = space
        self.info = []
        self.catalog: Dict[str, Any] = {}
        self.encryption: Optional[Tuple[Any, Any]] = None
        self.decipher: Optional[DecipherCallable] = None
        self._cached_objs: Dict[int, PDFObject] = {}
        self._parsed_objs: Dict[int, Tuple[List[PDFObject], int]] = {}
        self._cached_fonts: Dict[int, Font] = {}
        if isinstance(fp, io.TextIOBase):
            raise TypeError("fp is not a binary file")
        self.pdf_version, self.offset, self.buffer = _open_input(fp)
        self.is_printable = self.is_modifiable = self.is_extractable = True
        # Getting the XRef table and trailer is done non-lazily
        # because they contain encryption information among other
        # things.  As noted above we don't try to look for the first
        # page cross-reference table (for linearized PDFs) after the
        # header, it will instead be loaded with all the rest.
        self.parser = IndirectObjectParser(self.buffer, self)
        self.parser.seek(self.offset)
        self._xrefpos: Set[int] = set()
        try:
            self._read_xrefs()
        except Exception as e:
            log.debug(
                "Failed to parse xref table, falling back to object parser: %s",
                e,
            )
            newxref = XRefFallback(self.parser)
            self.xrefs.append(newxref)
        # Now find the trailer
        for xref in self.xrefs:
            trailer = xref.trailer
            if not trailer:
                continue
            # If there's an encryption info, remember it.
            if "Encrypt" in trailer:
                if "ID" in trailer:
                    id_value = list_value(trailer["ID"])
                else:
                    # Some documents may not have a /ID, use two empty
                    # byte strings instead. Solves
                    # https://github.com/pdfminer/pdfminer.six/issues/594
                    id_value = (b"", b"")
                self.encryption = (id_value, dict_value(trailer["Encrypt"]))
                self._initialize_password(password)
            if "Info" in trailer:
                try:
                    self.info.append(dict_value(trailer["Info"]))
                except TypeError:
                    log.warning("Info is a broken reference (incorrect xref table?)")
            if "Root" in trailer:
                # Every PDF file must have exactly one /Root dictionary.
                try:
                    self.catalog = dict_value(trailer["Root"])
                except TypeError:
                    log.warning("Root is a broken reference (incorrect xref table?)")
                    self.catalog = {}
                break
        else:
            log.warning("No /Root object! - Is this really a PDF?")
        if self.catalog.get("Type") is not LITERAL_CATALOG:
            log.warning("Catalog not found!")
        if "Version" in self.catalog:
            log.debug(
                "Using PDF version %r from catalog instead of %r from header",
                self.catalog["Version"],
                self.pdf_version,
            )
            self.pdf_version = literal_name(self.catalog["Version"])
        self.is_tagged = False
        markinfo = resolve1(self.catalog.get("MarkInfo"))
        if isinstance(markinfo, dict):
            self.is_tagged = not not markinfo.get("Marked")

    def _read_xrefs(self):
        try:
            xrefpos = self._find_xref()
        except Exception as e:
            raise PDFSyntaxError("No xref table found at end of file") from e
        try:
            self._read_xref_from(xrefpos, self.xrefs)
            return
        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
            log.warning("Checking for two PDFs in a trenchcoat: %s", e)
            xrefpos = self._detect_concatenation(xrefpos)
            if xrefpos == -1:
                raise PDFSyntaxError("Failed to read xref table at end of file") from e
        try:
            self._read_xref_from(xrefpos, self.xrefs)
        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
            raise PDFSyntaxError(
                "Failed to read xref table with adjusted offset"
            ) from e

    def _detect_concatenation(self, xrefpos: int) -> int:
        # Detect the case where two (or more) PDFs have been
        # concatenated, or where somebody tried an "incremental
        # update" without updating the xref table
        filestart = self.buffer.rfind(b"%%EOF")
        log.debug("Found ultimate %%EOF at %d", filestart)
        if filestart != -1:
            filestart = self.buffer.rfind(b"%%EOF", 0, filestart)
            log.debug("Found penultimate %%EOF at %d", filestart)
        if filestart != -1:
            filestart += 5
            while self.buffer[filestart] in (10, 13):
                filestart += 1
            parser = ObjectParser(self.buffer, self, filestart + xrefpos)
            try:
                (pos, token) = parser.nexttoken()
            except StopIteration:
                raise ValueError(f"Unexpected EOF at {filestart}")
            if token is KEYWORD_XREF:
                log.debug(
                    "Found two PDFs in a trenchcoat at %d "
                    "(second xref is at %d not %d)",
                    filestart,
                    pos,
                    xrefpos,
                )
                self.offset = filestart
                return pos
        return -1

    def _initialize_password(self, password: str = "") -> None:
        """Initialize the decryption handler with a given password, if any.

        Internal function, requires the Encrypt dictionary to have
        been read from the trailer into self.encryption.
        """
        assert self.encryption is not None
        (docid, param) = self.encryption
        if literal_name(param.get("Filter")) != "Standard":
            raise PDFEncryptionError("Unknown filter: param=%r" % param)
        v = int_value(param.get("V", 0))
        # 3 (PDF 1.4) An unpublished algorithm that permits encryption
        # key lengths ranging from 40 to 128 bits. This value shall
        # not appear in a conforming PDF file.
        if v == 3:
            raise PDFEncryptionError("Unpublished algorithm 3 not supported")
        factory = SECURITY_HANDLERS.get(v)
        # 0 An algorithm that is undocumented. This value shall not be used.
        if factory is None:
            raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
        handler = factory(docid, param, password)
        self.decipher = handler.decrypt
        self.is_printable = handler.is_printable
        self.is_modifiable = handler.is_modifiable
        self.is_extractable = handler.is_extractable
        assert self.parser is not None
        # Ensure that no extra data leaks into encrypted streams
        self.parser.strict = True
        self.parser.decipher = self.decipher

    def __iter__(self) -> Iterator[IndirectObject]:
        """Iterate over top-level `IndirectObject` (does not expand object streams)"""
        return (
            obj
            for pos, obj in IndirectObjectParser(
                self.buffer, self, pos=self.offset, strict=self.parser.strict
            )
        )

    @property
    def objects(self) -> Iterator[IndirectObject]:
        """Iterate over all indirect objects (including, then expanding object
        streams)"""
        for _, obj in IndirectObjectParser(
            self.buffer, self, pos=self.offset, strict=self.parser.strict
        ):
            yield obj
            if (
                isinstance(obj.obj, ContentStream)
                and obj.obj.get("Type") is LITERAL_OBJSTM
            ):
                parser = ObjectStreamParser(obj.obj, self)
                for _, sobj in parser:
                    yield sobj

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterate over tokens."""
        return (tok for pos, tok in Lexer(self.buffer))

    @property
    def structure(self) -> Union[Tree, None]:
        """Logical structure of this document, if any.

        In the case where no logical structure tree exists, this will
        be `None`.  Otherwise you may iterate over it, search it, etc.
        We do this instead of simply returning an empty structure
        tree because the vast majority of PDFs have no logical
        structure.
        """
        if hasattr(self, "_structure"):
            return self._structure
        try:
            self._structure = Tree(self)
        except (TypeError, KeyError):
            self._structure = None
        return self._structure

    def _getobj_objstm(
        self, stream: ContentStream, index: int, objid: int
    ) -> PDFObject:
        if stream.objid in self._parsed_objs:
            (objs, n) = self._parsed_objs[stream.objid]
        else:
            (objs, n) = self._get_objects(stream)
            assert stream.objid is not None
            self._parsed_objs[stream.objid] = (objs, n)
        i = n * 2 + index
        try:
            obj = objs[i]
        except IndexError:
            raise PDFSyntaxError("index too big: %r" % index)
        return obj

    def _get_objects(self, stream: ContentStream) -> Tuple[List[PDFObject], int]:
        if stream.get("Type") is not LITERAL_OBJSTM:
            log.warning("Content stream Type is not /ObjStm: %r" % stream)
        try:
            n = int_value(stream["N"])
        except KeyError:
            log.warning("N is not defined in content stream: %r" % stream)
            n = 0
        except TypeError:
            log.warning("N is invalid in content stream: %r" % stream)
            n = 0
        parser = ObjectParser(stream.buffer, self)
        objs: List[PDFObject] = [obj for _, obj in parser]
        return (objs, n)

    def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
        assert self.parser is not None
        self.parser.seek(pos)
        try:
            _, obj = next(self.parser)
            if obj.objid != objid:
                raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
        except (ValueError, IndexError, PDFSyntaxError) as e:
            log.warning(
                "Indirect object %d not found at position %d: %r", objid, pos, e
            )
            # In case of malformed pdf files where the offset in the
            # xref table doesn't point exactly at the object
            # definition (probably more frequent than you think), just
            # use a regular expression to find the object because we
            # can do that.
            realpos = -1
            lastgen = -1
            for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer):
                genno = int(m.group(1))
                if genno > lastgen:
                    lastgen = genno
                    realpos = m.start(0)
            if realpos == -1:
                raise PDFSyntaxError(
                    f"Indirect object {objid!r} not found in document"
                ) from e
            self.parser.seek(realpos)
            (_, obj) = next(self.parser)
        if obj.objid != objid:
            raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
        return obj.obj

    def __getitem__(self, objid: int) -> PDFObject:
        """Get an indirect object from the PDF.

        Note that the behaviour in the case of a non-existent object
        (raising `IndexError`), while Pythonic, is not PDFic, as PDF
        1.7 sec 7.3.10 states:

        > An indirect reference to an undefined object shall not be
        considered an error by a conforming reader; it shall be
        treated as a reference to the null object.

        Raises:
          ValueError: if Document is not initialized
          IndexError: if objid does not exist in PDF

        """
        if not self.xrefs:
            raise ValueError("Document is not initialized")
        if objid not in self._cached_objs:
            obj = None
            for xref in self.xrefs:
                try:
                    (strmid, index, genno) = xref.get_pos(objid)
                except KeyError:
                    continue
                try:
                    if strmid is not None:
                        stream = stream_value(self[strmid])
                        obj = self._getobj_objstm(stream, index, objid)
                    else:
                        obj = self._getobj_parse(index, objid)
                    break
                # FIXME: We might not actually want to catch these...
                except StopIteration:
                    log.debug("EOF when searching for object %d", objid)
                    continue
                except PDFSyntaxError as e:
                    log.debug("Syntax error when searching for object %d: %s", objid, e)
                    continue
            if obj is None:
                raise IndexError(f"Object with ID {objid} not found")
            self._cached_objs[objid] = obj
        return self._cached_objs[objid]

    def get_font(
        self, objid: int = 0, spec: Union[Dict[str, PDFObject], None] = None
    ) -> Font:
        if objid and objid in self._cached_fonts:
            return self._cached_fonts[objid]
        if spec is None:
            return Font({}, {})
        # Create a Font object, hopefully
        font: Union[Font, None] = None
        if spec.get("Type") is not LITERAL_FONT:
            log.warning("Font Type is not /Font: %r", spec)
        subtype = spec.get("Subtype")
        if subtype in (LITERAL_TYPE1, LITERAL_MMTYPE1):
            font = Type1Font(spec)
        elif subtype is LITERAL_TRUETYPE:
            font = TrueTypeFont(spec)
        elif subtype == LITERAL_TYPE3:
            font = Type3Font(spec)
        elif subtype == LITERAL_TYPE0:
            if "DescendantFonts" not in spec:
                log.warning("Type0 font has no DescendantFonts: %r", spec)
            else:
                dfonts = list_value(spec["DescendantFonts"])
                if len(dfonts) != 1:
                    log.debug(
                        "Type 0 font should have 1 descendant, has more: %r", dfonts
                    )
                subspec = resolve1(dfonts[0])
                if not isinstance(subspec, dict):
                    log.warning("Invalid descendant font: %r", subspec)
                else:
                    subspec = subspec.copy()
                    # Merge the root and descendant font dictionaries
                    for k in ("Encoding", "ToUnicode"):
                        if k in spec:
                            subspec[k] = resolve1(spec[k])
                    font = CIDFont(subspec)
        else:
            log.warning("Unknown Subtype in font: %r" % spec)
        if font is None:
            # We need a dummy font object to be able to do *something*
            # (even if it's the wrong thing) with text objects.
            font = Font({}, {})
        if objid:
            self._cached_fonts[objid] = font
        return font

    @property
    def fonts(self) -> Mapping[str, Font]:
        """Get the mapping of font names to fonts for this document.

        Note that this can be quite slow the first time it's accessed
        as it must scan every single page in the document.

        Note: Font names may collide.
            Font names are generally understood to be globally unique
            <del>in the neighbourhood</del> in the document, but there's no
            guarantee that this is the case.  In keeping with the
            "incremental update" philosophy dear to PDF, you get the
            last font with a given name.

        Danger: Do not rely on this being a `dict`.
            Currently this is implemented eagerly, but in the future it
            may return a lazy object which only loads fonts on demand.

        """
        if self._fontmap is not None:
            return self._fontmap
        self._fontmap: Dict[str, Font] = {}
        for idx, page in enumerate(self.pages):
            for font in page.fonts.values():
                self._fontmap[font.fontname] = font
        return self._fontmap

    @property
    def outline(self) -> Union[Outline, None]:
        """Document outline, if any."""
        if "Outlines" not in self.catalog:
            return None
        if self._outline is None:
            try:
                self._outline = Outline(self)
            except TypeError:
                log.warning(
                    "Invalid Outlines entry in catalog: %r", self.catalog["Outlines"]
                )
                return None
        return self._outline

    @property
    def page_labels(self) -> Iterator[str]:
        """Generate page label strings for the PDF document.

        If the document includes page labels, generates strings, one per page.
        If not, raise KeyError.

        The resulting iterator is unbounded (because the page label
        tree does not actually include all the pages), so it is
        recommended to use `pages` instead.

        Raises:
          KeyError: No page labels are present in the catalog

        """
        assert self.catalog is not None  # really it cannot be None

        page_labels = PageLabels(self.catalog["PageLabels"])
        return page_labels.labels

    PageType = Dict[Any, Dict[Any, Any]]

    def _get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
        """Find pages from the cross-reference tables if the page tree
        is missing (note that this only happens in invalid PDFs, but
        it happens.)

        Returns:
          an iterator over (objid, dict) pairs.
        """
        for xref in self.xrefs:
            for object_id in xref.objids:
                try:
                    obj = self[object_id]
                    if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
                        yield object_id, obj
                except IndexError:
                    pass

    def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
        """Iterate over the flattened page tree in reading order, propagating
        inheritable attributes.  Returns an iterator over (objid, dict) pairs.

        Raises:
          KeyError: if there is no page tree.
        """
        if "Pages" not in self.catalog:
            raise KeyError("No 'Pages' entry in catalog")
        stack = [(self.catalog["Pages"], self.catalog)]
        visited = set()
        while stack:
            (obj, parent) = stack.pop()
            if isinstance(obj, ObjRef):
                # The PDF specification *requires* both the Pages
                # element of the catalog and the entries in Kids in
                # the page tree to be indirect references.
                object_id = int(obj.objid)
            elif isinstance(obj, int):
                # Should not happen in a valid PDF, but probably does?
                log.warning("Page tree contains bare integer: %r in %r", obj, parent)
                object_id = obj
            else:
                log.warning("Page tree contains unknown object: %r", obj)
            page_object = dict_value(self[object_id])

            # Avoid recursion errors by keeping track of visited nodes
            # (again, this should never actually happen in a valid PDF)
            if object_id in visited:
                log.warning("Circular reference %r in page tree", obj)
                continue
            visited.add(object_id)

            # Propagate inheritable attributes
            object_properties = page_object.copy()
            for k, v in parent.items():
                if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
                    object_properties[k] = v

            # Recurse, depth-first
            object_type = object_properties.get("Type")
            if object_type is None:
                log.warning("Page has no Type, trying type: %r", object_properties)
                object_type = object_properties.get("type")
            if object_type is LITERAL_PAGES and "Kids" in object_properties:
                for child in reversed(list_value(object_properties["Kids"])):
                    stack.append((child, object_properties))
            elif object_type is LITERAL_PAGE:
                yield object_id, object_properties

    @property
    def pages(self) -> "PageList":
        """Pages of the document as an iterable/addressable `PageList` object."""
        if self._pages is None:
            self._pages = PageList(self)
        return self._pages

    @property
    def names(self) -> Dict[str, Any]:
        """PDF name dictionary (PDF 1.7 sec 7.7.4).

        Raises:
          KeyError: if nonexistent.
        """
        return dict_value(self.catalog["Names"])

    @property
    def destinations(self) -> "Destinations":
        """Named destinations as an iterable/addressable `Destinations` object."""
        if self._destinations is None:
            self._destinations = Destinations(self)
        return self._destinations

    def _find_xref(self) -> int:
        """Internal function used to locate the first XRef."""
        # search the last xref table by scanning the file backwards.
        prev = b""
        for pos, line in reverse_iter_lines(self.buffer):
            line = line.strip()
            if line == b"startxref":
                if not prev.isdigit():
                    log.warning("Invalid startxref position: %r", prev)
                    continue
                start = int(prev)
                if not start >= 0:
                    raise ValueError("Invalid negative startxref position: %d" % start)
                elif start > pos:
                    raise ValueError(
                        "Invalid startxref position (> %d): %d" % (pos, start)
                    )
                return start + self.offset
            elif line == b"xref":
                return pos
            elif line == b"endobj":
                # Okay, we're probably not in Kansas anymore...
                break
            if line:
                prev = line
        raise ValueError("No xref table found at end of file")

    # read xref table
    def _read_xref_from(
        self,
        start: int,
        xrefs: List[XRef],
    ) -> None:
        """Reads XRefs from the given location."""
        if start in self._xrefpos:
            log.warning("Detected circular xref chain at %d", start)
            return
        parser = ObjectParser(self.buffer, self, start)
        try:
            (pos, token) = parser.nexttoken()
        except StopIteration:
            raise ValueError(f"Unexpected EOF at {start}")
        if token is KEYWORD_XREF:
            log.debug("Reading xref table at %d", pos)
            parser.nextline()
            xref: XRef = XRefTable(parser, self.offset)
        else:
            # It might be an XRefStream, if this is an indirect object...
            _, token = parser.nexttoken()
            _, token = parser.nexttoken()
            if token is KEYWORD_OBJ:
                # XRefStream: PDF-1.5
                self.parser.seek(pos)
                self.parser.reset()
                xref = XRefStream(self.parser, self.offset)
            else:
                # Well, maybe it's an XRef table without "xref" (but
                # probably not)
                parser.seek(pos)
                xref = XRefTable(parser, self.offset)
        self._xrefpos.add(start)
        xrefs.append(xref)
        trailer = xref.trailer
        # For hybrid-reference files, an additional set of xrefs as a
        # stream.
        if "XRefStm" in trailer:
            pos = int_value(trailer["XRefStm"])
            self._read_xref_from(pos + self.offset, xrefs)
        # Recurse into any previous xref tables or streams
        if "Prev" in trailer:
            # find previous xref
            pos = int_value(trailer["Prev"])
            self._read_xref_from(pos + self.offset, xrefs)

destinations property

Named destinations as an iterable/addressable Destinations object.

fonts property

Get the mapping of font names to fonts for this document.

Note that this can be quite slow the first time it's accessed as it must scan every single page in the document.

Font names may collide.

Font names are generally understood to be globally unique in the neighbourhood in the document, but there's no guarantee that this is the case. In keeping with the "incremental update" philosophy dear to PDF, you get the last font with a given name.

Do not rely on this being a dict.

Currently this is implemented eagerly, but in the future it may return a lazy object which only loads fonts on demand.

names property

PDF name dictionary (PDF 1.7 sec 7.7.4).

Raises:

Type Description
KeyError

if nonexistent.

objects property

Iterate over all indirect objects (including, then expanding object streams)

outline property

Document outline, if any.

page_labels property

Generate page label strings for the PDF document.

If the document includes page labels, generates strings, one per page. If not, raise KeyError.

The resulting iterator is unbounded (because the page label tree does not actually include all the pages), so it is recommended to use pages instead.

Raises:

Type Description
KeyError

No page labels are present in the catalog

pages property

Pages of the document as an iterable/addressable PageList object.

structure property

Logical structure of this document, if any.

In the case where no logical structure tree exists, this will be None. Otherwise you may iterate over it, search it, etc. We do this instead of simply returning an empty structure tree because the vast majority of PDFs have no logical structure.

tokens property

Iterate over tokens.

__getitem__(objid)

Get an indirect object from the PDF.

Note that the behaviour in the case of a non-existent object (raising IndexError), while Pythonic, is not PDFic, as PDF 1.7 sec 7.3.10 states:

An indirect reference to an undefined object shall not be considered an error by a conforming reader; it shall be treated as a reference to the null object.

Raises:

Type Description
ValueError

if Document is not initialized

IndexError

if objid does not exist in PDF

Source code in playa/document.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
def __getitem__(self, objid: int) -> PDFObject:
    """Get an indirect object from the PDF.

    Note that the behaviour in the case of a non-existent object
    (raising `IndexError`), while Pythonic, is not PDFic, as PDF
    1.7 sec 7.3.10 states:

    > An indirect reference to an undefined object shall not be
    considered an error by a conforming reader; it shall be
    treated as a reference to the null object.

    Raises:
      ValueError: if Document is not initialized
      IndexError: if objid does not exist in PDF

    """
    if not self.xrefs:
        raise ValueError("Document is not initialized")
    if objid not in self._cached_objs:
        obj = None
        for xref in self.xrefs:
            try:
                (strmid, index, genno) = xref.get_pos(objid)
            except KeyError:
                continue
            try:
                if strmid is not None:
                    stream = stream_value(self[strmid])
                    obj = self._getobj_objstm(stream, index, objid)
                else:
                    obj = self._getobj_parse(index, objid)
                break
            # FIXME: We might not actually want to catch these...
            except StopIteration:
                log.debug("EOF when searching for object %d", objid)
                continue
            except PDFSyntaxError as e:
                log.debug("Syntax error when searching for object %d: %s", objid, e)
                continue
        if obj is None:
            raise IndexError(f"Object with ID {objid} not found")
        self._cached_objs[objid] = obj
    return self._cached_objs[objid]

__iter__()

Iterate over top-level IndirectObject (does not expand object streams)

Source code in playa/document.py
352
353
354
355
356
357
358
359
def __iter__(self) -> Iterator[IndirectObject]:
    """Iterate over top-level `IndirectObject` (does not expand object streams)"""
    return (
        obj
        for pos, obj in IndirectObjectParser(
            self.buffer, self, pos=self.offset, strict=self.parser.strict
        )
    )

_find_xref()

Internal function used to locate the first XRef.

Source code in playa/document.py
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
def _find_xref(self) -> int:
    """Internal function used to locate the first XRef."""
    # search the last xref table by scanning the file backwards.
    prev = b""
    for pos, line in reverse_iter_lines(self.buffer):
        line = line.strip()
        if line == b"startxref":
            if not prev.isdigit():
                log.warning("Invalid startxref position: %r", prev)
                continue
            start = int(prev)
            if not start >= 0:
                raise ValueError("Invalid negative startxref position: %d" % start)
            elif start > pos:
                raise ValueError(
                    "Invalid startxref position (> %d): %d" % (pos, start)
                )
            return start + self.offset
        elif line == b"xref":
            return pos
        elif line == b"endobj":
            # Okay, we're probably not in Kansas anymore...
            break
        if line:
            prev = line
    raise ValueError("No xref table found at end of file")

_get_page_objects()

Iterate over the flattened page tree in reading order, propagating inheritable attributes. Returns an iterator over (objid, dict) pairs.

Raises:

Type Description
KeyError

if there is no page tree.

Source code in playa/document.py
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
    """Iterate over the flattened page tree in reading order, propagating
    inheritable attributes.  Returns an iterator over (objid, dict) pairs.

    Raises:
      KeyError: if there is no page tree.
    """
    if "Pages" not in self.catalog:
        raise KeyError("No 'Pages' entry in catalog")
    stack = [(self.catalog["Pages"], self.catalog)]
    visited = set()
    while stack:
        (obj, parent) = stack.pop()
        if isinstance(obj, ObjRef):
            # The PDF specification *requires* both the Pages
            # element of the catalog and the entries in Kids in
            # the page tree to be indirect references.
            object_id = int(obj.objid)
        elif isinstance(obj, int):
            # Should not happen in a valid PDF, but probably does?
            log.warning("Page tree contains bare integer: %r in %r", obj, parent)
            object_id = obj
        else:
            log.warning("Page tree contains unknown object: %r", obj)
        page_object = dict_value(self[object_id])

        # Avoid recursion errors by keeping track of visited nodes
        # (again, this should never actually happen in a valid PDF)
        if object_id in visited:
            log.warning("Circular reference %r in page tree", obj)
            continue
        visited.add(object_id)

        # Propagate inheritable attributes
        object_properties = page_object.copy()
        for k, v in parent.items():
            if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
                object_properties[k] = v

        # Recurse, depth-first
        object_type = object_properties.get("Type")
        if object_type is None:
            log.warning("Page has no Type, trying type: %r", object_properties)
            object_type = object_properties.get("type")
        if object_type is LITERAL_PAGES and "Kids" in object_properties:
            for child in reversed(list_value(object_properties["Kids"])):
                stack.append((child, object_properties))
        elif object_type is LITERAL_PAGE:
            yield object_id, object_properties

_get_pages_from_xrefs()

Find pages from the cross-reference tables if the page tree is missing (note that this only happens in invalid PDFs, but it happens.)

Returns:

Type Description
Iterator[Tuple[int, PageType]]

an iterator over (objid, dict) pairs.

Source code in playa/document.py
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
def _get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
    """Find pages from the cross-reference tables if the page tree
    is missing (note that this only happens in invalid PDFs, but
    it happens.)

    Returns:
      an iterator over (objid, dict) pairs.
    """
    for xref in self.xrefs:
        for object_id in xref.objids:
            try:
                obj = self[object_id]
                if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
                    yield object_id, obj
            except IndexError:
                pass

_initialize_password(password='')

Initialize the decryption handler with a given password, if any.

Internal function, requires the Encrypt dictionary to have been read from the trailer into self.encryption.

Source code in playa/document.py
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
def _initialize_password(self, password: str = "") -> None:
    """Initialize the decryption handler with a given password, if any.

    Internal function, requires the Encrypt dictionary to have
    been read from the trailer into self.encryption.
    """
    assert self.encryption is not None
    (docid, param) = self.encryption
    if literal_name(param.get("Filter")) != "Standard":
        raise PDFEncryptionError("Unknown filter: param=%r" % param)
    v = int_value(param.get("V", 0))
    # 3 (PDF 1.4) An unpublished algorithm that permits encryption
    # key lengths ranging from 40 to 128 bits. This value shall
    # not appear in a conforming PDF file.
    if v == 3:
        raise PDFEncryptionError("Unpublished algorithm 3 not supported")
    factory = SECURITY_HANDLERS.get(v)
    # 0 An algorithm that is undocumented. This value shall not be used.
    if factory is None:
        raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
    handler = factory(docid, param, password)
    self.decipher = handler.decrypt
    self.is_printable = handler.is_printable
    self.is_modifiable = handler.is_modifiable
    self.is_extractable = handler.is_extractable
    assert self.parser is not None
    # Ensure that no extra data leaks into encrypted streams
    self.parser.strict = True
    self.parser.decipher = self.decipher

_read_xref_from(start, xrefs)

Reads XRefs from the given location.

Source code in playa/document.py
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
def _read_xref_from(
    self,
    start: int,
    xrefs: List[XRef],
) -> None:
    """Reads XRefs from the given location."""
    if start in self._xrefpos:
        log.warning("Detected circular xref chain at %d", start)
        return
    parser = ObjectParser(self.buffer, self, start)
    try:
        (pos, token) = parser.nexttoken()
    except StopIteration:
        raise ValueError(f"Unexpected EOF at {start}")
    if token is KEYWORD_XREF:
        log.debug("Reading xref table at %d", pos)
        parser.nextline()
        xref: XRef = XRefTable(parser, self.offset)
    else:
        # It might be an XRefStream, if this is an indirect object...
        _, token = parser.nexttoken()
        _, token = parser.nexttoken()
        if token is KEYWORD_OBJ:
            # XRefStream: PDF-1.5
            self.parser.seek(pos)
            self.parser.reset()
            xref = XRefStream(self.parser, self.offset)
        else:
            # Well, maybe it's an XRef table without "xref" (but
            # probably not)
            parser.seek(pos)
            xref = XRefTable(parser, self.offset)
    self._xrefpos.add(start)
    xrefs.append(xref)
    trailer = xref.trailer
    # For hybrid-reference files, an additional set of xrefs as a
    # stream.
    if "XRefStm" in trailer:
        pos = int_value(trailer["XRefStm"])
        self._read_xref_from(pos + self.offset, xrefs)
    # Recurse into any previous xref tables or streams
    if "Prev" in trailer:
        # find previous xref
        pos = int_value(trailer["Prev"])
        self._read_xref_from(pos + self.offset, xrefs)

PageLabels

Bases: NumberTree

PageLabels from the document catalog.

See Section 12.4.2 in the PDF 1.7 Reference.

Source code in playa/document.py
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
class PageLabels(NumberTree):
    """PageLabels from the document catalog.

    See Section 12.4.2 in the PDF 1.7 Reference.
    """

    @property
    def labels(self) -> Iterator[str]:
        itor = iter(self)
        try:
            start, label_dict_unchecked = next(itor)
            # The tree must begin with page index 0
            if start != 0:
                log.warning("PageLabels tree is missing page index 0")
                # Try to cope, by assuming empty labels for the initial pages
                start = 0
        except StopIteration:
            log.warning("PageLabels tree is empty")
            start = 0
            label_dict_unchecked = {}

        while True:  # forever!
            label_dict = dict_value(label_dict_unchecked)
            style = label_dict.get("S")
            prefix = decode_text(str_value(label_dict.get("P", b"")))
            first_value = int_value(label_dict.get("St", 1))

            try:
                next_start, label_dict_unchecked = next(itor)
            except StopIteration:
                # This is the last specified range. It continues until the end
                # of the document.
                values: Iterable[int] = itertools.count(first_value)
            else:
                range_length = next_start - start
                values = range(first_value, first_value + range_length)
                start = next_start

            for value in values:
                label = self._format_page_label(value, style)
                yield prefix + label

    @staticmethod
    def _format_page_label(value: int, style: Any) -> str:
        """Format page label value in a specific style"""
        if style is None:
            label = ""
        elif style is LIT("D"):  # Decimal arabic numerals
            label = str(value)
        elif style is LIT("R"):  # Uppercase roman numerals
            label = format_int_roman(value).upper()
        elif style is LIT("r"):  # Lowercase roman numerals
            label = format_int_roman(value)
        elif style is LIT("A"):  # Uppercase letters A-Z, AA-ZZ...
            label = format_int_alpha(value).upper()
        elif style is LIT("a"):  # Lowercase letters a-z, aa-zz...
            label = format_int_alpha(value)
        else:
            log.warning("Unknown page label style: %r", style)
            label = ""
        return label

_format_page_label(value, style) staticmethod

Format page label value in a specific style

Source code in playa/document.py
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
@staticmethod
def _format_page_label(value: int, style: Any) -> str:
    """Format page label value in a specific style"""
    if style is None:
        label = ""
    elif style is LIT("D"):  # Decimal arabic numerals
        label = str(value)
    elif style is LIT("R"):  # Uppercase roman numerals
        label = format_int_roman(value).upper()
    elif style is LIT("r"):  # Lowercase roman numerals
        label = format_int_roman(value)
    elif style is LIT("A"):  # Uppercase letters A-Z, AA-ZZ...
        label = format_int_alpha(value).upper()
    elif style is LIT("a"):  # Lowercase letters a-z, aa-zz...
        label = format_int_alpha(value)
    else:
        log.warning("Unknown page label style: %r", style)
        label = ""
    return label

PageList

List of pages indexable by 0-based index or string label.

Attributes:

Name Type Description
have_labels bool

If pages have explicit labels in the PDF.

Source code in playa/document.py
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
class PageList:
    """List of pages indexable by 0-based index or string label.

    Attributes:
        have_labels: If pages have explicit labels in the PDF.
    """

    have_labels: bool

    def __init__(
        self, doc: Document, pages: Union[Iterable[Page], None] = None
    ) -> None:
        self.docref = _ref_document(doc)
        if pages is not None:
            self._pages = list(pages)
            self._labels: Dict[str, Page] = {
                page.label: page for page in pages if page.label is not None
            }
            self.have_labels = not not self._labels
        else:
            self._init_pages(doc)

    def _init_pages(self, doc: Document) -> None:
        try:
            page_labels: Iterable[Union[str, None]] = doc.page_labels
            self.have_labels = True
        except (KeyError, ValueError):
            page_labels = (str(idx) for idx in itertools.count(1))
            self.have_labels = False
        self._pages = []
        self._objids = {}
        self._labels = {}
        try:
            page_objects = list(doc._get_page_objects())
        except (KeyError, IndexError, TypeError):
            page_objects = list(doc._get_pages_from_xrefs())
        for page_idx, ((objid, properties), label) in enumerate(
            zip(page_objects, page_labels)
        ):
            page = Page(doc, objid, properties, label, page_idx, doc.space)
            self._pages.append(page)
            self._objids[objid] = page
            if label is not None:
                if label in self._labels:
                    log.info("Duplicate page label %s at index %d", label, page_idx)
                else:
                    self._labels[label] = page

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self.docref)

    def __len__(self) -> int:
        return len(self._pages)

    def __iter__(self) -> Iterator[Page]:
        return iter(self._pages)

    @overload
    def __getitem__(self, key: int) -> Page: ...

    @overload
    def __getitem__(self, key: str) -> Page: ...

    @overload
    def __getitem__(self, key: slice) -> "PageList": ...

    @overload
    def __getitem__(self, key: Iterable[int]) -> "PageList": ...

    @overload
    def __getitem__(self, key: Iterator[Union[int, str]]) -> "PageList": ...

    def __getitem__(self, key):
        if isinstance(key, int):
            return self._pages[key]
        elif isinstance(key, str):
            return self._labels[key]
        elif isinstance(key, slice):
            return PageList(_deref_document(self.docref), self._pages[key])
        else:
            return PageList(_deref_document(self.docref), (self[k] for k in key))

    def by_id(self, objid: int) -> Page:
        """Get a page by its indirect object ID.

        Args:
            objid: Indirect object ID for the page object.

        Returns:
            the page in question.
        """
        return self._objids[objid]

    def map(self, func: Callable[[Page], Any]) -> Iterator:
        """Apply a function over each page, iterating over its results.

        Args:
            func: The function to apply to each page.

        Note:
            This possibly runs `func` in a separate process.  If its
            return value is not serializable (by `pickle`) then you
            will encounter errors.
        """
        doc = _deref_document(self.docref)
        if doc._pool is not None:
            return doc._pool.map(
                call_page,
                itertools.repeat(func),
                ((id(doc), page.page_idx) for page in self),
            )
        else:
            return (func(page) for page in self)

doc property

Get associated document if it exists.

by_id(objid)

Get a page by its indirect object ID.

Parameters:

Name Type Description Default
objid int

Indirect object ID for the page object.

required

Returns:

Type Description
Page

the page in question.

Source code in playa/document.py
873
874
875
876
877
878
879
880
881
882
def by_id(self, objid: int) -> Page:
    """Get a page by its indirect object ID.

    Args:
        objid: Indirect object ID for the page object.

    Returns:
        the page in question.
    """
    return self._objids[objid]

map(func)

Apply a function over each page, iterating over its results.

Parameters:

Name Type Description Default
func Callable[[Page], Any]

The function to apply to each page.

required
Note

This possibly runs func in a separate process. If its return value is not serializable (by pickle) then you will encounter errors.

Source code in playa/document.py
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
def map(self, func: Callable[[Page], Any]) -> Iterator:
    """Apply a function over each page, iterating over its results.

    Args:
        func: The function to apply to each page.

    Note:
        This possibly runs `func` in a separate process.  If its
        return value is not serializable (by `pickle`) then you
        will encounter errors.
    """
    doc = _deref_document(self.docref)
    if doc._pool is not None:
        return doc._pool.map(
            call_page,
            itertools.repeat(func),
            ((id(doc), page.page_idx) for page in self),
        )
    else:
        return (func(page) for page in self)

call_page(func, pageref)

Call a function on a page in a worker process.

Source code in playa/document.py
784
785
786
def call_page(func: Callable[[Page], Any], pageref: PageRef) -> Any:
    """Call a function on a page in a worker process."""
    return func(_deref_page(pageref))

playa.page

Classes for looking at pages and their contents.

Annotation dataclass

PDF annotation (PDF 1.7 section 12.5).

Attributes:

Name Type Description
subtype str

Type of annotation.

rect Rect

Annotation rectangle (location on page) in default user space

bbox Rect

Annotation rectangle in device space

props Dict[str, PDFObject]

Annotation dictionary containing all other properties (PDF 1.7 sec. 12.5.2).

Source code in playa/page.py
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
@dataclass
class Annotation:
    """PDF annotation (PDF 1.7 section 12.5).

    Attributes:
      subtype: Type of annotation.
      rect: Annotation rectangle (location on page) in *default user space*
      bbox: Annotation rectangle in *device space*
      props: Annotation dictionary containing all other properties
             (PDF 1.7 sec. 12.5.2).
    """

    _pageref: PageRef
    subtype: str
    rect: Rect
    props: Dict[str, PDFObject]

    @classmethod
    def from_dict(cls, obj: PDFObject, page: Page) -> "Annotation":
        annot = dict_value(obj)
        subtype = annot.get("Subtype")
        if subtype is None or not isinstance(subtype, PSLiteral):
            raise PDFSyntaxError("Invalid annotation Subtype %r" % (subtype,))
        rect = rect_value(annot.get("Rect"))
        return Annotation(
            _pageref=page.pageref,
            subtype=literal_name(subtype),
            rect=rect,
            props=annot,
        )

    @property
    def page(self) -> Page:
        """Containing page for this annotation."""
        return _deref_page(self._pageref)

    @property
    def bbox(self) -> Rect:
        """Bounding box for this annotation in device space."""
        return transform_bbox(self.page.ctm, self.rect)

    @property
    def contents(self) -> Union[str, None]:
        """Text contents of annotation."""
        contents = resolve1(self.props.get("Contents"))
        if contents is None:
            return None
        if not isinstance(contents, (bytes, str)):
            log.warning("Invalid annotation contents: %r", contents)
            return None
        return decode_text(contents)

    @property
    def name(self) -> Union[str, None]:
        """Annotation name, uniquely identifying this annotation."""
        name = resolve1(self.props.get("NM"))
        if name is None:
            return None
        if not isinstance(name, (bytes, str)):
            log.warning("Invalid annotation name: %r", name)
            return None
        return decode_text(name)

    @property
    def mtime(self) -> Union[str, None]:
        """String describing date and time when annotation was most recently
        modified.

        The date *should* be in the format `D:YYYYMMDDHHmmSSOHH'mm`
        but this is in no way required (and unlikely to be implemented
        consistently, if history is any guide).
        """
        mtime = resolve1(self.props.get("M"))
        if mtime is None:
            return None
        if not isinstance(mtime, (bytes, str)):
            log.warning("Invalid annotation modification date: %r", mtime)
            return None
        return decode_text(mtime)

bbox property

Bounding box for this annotation in device space.

contents property

Text contents of annotation.

mtime property

String describing date and time when annotation was most recently modified.

The date should be in the format D:YYYYMMDDHHmmSSOHH'mm but this is in no way required (and unlikely to be implemented consistently, if history is any guide).

name property

Annotation name, uniquely identifying this annotation.

page property

Containing page for this annotation.

Page

An object that holds the information about a page.

Parameters:

Name Type Description Default
doc Document

a Document object.

required
pageid int

the integer PDF object ID associated with the page in the page tree.

required
attrs Dict

a dictionary of page attributes.

required
label Optional[str]

page label string.

required
page_idx int

0-based index of the page in the document.

0
space DeviceSpace

the device space to use for interpreting content

'screen'

Attributes:

Name Type Description
pageid

the integer object ID associated with the page in the page tree

attrs

a dictionary of page attributes.

resources Dict[str, PDFObject]

a dictionary of resources used by the page.

mediabox

the physical size of the page.

cropbox

the crop rectangle of the page.

rotate

the page rotation (in degree).

label

the page's label (typically, the logical page number).

page_idx

0-based index of the page in the document.

ctm

coordinate transformation matrix from default user space to page's device space

Source code in playa/page.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
class Page:
    """An object that holds the information about a page.

    Args:
      doc: a Document object.
      pageid: the integer PDF object ID associated with the page in the page tree.
      attrs: a dictionary of page attributes.
      label: page label string.
      page_idx: 0-based index of the page in the document.
      space: the device space to use for interpreting content

    Attributes:
      pageid: the integer object ID associated with the page in the page tree
      attrs: a dictionary of page attributes.
      resources: a dictionary of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      label: the page's label (typically, the logical page number).
      page_idx: 0-based index of the page in the document.
      ctm: coordinate transformation matrix from default user space to
           page's device space
    """

    _fontmap: Union[Dict[str, Font], None] = None
    _structmap: Union[List[Union["Element", None]], None] = None

    def __init__(
        self,
        doc: "Document",
        pageid: int,
        attrs: Dict,
        label: Optional[str],
        page_idx: int = 0,
        space: DeviceSpace = "screen",
    ) -> None:
        self.docref = _ref_document(doc)
        self.pageid = pageid
        self.attrs = attrs
        self.label = label
        self.page_idx = page_idx
        self.space = space
        self.pageref = _ref_page(self)
        self.lastmod = resolve1(self.attrs.get("LastModified"))
        try:
            self.resources: Dict[str, PDFObject] = dict_value(
                self.attrs.get("Resources")
            )
        except TypeError:
            log.warning("Resources missing or invalid from Page id %d", pageid)
            self.resources = {}
        try:
            self.mediabox = normalize_rect(rect_value(self.attrs["MediaBox"]))
        except KeyError:
            log.warning(
                "MediaBox missing from Page id %d (and not inherited),"
                " defaulting to US Letter (612x792)",
                pageid,
            )
            self.mediabox = (0, 0, 612, 792)
        except (ValueError, PDFSyntaxError):
            log.warning(
                "MediaBox %r invalid in Page id %d,"
                " defaulting to US Letter (612x792)",
                self.attrs["MediaBox"],
                pageid,
            )
            self.mediabox = (0, 0, 612, 792)
        self.cropbox = self.mediabox
        if "CropBox" in self.attrs:
            try:
                self.cropbox = normalize_rect(rect_value(self.attrs["CropBox"]))
            except (ValueError, PDFSyntaxError):
                log.warning(
                    "Invalid CropBox %r in /Page, defaulting to MediaBox",
                    self.attrs["CropBox"],
                )

        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
        (x0, y0, x1, y1) = self.mediabox
        width = x1 - x0
        height = y1 - y0
        # PDF 1.7 section 8.4.1: Initial value: a matrix that
        # transforms default user coordinates to device coordinates.
        #
        # We keep this as `self.ctm` in order to transform layout
        # attributes in tagged PDFs which are specified in default
        # user space (PDF 1.7 section 14.8.5.4.3, table 344)
        #
        # "screen" device space: origin is top left of MediaBox
        if self.space == "screen":
            self.ctm = (1.0, 0.0, 0.0, -1.0, -x0, y1)
        # "page" device space: origin is bottom left of MediaBox
        elif self.space == "page":
            self.ctm = (1.0, 0.0, 0.0, 1.0, -x0, -y0)
        # "default" device space: no transformation or rotation
        else:
            if self.space != "default":
                log.warning("Unknown device space: %r", self.space)
            self.ctm = MATRIX_IDENTITY
            width = height = 0
        # If rotation is requested, apply rotation to the initial ctm
        if self.rotate == 90:
            # x' = y
            # y' = width - x
            self.ctm = mult_matrix((0, -1, 1, 0, 0, width), self.ctm)
        elif self.rotate == 180:
            # x' = width - x
            # y' = height - y
            self.ctm = mult_matrix((-1, 0, 0, -1, width, height), self.ctm)
        elif self.rotate == 270:
            # x' = height - y
            # y' = x
            self.ctm = mult_matrix((0, 1, -1, 0, height, 0), self.ctm)
        elif self.rotate != 0:
            log.warning("Invalid /Rotate: %r", self.rotate)

        contents = resolve1(self.attrs.get("Contents"))
        if contents is None:
            self._contents = []
        else:
            if isinstance(contents, list):
                self._contents = contents
            else:
                self._contents = [contents]

    @property
    def annotations(self) -> Iterator["Annotation"]:
        """Lazily iterate over page annotations."""
        alist = resolve1(self.attrs.get("Annots"))
        if alist is None:
            return
        if not isinstance(alist, list):
            log.warning("Invalid Annots list: %r", alist)
            return
        for obj in alist:
            try:
                yield Annotation.from_dict(obj, self)
            except (TypeError, ValueError, PDFSyntaxError) as e:
                log.warning("Invalid object %r in Annots: %s", obj, e)
                continue

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self.docref)

    @property
    def streams(self) -> Iterator[ContentStream]:
        """Return resolved content streams."""
        for obj in self._contents:
            try:
                yield stream_value(obj)
            except TypeError:
                log.warning("Found non-stream in contents: %r", obj)

    @property
    def width(self) -> float:
        """Width of the page in default user space units."""
        x0, _, x1, _ = self.mediabox
        return x1 - x0

    @property
    def height(self) -> float:
        """Width of the page in default user space units."""
        _, y0, _, y1 = self.mediabox
        return y1 - y0

    @property
    def contents(self) -> Iterator[PDFObject]:
        """Iterator over PDF objects in the content streams."""
        for _, obj in ContentParser(self._contents):
            yield obj

    def __iter__(self) -> Iterator["ContentObject"]:
        """Iterator over lazy layout objects."""
        return iter(LazyInterpreter(self, self._contents))

    @property
    def paths(self) -> Iterator["PathObject"]:
        """Iterator over lazy path objects."""
        return self.flatten(PathObject)

    @property
    def images(self) -> Iterator["ImageObject"]:
        """Iterator over lazy image objects."""
        return self.flatten(ImageObject)

    @property
    def texts(self) -> Iterator["TextObject"]:
        """Iterator over lazy text objects."""
        return self.flatten(TextObject)

    @property
    def glyphs(self) -> Iterator["GlyphObject"]:
        """Iterator over lazy glyph objects."""
        for text in self.flatten(TextObject):
            yield from text

    @property
    def xobjects(self) -> Iterator["XObjectObject"]:
        """Return resolved and rendered Form XObjects.

        This does *not* return any image or PostScript XObjects.  You
        can get images via the `images` property.  Apparently you
        aren't supposed to use PostScript XObjects for anything, ever.

        Note that these are the XObjects as rendered on the page, so
        you may see the same named XObject multiple times.  If you
        need to access their actual definitions you'll have to look at
        `page.resources`.

        This will also return Form XObjects within Form XObjects,
        except in the case of circular reference chains.
        """

        from typing import Set

        def xobjects_one(
            itor: Iterable["ContentObject"], parents: Set[str]
        ) -> Iterator["XObjectObject"]:
            for obj in itor:
                if isinstance(obj, XObjectObject) and obj.xobjid not in parents:
                    yield obj
                    yield from xobjects_one(obj, parents | {obj.xobjid})

        for obj in xobjects_one(self, set()):
            if isinstance(obj, XObjectObject):
                yield obj

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterator over tokens in the content streams."""
        parser = ContentParser(self._contents)
        while True:
            try:
                pos, tok = parser.nexttoken()
            except StopIteration:
                return
            yield tok

    @property
    def parent_key(self) -> Union[int, None]:
        """Parent tree key for this page, if any."""
        if "StructParents" in self.attrs:
            return int_value(self.attrs["StructParents"])
        return None

    @property
    def structure(self) -> Sequence[Union["Element", None]]:
        """Mapping of marked content IDs to logical structure elements.

        This is actually a list of logical structure elements
        corresponding to marked content IDs, or `None` for indices
        which do not correspond to a marked content ID.  Note that
        because structure elements may contain multiple marked content
        sections, the same element may occur multiple times in this
        list.

        Note: This is not the same as `playa.Document.structure`.
            PDF documents have logical structure, but PDF pages **do
            not**, and it is dishonest to pretend otherwise (as some
            code I once wrote unfortunately does).  What they do have
            is marked content sections which correspond to content
            items in the logical structure tree.

        Danger: Do not rely on this being a `list`.
            Currently this is implemented eagerly, but in the future it
            may return a lazy object.

        """
        from playa.structure import Element

        if self._structmap is not None:
            return self._structmap
        self._structmap = []
        if self.doc.structure is None:
            return self._structmap
        parent_key = self.parent_key
        if parent_key is None:
            return self._structmap
        try:
            parents = list_value(self.doc.structure.parent_tree[parent_key])
        except IndexError:
            return self._structmap
        # Elements can contain multiple marked content sections, so
        # don't create redundant Element objects for these
        elements: Dict[int, Element] = {}
        for obj in parents:
            objid = obj.objid if isinstance(obj, ObjRef) else id(obj)
            if objid not in elements:
                elements[objid] = Element.from_dict(self.doc, dict_value(obj))
            self._structmap.append(elements[objid])
        return self._structmap

    @property
    def fonts(self) -> Mapping[str, Font]:
        """Mapping of resource names to fonts for this page.

        Note: This is not the same as `playa.Document.fonts`.
            The resource names (e.g. `F1`, `F42`, `FooBar`) here are
            specific to a page (or Form XObject) resource dictionary
            and have no relation to the font name as commonly
            understood (e.g. `Helvetica`,
            `WQERQE+Arial-SuperBold-HJRE-UTF-8`).  Since font names are
            generally considered to be globally unique, it may be
            possible to access fonts by them in the future.

        Danger: Do not rely on this being a `dict`.
            Currently this is implemented eagerly, but in the future it
            may return a lazy object which only loads fonts on demand.

        """
        if self._fontmap is not None:
            return self._fontmap
        self._fontmap = _make_fontmap(self.resources.get("Font"), self.doc)
        return self._fontmap

    def __repr__(self) -> str:
        return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

    @overload
    def flatten(self) -> Iterator["ContentObject"]: ...

    @overload
    def flatten(self, filter_class: Type[CO]) -> Iterator[CO]: ...

    def flatten(
        self, filter_class: Union[None, Type[CO]] = None
    ) -> Iterator[Union[CO, "ContentObject"]]:
        """Iterate over content objects, recursing into form XObjects."""

        from typing import Set

        def flatten_one(
            itor: Iterable["ContentObject"], parents: Set[str]
        ) -> Iterator["ContentObject"]:
            for obj in itor:
                if isinstance(obj, XObjectObject) and obj.xobjid not in parents:
                    yield from flatten_one(obj, parents | {obj.xobjid})
                else:
                    yield obj

        if filter_class is None:
            yield from flatten_one(self, set())
        else:
            for obj in flatten_one(self, set()):
                if isinstance(obj, filter_class):
                    yield obj

    @property
    def mcid_texts(self) -> Mapping[int, List[str]]:
        """Mapping of marked content IDs to Unicode text strings.

        For use in text extraction from tagged PDFs.

        Danger: Do not rely on this being a `dict`.
            Currently this is implemented eagerly, but in the future it
            may return a lazy object.
        """
        if hasattr(self, "_textmap"):
            return self._textmap
        self._textmap: Mapping[int, List[str]] = _extract_mcid_texts(self)
        return self._textmap

    def extract_text(self) -> str:
        """Do some best-effort text extraction.

        This necessarily involves a few heuristics, so don't get your
        hopes up.  It will attempt to use marked content information
        for a tagged PDF, otherwise it will fall back on the character
        displacement and line matrix to determine word and line breaks.
        """
        if self.doc.is_tagged:
            return self.extract_text_tagged()
        else:
            return self.extract_text_untagged()

    def extract_text_untagged(self) -> str:
        """Get text from a page of an untagged PDF."""

        def _extract_text_from_obj(
            obj: "TextObject", vertical: bool, prev_end: float
        ) -> Tuple[str, float]:
            """Try to get text from a text object."""
            chars: List[str] = []
            for glyph in obj:
                x, y = glyph.origin
                off = y if vertical else x
                # 0.5 here is a heuristic!!!
                if prev_end and off - prev_end > 0.5:
                    if chars and chars[-1] != " ":
                        chars.append(" ")
                if glyph.text is not None:
                    chars.append(glyph.text)
                dx, dy = glyph.displacement
                prev_end = off + (dy if vertical else dx)
            return "".join(chars), prev_end

        prev_end = 0.0
        prev_origin: Union[Point, None] = None
        lines = []
        strings: List[str] = []
        for text in self.texts:
            if text.gstate.font is None:
                continue
            vertical = text.gstate.font.vertical
            # Track changes to the translation component of text
            # rendering matrix to (yes, heuristically) detect newlines
            # and spaces between text objects
            dx, dy = text.origin
            off = dy if vertical else dx
            if strings and self._next_line(text, prev_origin):
                lines.append("".join(strings))
                strings.clear()
            # 0.5 here is a heuristic!!!
            if strings and off - prev_end > 0.5 and not strings[-1].endswith(" "):
                strings.append(" ")
            textstr, prev_end = _extract_text_from_obj(text, vertical, off)
            strings.append(textstr)
            prev_origin = dx, dy
        if strings:
            lines.append("".join(strings))
        return "\n".join(lines)

    def _next_line(
        self, text: Union[TextObject, None], prev_offset: Union[Point, None]
    ) -> bool:
        if text is None:
            return False
        if text.gstate.font is None:
            return False
        if prev_offset is None:
            return False
        offset = text.origin

        # Vertical text (usually) means right-to-left lines
        if text.gstate.font.vertical:
            line_offset = offset[0] - prev_offset[0]
        else:
            # The CTM isn't useful here because we actually do care
            # about the final device space, and we just want to know
            # which way is up and which way is down.
            dy = offset[1] - prev_offset[1]
            if self.space == "screen":
                line_offset = -dy
            else:
                line_offset = dy
        return line_offset < 0

    def extract_text_tagged(self) -> str:
        """Get text from a page of a tagged PDF."""
        lines: List[str] = []
        strings: List[str] = []
        prev_mcid: Union[int, None] = None
        prev_origin: Union[Point, None] = None
        # TODO: Iteration over marked content sections and getting
        # their text, origin, and displacement, will be refactored
        for mcs, texts in itertools.groupby(self.texts, operator.attrgetter("mcs")):
            text: Union[TextObject, None] = None
            # TODO: Artifact can also be a structure element, but
            # also, any content outside the structure tree is
            # considered an artifact
            if mcs is None or mcs.tag == "Artifact":
                for text in texts:
                    prev_origin = text.origin
                continue
            actual_text = mcs.props.get("ActualText")
            if actual_text is None:
                reversed = mcs.tag == "ReversedChars"
                c = []
                for text in texts:  # noqa: B031
                    c.append(text.chars[::-1] if reversed else text.chars)
                chars = "".join(c)
            else:
                assert isinstance(actual_text, bytes)
                chars = actual_text.decode("UTF-16")
                # Consume all text objects to ensure correct graphicstate
                for _ in texts:  # noqa: B031
                    pass

            # Remove soft hyphens
            chars = chars.replace("\xad", "")
            # There *might* be a line break, determine based on origin
            if mcs.mcid != prev_mcid:
                if self._next_line(text, prev_origin):
                    lines.extend(textwrap.wrap("".join(strings)))
                    strings.clear()
                prev_mcid = mcs.mcid
            strings.append(chars)
            if text is not None:
                prev_origin = text.origin
        if strings:
            lines.extend(textwrap.wrap("".join(strings)))
        return "\n".join(lines)

annotations property

Lazily iterate over page annotations.

contents property

Iterator over PDF objects in the content streams.

doc property

Get associated document if it exists.

fonts property

Mapping of resource names to fonts for this page.

This is not the same as playa.Document.fonts.

The resource names (e.g. F1, F42, FooBar) here are specific to a page (or Form XObject) resource dictionary and have no relation to the font name as commonly understood (e.g. Helvetica, WQERQE+Arial-SuperBold-HJRE-UTF-8). Since font names are generally considered to be globally unique, it may be possible to access fonts by them in the future.

Do not rely on this being a dict.

Currently this is implemented eagerly, but in the future it may return a lazy object which only loads fonts on demand.

glyphs property

Iterator over lazy glyph objects.

height property

Width of the page in default user space units.

images property

Iterator over lazy image objects.

mcid_texts property

Mapping of marked content IDs to Unicode text strings.

For use in text extraction from tagged PDFs.

Do not rely on this being a dict.

Currently this is implemented eagerly, but in the future it may return a lazy object.

parent_key property

Parent tree key for this page, if any.

paths property

Iterator over lazy path objects.

streams property

Return resolved content streams.

structure property

Mapping of marked content IDs to logical structure elements.

This is actually a list of logical structure elements corresponding to marked content IDs, or None for indices which do not correspond to a marked content ID. Note that because structure elements may contain multiple marked content sections, the same element may occur multiple times in this list.

This is not the same as playa.Document.structure.

PDF documents have logical structure, but PDF pages do not, and it is dishonest to pretend otherwise (as some code I once wrote unfortunately does). What they do have is marked content sections which correspond to content items in the logical structure tree.

Do not rely on this being a list.

Currently this is implemented eagerly, but in the future it may return a lazy object.

texts property

Iterator over lazy text objects.

tokens property

Iterator over tokens in the content streams.

width property

Width of the page in default user space units.

xobjects property

Return resolved and rendered Form XObjects.

This does not return any image or PostScript XObjects. You can get images via the images property. Apparently you aren't supposed to use PostScript XObjects for anything, ever.

Note that these are the XObjects as rendered on the page, so you may see the same named XObject multiple times. If you need to access their actual definitions you'll have to look at page.resources.

This will also return Form XObjects within Form XObjects, except in the case of circular reference chains.

__iter__()

Iterator over lazy layout objects.

Source code in playa/page.py
243
244
245
def __iter__(self) -> Iterator["ContentObject"]:
    """Iterator over lazy layout objects."""
    return iter(LazyInterpreter(self, self._contents))

extract_text()

Do some best-effort text extraction.

This necessarily involves a few heuristics, so don't get your hopes up. It will attempt to use marked content information for a tagged PDF, otherwise it will fall back on the character displacement and line matrix to determine word and line breaks.

Source code in playa/page.py
434
435
436
437
438
439
440
441
442
443
444
445
def extract_text(self) -> str:
    """Do some best-effort text extraction.

    This necessarily involves a few heuristics, so don't get your
    hopes up.  It will attempt to use marked content information
    for a tagged PDF, otherwise it will fall back on the character
    displacement and line matrix to determine word and line breaks.
    """
    if self.doc.is_tagged:
        return self.extract_text_tagged()
    else:
        return self.extract_text_untagged()

extract_text_tagged()

Get text from a page of a tagged PDF.

Source code in playa/page.py
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
def extract_text_tagged(self) -> str:
    """Get text from a page of a tagged PDF."""
    lines: List[str] = []
    strings: List[str] = []
    prev_mcid: Union[int, None] = None
    prev_origin: Union[Point, None] = None
    # TODO: Iteration over marked content sections and getting
    # their text, origin, and displacement, will be refactored
    for mcs, texts in itertools.groupby(self.texts, operator.attrgetter("mcs")):
        text: Union[TextObject, None] = None
        # TODO: Artifact can also be a structure element, but
        # also, any content outside the structure tree is
        # considered an artifact
        if mcs is None or mcs.tag == "Artifact":
            for text in texts:
                prev_origin = text.origin
            continue
        actual_text = mcs.props.get("ActualText")
        if actual_text is None:
            reversed = mcs.tag == "ReversedChars"
            c = []
            for text in texts:  # noqa: B031
                c.append(text.chars[::-1] if reversed else text.chars)
            chars = "".join(c)
        else:
            assert isinstance(actual_text, bytes)
            chars = actual_text.decode("UTF-16")
            # Consume all text objects to ensure correct graphicstate
            for _ in texts:  # noqa: B031
                pass

        # Remove soft hyphens
        chars = chars.replace("\xad", "")
        # There *might* be a line break, determine based on origin
        if mcs.mcid != prev_mcid:
            if self._next_line(text, prev_origin):
                lines.extend(textwrap.wrap("".join(strings)))
                strings.clear()
            prev_mcid = mcs.mcid
        strings.append(chars)
        if text is not None:
            prev_origin = text.origin
    if strings:
        lines.extend(textwrap.wrap("".join(strings)))
    return "\n".join(lines)

extract_text_untagged()

Get text from a page of an untagged PDF.

Source code in playa/page.py
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
def extract_text_untagged(self) -> str:
    """Get text from a page of an untagged PDF."""

    def _extract_text_from_obj(
        obj: "TextObject", vertical: bool, prev_end: float
    ) -> Tuple[str, float]:
        """Try to get text from a text object."""
        chars: List[str] = []
        for glyph in obj:
            x, y = glyph.origin
            off = y if vertical else x
            # 0.5 here is a heuristic!!!
            if prev_end and off - prev_end > 0.5:
                if chars and chars[-1] != " ":
                    chars.append(" ")
            if glyph.text is not None:
                chars.append(glyph.text)
            dx, dy = glyph.displacement
            prev_end = off + (dy if vertical else dx)
        return "".join(chars), prev_end

    prev_end = 0.0
    prev_origin: Union[Point, None] = None
    lines = []
    strings: List[str] = []
    for text in self.texts:
        if text.gstate.font is None:
            continue
        vertical = text.gstate.font.vertical
        # Track changes to the translation component of text
        # rendering matrix to (yes, heuristically) detect newlines
        # and spaces between text objects
        dx, dy = text.origin
        off = dy if vertical else dx
        if strings and self._next_line(text, prev_origin):
            lines.append("".join(strings))
            strings.clear()
        # 0.5 here is a heuristic!!!
        if strings and off - prev_end > 0.5 and not strings[-1].endswith(" "):
            strings.append(" ")
        textstr, prev_end = _extract_text_from_obj(text, vertical, off)
        strings.append(textstr)
        prev_origin = dx, dy
    if strings:
        lines.append("".join(strings))
    return "\n".join(lines)

flatten(filter_class=None)

flatten() -> Iterator[ContentObject]
flatten(filter_class: Type[CO]) -> Iterator[CO]

Iterate over content objects, recursing into form XObjects.

Source code in playa/page.py
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
def flatten(
    self, filter_class: Union[None, Type[CO]] = None
) -> Iterator[Union[CO, "ContentObject"]]:
    """Iterate over content objects, recursing into form XObjects."""

    from typing import Set

    def flatten_one(
        itor: Iterable["ContentObject"], parents: Set[str]
    ) -> Iterator["ContentObject"]:
        for obj in itor:
            if isinstance(obj, XObjectObject) and obj.xobjid not in parents:
                yield from flatten_one(obj, parents | {obj.xobjid})
            else:
                yield obj

    if filter_class is None:
        yield from flatten_one(self, set())
    else:
        for obj in flatten_one(self, set()):
            if isinstance(obj, filter_class):
                yield obj

playa.content

PDF content objects created by the interpreter.

ContentObject dataclass

Any sort of content object.

Attributes:

Name Type Description
gstate GraphicState

Graphics state.

ctm Matrix

Coordinate transformation matrix (PDF 1.7 section 8.3.2).

mcstack Tuple[MarkedContent, ...]

Stack of enclosing marked content sections.

Source code in playa/content.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
@dataclass
class ContentObject:
    """Any sort of content object.

    Attributes:
      gstate: Graphics state.
      ctm: Coordinate transformation matrix (PDF 1.7 section 8.3.2).
      mcstack: Stack of enclosing marked content sections.
    """

    _pageref: PageRef
    _parentkey: Union[int, None]
    gstate: GraphicState
    ctm: Matrix
    mcstack: Tuple[MarkedContent, ...]

    def __iter__(self) -> Iterator["ContentObject"]:
        yield from ()

    def __len__(self) -> int:
        """Return the number of children of this object (generic implementation)."""
        return sum(1 for _ in self)

    @property
    def object_type(self):
        """Type of this object as a string, e.g. "text", "path", "image"."""
        name = self.__class__.__name__
        return name[: -len("Object")].lower()

    @property
    def bbox(self) -> Rect:
        """The bounding box in device space of this object."""
        # These bboxes have already been computed in device space so
        # we don't need all 4 corners!
        points = itertools.chain.from_iterable(
            ((x0, y0), (x1, y1)) for x0, y0, x1, y1 in (item.bbox for item in self)
        )
        return get_bound(points)

    @property
    def mcs(self) -> Union[MarkedContent, None]:
        """The immediately enclosing marked content section."""
        return self.mcstack[-1] if self.mcstack else None

    @property
    def mcid(self) -> Union[int, None]:
        """The marked content ID of the nearest enclosing marked
        content section with an ID.

        This is notably what you should use (and what `parent` uses)
        to find the parent logical structure element, because (PDF
        14.7.5.1.1):

        > A marked-content sequence corresponding to a structure
        content item shall not have another marked-content sequence
        for a structure content item nested within it though
        non-structural marked-content shall be allowed.
        """
        if hasattr(self, "_mcid"):
            return self._mcid
        for mcs in self.mcstack[::-1]:
            if mcs.mcid is not None:
                self._mcid: Union[int, None] = mcs.mcid
                break
        else:
            self._mcid = None
        return self._mcid

    @property
    def parent(self) -> Union["Element", None]:
        """The enclosing logical structure element, if any."""
        from playa.structure import Element

        # Use `mcid` and not `mcs` here (see docs for `mcid`)
        if hasattr(self, "_parent"):
            return self._parent
        self._parent: Union["Element", None] = None
        parent_key = self._parentkey
        if parent_key is None:
            return self._parent
        structure = self.doc.structure
        if structure is None:
            return self._parent
        mcid = self.mcid
        if mcid is None:
            return self._parent
        parents = list_value(structure.parent_tree[parent_key])
        if mcid >= len(parents):
            log.warning(
                "Invalid marked content ID: %d (page has %d MCIDs)", mcid, len(parents)
            )
            return self._parent
        self._parent = Element.from_dict(self.doc, dict_value(parents[mcid]))
        return self._parent

    @property
    def page(self) -> "Page":
        """The page containing this content object."""
        return _deref_page(self._pageref)

    @property
    def doc(self) -> "Document":
        """The document containing this content object."""
        docref, _ = self._pageref
        return _deref_document(docref)

bbox property

The bounding box in device space of this object.

doc property

The document containing this content object.

mcid property

The marked content ID of the nearest enclosing marked content section with an ID.

This is notably what you should use (and what parent uses) to find the parent logical structure element, because (PDF 14.7.5.1.1):

A marked-content sequence corresponding to a structure content item shall not have another marked-content sequence for a structure content item nested within it though non-structural marked-content shall be allowed.

mcs property

The immediately enclosing marked content section.

object_type property

Type of this object as a string, e.g. "text", "path", "image".

page property

The page containing this content object.

parent property

The enclosing logical structure element, if any.

__len__()

Return the number of children of this object (generic implementation).

Source code in playa/content.py
247
248
249
def __len__(self) -> int:
    """Return the number of children of this object (generic implementation)."""
    return sum(1 for _ in self)

DashPattern

Bases: NamedTuple

Line dash pattern in PDF graphics state (PDF 1.7 section 8.4.3.6).

Attributes:

Name Type Description
dash Tuple[float, ...]

lengths of dashes and gaps in user space units

phase float

starting position in the dash pattern

Source code in playa/content.py
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class DashPattern(NamedTuple):
    """
    Line dash pattern in PDF graphics state (PDF 1.7 section 8.4.3.6).

    Attributes:
      dash: lengths of dashes and gaps in user space units
      phase: starting position in the dash pattern
    """

    dash: Tuple[float, ...]
    phase: float

    def __str__(self):
        if len(self.dash) == 0:
            return ""
        else:
            return f"{self.dash} {self.phase}"

GlyphObject dataclass

Bases: ContentObject

Individual glyph on the page.

Attributes:

Name Type Description
font Font

Font for this glyph.

size float

Effective font size for this glyph.

cid int

Character ID for this glyph.

text Union[str, None]

Unicode mapping of this glyph, if any.

matrix Matrix

Rendering matrix T_rm for this glyph, which transforms text space coordinates to device space (PDF 2.0 section 9.4.4).

origin Point

Origin of this glyph in device space.

displacement Point

Vector to the origin of the next glyph in device space.

bbox Rect

glyph bounding box in device space.

Source code in playa/content.py
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
@dataclass
class GlyphObject(ContentObject):
    """Individual glyph on the page.

    Attributes:
      font: Font for this glyph.
      size: Effective font size for this glyph.
      cid: Character ID for this glyph.
      text: Unicode mapping of this glyph, if any.
      matrix: Rendering matrix `T_rm` for this glyph, which transforms
              text space coordinates to device space (PDF 2.0 section
              9.4.4).
      origin: Origin of this glyph in device space.
      displacement: Vector to the origin of the next glyph in device space.
      bbox: glyph bounding box in device space.

    """

    cid: int
    text: Union[str, None]
    matrix: Matrix
    _displacement: float
    _corners: bool

    def __iter__(self) -> Iterator[ContentObject]:
        """Possibly iterate over paths in a glyph.

        For Type3 fonts, you can iterate over paths (or anything
        else) inside a glyph, in the coordinate space defined by the
        text rendering matrix.

        Otherwise, you can't do that, and you get nothing.
        """
        from playa.interp import Type3Interpreter

        font = self.font
        itor: Iterator[ContentObject] = iter(())
        if not isinstance(font, Type3Font):
            return itor
        gid = font.encoding.get(self.cid)
        if gid is None:
            log.warning("Unknown CID %d in Type3 font %r", self.cid, font)
            return itor
        charproc = resolve1(font.charprocs.get(gid))
        if not isinstance(charproc, ContentStream):
            log.warning("CharProc %s not found in font %r ", gid, font)
            return itor

        interp = Type3Interpreter(
            self.page,
            [charproc],
            font.resources,
            ctm=mult_matrix(font.matrix, self.matrix),
            # NOTE: no copy here because an interpreter always creates
            # a new graphics state.
            gstate=self.gstate,
        )
        itor = iter(interp)
        # TODO: We *could* try to get and use the d1 information here
        # but if we do that, we need to do it everywhere the glyph is
        # used so that the bbox will be consistent
        return itor

    @property
    def font(self) -> Font:
        font = self.gstate.font
        assert font is not None
        return font

    @property
    def size(self) -> float:
        vert = False if self.gstate.font is None else self.gstate.font.vertical
        return _font_size(self.matrix, vert)

    @property
    def origin(self) -> Point:
        _, _, _, _, dx, dy = self.matrix
        return dx, dy

    @property
    def displacement(self) -> Point:
        # Equivalent to:
        # apply_matrix_norm(self.matrix,
        #                   (0, self._displacement)
        #                   if font.vertical else
        #                   (self._displacement, 0))
        a, b, c, d, _, _ = self.matrix
        if self.font.vertical:
            return c * self._displacement, d * self._displacement
        else:
            return a * self._displacement, b * self._displacement

    @property
    def bbox(self) -> Rect:
        x0, y0, x1, y1 = self.font.char_bbox(self.cid)
        if self._corners:
            return get_bound(
                (
                    apply_matrix_pt(self.matrix, (x0, y0)),
                    apply_matrix_pt(self.matrix, (x0, y1)),
                    apply_matrix_pt(self.matrix, (x1, y1)),
                    apply_matrix_pt(self.matrix, (x1, y0)),
                )
            )
        else:
            x0, y0 = apply_matrix_pt(self.matrix, (x0, y0))
            x1, y1 = apply_matrix_pt(self.matrix, (x1, y1))
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            return (x0, y0, x1, y1)

__iter__()

Possibly iterate over paths in a glyph.

For Type3 fonts, you can iterate over paths (or anything else) inside a glyph, in the coordinate space defined by the text rendering matrix.

Otherwise, you can't do that, and you get nothing.

Source code in playa/content.py
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
def __iter__(self) -> Iterator[ContentObject]:
    """Possibly iterate over paths in a glyph.

    For Type3 fonts, you can iterate over paths (or anything
    else) inside a glyph, in the coordinate space defined by the
    text rendering matrix.

    Otherwise, you can't do that, and you get nothing.
    """
    from playa.interp import Type3Interpreter

    font = self.font
    itor: Iterator[ContentObject] = iter(())
    if not isinstance(font, Type3Font):
        return itor
    gid = font.encoding.get(self.cid)
    if gid is None:
        log.warning("Unknown CID %d in Type3 font %r", self.cid, font)
        return itor
    charproc = resolve1(font.charprocs.get(gid))
    if not isinstance(charproc, ContentStream):
        log.warning("CharProc %s not found in font %r ", gid, font)
        return itor

    interp = Type3Interpreter(
        self.page,
        [charproc],
        font.resources,
        ctm=mult_matrix(font.matrix, self.matrix),
        # NOTE: no copy here because an interpreter always creates
        # a new graphics state.
        gstate=self.gstate,
    )
    itor = iter(interp)
    # TODO: We *could* try to get and use the d1 information here
    # but if we do that, we need to do it everywhere the glyph is
    # used so that the bbox will be consistent
    return itor

GraphicState dataclass

PDF graphics state (PDF 1.7 section 8.4) including text state (PDF 1.7 section 9.3.1), but excluding coordinate transformations.

Contrary to the pretensions of pdfminer.six, the text state is for the most part not at all separate from the graphics state, and can be updated outside the confines of BT and ET operators, thus there is no advantage and only confusion that comes from treating it separately.

The only state that does not persist outside BT / ET pairs is the text coordinate space (line matrix and text rendering matrix), and it is also the only part that is updated during iteration over a TextObject.

For historical reasons the main coordinate transformation matrix, though it is also part of the graphics state, is also stored separately.

Attributes:

Name Type Description
clipping_path None

The current clipping path (sec. 8.5.4)

linewidth float

Line width in user space units (sec. 8.4.3.2)

linecap int

Line cap style (sec. 8.4.3.3)

linejoin int

Line join style (sec. 8.4.3.4)

miterlimit float

Maximum length of mitered line joins (sec. 8.4.3.5)

dash DashPattern

Dash pattern for stroking (sec 8.4.3.6)

intent PSLiteral

Rendering intent (sec. 8.6.5.8)

stroke_adjustment bool

A flag specifying whether to compensate for possible rasterization effects when stroking a path with a line width that is small relative to the pixel resolution of the output device (sec. 10.7.5)

blend_mode Union[PSLiteral, List[PSLiteral]]

The current blend mode that shall be used in the transparent imaging model (sec. 11.3.5)

smask Union[None, Dict[str, PDFObject]]

A soft-mask dictionary (sec. 11.6.5.1) or None

salpha float

The constant shape or constant opacity value used for stroking operations (sec. 11.3.7.2 & 11.6.4.4)

nalpha float

The constant shape or constant opacity value used for non-stroking operations

alpha_source bool

A flag specifying whether the current soft mask and alpha constant parameters shall be interpreted as shape values (true) or opacity values (false). This flag also governs the interpretation of the SMask entry, if any, in an image dictionary

black_pt_comp PSLiteral

The black point compensation algorithm that shall be used when converting CIE-based colours (sec. 8.6.5.9)

flatness float

The precision with which curves shall be rendered on the output device (sec. 10.6.2)

scolor Color

Colour used for stroking operations

scs ColorSpace

Colour space used for stroking operations

ncolor Color

Colour used for non-stroking operations

ncs ColorSpace

Colour space used for non-stroking operations

font Union[Font, None]

The current font.

fontsize float

The "font size" parameter, which is not the font size in points as you might understand it, but rather a scaling factor applied to text space (so, it affects not only text size but position as well). Since most reasonable people find that behaviour rather confusing, this is often just 1.0, and PDFs rely on the text matrix to set the size of text.

charspace float

Extra spacing to add after each glyph, expressed in unscaled text space units, meaning it is not affected by fontsize. BUT it will be modified by scaling for horizontal writing mode (so, most of the time).

wordspace float

Extra spacing to add after a space glyph, defined very specifically as the glyph encoded by the single-byte character code 32 (SPOILER: it is probably a space). Also expressed in unscaled text space units, but modified by scaling.

scaling float

The horizontal scaling factor as defined by the PDF standard (that is, divided by 100).

leading float

The leading as defined by the PDF standard, in unscaled text space units.

render_mode int

The PDF rendering mode. The really important one here is 3, which means "don't render the text". You might want to use this to detect invisible text.

rise float

The text rise (superscript or subscript position), in unscaled text space units.

knockout bool

The text knockout flag, shall determine the behaviour of overlapping glyphs within a text object in the transparent imaging model (sec. 9.3.8)

Source code in playa/content.py
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
@dataclass
class GraphicState:
    """PDF graphics state (PDF 1.7 section 8.4) including text state
    (PDF 1.7 section 9.3.1), but excluding coordinate transformations.

    Contrary to the pretensions of pdfminer.six, the text state is for
    the most part not at all separate from the graphics state, and can
    be updated outside the confines of `BT` and `ET` operators, thus
    there is no advantage and only confusion that comes from treating
    it separately.

    The only state that does not persist outside `BT` / `ET` pairs is
    the text coordinate space (line matrix and text rendering matrix),
    and it is also the only part that is updated during iteration over
    a `TextObject`.

    For historical reasons the main coordinate transformation matrix,
    though it is also part of the graphics state, is also stored
    separately.

    Attributes:
      clipping_path: The current clipping path (sec. 8.5.4)
      linewidth: Line width in user space units (sec. 8.4.3.2)
      linecap: Line cap style (sec. 8.4.3.3)
      linejoin: Line join style (sec. 8.4.3.4)
      miterlimit: Maximum length of mitered line joins (sec. 8.4.3.5)
      dash: Dash pattern for stroking (sec 8.4.3.6)
      intent: Rendering intent (sec. 8.6.5.8)
      stroke_adjustment: A flag specifying whether to compensate for
        possible rasterization effects when stroking a path with a line
        width that is small relative to the pixel resolution of the output
        device (sec. 10.7.5)
      blend_mode: The current blend mode that shall be used in the
        transparent imaging model (sec. 11.3.5)
      smask: A soft-mask dictionary (sec. 11.6.5.1) or None
      salpha: The constant shape or constant opacity value used for
        stroking operations (sec. 11.3.7.2 & 11.6.4.4)
      nalpha: The constant shape or constant opacity value used for
        non-stroking operations
      alpha_source: A flag specifying whether the current soft mask and
        alpha constant parameters shall be interpreted as shape values
        (true) or opacity values (false). This flag also governs the
        interpretation of the SMask entry, if any, in an image dictionary
      black_pt_comp: The black point compensation algorithm that shall be
        used when converting CIE-based colours (sec. 8.6.5.9)
      flatness: The precision with which curves shall be rendered on
        the output device (sec. 10.6.2)
      scolor: Colour used for stroking operations
      scs: Colour space used for stroking operations
      ncolor: Colour used for non-stroking operations
      ncs: Colour space used for non-stroking operations
      font: The current font.
      fontsize: The "font size" parameter, which is **not** the font
        size in points as you might understand it, but rather a
        scaling factor applied to text space (so, it affects not only
        text size but position as well).  Since most reasonable people
        find that behaviour rather confusing, this is often just 1.0,
        and PDFs rely on the text matrix to set the size of text.
      charspace: Extra spacing to add after each glyph, expressed in
        unscaled text space units, meaning it is not affected by
        `fontsize`.  **BUT** it will be modified by `scaling` for
        horizontal writing mode (so, most of the time).
      wordspace: Extra spacing to add after a space glyph, defined
        very specifically as the glyph encoded by the single-byte
        character code 32 (SPOILER: it is probably a space).  Also
        expressed in unscaled text space units, but modified by
        `scaling`.
      scaling: The horizontal scaling factor as defined by the PDF
        standard (that is, divided by 100).
      leading: The leading as defined by the PDF standard, in unscaled
        text space units.
      render_mode: The PDF rendering mode.  The really important one
        here is 3, which means "don't render the text".  You might
        want to use this to detect invisible text.
      rise: The text rise (superscript or subscript position), in
        unscaled text space units.
      knockout: The text knockout flag, shall determine the behaviour of
        overlapping glyphs within a text object in the transparent imaging
        model (sec. 9.3.8)

    """

    clipping_path: None = None  # TODO
    linewidth: float = 1
    linecap: int = 0
    linejoin: int = 0
    miterlimit: float = 10
    dash: DashPattern = SOLID_LINE
    intent: PSLiteral = LITERAL_RELATIVE_COLORIMETRIC
    stroke_adjustment: bool = False
    blend_mode: Union[PSLiteral, List[PSLiteral]] = LITERAL_NORMAL
    smask: Union[None, Dict[str, PDFObject]] = None
    salpha: float = 1
    nalpha: float = 1
    alpha_source: bool = False
    black_pt_comp: PSLiteral = LITERAL_DEFAULT
    flatness: float = 1
    scolor: Color = BASIC_BLACK
    scs: ColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
    ncolor: Color = BASIC_BLACK
    ncs: ColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
    font: Union[Font, None] = None
    fontsize: float = 0
    charspace: float = 0
    wordspace: float = 0
    scaling: float = 100
    leading: float = 0
    render_mode: int = 0
    rise: float = 0
    knockout: bool = True

ImageObject dataclass

Bases: ContentObject

An image (either inline or XObject).

Attributes:

Name Type Description
xobjid Union[str, None]

Name of XObject (or None for inline images).

srcsize Tuple[int, int]

Size of source image in pixels.

bits int

Number of bits per component, if required (otherwise 1).

imagemask bool

True if the image is a mask.

stream ContentStream

Content stream with image data.

colorspace Union[ColorSpace, None]

Colour space for this image, if required (otherwise None).

Source code in playa/content.py
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
@dataclass
class ImageObject(ContentObject):
    """An image (either inline or XObject).

    Attributes:
      xobjid: Name of XObject (or None for inline images).
      srcsize: Size of source image in pixels.
      bits: Number of bits per component, if required (otherwise 1).
      imagemask: True if the image is a mask.
      stream: Content stream with image data.
      colorspace: Colour space for this image, if required (otherwise
        None).
    """

    xobjid: Union[str, None]
    srcsize: Tuple[int, int]
    bits: int
    imagemask: bool
    stream: ContentStream
    colorspace: Union[ColorSpace, None]

    def __contains__(self, name: str) -> bool:
        return name in self.stream

    def __getitem__(self, name: str) -> PDFObject:
        return self.stream[name]

    def get(self, name: str, default: PDFObject = None) -> PDFObject:
        return self.stream.get(name, default)

    def __len__(self) -> int:
        """Even though you can __getitem__ from an image you cannot iterate
        over its keys, sorry about that.  Returns zero."""
        return 0

    @property
    def parent(self) -> Union["Element", None]:
        """The enclosing logical structure element, if any."""
        from playa.structure import Element

        if hasattr(self, "_parent"):
            return self._parent
        self._parent = None
        if self._parentkey is None:
            return self._parent
        # No structure, no parent!
        if self.doc.structure is None:
            return self._parent
        try:
            parent = resolve1(self.doc.structure.parent_tree[self._parentkey])
            if isinstance(parent, dict):
                self._parent = Element.from_dict(self.doc, parent)
            else:
                del self._parent
                return super().parent
        except IndexError:
            pass
        return self._parent

    @property
    def buffer(self) -> bytes:
        """Binary stream content for this image"""
        return self.stream.buffer

    @property
    def bbox(self) -> Rect:
        # PDF 1.7 sec 8.3.24: All images shall be 1 unit wide by 1
        # unit high in user space, regardless of the number of samples
        # in the image. To be painted, an image shall be mapped to a
        # region of the page by temporarily altering the CTM.
        return transform_bbox(self.ctm, (0, 0, 1, 1))

buffer property

Binary stream content for this image

parent property

The enclosing logical structure element, if any.

__len__()

Even though you can getitem from an image you cannot iterate over its keys, sorry about that. Returns zero.

Source code in playa/content.py
403
404
405
406
def __len__(self) -> int:
    """Even though you can __getitem__ from an image you cannot iterate
    over its keys, sorry about that.  Returns zero."""
    return 0

MarkedContent

Bases: NamedTuple

Marked content point or section in a PDF page or Form XObject.

Attributes:

Name Type Description
mcid Union[int, None]

Marked content section ID, or None for a marked content point.

tag str

Name of tag for this marked content.

props Dict[str, PDFObject]

Marked content property dictionary.

Source code in playa/content.py
201
202
203
204
205
206
207
208
209
210
211
212
213
class MarkedContent(NamedTuple):
    """Marked content point or section in a PDF page or Form XObject.

    Attributes:
      mcid: Marked content section ID, or `None` for a marked content point.
      tag: Name of tag for this marked content.
      props: Marked content property dictionary.

    """

    mcid: Union[int, None]
    tag: str
    props: Dict[str, PDFObject]

PathObject dataclass

Bases: ContentObject

A path object.

Attributes:

Name Type Description
raw_segments List[PathSegment]

Segments in path (in user space).

stroke bool

True if the outline of the path is stroked.

fill bool

True if the path is filled.

evenodd bool

True if the filling of complex paths uses the even-odd winding rule, False if the non-zero winding number rule is used (PDF 1.7 section 8.5.3.3)

Source code in playa/content.py
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
@dataclass
class PathObject(ContentObject):
    """A path object.

    Attributes:
      raw_segments: Segments in path (in user space).
      stroke: True if the outline of the path is stroked.
      fill: True if the path is filled.
      evenodd: True if the filling of complex paths uses the even-odd
        winding rule, False if the non-zero winding number rule is
        used (PDF 1.7 section 8.5.3.3)
    """

    raw_segments: List[PathSegment]
    stroke: bool
    fill: bool
    evenodd: bool

    @property
    def segments(self) -> Iterator[PathSegment]:
        """Get path segments in device space."""
        return (
            PathSegment(
                p.operator,
                tuple(apply_matrix_pt(self.ctm, point) for point in p.points),
            )
            for p in self.raw_segments
        )

    @property
    def bbox(self) -> Rect:
        """Get bounding box of path in device space as defined by its
        points and control points."""
        # First get the bounding box in user space (fast)
        bbox = get_bound(
            itertools.chain.from_iterable(seg.points for seg in self.raw_segments)
        )
        # Transform it and get the new bounding box
        return transform_bbox(self.ctm, bbox)

bbox property

Get bounding box of path in device space as defined by its points and control points.

segments property

Get path segments in device space.

PathSegment

Bases: NamedTuple

Segment in a PDF graphics path.

Source code in playa/content.py
219
220
221
222
223
224
225
class PathSegment(NamedTuple):
    """
    Segment in a PDF graphics path.
    """

    operator: PathOperator
    points: Tuple[Point, ...]

TagObject dataclass

Bases: ContentObject

A marked content tag..

Source code in playa/content.py
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
@dataclass
class TagObject(ContentObject):
    """A marked content tag.."""

    _mcs: MarkedContent

    def __len__(self) -> int:
        """A tag has no contents, iterating over it returns nothing."""
        return 0

    @property
    def mcs(self) -> MarkedContent:
        """The marked content tag for this object."""
        return self._mcs

    @property
    def mcid(self) -> Union[int, None]:
        """The marked content ID of the nearest enclosing marked
        content section with an ID."""
        if self._mcs.mcid is not None:
            return self._mcs.mcid
        return super().mcid

    @property
    def bbox(self) -> Rect:
        """A tag has no content and thus no bounding box.

        To avoid needlessly complicating user code this returns
        `BBOX_NONE` instead of `None` or throwing a exception.
        Because that is a specific object, you can reliably check for
        it with:

            if obj.bbox is BBOX_NONE:
                ...
        """
        return BBOX_NONE

bbox property

A tag has no content and thus no bounding box.

To avoid needlessly complicating user code this returns BBOX_NONE instead of None or throwing a exception. Because that is a specific object, you can reliably check for it with:

if obj.bbox is BBOX_NONE:
    ...

mcid property

The marked content ID of the nearest enclosing marked content section with an ID.

mcs property

The marked content tag for this object.

__len__()

A tag has no contents, iterating over it returns nothing.

Source code in playa/content.py
341
342
343
def __len__(self) -> int:
    """A tag has no contents, iterating over it returns nothing."""
    return 0

TextObject dataclass

Bases: ContentObject

Text object (contains one or more glyphs).

Attributes:

matrix: Initial rendering matrix T_rm for this text object, which transforms text space coordinates to device space (PDF 2.0 section 9.4.4). origin: Origin of this text object in device space. displacement: Vector to the origin of the next text object in device space. size: Effective font size for this text object. text_matrix: Text matrix T_m for this text object, which transforms text space coordinates to user space. line_matrix: Text line matrix T_lm for this text object, which is the text matrix at the beginning of the "current line" (PDF 2.0 section 9.4.1). Note that this is not reliable for detecting line breaks. scaling_matrix: The anonymous but rather important matrix which applies font size, horizontal scaling and rise to obtain the rendering matrix (PDF 2.0 sec 9.4.4). args: Strings or position adjustments. bbox: Text bounding box in device space.

Source code in playa/content.py
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
@dataclass
class TextObject(ContentObject):
    """Text object (contains one or more glyphs).

    Attributes:

      matrix: Initial rendering matrix `T_rm` for this text object,
              which transforms text space coordinates to device space
              (PDF 2.0 section 9.4.4).
      origin: Origin of this text object in device space.
      displacement: Vector to the origin of the next text object in
                    device space.
      size: Effective font size for this text object.
      text_matrix: Text matrix `T_m` for this text object, which
                   transforms text space coordinates to user space.
      line_matrix: Text line matrix `T_lm` for this text object, which
                   is the text matrix at the beginning of the "current
                   line" (PDF 2.0 section 9.4.1).  Note that this is
                   **not** reliable for detecting line breaks.
      scaling_matrix: The anonymous but rather important matrix which
                      applies font size, horizontal scaling and rise to
                      obtain the rendering matrix (PDF 2.0 sec 9.4.4).
      args: Strings or position adjustments.
      bbox: Text bounding box in device space.

    """

    args: List[Union[bytes, float]]
    line_matrix: Matrix
    _glyph_offset: Point

    _matrix: Union[Matrix, None] = None
    _chars: Union[List[str], None] = None
    _bbox: Union[Rect, None] = None
    _next_glyph_offset: Union[Point, None] = None

    def __iter__(self) -> Iterator[GlyphObject]:
        """Generate glyphs for this text object"""
        glyph_offset = self._glyph_offset
        font = self.gstate.font
        # If no font is set, we cannot do anything, since even calling
        # TJ with a displacement and no text effects requires us at
        # least to know the fontsize.
        if font is None:
            log.warning(
                "No font is set, will not update text state or output text: %r TJ",
                self.args,
            )
            self._next_glyph_offset = glyph_offset
            return
        assert self.ctm is not None

        tlm_ctm = mult_matrix(self.line_matrix, self.ctm)
        # Pre-determine if we need to recompute the bound for rotated glyphs
        a, b, c, d, _, _ = tlm_ctm
        corners = b * d < 0 or a * c < 0
        fontsize = self.gstate.fontsize
        horizontal_scaling = self.gstate.scaling * 0.01
        # PDF 2.0 section 9.3.2: The character-spacing parameter, Tc,
        # shall be a number specified in unscaled text space units
        # (although it shall be subject to scaling by the Th parameter
        # if the writing mode is horizontal).
        scaled_charspace = self.gstate.charspace / fontsize
        # Section 9.3.3: Word spacing "works the same way"
        scaled_wordspace = self.gstate.wordspace / fontsize

        # PDF 2.0 section 9.4.4: Conceptually, the entire
        # transformation from text space to device space can be
        # represented by a text rendering matrix, T_rm:
        #
        # (scaling_matrix @ text_matrix @ glyph.ctm)
        #
        # Note that scaling_matrix and text_matrix are constant across
        # glyphs in a TextObject, and scaling_matrix is always
        # diagonal (thus the mult_matrix call below can be optimized)
        scaling_matrix = (
            fontsize * horizontal_scaling,
            0,
            0,
            fontsize,
            0,
            self.gstate.rise,
        )
        vert = font.vertical
        # FIXME: THIS IS NOT TRUE!!!  We need a test for it though.
        if font.multibyte:
            scaled_wordspace = 0
        (x, y) = glyph_offset
        pos = y if vert else x
        for obj in self.args:
            if isinstance(obj, (int, float)):
                pos -= obj * 0.001 * fontsize * horizontal_scaling
            else:
                for cid, text in font.decode(obj):
                    glyph_offset = (x, pos) if vert else (pos, y)
                    disp = font.vdisp(cid) if vert else font.hdisp(cid)
                    disp += scaled_charspace
                    if cid == 32:
                        disp += scaled_wordspace
                    matrix = mult_matrix(
                        scaling_matrix, translate_matrix(tlm_ctm, glyph_offset)
                    )
                    glyph = GlyphObject(
                        _pageref=self._pageref,
                        _parentkey=self._parentkey,
                        gstate=self.gstate,
                        ctm=self.ctm,
                        mcstack=self.mcstack,
                        cid=cid,
                        text=text,
                        matrix=matrix,
                        _displacement=disp,
                        _corners=corners,
                    )
                    yield glyph
                    # This implements the proper scaling of charspace/wordspace
                    if vert:
                        pos += disp * fontsize
                    else:
                        pos += disp * fontsize * horizontal_scaling
        glyph_offset = (x, pos) if vert else (pos, y)
        if self._next_glyph_offset is None:
            self._next_glyph_offset = glyph_offset

    def _get_next_glyph_offset(self) -> Point:
        """Update only the glyph offset without calculating anything else."""
        if self._next_glyph_offset is not None:
            return self._next_glyph_offset
        font = self.gstate.font
        fontsize = self.gstate.fontsize
        if font is None:
            log.warning(
                "No font is set, will not update text state or output text: %r TJ",
                self.args,
            )
            self._next_glyph_offset = self._glyph_offset
            return self._next_glyph_offset
        if len(self.args) == 0:
            self._next_glyph_offset = self._glyph_offset
            return self._next_glyph_offset

        horizontal_scaling = self.gstate.scaling * 0.01
        charspace = self.gstate.charspace
        wordspace = self.gstate.wordspace
        vert = font.vertical
        if font.multibyte:
            wordspace = 0
        (x, y) = self._glyph_offset
        pos = y if vert else x
        if not vert:
            # Scale charspace and wordspace, PDF 2.0 section 9.3.2
            charspace *= horizontal_scaling
            wordspace *= horizontal_scaling
        for obj in self.args:
            if isinstance(obj, (int, float)):
                pos -= obj * 0.001 * fontsize * horizontal_scaling
            else:
                for cid, _ in font.decode(obj):
                    x, y = (x, pos) if vert else (pos, y)
                    if vert:
                        assert isinstance(font, CIDFont)
                        pos += font.vdisp(cid) * fontsize
                    else:
                        hdisp = font.hdisp(cid)
                        pos += hdisp * fontsize * horizontal_scaling
                    pos += charspace
                    if cid == 32:
                        pos += wordspace
        self._next_glyph_offset = (x, pos) if vert else (pos, y)
        return self._next_glyph_offset

    @property
    def matrix(self) -> Matrix:
        if self._matrix is not None:
            return self._matrix
        self._matrix = mult_matrix(
            self.scaling_matrix, mult_matrix(self.text_matrix, self.ctm)
        )
        return self._matrix

    @property
    def size(self) -> float:
        vert = False if self.gstate.font is None else self.gstate.font.vertical
        return _font_size(self.matrix, vert)

    @property
    def scaling_matrix(self):
        horizontal_scaling = self.gstate.scaling * 0.01
        fontsize = self.gstate.fontsize
        return (
            fontsize * horizontal_scaling,
            0,
            0,
            fontsize,
            0,
            self.gstate.rise,
        )

    @property
    def text_matrix(self) -> Matrix:
        return translate_matrix(self.line_matrix, self._glyph_offset)

    @property
    def origin(self) -> Point:
        _, _, _, _, dx, dy = self.matrix
        return dx, dy

    @property
    def displacement(self) -> Point:
        matrix = self.matrix
        # FIXME: This should be either cached or optimized
        next_matrix = mult_matrix(
            self.scaling_matrix,
            mult_matrix(
                translate_matrix(self.line_matrix, self._get_next_glyph_offset()),
                self.ctm,
            ),
        )
        return next_matrix[-2] - matrix[-2], next_matrix[-1] - matrix[-1]

    @property
    def bbox(self) -> Rect:
        # We specialize this to avoid it having side effects on the
        # text state (already it's a bit of a footgun that __iter__
        # does that...), but also because we know all glyphs have the
        # same text matrix and thus we can avoid a lot of multiply
        if self._bbox is not None:
            return self._bbox
        matrix = mult_matrix(self.line_matrix, self.ctm)
        font = self.gstate.font
        fontsize = self.gstate.fontsize
        rise = self.gstate.rise
        if font is None:
            log.warning(
                "No font is set, will not update text state or output text: %r TJ",
                self.args,
            )
            self._bbox = BBOX_NONE
            self._next_glyph_offset = self._glyph_offset
            return self._bbox
        if len(self.args) == 0:
            self._bbox = BBOX_NONE
            self._next_glyph_offset = self._glyph_offset
            return self._bbox

        horizontal_scaling = self.gstate.scaling * 0.01
        charspace = self.gstate.charspace
        wordspace = self.gstate.wordspace
        vert = font.vertical
        if font.multibyte:
            wordspace = 0
        (x, y) = self._glyph_offset
        pos = y if vert else x
        x0 = x1 = x
        y0 = y1 = y + rise
        fast_path = False
        if not vert:
            # Scale charspace and wordspace, PDF 2.0 section 9.3.2
            charspace *= horizontal_scaling
            wordspace *= horizontal_scaling
            # Detect the most frequent case, horizontal writing with
            # diagonal font.matrix
            a, b, c, d, e, f = font.matrix
            if b == 0 and c == 0:
                fast_path = True
                y0 += d * font.descent * fontsize
                y1 += d * font.ascent * fontsize
        for obj in self.args:
            if isinstance(obj, (int, float)):
                pos -= obj * 0.001 * fontsize * horizontal_scaling
            else:
                for cid, _ in font.decode(obj):
                    x, y = (x, pos) if vert else (pos, y)
                    if vert:
                        assert isinstance(font, CIDFont)
                        pos += font.vdisp(cid) * fontsize
                    else:
                        hdisp = font.hdisp(cid)
                        pos += hdisp * fontsize * horizontal_scaling
                    if fast_path:
                        x1 = pos
                    else:
                        gx0, gy0, gx1, gy1 = font.char_bbox(cid)
                        gx0 *= fontsize * horizontal_scaling
                        gx1 *= fontsize * horizontal_scaling
                        gy0 *= fontsize
                        gy0 += rise
                        gy1 *= fontsize
                        gy1 += rise
                        x0 = min(x0, x + gx0)
                        y0 = min(y0, y + gy0)
                        x1 = max(x1, x + gx1)
                        y1 = max(y1, y + gy1)
                    pos += charspace
                    if cid == 32:
                        pos += wordspace
        # Update this because we can!
        if self._next_glyph_offset is None:
            self._next_glyph_offset = (x, pos) if vert else (pos, y)
        self._bbox = transform_bbox(matrix, (x0, y0, x1, y1))
        return self._bbox

    @property
    def chars(self) -> str:
        """Get the Unicode characters (in stream order) for this object."""
        if self._chars is not None:
            return "".join(self._chars)
        self._chars = []
        font = self.gstate.font
        assert font is not None, "No font was selected"
        for obj in self.args:
            if not isinstance(obj, bytes):
                continue
            for _, text in font.decode(obj):
                self._chars.append(text)
        return "".join(self._chars)

    def __len__(self) -> int:
        """Return the number of glyphs that would result from iterating over
        this object.

        Important: this is the number of glyphs, *not* the number of
        Unicode characters.
        """
        nglyphs = 0
        font = self.gstate.font
        assert font is not None, "No font was selected"
        for obj in self.args:
            if not isinstance(obj, bytes):
                continue
            nglyphs += sum(1 for _ in font.decode(obj))
        return nglyphs

chars property

Get the Unicode characters (in stream order) for this object.

__iter__()

Generate glyphs for this text object

Source code in playa/content.py
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
def __iter__(self) -> Iterator[GlyphObject]:
    """Generate glyphs for this text object"""
    glyph_offset = self._glyph_offset
    font = self.gstate.font
    # If no font is set, we cannot do anything, since even calling
    # TJ with a displacement and no text effects requires us at
    # least to know the fontsize.
    if font is None:
        log.warning(
            "No font is set, will not update text state or output text: %r TJ",
            self.args,
        )
        self._next_glyph_offset = glyph_offset
        return
    assert self.ctm is not None

    tlm_ctm = mult_matrix(self.line_matrix, self.ctm)
    # Pre-determine if we need to recompute the bound for rotated glyphs
    a, b, c, d, _, _ = tlm_ctm
    corners = b * d < 0 or a * c < 0
    fontsize = self.gstate.fontsize
    horizontal_scaling = self.gstate.scaling * 0.01
    # PDF 2.0 section 9.3.2: The character-spacing parameter, Tc,
    # shall be a number specified in unscaled text space units
    # (although it shall be subject to scaling by the Th parameter
    # if the writing mode is horizontal).
    scaled_charspace = self.gstate.charspace / fontsize
    # Section 9.3.3: Word spacing "works the same way"
    scaled_wordspace = self.gstate.wordspace / fontsize

    # PDF 2.0 section 9.4.4: Conceptually, the entire
    # transformation from text space to device space can be
    # represented by a text rendering matrix, T_rm:
    #
    # (scaling_matrix @ text_matrix @ glyph.ctm)
    #
    # Note that scaling_matrix and text_matrix are constant across
    # glyphs in a TextObject, and scaling_matrix is always
    # diagonal (thus the mult_matrix call below can be optimized)
    scaling_matrix = (
        fontsize * horizontal_scaling,
        0,
        0,
        fontsize,
        0,
        self.gstate.rise,
    )
    vert = font.vertical
    # FIXME: THIS IS NOT TRUE!!!  We need a test for it though.
    if font.multibyte:
        scaled_wordspace = 0
    (x, y) = glyph_offset
    pos = y if vert else x
    for obj in self.args:
        if isinstance(obj, (int, float)):
            pos -= obj * 0.001 * fontsize * horizontal_scaling
        else:
            for cid, text in font.decode(obj):
                glyph_offset = (x, pos) if vert else (pos, y)
                disp = font.vdisp(cid) if vert else font.hdisp(cid)
                disp += scaled_charspace
                if cid == 32:
                    disp += scaled_wordspace
                matrix = mult_matrix(
                    scaling_matrix, translate_matrix(tlm_ctm, glyph_offset)
                )
                glyph = GlyphObject(
                    _pageref=self._pageref,
                    _parentkey=self._parentkey,
                    gstate=self.gstate,
                    ctm=self.ctm,
                    mcstack=self.mcstack,
                    cid=cid,
                    text=text,
                    matrix=matrix,
                    _displacement=disp,
                    _corners=corners,
                )
                yield glyph
                # This implements the proper scaling of charspace/wordspace
                if vert:
                    pos += disp * fontsize
                else:
                    pos += disp * fontsize * horizontal_scaling
    glyph_offset = (x, pos) if vert else (pos, y)
    if self._next_glyph_offset is None:
        self._next_glyph_offset = glyph_offset

__len__()

Return the number of glyphs that would result from iterating over this object.

Important: this is the number of glyphs, not the number of Unicode characters.

Source code in playa/content.py
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
def __len__(self) -> int:
    """Return the number of glyphs that would result from iterating over
    this object.

    Important: this is the number of glyphs, *not* the number of
    Unicode characters.
    """
    nglyphs = 0
    font = self.gstate.font
    assert font is not None, "No font was selected"
    for obj in self.args:
        if not isinstance(obj, bytes):
            continue
        nglyphs += sum(1 for _ in font.decode(obj))
    return nglyphs

XObjectObject dataclass

Bases: ContentObject

An eXternal Object, in the context of a page.

There are a couple of kinds of XObjects. Here we are only concerned with "Form XObjects" which, despite their name, have nothing at all to do with fillable forms. Instead they are like little embeddable PDF pages, possibly with their own resources, definitely with their own definition of "user space".

Image XObjects are handled by ImageObject.

Attributes:

Name Type Description
xobjid str

Name of this XObject (in the page resources).

stream ContentStream

Content stream with PDF operators.

resources Union[None, Dict[str, PDFObject]]

Resources specific to this XObject, if any.

group Union[None, Dict[str, PDFObject]]

Transparency group, if any.

Source code in playa/content.py
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
@dataclass
class XObjectObject(ContentObject):
    """An eXternal Object, in the context of a page.

    There are a couple of kinds of XObjects.  Here we are only
    concerned with "Form XObjects" which, despite their name, have
    nothing at all to do with fillable forms.  Instead they are like
    little embeddable PDF pages, possibly with their own resources,
    definitely with their own definition of "user space".

    Image XObjects are handled by `ImageObject`.

    Attributes:
      xobjid: Name of this XObject (in the page resources).
      stream: Content stream with PDF operators.
      resources: Resources specific to this XObject, if any.
      group: Transparency group, if any.
    """

    xobjid: str
    stream: ContentStream
    resources: Union[None, Dict[str, PDFObject]]
    group: Union[None, Dict[str, PDFObject]]

    def __contains__(self, name: str) -> bool:
        return name in self.stream

    def __getitem__(self, name: str) -> PDFObject:
        return self.stream[name]

    @property
    def bbox(self) -> Rect:
        """Get the bounding box of this XObject in device space."""
        # It is a required attribute!
        if "BBox" not in self.stream:
            log.debug("XObject %r has no BBox: %r", self.xobjid, self.stream)
            return self.page.cropbox
        return transform_bbox(self.ctm, rect_value(self.stream["BBox"]))

    @property
    def buffer(self) -> bytes:
        """Raw stream content for this XObject"""
        return self.stream.buffer

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterate over tokens in the XObject's content stream."""
        parser = ContentParser([self.stream])
        while True:
            try:
                pos, tok = parser.nexttoken()
            except StopIteration:
                return
            yield tok

    @property
    def contents(self) -> Iterator[PDFObject]:
        """Iterator over PDF objects in the content stream."""
        for pos, obj in ContentParser([self.stream]):
            yield obj

    def __iter__(self) -> Iterator["ContentObject"]:
        from playa.interp import LazyInterpreter

        interp = LazyInterpreter(
            self.page,
            [self.stream],
            self.resources,
            ctm=self.ctm,
            gstate=self.gstate,
            # This is not really correct if this XObject has a
            # StructParent, *but* in that case the standard forbids
            # there to be any marked content sections inside it so
            # this should never get accessed anyway.
            parent_key=self._parentkey,
        )
        return iter(interp)

    @property
    def parent(self) -> Union["Element", None]:
        """The enclosing logical structure element, if any."""
        from playa.structure import Element

        if hasattr(self, "_parent"):
            return self._parent
        self._parent = None
        if self._parentkey is None:
            return self._parent
        # No structure, no parent!
        if self.doc.structure is None:
            return self._parent
        try:
            parent = resolve1(self.doc.structure.parent_tree[self._parentkey])
            if isinstance(parent, dict):
                self._parent = Element.from_dict(self.doc, parent)
            else:
                del self._parent
                return super().parent
        except IndexError:
            pass
        return self._parent

    @property
    def structure(self) -> Sequence[Union["Element", None]]:
        """Mapping of marked content IDs to logical structure elements.

        As with pages, Form XObjects can also contain their own
        mapping of marked content IDs to structure elements.

        Danger: Do not rely on this being a `list`.
            Currently this is implemented eagerly, but in the future it
            may return a lazy object.

        """
        from playa.structure import Element

        if hasattr(self, "_structmap"):
            return self._structmap
        self._structmap: List[Union["Element", None]] = []
        if self.doc.structure is None:
            return self._structmap
        if self._parentkey is None:
            return self._structmap
        try:
            parents = resolve1(self.doc.structure.parent_tree[self._parentkey])
            if not isinstance(parents, list):
                # This means that there is a single StructParent, and thus
                # no internal structure to this Form XObject
                return self._structmap
        except IndexError:
            return self._structmap
        # Elements can contain multiple marked content sections, so
        # don't create redundant Element objects for these
        elements: Dict[int, Element] = {}
        for obj in parents:
            objid = obj.objid if isinstance(obj, ObjRef) else id(obj)
            if objid not in elements:
                elements[objid] = Element.from_dict(self.doc, dict_value(obj))
            self._structmap.append(elements[objid])
        return self._structmap

    @property
    def mcid_texts(self) -> Mapping[int, List[str]]:
        """Mapping of marked content IDs to Unicode text strings.

        For use in text extraction from tagged PDFs.

        Danger: Do not rely on this being a `dict`.
            Currently this is implemented eagerly, but in the future it
            may return a lazy object.
        """
        if hasattr(self, "_textmap"):
            return self._textmap
        self._textmap: Mapping[int, List[str]] = _extract_mcid_texts(self)
        return self._textmap

    @classmethod
    def from_stream(
        cls,
        stream: ContentStream,
        page: "Page",
        xobjid: str,
        gstate: GraphicState,
        ctm: Matrix,
        mcstack: Tuple[MarkedContent, ...],
    ) -> "XObjectObject":
        """Create a new XObjectObject from a content stream."""
        if "Matrix" in stream:
            ctm = mult_matrix(matrix_value(stream["Matrix"]), ctm)
        # According to PDF reference 1.7 section 4.9.1, XObjects in
        # earlier PDFs (prior to v1.2) use the page's Resources entry
        # instead of having their own Resources entry.  So, this could
        # be None, in which case LazyInterpreter will fall back to
        # page.resources.
        xobjres = stream.get("Resources")
        resources = None if xobjres is None else dict_value(xobjres)
        xobjgrp = stream.get("Group")
        group = None if xobjgrp is None else dict_value(xobjgrp)
        # PDF 2.0, sec 11.6.6
        # Initial blend mode: Before execution of the transparency group
        # XObject’s content stream, the current blend mode in the graphics
        # state shall be initialised to Normal, the current stroking and
        # nonstroking alpha constants to 1.0, and the current soft mask to None
        if group and group.get("S") == LITERAL_TRANSPARENCY:
            # Need to copy here so as not to modify existing gstate,
            # unfortunately it will get copied again later...
            gstate = copy(gstate)
            gstate.blend_mode = LITERAL_NORMAL
            gstate.salpha = gstate.nalpha = 1
            gstate.smask = None
        # PDF 2.0, Table 359
        # At most one of [StructParent and StructParents] shall be
        # present in a given object. An object may be either a content
        # item in its entirety or a container for marked-content
        # sequences that are content items, but not both.
        if "StructParent" in stream:
            parent_key = int_value(stream["StructParent"])
        elif "StructParents" in stream:
            parent_key = int_value(stream["StructParents"])
        else:
            parent_key = None
        return cls(
            _pageref=page.pageref,
            _parentkey=parent_key,
            gstate=gstate,
            ctm=ctm,
            mcstack=mcstack,
            xobjid=xobjid,
            stream=stream,
            resources=resources,
            group=group,
        )

bbox property

Get the bounding box of this XObject in device space.

buffer property

Raw stream content for this XObject

contents property

Iterator over PDF objects in the content stream.

mcid_texts property

Mapping of marked content IDs to Unicode text strings.

For use in text extraction from tagged PDFs.

Do not rely on this being a dict.

Currently this is implemented eagerly, but in the future it may return a lazy object.

parent property

The enclosing logical structure element, if any.

structure property

Mapping of marked content IDs to logical structure elements.

As with pages, Form XObjects can also contain their own mapping of marked content IDs to structure elements.

Do not rely on this being a list.

Currently this is implemented eagerly, but in the future it may return a lazy object.

tokens property

Iterate over tokens in the XObject's content stream.

from_stream(stream, page, xobjid, gstate, ctm, mcstack) classmethod

Create a new XObjectObject from a content stream.

Source code in playa/content.py
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
@classmethod
def from_stream(
    cls,
    stream: ContentStream,
    page: "Page",
    xobjid: str,
    gstate: GraphicState,
    ctm: Matrix,
    mcstack: Tuple[MarkedContent, ...],
) -> "XObjectObject":
    """Create a new XObjectObject from a content stream."""
    if "Matrix" in stream:
        ctm = mult_matrix(matrix_value(stream["Matrix"]), ctm)
    # According to PDF reference 1.7 section 4.9.1, XObjects in
    # earlier PDFs (prior to v1.2) use the page's Resources entry
    # instead of having their own Resources entry.  So, this could
    # be None, in which case LazyInterpreter will fall back to
    # page.resources.
    xobjres = stream.get("Resources")
    resources = None if xobjres is None else dict_value(xobjres)
    xobjgrp = stream.get("Group")
    group = None if xobjgrp is None else dict_value(xobjgrp)
    # PDF 2.0, sec 11.6.6
    # Initial blend mode: Before execution of the transparency group
    # XObject’s content stream, the current blend mode in the graphics
    # state shall be initialised to Normal, the current stroking and
    # nonstroking alpha constants to 1.0, and the current soft mask to None
    if group and group.get("S") == LITERAL_TRANSPARENCY:
        # Need to copy here so as not to modify existing gstate,
        # unfortunately it will get copied again later...
        gstate = copy(gstate)
        gstate.blend_mode = LITERAL_NORMAL
        gstate.salpha = gstate.nalpha = 1
        gstate.smask = None
    # PDF 2.0, Table 359
    # At most one of [StructParent and StructParents] shall be
    # present in a given object. An object may be either a content
    # item in its entirety or a container for marked-content
    # sequences that are content items, but not both.
    if "StructParent" in stream:
        parent_key = int_value(stream["StructParent"])
    elif "StructParents" in stream:
        parent_key = int_value(stream["StructParents"])
    else:
        parent_key = None
    return cls(
        _pageref=page.pageref,
        _parentkey=parent_key,
        gstate=gstate,
        ctm=ctm,
        mcstack=mcstack,
        xobjid=xobjid,
        stream=stream,
        resources=resources,
        group=group,
    )

playa.structure

Lazy interface to PDF logical structure (PDF 1.7 sect 14.7).

ContentItem dataclass

Content item in logical structure tree.

This corresponds to an individual marked content section on a specific page, and can be used to (lazily) find that section if desired.

Source code in playa/structure.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@dataclass
class ContentItem:
    """Content item in logical structure tree.

    This corresponds to an individual marked content section on a
    specific page, and can be used to (lazily) find that section if
    desired.
    """

    _pageref: PageRef
    mcid: int
    stream: Union[ContentStream, None]
    _bbox: Union[Rect, None] = None

    @property
    def page(self) -> Union["Page", None]:
        """Specific page for this structure tree, if any."""
        if self._pageref is None:
            return None
        return _deref_page(self._pageref)

    @property
    def doc(self) -> "Document":
        """The document containing this content object."""
        docref, _ = self._pageref
        return _deref_document(docref)

    @property
    def bbox(self) -> Rect:
        """Find the bounding box, if any, of this item, which is the
        smallest rectangle enclosing all objects in its marked content
        section.

        Note that this is currently quite inefficient as it involves
        interpreting the entire page.

        If the `page` attribute is `None`, then `bbox` will be
        `BBOX_NONE`.

        """
        if self._bbox is not None:
            return self._bbox
        page = self.page
        if page is None:
            self._bbox = BBOX_NONE
        else:
            itor: Union["Page", "XObjectObject"] = page
            if self.stream is not None:
                for obj in page.xobjects:
                    if obj.stream.objid == self.stream.objid:
                        itor = obj
            self._bbox = get_bound_rects(
                obj.bbox for obj in itor if obj.mcid == self.mcid
            )
        return self._bbox

    @property
    def text(self) -> Union[str, None]:
        """Unicode text contained in this structure element."""
        page = self.page
        if page is None:
            return None
        itor: Union["Page", "XObjectObject"] = page
        if self.stream is not None:
            # FIXME: Potentially quite slow, but we hope this never happens
            for obj in page.xobjects:
                if obj.stream.objid == self.stream.objid:
                    itor = obj
            # FIXME: if we don't find it... then what? (probably None, but...)
        texts = itor.mcid_texts.get(self.mcid)
        if texts is None:
            return None
        return "".join(texts)

bbox property

Find the bounding box, if any, of this item, which is the smallest rectangle enclosing all objects in its marked content section.

Note that this is currently quite inefficient as it involves interpreting the entire page.

If the page attribute is None, then bbox will be BBOX_NONE.

doc property

The document containing this content object.

page property

Specific page for this structure tree, if any.

text property

Unicode text contained in this structure element.

ContentObject dataclass

Content object in logical structure tree.

This corresponds to a content item that is an entire PDF (X)Object (PDF 1.7 section 14.7.43), and can be used to (lazily) get that

The standard is very unclear on what this could be aside from an Annotation or an XObject (presumably either a Form XObject or an image). An XObject must be a content stream, so that's clear enough... Otherwise, since the Type key is not required in an annotation dictionary we presume that this is an annotation if it's not present.

Not to be confused with playa.page.ContentObject. While you can get there from here with the obj property, it may not be a great idea, because the only way to do that correctly in the case of an XObject (or image) is to interpret the containing page.

Sometimes, but not always, you can nonetheless rapidly access the bbox, so this is also provided as a property here.

Source code in playa/structure.py
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
@dataclass
class ContentObject:
    """Content object in logical structure tree.

    This corresponds to a content item that is an entire PDF (X)Object
    (PDF 1.7 section 14.7.43), and can be used to (lazily) get that

    The standard is very unclear on what this could be aside from an
    `Annotation` or an `XObject` (presumably either a Form XObject or
    an image).  An XObject must be a content stream, so that's clear
    enough... Otherwise, since the `Type` key is not required in an
    annotation dictionary we presume that this is an annotation if
    it's not present.

    Not to be confused with `playa.page.ContentObject`.  While you
    *can* get there from here with the `obj` property, it may not be a
    great idea, because the only way to do that correctly in the case
    of an `XObject` (or image) is to interpret the containing page.

    Sometimes, but not always, you can nonetheless rapidly access the
    `bbox`, so this is also provided as a property here.

    """

    _pageref: PageRef
    props: Union[ContentStream, Dict[str, PDFObject]]

    @property
    def obj(self) -> Union["XObjectObject", "ImageObject", "Annotation", None]:
        """Return an instantiated object, if possible."""
        objtype = self.type
        if objtype is LITERAL_ANNOT:
            from playa.page import Annotation

            return Annotation.from_dict(self.props, self.page)

        if objtype is LITERAL_XOBJECT:
            assert isinstance(self.props, ContentStream)
            subtype = self.props.get("Subtype")
            itor = self.page.images if subtype is LITERAL_IMAGE else self.page.xobjects
            for obj in itor:
                if obj.stream.objid == self.props.objid:
                    return obj

        return None

    @property
    def type(self) -> PSLiteral:
        """Type of this object, usually LITERAL_ANNOT or LITERAL_XOBJECT."""
        if isinstance(self.props, ContentStream):
            return LITERAL_XOBJECT
        objtype = self.props.get("Type")
        if not isinstance(objtype, PSLiteral):
            return LITERAL_ANNOT
        return objtype

    @property
    def page(self) -> "Page":
        """Containing page for this content object."""
        return _deref_page(self._pageref)

    @property
    def doc(self) -> "Document":
        """The document containing this content object."""
        docref, _ = self._pageref
        return _deref_document(docref)

    @property
    def bbox(self) -> Rect:
        """Find the bounding box, if any, of this object.

        If there is no bounding box (very unlikely) this will be
        `BBOX_NONE`.
        """
        if "BBox" in self.props:
            rawbox = rect_value(self.props["BBox"])
            return transform_bbox(self.page.ctm, rawbox)

        if "Rect" in self.props:
            rawbox = rect_value(self.props["Rect"])
            return transform_bbox(self.page.ctm, rawbox)

        obj = self.obj
        if obj is None:
            return BBOX_NONE
        return obj.bbox

bbox property

Find the bounding box, if any, of this object.

If there is no bounding box (very unlikely) this will be BBOX_NONE.

doc property

The document containing this content object.

obj property

Return an instantiated object, if possible.

page property

Containing page for this content object.

type property

Type of this object, usually LITERAL_ANNOT or LITERAL_XOBJECT.

Element dataclass

Bases: Findable

Logical structure element.

Attributes:

Name Type Description
props Dict[str, PDFObject]

Structure element dictionary (PDF 1.7 table 323).

Source code in playa/structure.py
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
@dataclass
class Element(Findable):
    """Logical structure element.

    Attributes:
      props: Structure element dictionary (PDF 1.7 table 323).
    """

    _docref: DocumentRef
    props: Dict[str, PDFObject]
    _role: Union[str, None] = None

    @classmethod
    def from_dict(cls, doc: "Document", obj: Dict[str, PDFObject]) -> "Element":
        """Construct from PDF structure element dictionary."""
        return cls(_docref=_ref_document(doc), props=obj)

    @property
    def type(self) -> str:
        """Structure type for this element.

        Note: Raw and standard structure types
            This type is quite likely idiosyncratic and defined by
            whatever style sheets the author used in their word
            processor.  Standard structure types (PDF 1.7 section
            14.8.4) are accessible through the `role_map` attribute of
            the structure root, or, for convenience (this is slow) via
            the `role` attribute on elements.

        """
        return literal_name(self.props["S"])

    @property
    def role(self) -> str:
        """Standardized structure type.

        Note: Roles are always mapped
            Since it is common for documents to use standard types
            directly for some of their structure elements (typically
            ones with no content) and thus to omit them from the role
            map, `role` will always return a string in order to
            facilitate processing.  If you must absolutely know
            whether an element's type has no entry in the role map
            then you will need to consult it directly.
        """
        if self._role is not None:
            return self._role
        tree = self.doc.structure
        if tree is None:  # it could happen!
            return self.type
        return tree.role_map.get(self.type, self.type)

    @property
    def doc(self) -> "Document":
        """Containing document for this element."""
        return _deref_document(self._docref)

    @property
    def page(self) -> Union["Page", None]:
        """Containing page for this element, if any."""
        pg = self.props.get("Pg")
        if pg is None:
            return None
        elif isinstance(pg, ObjRef):
            try:
                return self.doc.pages.by_id(pg.objid)
            except KeyError:
                LOG.warning("'Pg' entry not found in document: %r", self.props)
        else:
            LOG.warning(
                "'Pg' entry is not an indirect object reference: %r", self.props
            )
        return None

    @property
    def parent(self) -> Union["Element", "Tree", None]:
        p = resolve1(self.props.get("P"))
        if p is None:
            return None
        p = dict_value(p)
        if p.get("Type") is LITERAL_STRUCTTREEROOT:
            return self.doc.structure
        return Element.from_dict(self.doc, p)

    @property
    def contents(self) -> Iterator[Union[ContentItem, ContentObject]]:
        """Iterate over all content items contained in an element."""
        for kid in self:
            if isinstance(kid, Element):
                yield from kid.contents
            elif isinstance(kid, (ContentItem, ContentObject)):
                yield kid

    @property
    def bbox(self) -> Rect:
        """The bounding box, if any, of this element.

        Elements may explicitly define a `BBox` in default user space,
        in which case this is used.  Otherwise, the bounding box is
        the smallest rectangle enclosing all of the content items
        contained by this element (which may take some time to compute).

        Note that this is currently quite inefficient as it involves
        interpreting the entire page.

        Note: Elements may span multiple pages!
            In the case of an element (such as a `Document` for
            instance) that spans multiple pages, the bounding box
            cannot exist, and `BBOX_NONE` will be returned.  If the
            `page` attribute is `None`, then `bbox` will be
            `BBOX_NONE`.

        """
        page = self.page
        if page is None:
            return BBOX_NONE
        if "BBox" in self.props:
            rawbox = rect_value(self.props["BBox"])
            return transform_bbox(page.ctm, rawbox)
        else:
            # NOTE: This is quite slow
            return get_bound_rects(
                item.bbox
                for item in self.contents
                if item.page is page and item.bbox is not BBOX_NONE
            )

    def __iter__(self) -> Iterator[Union["Element", ContentItem, ContentObject]]:
        if "K" in self.props:
            kids = resolve1(self.props["K"])
            yield from _make_kids(kids, self.page, self._docref)

bbox property

The bounding box, if any, of this element.

Elements may explicitly define a BBox in default user space, in which case this is used. Otherwise, the bounding box is the smallest rectangle enclosing all of the content items contained by this element (which may take some time to compute).

Note that this is currently quite inefficient as it involves interpreting the entire page.

Elements may span multiple pages!

In the case of an element (such as a Document for instance) that spans multiple pages, the bounding box cannot exist, and BBOX_NONE will be returned. If the page attribute is None, then bbox will be BBOX_NONE.

contents property

Iterate over all content items contained in an element.

doc property

Containing document for this element.

page property

Containing page for this element, if any.

role property

Standardized structure type.

Roles are always mapped

Since it is common for documents to use standard types directly for some of their structure elements (typically ones with no content) and thus to omit them from the role map, role will always return a string in order to facilitate processing. If you must absolutely know whether an element's type has no entry in the role map then you will need to consult it directly.

type property

Structure type for this element.

Raw and standard structure types

This type is quite likely idiosyncratic and defined by whatever style sheets the author used in their word processor. Standard structure types (PDF 1.7 section 14.8.4) are accessible through the role_map attribute of the structure root, or, for convenience (this is slow) via the role attribute on elements.

from_dict(doc, obj) classmethod

Construct from PDF structure element dictionary.

Source code in playa/structure.py
311
312
313
314
@classmethod
def from_dict(cls, doc: "Document", obj: Dict[str, PDFObject]) -> "Element":
    """Construct from PDF structure element dictionary."""
    return cls(_docref=_ref_document(doc), props=obj)

Findable

Bases: Iterable

find() and find_all() methods that can be inherited to avoid repeating oneself

Source code in playa/structure.py
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
class Findable(Iterable):
    """find() and find_all() methods that can be inherited to avoid
    repeating oneself"""

    def find_all(
        self, matcher: Union[str, Pattern[str], MatchFunc, None] = None
    ) -> Iterator["Element"]:
        """Iterate depth-first over matching elements in subtree.

        The `matcher` argument is either a string, a regular
        expression, or a function taking a `Element` and returning
        `True` if the element matches, or `None` (default) to return
        all descendants in depth-first order.

        For compatibility with `pdfplumber` and consistent behaviour
        across documents, names and regular expressions are matched
        against the `role` attribute.  If you wish to match the "raw"
        structure type from the `type` attribute, you can do this with
        a matching function.

        """
        return _find_all(list(self), matcher)

    def find(
        self, matcher: Union[str, Pattern[str], MatchFunc, None] = None
    ) -> Union["Element", None]:
        """Find the first matching element in subtree.

        The `matcher` argument is either a string or a regular
        expression to be matched against the `role` attribute, or a
        function taking a `Element` and returning `True` if the
        element matches, or `None` (default) to just get the first
        child element.

        """
        try:
            return next(_find_all(list(self), matcher))
        except StopIteration:
            return None

find(matcher=None)

Find the first matching element in subtree.

The matcher argument is either a string or a regular expression to be matched against the role attribute, or a function taking a Element and returning True if the element matches, or None (default) to just get the first child element.

Source code in playa/structure.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
def find(
    self, matcher: Union[str, Pattern[str], MatchFunc, None] = None
) -> Union["Element", None]:
    """Find the first matching element in subtree.

    The `matcher` argument is either a string or a regular
    expression to be matched against the `role` attribute, or a
    function taking a `Element` and returning `True` if the
    element matches, or `None` (default) to just get the first
    child element.

    """
    try:
        return next(_find_all(list(self), matcher))
    except StopIteration:
        return None

find_all(matcher=None)

Iterate depth-first over matching elements in subtree.

The matcher argument is either a string, a regular expression, or a function taking a Element and returning True if the element matches, or None (default) to return all descendants in depth-first order.

For compatibility with pdfplumber and consistent behaviour across documents, names and regular expressions are matched against the role attribute. If you wish to match the "raw" structure type from the type attribute, you can do this with a matching function.

Source code in playa/structure.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
def find_all(
    self, matcher: Union[str, Pattern[str], MatchFunc, None] = None
) -> Iterator["Element"]:
    """Iterate depth-first over matching elements in subtree.

    The `matcher` argument is either a string, a regular
    expression, or a function taking a `Element` and returning
    `True` if the element matches, or `None` (default) to return
    all descendants in depth-first order.

    For compatibility with `pdfplumber` and consistent behaviour
    across documents, names and regular expressions are matched
    against the `role` attribute.  If you wish to match the "raw"
    structure type from the `type` attribute, you can do this with
    a matching function.

    """
    return _find_all(list(self), matcher)

Tree

Bases: Findable

Logical structure tree.

A structure tree can be iterated over in the same fashion as its elements. Note that even though it is forbidden for structure tree root to contain content items, PLAYA is robust to this possibility, thus you should not presume that iterating over it will only yield Element instances.

The various attributes (role map, class map, pronunciation dictionary, etc, etc) are accessible through props but currently have no particular interpretation aside from the role map which is accessible in normalized form through role_map.

Attributes:

Name Type Description
props Dict[str, PDFObject]

Structure tree root dictionary (PDF 1.7 table 322).

role_map Dict[str, str]

Mapping of structure element types (as strings) to standard structure types (as strings) (PDF 1.7 section 14.8.4)

parent_tree NumberTree

Parent tree linking marked content sections to structure elements (PDF 1.7 section 14.7.4.4)

Source code in playa/structure.py
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
class Tree(Findable):
    """Logical structure tree.

    A structure tree can be iterated over in the same fashion as its
    elements.  Note that even though it is forbidden for structure
    tree root to contain content items, PLAYA is robust to this
    possibility, thus you should not presume that iterating over it
    will only yield `Element` instances.

    The various attributes (role map, class map, pronunciation
    dictionary, etc, etc) are accessible through `props` but currently
    have no particular interpretation aside from the role map which is
    accessible in normalized form through `role_map`.

    Attributes:
      props: Structure tree root dictionary (PDF 1.7 table 322).
      role_map: Mapping of structure element types (as strings) to
          standard structure types (as strings) (PDF 1.7 section 14.8.4)
      parent_tree: Parent tree linking marked content sections to
          structure elements (PDF 1.7 section 14.7.4.4)
    """

    _docref: DocumentRef
    props: Dict[str, PDFObject]
    _role_map: Dict[str, str]
    _parent_tree: NumberTree

    def __init__(self, doc: "Document") -> None:
        self._docref = _ref_document(doc)
        self.props = dict_value(doc.catalog["StructTreeRoot"])

    def __iter__(self) -> Iterator[Union["Element", ContentItem, ContentObject]]:
        doc = _deref_document(self._docref)
        return _iter_structure(doc)

    @property
    def role_map(self) -> Dict[str, str]:
        """Dictionary mapping some (not necessarily all) element types
        to their standard equivalents."""
        if hasattr(self, "_role_map"):
            return self._role_map
        self._role_map = {}
        rm = resolve1(self.props.get("RoleMap"))  # It is optional
        if isinstance(rm, dict):
            for k, v in rm.items():
                if isinstance(v, PSLiteral):
                    role = literal_name(v)
                else:
                    role = str(v)
                self._role_map[k] = role
        return self._role_map

    @property
    def parent_tree(self) -> NumberTree:
        """Parent tree for this document.

        This is a somewhat obscure data structure that links marked
        content sections to their corresponding structure elements.
        If you don't know what that means, you probably don't need it,
        but if you do, here it is.

        Unlike the structure tree itself, if there is no parent tree,
        this will be an empty NumberTree.  This is because the parent
        tree is required by the spec in the case where structure
        elements contain marked content, which is nearly all the time.

        """
        if hasattr(self, "_parent_tree"):
            return self._parent_tree
        if "ParentTree" not in self.props:
            self._parent_tree = NumberTree({})
        else:
            self._parent_tree = NumberTree(self.props["ParentTree"])
        return self._parent_tree

    @property
    def contents(self) -> Iterator[Union[ContentItem, ContentObject]]:
        """Iterate over all content items in the tree."""
        for kid in self:
            if isinstance(kid, Element):
                yield from kid.contents
            elif isinstance(kid, (ContentItem, ContentObject)):
                # This is not supposed to happen, but we will support it anyway
                yield kid

    @property
    def doc(self) -> "Document":
        """Document with which this structure tree is associated."""
        return _deref_document(self._docref)

contents property

Iterate over all content items in the tree.

doc property

Document with which this structure tree is associated.

parent_tree property

Parent tree for this document.

This is a somewhat obscure data structure that links marked content sections to their corresponding structure elements. If you don't know what that means, you probably don't need it, but if you do, here it is.

Unlike the structure tree itself, if there is no parent tree, this will be an empty NumberTree. This is because the parent tree is required by the spec in the case where structure elements contain marked content, which is nearly all the time.

role_map property

Dictionary mapping some (not necessarily all) element types to their standard equivalents.

playa.outline

Lazy interface to PDF document outline (PDF 1.7 sect 12.3.3).

Action dataclass

PDF actions (PDF 1.7 sect 12.6)

Source code in playa/outline.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
@dataclass
class Action:
    """PDF actions (PDF 1.7 sect 12.6)"""

    _docref: DocumentRef
    props: Dict[str, PDFObject]

    @property
    def type(self) -> PSLiteral:
        assert isinstance(self.props["S"], PSLiteral)
        return self.props["S"]

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self._docref)

    @property
    def destination(self) -> Union[Destination, None]:
        """Destination of this action, if any."""
        dest = resolve1(self.props.get("D"))
        if dest is None:
            return None
        elif not isinstance(dest, (PSLiteral, bytes, list)):
            LOG.warning("Unrecognized destination: %r", dest)
            return None
        return Destination.from_dest(self.doc, dest)

destination property

Destination of this action, if any.

doc property

Get associated document if it exists.

Destination dataclass

PDF destinations (PDF 1.7 sect 12.3.2)

Source code in playa/outline.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
@dataclass
class Destination:
    """PDF destinations (PDF 1.7 sect 12.3.2)"""

    _docref: DocumentRef
    page_idx: Union[int, None]
    display: Union[PSLiteral, None]
    coords: Tuple[Union[float, None], ...]

    @classmethod
    def from_dest(
        cls, doc: "Document", dest: Union[PSLiteral, bytes, list]
    ) -> "Destination":
        if isinstance(dest, (bytes, PSLiteral)):
            return doc.destinations[dest]
        elif isinstance(dest, list):
            return cls.from_list(doc, dest)
        else:
            raise TypeError("Unknown destination type: %r", dest)

    @classmethod
    def from_list(cls, doc: "Document", dest: Sequence) -> "Destination":
        pageobj, display, *args = dest
        page_idx: Union[int, None] = None
        if isinstance(pageobj, int):
            # Not really sure if this is page number or page index...
            page_idx = pageobj - 1
        elif isinstance(pageobj, ObjRef):
            try:
                page_idx = doc.pages.by_id(pageobj.objid).page_idx
            except KeyError:
                LOG.warning("Invalid page object in destination: %r", pageobj)
        else:
            LOG.warning("Unrecognized page in destination object: %r", pageobj)
        if not isinstance(display, PSLiteral):
            LOG.warning("Unknown display type: %r", display)
            display = None
        coords = tuple(x if isinstance(x, (int, float)) else None for x in args)
        return Destination(
            _docref=_ref_document(doc),
            page_idx=page_idx,
            display=display,
            coords=coords,
        )

Outline

PDF document outline (PDF 1.7 sect 12.3.3)

Source code in playa/outline.py
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
class Outline:
    """PDF document outline (PDF 1.7 sect 12.3.3)"""

    _docref: DocumentRef
    props: Dict[str, PDFObject]

    def __init__(self, doc: "Document") -> None:
        self._docref = _ref_document(doc)
        self.props = dict_value(doc.catalog["Outlines"])

    def __iter__(self) -> Iterator["Outline"]:
        if "First" in self.props and "Last" in self.props:
            ref = self.props["First"]
            while ref is not None:
                if not isinstance(ref, ObjRef):
                    LOG.warning("Not an indirect object reference: %r", ref)
                    break
                out = self._from_ref(ref)
                ref = out.props.get("Next")
                yield out
                if ref is self.props["Last"]:
                    break

    def _from_ref(self, ref: ObjRef) -> "Outline":
        out = Outline.__new__(Outline)
        out._docref = self._docref
        out.props = dict_value(ref)
        return out

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self._docref)

    @property
    def title(self) -> Union[str, None]:
        raw = resolve1(self.props.get("Title"))
        if raw is None:
            return None
        if not isinstance(raw, bytes):
            LOG.warning("Title is not a string: %r", raw)
            return None
        return decode_text(raw)

    @property
    def destination(self) -> Union[Destination, None]:
        """Destination for this outline item.

        Note: Special case of `GoTo` actions.
            Since internal `GoTo` actions (PDF 1.7 sect 12.6.4.2) in
            outlines and links are entirely equivalent to
            destinations, if one exists, it will be returned here as
            well.

        Returns:
            destination, if one exists.
        """
        dest = resolve1(self.props.get("Dest"))
        if dest is not None:
            try:
                if isinstance(dest, (PSLiteral, bytes, list)):
                    return Destination.from_dest(self.doc, dest)
            except KeyError:
                LOG.warning("Unknown named destination: %r", dest)
        # Fall through to try an Action instead
        action = self.action
        if action is None or action.type is not ACTION_GOTO:
            return None
        return action.destination

    @property
    def action(self) -> Union[Action, None]:
        try:
            return Action(self._docref, dict_value(self.props["A"]))
        except (KeyError, TypeError):
            return None

    @property
    def element(self) -> Union[Element, None]:
        """The structure element associated with this outline item, if
        any.

        Returns:
            structure element, if one exists.
        """
        try:
            return Element.from_dict(self.doc, dict_value(self.props["SE"]))
        except (KeyError, TypeError):
            return None

    @property
    def parent(self) -> Union["Outline", None]:
        ref = self.props.get("Parent")
        if ref is None:
            return None
        if not isinstance(ref, ObjRef):
            LOG.warning("Parent is not indirect object reference: %r", ref)
            return None
        return self._from_ref(ref)

destination property

Destination for this outline item.

Special case of GoTo actions.

Since internal GoTo actions (PDF 1.7 sect 12.6.4.2) in outlines and links are entirely equivalent to destinations, if one exists, it will be returned here as well.

Returns:

Type Description
Union[Destination, None]

destination, if one exists.

doc property

Get associated document if it exists.

element property

The structure element associated with this outline item, if any.

Returns:

Type Description
Union[Element, None]

structure element, if one exists.

playa.font

Font metrics and descriptors

API subject to change.

These APIs are unstable and subject to revision before PLAYA 1.0.

CIDFont

Bases: Font

Source code in playa/font.py
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
class CIDFont(Font):
    default_vdisp: float

    def __init__(
        self,
        spec: Dict[str, PDFObject],
    ) -> None:
        self.basefont = get_basefont(spec)
        self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
        # These are *supposed* to be ASCII (PDF 1.7 section 9.7.3),
        # but for whatever reason they are sometimes UTF-16BE
        cid_registry = resolve1(self.cidsysteminfo.get("Registry"))
        if isinstance(cid_registry, (str, bytes)):
            cid_registry = decode_text(cid_registry)
        else:
            cid_registry = "unknown"
        cid_ordering = resolve1(self.cidsysteminfo.get("Ordering"))
        if isinstance(cid_ordering, (str, bytes)):
            cid_ordering = decode_text(cid_ordering)
        else:
            cid_ordering = "unknown"
        self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
        self.cmap: CMapBase = self.get_cmap_from_spec(spec)

        try:
            descriptor = dict_value(spec["FontDescriptor"])
        except KeyError:
            log.warning("Font spec is missing FontDescriptor: %r", spec)
            descriptor = {}
        self.tounicode: Optional[ToUnicodeMap] = None
        self.unicode_map: Optional[UnicodeMap] = None
        # Since None is equivalent to an identity map, avoid warning
        # in the case where there was some kind of explicit Identity
        # mapping (even though this is absolutely not standards compliant)
        identity_map = False
        # First try to use an explicit ToUnicode Map
        if "ToUnicode" in spec:
            if "Encoding" in spec and spec["ToUnicode"] == spec["Encoding"]:
                log.debug(
                    "ToUnicode and Encoding point to the same object, using an "
                    "identity mapping for Unicode instead of this nonsense: %r",
                    spec["ToUnicode"],
                )
                identity_map = True
            elif isinstance(spec["ToUnicode"], ContentStream):
                strm = stream_value(spec["ToUnicode"])
                log.debug("Parsing ToUnicode from stream %r", strm)
                self.tounicode = parse_tounicode(strm.buffer)
            # If there is no stream, consider it an Identity mapping
            elif (
                isinstance(spec["ToUnicode"], PSLiteral)
                and "Identity" in spec["ToUnicode"].name
            ):
                log.debug("Using identity mapping for ToUnicode %r", spec["ToUnicode"])
                identity_map = True
            else:
                log.warning("Unparseable ToUnicode in %r", spec)
        # If there is no ToUnicode, then try TrueType font tables
        elif "FontFile2" in descriptor:
            self.fontfile = stream_value(descriptor.get("FontFile2"))
            log.debug("Parsing ToUnicode from TrueType font %r", self.fontfile)
            # FIXME: Utterly gratuitous use of BytesIO
            ttf = TrueTypeFontProgram(self.basefont, BytesIO(self.fontfile.buffer))
            self.tounicode = ttf.create_tounicode()
        # Or try to get a predefined UnicodeMap (not to be confused
        # with a ToUnicodeMap)
        if self.tounicode is None:
            try:
                self.unicode_map = CMapDB.get_unicode_map(
                    self.cidcoding,
                    self.cmap.is_vertical(),
                )
            except KeyError:
                pass
        if self.unicode_map is None and self.tounicode is None and not identity_map:
            log.debug(
                "Unable to find/create/guess unicode mapping for CIDFont, "
                "using identity mapping: %r",
                spec,
            )
        # FIXME: Verify that self.tounicode's code space corresponds
        # to self.cmap (this is actually quite hard because the code
        # spaces have been lost in the precompiled CMaps...)

        widths = _get_widths(list_value(spec.get("W", [])))
        if "DW" in spec:
            default_width = num_value(spec["DW"])
        else:
            default_width = 1000
        self.vertical = self.cmap.is_vertical()
        if self.vertical:
            if "DW2" in spec:
                (vy, w1) = point_value(spec["DW2"])
            else:
                # seemingly arbitrary values, but found in PDF 2.0 Table 115
                vy = 880  # vertical component of position vector
                w1 = -1000  # default vertical displacement
            self.default_position = (default_width / 2, vy)
            # The horizontal displacement is *always* zero (PDF 2.0
            # sec 9.7.4.3) so we only store the vertical.
            self.default_vdisp = w1
            # Glyph-specific vertical displacement and position vectors if any
            self.positions = {}
            self.vdisps = {}
            if "W2" in spec:
                for cid, (w1, (vx, vy)) in _get_widths2(list_value(spec["W2"])).items():
                    self.positions[cid] = (vx, vy)
                    self.vdisps[cid] = w1
        else:
            self.default_position = (0, 0)
            self.default_vdisp = 0
            self.positions = {}
            self.vdisps = {}
        Font.__init__(self, descriptor, widths, default_width=default_width)

    def get_cmap_from_spec(self, spec: Dict[str, PDFObject]) -> CMapBase:
        """Get cmap from font specification

        For certain PDFs, Encoding Type isn't mentioned as an attribute of
        Encoding but as an attribute of CMapName, where CMapName is an
        attribute of spec['Encoding'].
        The horizontal/vertical modes are mentioned with different name
        such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
        """
        cmap_name = self._get_cmap_name(spec)

        try:
            return CMapDB.get_cmap(cmap_name)
        except KeyError as e:
            # Parse an embedded CMap if necessary
            if isinstance(spec["Encoding"], ContentStream):
                strm = stream_value(spec["Encoding"])
                return parse_encoding(strm.buffer)
            else:
                log.warning("Failed to get cmap %s: %s", cmap_name, e)
                return CMap()

    @staticmethod
    def _get_cmap_name(spec: Dict[str, PDFObject]) -> str:
        """Get cmap name from font specification"""
        cmap_name = "unknown"  # default value
        try:
            spec_encoding = resolve1(spec["Encoding"])
            if spec_encoding is not None:
                cmap_name = literal_name(spec_encoding)
            else:
                spec_encoding = resolve1(spec["CMapName"])
                if spec_encoding is not None:
                    cmap_name = literal_name(spec_encoding)
        except KeyError:
            log.warning("Font spec is missing Encoding: %r", spec)
        except TypeError:
            log.warning("Font spec has invalid Encoding: %r", spec)
        return IDENTITY_ENCODER.get(cmap_name, cmap_name)

    def decode(self, data: bytes) -> Iterable[Tuple[int, str]]:
        if self.tounicode is not None:
            log.debug("decode with ToUnicodeMap: %r", data)
            # FIXME: Should verify that the codes are actually the
            # same (or just trust the codes that come from the cmap)
            return zip(
                (cid for _, cid in self.cmap.decode(data)), self.tounicode.decode(data)
            )
        elif self.unicode_map is not None:
            log.debug("decode with UnicodeMap: %r", data)
            return (
                (cid, self.unicode_map.get_unichr(cid))
                for (_, cid) in self.cmap.decode(data)
            )
        else:
            log.debug("decode with identity unicode map: %r", data)
            return (
                (cid, chr(int.from_bytes(substr, "big")))
                for substr, cid in self.cmap.decode(data)
            )

    def __repr__(self) -> str:
        return f"<CIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"

    def vdisp(self, cid: int) -> float:
        """Get vertical displacement for vertical writing mode, in
        text space units.

        Returns 0 for horizontal writing, for obvious reasons.
        """
        return self.matrix[3] * self.vdisps.get(cid, self.default_vdisp)

    def position(self, cid: int) -> Tuple[float, float]:
        """Get position vector for vertical writing mode, in text
        space units.

        This is quite ill-defined in the PDF standard (PDF 2.0 Figure
        55), but basically it specifies a translation of the glyph
        with respect to the origin.  It is *subtracted* from that
        origin to give the glyph position.  So if your text matrix is
        `[1 0 0 1 100 100]`, and your font size is `10`, a position
        vector of `[500 200]` will place the origin of the glyph in
        glyph space at `[-500 -200]`, which becomes `[-.5 -.2]` in
        text space, then `[-5 -2]` after applying the font size, thus
        the glyph is painted with its origin at `[95 98]`.

        Yes, the horizontal scaling factor **does** apply to the
        horizontal component of the position vector, even if some PDF
        viewers don't think so.

        For horizontal writing, it is obviously (0, 0).

        """
        vx, vy = self.positions.get(cid, self.default_position)
        # We know that the matrix is diagonal here
        a, _, _, d, _, _ = self.matrix
        return a * vx, d * vy

    def char_bbox(self, cid: int) -> Rect:
        """Get the standard bounding box for a character from its CID.

        This is the standard bounding box in text space units based on
        width, descent and ascent, translated by the position vector.

        Danger: Not the actual bounding box of the glyph.
            This is a standardized bounding box for use in text
            extraction and layout analysis.  It does not correspond to
            the actual bounding box of an individual glyph as
            specified by the font program.

        """
        width = self.widths.get(cid, self.default_width)
        # We know that the matrix is diagonal here
        a, _, _, d, _, _ = self.matrix
        if self.vertical:
            vx, vy = self.positions.get(cid, self.default_position)
            # Horizontal offset for glyph origin vs. text
            # space origin.
            vx = -vx
            # Vertical offset for glyph origin
            vy = -vy
            # Find glyph bbox
            return (
                a * vx,
                d * (vy + self.descent),
                a * (vx + width),
                d * (vy + self.ascent),
            )
        return (0, d * self.descent, a * width, d * self.ascent)

char_bbox(cid)

Get the standard bounding box for a character from its CID.

This is the standard bounding box in text space units based on width, descent and ascent, translated by the position vector.

Not the actual bounding box of the glyph.

This is a standardized bounding box for use in text extraction and layout analysis. It does not correspond to the actual bounding box of an individual glyph as specified by the font program.

Source code in playa/font.py
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
def char_bbox(self, cid: int) -> Rect:
    """Get the standard bounding box for a character from its CID.

    This is the standard bounding box in text space units based on
    width, descent and ascent, translated by the position vector.

    Danger: Not the actual bounding box of the glyph.
        This is a standardized bounding box for use in text
        extraction and layout analysis.  It does not correspond to
        the actual bounding box of an individual glyph as
        specified by the font program.

    """
    width = self.widths.get(cid, self.default_width)
    # We know that the matrix is diagonal here
    a, _, _, d, _, _ = self.matrix
    if self.vertical:
        vx, vy = self.positions.get(cid, self.default_position)
        # Horizontal offset for glyph origin vs. text
        # space origin.
        vx = -vx
        # Vertical offset for glyph origin
        vy = -vy
        # Find glyph bbox
        return (
            a * vx,
            d * (vy + self.descent),
            a * (vx + width),
            d * (vy + self.ascent),
        )
    return (0, d * self.descent, a * width, d * self.ascent)

get_cmap_from_spec(spec)

Get cmap from font specification

For certain PDFs, Encoding Type isn't mentioned as an attribute of Encoding but as an attribute of CMapName, where CMapName is an attribute of spec['Encoding']. The horizontal/vertical modes are mentioned with different name such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.

Source code in playa/font.py
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
def get_cmap_from_spec(self, spec: Dict[str, PDFObject]) -> CMapBase:
    """Get cmap from font specification

    For certain PDFs, Encoding Type isn't mentioned as an attribute of
    Encoding but as an attribute of CMapName, where CMapName is an
    attribute of spec['Encoding'].
    The horizontal/vertical modes are mentioned with different name
    such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
    """
    cmap_name = self._get_cmap_name(spec)

    try:
        return CMapDB.get_cmap(cmap_name)
    except KeyError as e:
        # Parse an embedded CMap if necessary
        if isinstance(spec["Encoding"], ContentStream):
            strm = stream_value(spec["Encoding"])
            return parse_encoding(strm.buffer)
        else:
            log.warning("Failed to get cmap %s: %s", cmap_name, e)
            return CMap()

position(cid)

Get position vector for vertical writing mode, in text space units.

This is quite ill-defined in the PDF standard (PDF 2.0 Figure 55), but basically it specifies a translation of the glyph with respect to the origin. It is subtracted from that origin to give the glyph position. So if your text matrix is [1 0 0 1 100 100], and your font size is 10, a position vector of [500 200] will place the origin of the glyph in glyph space at [-500 -200], which becomes [-.5 -.2] in text space, then [-5 -2] after applying the font size, thus the glyph is painted with its origin at [95 98].

Yes, the horizontal scaling factor does apply to the horizontal component of the position vector, even if some PDF viewers don't think so.

For horizontal writing, it is obviously (0, 0).

Source code in playa/font.py
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
def position(self, cid: int) -> Tuple[float, float]:
    """Get position vector for vertical writing mode, in text
    space units.

    This is quite ill-defined in the PDF standard (PDF 2.0 Figure
    55), but basically it specifies a translation of the glyph
    with respect to the origin.  It is *subtracted* from that
    origin to give the glyph position.  So if your text matrix is
    `[1 0 0 1 100 100]`, and your font size is `10`, a position
    vector of `[500 200]` will place the origin of the glyph in
    glyph space at `[-500 -200]`, which becomes `[-.5 -.2]` in
    text space, then `[-5 -2]` after applying the font size, thus
    the glyph is painted with its origin at `[95 98]`.

    Yes, the horizontal scaling factor **does** apply to the
    horizontal component of the position vector, even if some PDF
    viewers don't think so.

    For horizontal writing, it is obviously (0, 0).

    """
    vx, vy = self.positions.get(cid, self.default_position)
    # We know that the matrix is diagonal here
    a, _, _, d, _, _ = self.matrix
    return a * vx, d * vy

vdisp(cid)

Get vertical displacement for vertical writing mode, in text space units.

Returns 0 for horizontal writing, for obvious reasons.

Source code in playa/font.py
676
677
678
679
680
681
682
def vdisp(self, cid: int) -> float:
    """Get vertical displacement for vertical writing mode, in
    text space units.

    Returns 0 for horizontal writing, for obvious reasons.
    """
    return self.matrix[3] * self.vdisps.get(cid, self.default_vdisp)

Font

Source code in playa/font.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
class Font:
    vertical: bool = False
    multibyte: bool = False
    encoding: Dict[int, str]

    def __init__(
        self,
        descriptor: Dict[str, PDFObject],
        widths: Dict[int, float],
        default_width: Optional[float] = None,
    ) -> None:
        self.descriptor = descriptor
        self.widths = widths
        fontname = resolve1(descriptor.get("FontName"))
        if isinstance(fontname, PSLiteral):
            self.fontname = literal_name(fontname)
        elif isinstance(fontname, (bytes, str)):
            self.fontname = decode_text(fontname)
        else:
            self.fontname = "unknown"
        self.basefont = self.fontname
        self.flags = int_value(descriptor.get("Flags", 0))
        # Default values based on default DW2 metrics
        self.ascent = num_value(descriptor.get("Ascent", 880))
        self.descent = num_value(descriptor.get("Descent", -120))
        self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
        if default_width is None:
            self.default_width = num_value(descriptor.get("MissingWidth", 1000))
        else:
            self.default_width = default_width
        self.leading = num_value(descriptor.get("Leading", 0))
        if "FontBBox" in descriptor:
            self.bbox = rect_value(descriptor["FontBBox"])
        else:
            self.bbox = (0, 0, 0, 0)
        self.matrix: Matrix = (0.001, 0, 0, 0.001, 0, 0)

        # PDF RM 9.8.1 specifies /Descent should always be a negative number.
        # PScript5.dll seems to produce Descent with a positive number, but
        # text analysis will be wrong if this is taken as correct. So force
        # descent to negative.
        if self.descent > 0:
            self.descent = -self.descent
        # NOTE: A Type3 font *can* have positive descent because the
        # FontMatrix might be flipped, this is handled in the subclass
        # (but also, we ignore ascent and descent on Type3 fonts)

        # For some unknown reason sometimes Ascent and Descent are
        # both zero, in which case set them from the bbox.
        if self.ascent == 0 and self.descent == 0:
            _, self.descent, _, self.ascent = self.bbox

    def __repr__(self) -> str:
        return "<Font>"

    def decode(self, data: bytes) -> Iterable[Tuple[int, str]]:
        # Default to an Identity map
        log.debug("decode with identity: %r", data)
        return ((cid, chr(cid)) for cid in data)

    def hdisp(self, cid: int) -> float:
        """Get the horizontal displacement (so-called "width") of a character
        from its CID."""
        width = self.widths.get(cid, self.default_width)
        return self.matrix[0] * width

    def vdisp(self, cid: int) -> float:
        """Get vertical displacement for vertical writing mode, in
        text space units.

        This is always 0 for simple fonts as they have no vertical
        writing mode.

        """
        return 0

    def position(self, cid: int) -> Tuple[float, float]:
        """Get position vector for vertical writing mode, in text
        space units.

        This is always `[0 0]` for simple fonts as they have no
        vertical writing mode.
        """
        return (0, 0)

    def char_bbox(self, cid: int) -> Rect:
        """Get the standard bounding box for a character from its CID.

        This is, very specifically, `[0 descent width ascent]` in text
        space units.

        Danger: Not the actual bounding box of the glyph.
            This is a standardized bounding box for use in text
            extraction and layout analysis.  It does not correspond to
            the actual bounding box of an individual glyph as
            specified by the font program.
        """
        width = self.widths.get(cid, self.default_width)
        # We know the matrix is diagonal
        a, _, _, d, _, _ = self.matrix
        return (0, d * self.descent, a * width, d * self.ascent)

char_bbox(cid)

Get the standard bounding box for a character from its CID.

This is, very specifically, [0 descent width ascent] in text space units.

Not the actual bounding box of the glyph.

This is a standardized bounding box for use in text extraction and layout analysis. It does not correspond to the actual bounding box of an individual glyph as specified by the font program.

Source code in playa/font.py
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def char_bbox(self, cid: int) -> Rect:
    """Get the standard bounding box for a character from its CID.

    This is, very specifically, `[0 descent width ascent]` in text
    space units.

    Danger: Not the actual bounding box of the glyph.
        This is a standardized bounding box for use in text
        extraction and layout analysis.  It does not correspond to
        the actual bounding box of an individual glyph as
        specified by the font program.
    """
    width = self.widths.get(cid, self.default_width)
    # We know the matrix is diagonal
    a, _, _, d, _, _ = self.matrix
    return (0, d * self.descent, a * width, d * self.ascent)

hdisp(cid)

Get the horizontal displacement (so-called "width") of a character from its CID.

Source code in playa/font.py
129
130
131
132
133
def hdisp(self, cid: int) -> float:
    """Get the horizontal displacement (so-called "width") of a character
    from its CID."""
    width = self.widths.get(cid, self.default_width)
    return self.matrix[0] * width

position(cid)

Get position vector for vertical writing mode, in text space units.

This is always [0 0] for simple fonts as they have no vertical writing mode.

Source code in playa/font.py
145
146
147
148
149
150
151
152
def position(self, cid: int) -> Tuple[float, float]:
    """Get position vector for vertical writing mode, in text
    space units.

    This is always `[0 0]` for simple fonts as they have no
    vertical writing mode.
    """
    return (0, 0)

vdisp(cid)

Get vertical displacement for vertical writing mode, in text space units.

This is always 0 for simple fonts as they have no vertical writing mode.

Source code in playa/font.py
135
136
137
138
139
140
141
142
143
def vdisp(self, cid: int) -> float:
    """Get vertical displacement for vertical writing mode, in
    text space units.

    This is always 0 for simple fonts as they have no vertical
    writing mode.

    """
    return 0

Type1Font

Bases: SimpleFont

Source code in playa/font.py
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
class Type1Font(SimpleFont):
    char_widths: Union[Dict[str, int], None] = None

    def __init__(self, spec: Dict[str, PDFObject]) -> None:
        self.basefont = get_basefont(spec)
        widths: Dict[int, float]
        if self.basefont in FONT_METRICS:
            (descriptor, self.char_widths) = FONT_METRICS[self.basefont]
            widths = {}
        else:
            descriptor = dict_value(spec.get("FontDescriptor", {}))
            firstchar = int_value(spec.get("FirstChar", 0))
            # lastchar = int_value(spec.get('LastChar', 255))
            width_list = list_value(spec.get("Widths", [0] * 256))
            widths = {i + firstchar: num_value(w) for (i, w) in enumerate(width_list)}
        SimpleFont.__init__(self, descriptor, widths, spec)

    def get_implicit_encoding(
        self, descriptor: Dict[str, PDFObject]
    ) -> Union[PSLiteral, Dict[int, str], None]:
        # PDF 1.7 Table 114: For a font program that is embedded in
        # the PDF file, the implicit base encoding shall be the font
        # program’s built-in encoding.
        if "FontFile" in descriptor:
            self.fontfile = stream_value(descriptor.get("FontFile"))
            length1 = int_value(self.fontfile["Length1"])
            data = self.fontfile.buffer[:length1]
            parser = Type1FontHeaderParser(data)
            return parser.get_encoding()
        elif "FontFile3" in descriptor:
            self.fontfile3 = stream_value(descriptor.get("FontFile3"))
            try:
                cfffont = CFFFontProgram(self.basefont, BytesIO(self.fontfile3.buffer))
                self.cfffont = cfffont
                return {
                    cid: cfffont.gid2name[gid]
                    for cid, gid in cfffont.code2gid.items()
                    if gid in cfffont.gid2name
                }
            except Exception:
                log.debug("Failed to parse CFFFont %r", self.fontfile3, exc_info=True)
                return LITERAL_STANDARD_ENCODING
        elif self.basefont == "Symbol":
            # FIXME: This (and zapf) can be obtained from the AFM files
            return SYMBOL_BUILTIN_ENCODING
        elif self.basefont == "ZapfDingbats":
            return ZAPFDINGBATS_BUILTIN_ENCODING
        else:
            # PDF 1.7 Table 114: Otherwise, for a nonsymbolic font, it
            # shall be StandardEncoding, and for a symbolic font, it
            # shall be the font's built-in encoding (see FIXME above)
            return LITERAL_STANDARD_ENCODING

    def _glyph_space_width(self, cid: int) -> float:
        # Commit 6e4f36d <- what's the purpose of this? seems very cursed
        # reverting this would make #76 easy to fix since cid2unicode would only be
        # needed when ToUnicode is absent
        #
        # Answer: It exists entirely to support core fonts with a
        # custom Encoding defined over them (accented characters for
        # example).  The correct fix is to redo the AFM parsing to:
        #
        # - Get the implicit encoding (it's usually LITERAL_STANDARD_ENCODING)
        # - Index the widths by glyph names, not encoding values
        # - As a treat, we can also get the encodings for Symbol and ZapfDingbats
        #
        # Then we can construct `self.widths` directly using `self.encoding`.
        if self.char_widths is not None:
            if cid not in self._cid2unicode:
                return self.default_width
            return self.char_widths.get(self._cid2unicode[cid], self.default_width)
        return self.widths.get(cid, self.default_width)

    def hdisp(self, cid: int) -> float:
        """Get the horizontal displacement (so-called "width") of a character
        from its CID."""
        return self.matrix[0] * self._glyph_space_width(cid)

    def char_bbox(self, cid: int) -> Rect:
        """Get the standard bounding box for a character from its CID.

        This is, very specifically, `[0 descent width ascent]` in text
        space units.

        Danger: Not the actual bounding box of the glyph.
            This is a standardized bounding box for use in text
            extraction and layout analysis.  It does not correspond to
            the actual bounding box of an individual glyph as
            specified by the font program.
        """
        width = self._glyph_space_width(cid)
        # We know the matrix is diagonal
        a, _, _, d, _, _ = self.matrix
        return (0, d * self.descent, a * width, d * self.ascent)

    def __repr__(self) -> str:
        return "<Type1Font: basefont=%r>" % self.basefont

char_bbox(cid)

Get the standard bounding box for a character from its CID.

This is, very specifically, [0 descent width ascent] in text space units.

Not the actual bounding box of the glyph.

This is a standardized bounding box for use in text extraction and layout analysis. It does not correspond to the actual bounding box of an individual glyph as specified by the font program.

Source code in playa/font.py
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def char_bbox(self, cid: int) -> Rect:
    """Get the standard bounding box for a character from its CID.

    This is, very specifically, `[0 descent width ascent]` in text
    space units.

    Danger: Not the actual bounding box of the glyph.
        This is a standardized bounding box for use in text
        extraction and layout analysis.  It does not correspond to
        the actual bounding box of an individual glyph as
        specified by the font program.
    """
    width = self._glyph_space_width(cid)
    # We know the matrix is diagonal
    a, _, _, d, _, _ = self.matrix
    return (0, d * self.descent, a * width, d * self.ascent)

hdisp(cid)

Get the horizontal displacement (so-called "width") of a character from its CID.

Source code in playa/font.py
321
322
323
324
def hdisp(self, cid: int) -> float:
    """Get the horizontal displacement (so-called "width") of a character
    from its CID."""
    return self.matrix[0] * self._glyph_space_width(cid)

Type3Font

Bases: SimpleFont

Source code in playa/font.py
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
class Type3Font(SimpleFont):
    def __init__(self, spec: Dict[str, PDFObject]) -> None:
        firstchar = int_value(spec.get("FirstChar", 0))
        # lastchar = int_value(spec.get('LastChar', 0))
        width_list = list_value(spec.get("Widths", [0] * 256))
        widths = {i + firstchar: num_value(w) for (i, w) in enumerate(width_list)}
        descriptor = dict_value(spec.get("FontDescriptor", {}))
        SimpleFont.__init__(self, descriptor, widths, spec)
        # Type 3 fonts don't have a BaseFont in their font dictionary
        # and generally don't have a FontName in their descriptor
        # (https://github.com/pdf-association/pdf-issues/issues/11) as
        # they aren't considered to be subsettable, so we should just
        # look at Name to get their name and ignore whatever
        # SimpleFont.__init__ tells us
        fontname = resolve1(descriptor.get("FontName", spec.get("Name")))
        if isinstance(fontname, PSLiteral):
            self.fontname = fontname.name
        elif isinstance(fontname, bytes):
            self.fontname = decode_text(fontname)
        else:
            self.fontname = "unknown"
        self.basefont = self.fontname
        # Get the character definitions so we can interpret them
        self.charprocs = dict_value(spec.get("CharProcs", {}))
        # Get font-specific resources (FIXME: There is a huge amount
        # of ambiguity surrounding resources in Type3 fonts, see
        # https://github.com/pdf-association/pdf-issues/issues/128)
        resources = resolve1(spec.get("Resources"))
        self.resources: Union[None, Dict[str, PDFObject]] = (
            None if resources is None else dict_value(resources)
        )
        if "FontMatrix" in spec:  # it is actually required though
            self.matrix = matrix_value(spec["FontMatrix"])
        else:
            self.matrix = (0.001, 0, 0, 0.001, 0, 0)
        # FontBBox is in the font dictionary for Type 3 fonts
        if "FontBBox" in spec:  # it is also required though
            self.bbox = rect_value(spec["FontBBox"])
            # otherwise it was set in SimpleFont.__init__

        # Set ascent/descent from the bbox (they *could* be in the
        # descriptor but this is very unlikely, and then, they might
        # also both be zero, which is bad)
        _, self.descent, _, self.ascent = self.bbox

    def get_implicit_encoding(
        self, descriptor: Dict[str, PDFObject]
    ) -> Union[PSLiteral, Dict[int, str], None]:
        # PDF 1.7 sec 9.6.6.3: A Type 3 font’s mapping from character
        # codes to glyph names shall be entirely defined by its
        # Encoding entry, which is required in this case.
        return {}

    def char_bbox(self, cid: int) -> Rect:
        """Get the standard bounding box for a character from its CID.

        This is the smallest rectangle enclosing [0 descent width
        ascent] after the font matrix has been applied.

        Danger: Not the actual bounding box of the glyph (but almost).
            The descent and ascent here are from the **font** and not
            from the individual **glyph** so this will be somewhat
            larger than the actual bounding box.
        """
        width = self.widths.get(cid, self.default_width)
        return transform_bbox(self.matrix, (0, self.descent, width, self.ascent))

    def __repr__(self) -> str:
        return "<Type3Font>"

char_bbox(cid)

Get the standard bounding box for a character from its CID.

This is the smallest rectangle enclosing [0 descent width ascent] after the font matrix has been applied.

Not the actual bounding box of the glyph (but almost).

The descent and ascent here are from the font and not from the individual glyph so this will be somewhat larger than the actual bounding box.

Source code in playa/font.py
423
424
425
426
427
428
429
430
431
432
433
434
435
def char_bbox(self, cid: int) -> Rect:
    """Get the standard bounding box for a character from its CID.

    This is the smallest rectangle enclosing [0 descent width
    ascent] after the font matrix has been applied.

    Danger: Not the actual bounding box of the glyph (but almost).
        The descent and ascent here are from the **font** and not
        from the individual **glyph** so this will be somewhat
        larger than the actual bounding box.
    """
    width = self.widths.get(cid, self.default_width)
    return transform_bbox(self.matrix, (0, self.descent, width, self.ascent))

playa.parser

PDF lexer and parser

API subject to change.

These APIs are unstable and subject to revision before PLAYA 1.0.

ContentParser

Bases: ObjectParser

Parse the concatenation of multiple content streams, as described in the spec (PDF 1.7, p.86):

...the effect shall be as if all of the streams in the array were concatenated, in order, to form a single stream. Conforming writers can create image objects and other resources as they occur, even though they interrupt the content stream. The division between streams may occur only at the boundaries between lexical tokens (see 7.2, "Lexical Conventions") but shall be unrelated to the page’s logical content or organization.

Source code in playa/parser.py
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
class ContentParser(ObjectParser):
    """Parse the concatenation of multiple content streams, as
    described in the spec (PDF 1.7, p.86):

    ...the effect shall be as if all of the streams in the array were
    concatenated, in order, to form a single stream.  Conforming
    writers can create image objects and other resources as they
    occur, even though they interrupt the content stream. The division
    between streams may occur only at the boundaries between lexical
    tokens (see 7.2, "Lexical Conventions") but shall be unrelated to
    the page’s logical content or organization.
    """

    def __init__(self, streams: Iterable[PDFObject]) -> None:
        self.streamiter = iter(streams)
        try:
            stream = stream_value(next(self.streamiter))
            super().__init__(stream.buffer)
        except StopIteration:
            super().__init__(b"")
        except TypeError:
            log.warning("Found non-stream in contents: %r", streams)
            super().__init__(b"")

    def nexttoken(self) -> Tuple[int, Token]:
        """Override nexttoken() to continue parsing in subsequent streams.

        TODO: If we want to avoid evil implementation inheritance, we
        should do this in the lexer instead.
        """
        while True:
            try:
                return super().nexttoken()
            except StopIteration:
                # Will also raise StopIteration if there are no more,
                # which is exactly what we want
                try:
                    ref = next(self.streamiter)
                    stream = stream_value(ref)
                    self.newstream(stream.buffer)
                except TypeError:
                    log.warning("Found non-stream in contents: %r", ref)

nexttoken()

Override nexttoken() to continue parsing in subsequent streams.

TODO: If we want to avoid evil implementation inheritance, we should do this in the lexer instead.

Source code in playa/parser.py
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
def nexttoken(self) -> Tuple[int, Token]:
    """Override nexttoken() to continue parsing in subsequent streams.

    TODO: If we want to avoid evil implementation inheritance, we
    should do this in the lexer instead.
    """
    while True:
        try:
            return super().nexttoken()
        except StopIteration:
            # Will also raise StopIteration if there are no more,
            # which is exactly what we want
            try:
                ref = next(self.streamiter)
                stream = stream_value(ref)
                self.newstream(stream.buffer)
            except TypeError:
                log.warning("Found non-stream in contents: %r", ref)

IndirectObjectParser

IndirectObjectParser fetches indirect objects from a data stream. It holds a weak reference to the document in order to resolve indirect references. If the document is deleted then this will obviously no longer work.

Note that according to PDF 1.7 sec 7.5.3, "The body of a PDF file shall consist of a sequence of indirect objects representing the contents of a document." Therefore unlike the base ObjectParser, IndirectObjectParser returns only indrect objects and not bare keywords, strings, numbers, etc.

However, unlike ObjectParser, it will also read and return ContentStreams, as these must be indirect objects by definition.

Typical usage

parser = IndirectObjectParser(fp, doc) for object in parser: ...

Source code in playa/parser.py
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
class IndirectObjectParser:
    """IndirectObjectParser fetches indirect objects from a data
    stream.  It holds a weak reference to the document in order to
    resolve indirect references.  If the document is deleted then this
    will obviously no longer work.

    Note that according to PDF 1.7 sec 7.5.3, "The body of a PDF file
    shall consist of a sequence of indirect objects representing the
    contents of a document."  Therefore unlike the base `ObjectParser`,
    `IndirectObjectParser` returns *only* indrect objects and not bare
    keywords, strings, numbers, etc.

    However, unlike `ObjectParser`, it will also read and return
    `ContentStream`s, as these *must* be indirect objects by definition.

    Typical usage:
      parser = IndirectObjectParser(fp, doc)
      for object in parser:
          ...

    """

    def __init__(
        self,
        data: Union[bytes, mmap.mmap],
        doc: Union["Document", None] = None,
        pos: int = 0,
        strict: bool = False,
    ) -> None:
        self._parser = ObjectParser(data, doc, pos=pos, strict=strict)
        self.buffer = data
        self.objstack: List[Tuple[int, Union[PDFObject, ContentStream]]] = []
        self.docref = None if doc is None else _ref_document(doc)
        self.strict = strict
        self.decipher = None if doc is None else doc.decipher

    @property
    def doc(self) -> Union["Document", None]:
        """Get associated document if it exists."""
        if self.docref is None:
            return None
        return _deref_document(self.docref)

    def __iter__(self) -> Iterator[Tuple[int, IndirectObject]]:
        return self

    def __next__(self) -> Tuple[int, IndirectObject]:
        obj: Union[PDFObject, ContentStream]
        while True:
            try:
                pos, obj = next(self._parser)
                if isinstance(obj, PSKeyword) and obj.name.startswith(b"endobj"):
                    return self._endobj(pos, obj)
                elif obj is KEYWORD_STREAM:
                    stream = self._stream(pos, obj)
                    self.objstack.append((pos, stream))
                elif obj is KEYWORD_ENDSTREAM:
                    if not isinstance(self.objstack[-1][1], ContentStream):
                        log.warning("Got endstream without a stream, ignoring!")
                elif isinstance(obj, PSKeyword) and obj.name.startswith(b"endstream"):
                    # Some broken PDFs have junk after "endstream"
                    errmsg = "Expected 'endstream', got %r" % (obj,)
                    raise PDFSyntaxError(errmsg)
                else:
                    self.objstack.append((pos, obj))
            except StopIteration:
                raise
            except Exception as e:
                errmsg = "Syntax error near position %d: %s" % (pos, e)
                if self.strict:
                    raise PDFSyntaxError(errmsg) from e
                else:
                    log.warning(errmsg)
                    continue

    def _endobj(self, pos: int, obj: PDFObject) -> Tuple[int, IndirectObject]:
        # Some broken PDFs omit the space after `endobj`...
        if obj is not KEYWORD_ENDOBJ:
            self._parser.seek(pos + len(b"endobj"))
        # objid genno "obj" (skipped) ... and the object
        (_, obj) = self.objstack.pop()
        (kpos, kwd) = self.objstack.pop()
        if kwd is not KEYWORD_OBJ:
            errmsg = "Expected 'obj' at %d, got %r" % (kpos, kwd)
            raise PDFSyntaxError(errmsg)
        (_, genno) = self.objstack.pop()
        # Update pos to be the beginning of the indirect object
        (pos, objid) = self.objstack.pop()
        try:
            objid = int_value(objid)
            genno = int_value(genno)
        except TypeError as e:
            objs = " ".join(
                repr(obj)
                for obj in itertools.chain(
                    (x[1] for x in self.objstack), (objid, genno, obj)
                )
            )
            errmsg = (
                f"Failed to parse indirect object at {pos}: "
                f"got: {objs} "
                f"before 'endobj'"
            )
            raise PDFSyntaxError(errmsg) from e
        # ContentStream is *special* and needs these
        # internally for decryption.
        if isinstance(obj, ContentStream):
            obj.objid = objid
            obj.genno = genno
        # Decrypt indirect objects at top level (inside object streams
        # they are handled by ObjectStreamParser)
        if self.decipher:
            return pos, IndirectObject(
                objid,
                genno,
                decipher_all(self.decipher, objid, genno, obj),
            )
        else:
            return pos, IndirectObject(objid, genno, obj)

    def _stream(self, pos: int, obj: PDFObject) -> ContentStream:
        # PDF 1.7 sec 7.3.8.1: A stream shall consist of a
        # dictionary followed by zero or more bytes bracketed
        # between the keywords `stream` (followed by newline)
        # and `endstream`
        (_, dic) = self.objstack.pop()
        if not isinstance(dic, dict):
            # sec 7.3.8.1: the stream dictionary shall be a
            # direct object.
            raise PDFSyntaxError("Incorrect type for stream dictionary %r", dic)
        try:
            # sec 7.3.8.2: Every stream dictionary shall have
            # a Length entry that indicates how many bytes of
            # the PDF file are used for the stream’s data
            # FIXME: This call is **not** thread-safe as we currently
            # reuse the same IndirectObjectParser to resolve references
            objlen = int_value(dic["Length"])
        except KeyError:
            log.warning("/Length is undefined in stream dictionary %r", dic)
            objlen = 0
        except ValueError:
            # FIXME: This warning should be suppressed in fallback
            # xref parsing, since we obviously can't resolve any
            # references yet.  Either that or fallback xref parsing
            # should just run a regex over the PDF and not try to
            # actually parse the objects (probably a better solution)
            log.warning("/Length reference cannot be resolved in %r", dic)
            objlen = 0
        except TypeError:
            # FIXME: This may happen with incremental updates
            log.warning("/Length reference resolves to non-integer in %r", dic)
            objlen = 0
        # sec 7.3.8.1: The keyword `stream` that follows the stream
        # dictionary shall be followed by an end-of-line
        # marker consisting of either a CARRIAGE RETURN and a
        # LINE FEED or just a LINE FEED, and not by a CARRIAGE
        # RETURN alone.
        self._parser.seek(pos)
        _, line = self._parser.nextline()
        assert line.strip() == b"stream"
        # Because PDFs do not follow the spec, we will read
        # *at least* the specified number of bytes, which
        # could be zero (particularly if not specified!), up
        # until the "endstream" tag.  In most cases it is
        # expected that this extra data will be included in
        # the stream anyway, but for encrypted streams you
        # probably don't want that (LOL @ PDF "security")
        data = self._parser.read(objlen)
        doc = self.doc
        decipher = None if doc is None else doc.decipher
        # sec 7.3.8.1: There should be an end-of-line marker after the
        # data and before endstream; this marker shall not be included
        # in the stream length.
        #
        # TRANSLATION: We expect either one of PDF's many end-of-line
        # markers, endstream, or EOL + endstream.  If we get something
        # else, it's an error in strict mode, otherwise, we throw it
        # on the pile and keep going.
        pos = self._parser.tell()
        m = ENDSTREAMR.match(self._parser._lexer.data, pos)
        if m is not None:
            return ContentStream(dic, bytes(data), decipher)
        # We already know it's an error in strict mode, but read the
        # line anyway to show the user what's wrong
        pos, line = self._parser.nextline()
        if self.strict:
            raise PDFSyntaxError("Expected newline or 'endstream', got %r", line)
        # Now glom on all the data until we see endstream
        while True:
            if b"endstream" in line:
                idx = line.index(b"endstream")
                data += line[:idx]
                self._parser.seek(pos + idx)
                break
            data += line
            pos, line = self._parser.nextline()
            if line == b"":  # Means EOF
                log.warning("Incorrect length for stream, no 'endstream' found")
                break
        return ContentStream(dic, bytes(data), decipher)

    # Delegation follows
    def seek(self, pos: int) -> None:
        """Seek to a position."""
        self._parser.seek(pos)

    def reset(self) -> None:
        """Clear internal parser state."""
        self._parser.reset()

doc property

Get associated document if it exists.

reset()

Clear internal parser state.

Source code in playa/parser.py
842
843
844
def reset(self) -> None:
    """Clear internal parser state."""
    self._parser.reset()

seek(pos)

Seek to a position.

Source code in playa/parser.py
838
839
840
def seek(self, pos: int) -> None:
    """Seek to a position."""
    self._parser.seek(pos)

Lexer

Lexer for PDF data.

Source code in playa/parser.py
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
class Lexer:
    """Lexer for PDF data."""

    def __init__(self, data: Union[bytes, mmap.mmap], pos: int = 0) -> None:
        self.data = data
        self.pos = pos
        self.end = len(data)
        self._tokens: Deque[Tuple[int, Token]] = deque()

    def seek(self, pos: int) -> None:
        """Seek to a position and reinitialize parser state."""
        self.pos = pos
        self._curtoken = b""
        self._curtokenpos = 0
        self._tokens.clear()

    def tell(self) -> int:
        """Get the current position in the buffer."""
        return self.pos

    def read(self, objlen: int) -> bytes:
        """Read data from current position, advancing to the end of
        this data."""
        pos = self.pos
        self.pos = min(pos + objlen, len(self.data))
        return self.data[pos : self.pos]

    def nextline(self) -> Tuple[int, bytes]:
        r"""Get the next line ending either with \r, \n, or \r\n,
        starting at the current position."""
        linepos = self.pos
        m = EOLR.search(self.data, self.pos)
        if m is None:
            self.pos = self.end
        else:
            self.pos = m.end()
        return (linepos, self.data[linepos : self.pos])

    def __iter__(self) -> Iterator[Tuple[int, Token]]:
        """Iterate over tokens."""
        return self

    def __next__(self) -> Tuple[int, Token]:
        """Get the next token in iteration, raising StopIteration when
        done."""
        while True:
            m = LEXER.match(self.data, self.pos)
            if m is None:  # can only happen at EOS
                raise StopIteration
            self._curtokenpos = m.start()
            self.pos = m.end()
            if m.lastgroup not in ("whitespace", "comment"):  # type: ignore
                # Okay, we got a token or something
                break
        self._curtoken = m[0]
        if m.lastgroup == "name":  # type: ignore
            self._curtoken = m[0][1:]
            self._curtoken = HEXDIGIT.sub(
                lambda x: bytes((int(x[1], 16),)), self._curtoken
            )
            tok = LIT(name_str(self._curtoken))
            return (self._curtokenpos, tok)
        if m.lastgroup == "number":  # type: ignore
            if b"." in self._curtoken:
                return (self._curtokenpos, float(self._curtoken))
            else:
                return (self._curtokenpos, int(self._curtoken))
        if m.lastgroup == "startdict":  # type: ignore
            return (self._curtokenpos, KEYWORD_DICT_BEGIN)
        if m.lastgroup == "enddict":  # type: ignore
            return (self._curtokenpos, KEYWORD_DICT_END)
        if m.lastgroup == "startstr":  # type: ignore
            return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end())
        if m.lastgroup == "hexstr":  # type: ignore
            self._curtoken = SPC.sub(b"", self._curtoken[1:-1])
            if len(self._curtoken) % 2 == 1:
                self._curtoken += b"0"
            return (self._curtokenpos, unhexlify(self._curtoken))
        # Anything else is treated as a keyword (whether explicitly matched or not)
        if self._curtoken == b"true":
            return (self._curtokenpos, True)
        elif self._curtoken == b"false":
            return (self._curtokenpos, False)
        else:
            return (self._curtokenpos, KWD(self._curtoken))

    def _parse_endstr(self, start: bytes, pos: int) -> Tuple[int, Token]:
        """Parse the remainder of a string."""
        # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
        parts = [EOLR.sub(b"\n", start)]
        paren = 1
        for m in STRLEXER.finditer(self.data, pos):
            self.pos = m.end()
            if m.lastgroup == "parenright":  # type: ignore
                paren -= 1
                if paren == 0:
                    # By far the most common situation!
                    break
                parts.append(m[0])
            elif m.lastgroup == "parenleft":  # type: ignore
                parts.append(m[0])
                paren += 1
            elif m.lastgroup == "escape":  # type: ignore
                chr = m[0][1:2]
                if chr not in ESC_STRING:
                    # PDF 1.7 sec 7.3.4.2: If the character following
                    # the REVERSE SOLIDUS is not one of those shown in
                    # Table 3, the REVERSE SOLIDUS shall be ignored.
                    parts.append(chr)
                else:
                    parts.append(bytes((ESC_STRING[chr],)))
            elif m.lastgroup == "octal":  # type: ignore
                chrcode = int(m[0][1:], 8)
                if chrcode >= 256:
                    # PDF1.7 p.16: "high-order overflow shall be
                    # ignored."
                    log.warning("Invalid octal %r (%d)", m[0][1:], chrcode)
                else:
                    parts.append(bytes((chrcode,)))
            elif m.lastgroup == "newline":  # type: ignore
                # Handle nonsense CRLF conversion in strings (PDF 1.7, p.15)
                parts.append(b"\n")
            elif m.lastgroup == "linebreak":  # type: ignore
                pass
            else:
                parts.append(m[0])
        if paren != 0:
            log.warning("Unterminated string at %d", pos)
            raise StopIteration
        return (self._curtokenpos, b"".join(parts))

__iter__()

Iterate over tokens.

Source code in playa/parser.py
185
186
187
def __iter__(self) -> Iterator[Tuple[int, Token]]:
    """Iterate over tokens."""
    return self

__next__()

Get the next token in iteration, raising StopIteration when done.

Source code in playa/parser.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def __next__(self) -> Tuple[int, Token]:
    """Get the next token in iteration, raising StopIteration when
    done."""
    while True:
        m = LEXER.match(self.data, self.pos)
        if m is None:  # can only happen at EOS
            raise StopIteration
        self._curtokenpos = m.start()
        self.pos = m.end()
        if m.lastgroup not in ("whitespace", "comment"):  # type: ignore
            # Okay, we got a token or something
            break
    self._curtoken = m[0]
    if m.lastgroup == "name":  # type: ignore
        self._curtoken = m[0][1:]
        self._curtoken = HEXDIGIT.sub(
            lambda x: bytes((int(x[1], 16),)), self._curtoken
        )
        tok = LIT(name_str(self._curtoken))
        return (self._curtokenpos, tok)
    if m.lastgroup == "number":  # type: ignore
        if b"." in self._curtoken:
            return (self._curtokenpos, float(self._curtoken))
        else:
            return (self._curtokenpos, int(self._curtoken))
    if m.lastgroup == "startdict":  # type: ignore
        return (self._curtokenpos, KEYWORD_DICT_BEGIN)
    if m.lastgroup == "enddict":  # type: ignore
        return (self._curtokenpos, KEYWORD_DICT_END)
    if m.lastgroup == "startstr":  # type: ignore
        return self._parse_endstr(self.data[m.start() + 1 : m.end()], m.end())
    if m.lastgroup == "hexstr":  # type: ignore
        self._curtoken = SPC.sub(b"", self._curtoken[1:-1])
        if len(self._curtoken) % 2 == 1:
            self._curtoken += b"0"
        return (self._curtokenpos, unhexlify(self._curtoken))
    # Anything else is treated as a keyword (whether explicitly matched or not)
    if self._curtoken == b"true":
        return (self._curtokenpos, True)
    elif self._curtoken == b"false":
        return (self._curtokenpos, False)
    else:
        return (self._curtokenpos, KWD(self._curtoken))

nextline()

Get the next line ending either with \r, \n, or \r\n, starting at the current position.

Source code in playa/parser.py
174
175
176
177
178
179
180
181
182
183
def nextline(self) -> Tuple[int, bytes]:
    r"""Get the next line ending either with \r, \n, or \r\n,
    starting at the current position."""
    linepos = self.pos
    m = EOLR.search(self.data, self.pos)
    if m is None:
        self.pos = self.end
    else:
        self.pos = m.end()
    return (linepos, self.data[linepos : self.pos])

read(objlen)

Read data from current position, advancing to the end of this data.

Source code in playa/parser.py
167
168
169
170
171
172
def read(self, objlen: int) -> bytes:
    """Read data from current position, advancing to the end of
    this data."""
    pos = self.pos
    self.pos = min(pos + objlen, len(self.data))
    return self.data[pos : self.pos]

seek(pos)

Seek to a position and reinitialize parser state.

Source code in playa/parser.py
156
157
158
159
160
161
def seek(self, pos: int) -> None:
    """Seek to a position and reinitialize parser state."""
    self.pos = pos
    self._curtoken = b""
    self._curtokenpos = 0
    self._tokens.clear()

tell()

Get the current position in the buffer.

Source code in playa/parser.py
163
164
165
def tell(self) -> int:
    """Get the current position in the buffer."""
    return self.pos

ObjectParser

ObjectParser is used to parse PDF object streams (and content streams, which have the same syntax). Notably these consist of, well, a stream of objects without the surrounding obj and endobj tokens (which cannot occur in an object stream).

They can contain indirect object references (so, must be initialized with a Document to resolve these) but for perhaps obvious reasons (how would you parse that) these cannot occur at the top level of the stream, only inside an array or dictionary.

Source code in playa/parser.py
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
class ObjectParser:
    """ObjectParser is used to parse PDF object streams (and
    content streams, which have the same syntax).  Notably these
    consist of, well, a stream of objects without the surrounding
    `obj` and `endobj` tokens (which cannot occur in an object
    stream).

    They can contain indirect object references (so, must be
    initialized with a `Document` to resolve these) but for perhaps
    obvious reasons (how would you parse that) these cannot occur at
    the top level of the stream, only inside an array or dictionary.
    """

    def __init__(
        self,
        data: Union[bytes, mmap.mmap],
        doc: Union["Document", None] = None,
        pos: int = 0,
        strict: bool = False,
    ) -> None:
        self._lexer = Lexer(data, pos)
        self.stack: List[StackEntry] = []
        self.docref = None if doc is None else _ref_document(doc)
        self.strict = strict

    @property
    def doc(self) -> Union["Document", None]:
        """Get associated document if it exists."""
        if self.docref is None:
            return None
        return _deref_document(self.docref)

    def newstream(self, data: Union[bytes, mmap.mmap]) -> None:
        """Continue parsing from a new data stream."""
        self._lexer = Lexer(data)

    def reset(self) -> None:
        """Clear internal parser state."""
        del self.stack[:]

    def __iter__(self) -> Iterator[StackEntry]:
        """Iterate over (position, object) tuples."""
        return self

    def __next__(self) -> StackEntry:
        """Get next PDF object from stream (raises StopIteration at EOF)."""
        top: Union[int, None] = None
        obj: Union[Dict[Any, Any], List[PDFObject], PDFObject] = None
        while True:
            if self.stack and top is None:
                return self.stack.pop()
            (pos, token) = self.nexttoken()
            if token is KEYWORD_ARRAY_BEGIN:
                if top is None:
                    top = pos
                self.stack.append((pos, token))
            elif token is KEYWORD_ARRAY_END:
                try:
                    pos, obj = self.pop_to(KEYWORD_ARRAY_BEGIN)
                except (TypeError, PDFSyntaxError) as e:
                    if self.strict:
                        raise e
                    log.warning("When constructing array from %r: %s", obj, e)
                if pos == top:
                    top = None
                    return pos, obj
                self.stack.append((pos, obj))
            elif token is KEYWORD_DICT_BEGIN:
                if top is None:
                    top = pos
                self.stack.append((pos, token))
            elif token is KEYWORD_DICT_END:
                try:
                    (pos, objs) = self.pop_to(KEYWORD_DICT_BEGIN)
                    if len(objs) % 2 != 0:
                        error_msg = (
                            "Dictionary contains odd number of objects: %r" % objs
                        )
                        raise PDFSyntaxError(error_msg)
                    obj = {
                        literal_name(k): v
                        for (k, v) in choplist(2, objs)
                        if v is not None
                    }
                except (TypeError, PDFSyntaxError) as e:
                    if self.strict:
                        raise e
                    log.warning("When constructing dict from %r: %s", self.stack, e)
                if pos == top:
                    top = None
                    return pos, obj
                self.stack.append((pos, obj))
            elif token is KEYWORD_PROC_BEGIN:
                if top is None:
                    top = pos
                self.stack.append((pos, token))
            elif token is KEYWORD_PROC_END:
                try:
                    pos, obj = self.pop_to(KEYWORD_PROC_BEGIN)
                except (TypeError, PDFSyntaxError) as e:
                    if self.strict:
                        raise e
                    log.warning("When constructing proc from %r: %s", obj, e)
                if pos == top:
                    top = None
                    return pos, obj
                self.stack.append((pos, obj))
            elif token is KEYWORD_NULL:
                self.stack.append((pos, None))
            elif token is KEYWORD_R:
                # reference to indirect object (only allowed inside another object)
                if top is None:
                    log.warning("Ignoring indirect object reference at top level")
                    self.stack.append((pos, token))
                else:
                    obj = self.get_object_reference(pos, token)
                    if obj is not None:
                        self.stack.append((pos, obj))
            elif token is KEYWORD_BI:
                # Inline images must occur at the top level, otherwise
                # something is wrong (probably a corrupt file)
                if top is not None:
                    raise PDFSyntaxError(
                        "Inline image not at top level of stream "
                        f"({pos} != {top}, {self.stack})"
                    )
                top = pos
                self.stack.append((pos, token))
            elif token is KEYWORD_ID:
                obj = self.get_inline_image(pos, token)
                if obj is not None:
                    top = None
                    return pos, obj
            else:
                # Literally anything else, including any other keyword
                # (will be returned above if top is None, or later if
                # we are inside some object)
                self.stack.append((pos, token))

    def pop_to(self, token: PSKeyword) -> Tuple[int, List[PDFObject]]:
        """Pop everything from the stack back to token."""
        context: List[PDFObject] = []
        while self.stack:
            pos, last = self.stack.pop()
            if last is token:
                context.reverse()
                return pos, context
            context.append(last)
        raise PDFSyntaxError(f"Unmatched end token {token!r}")

    def get_object_reference(self, pos: int, token: Token) -> Union[ObjRef, None]:
        """Get an indirect object reference upon finding an "R" token."""
        try:
            _pos, _genno = self.stack.pop()
            _pos, objid = self.stack.pop()
        except ValueError as e:
            if self.strict:
                raise PDFSyntaxError(
                    "Expected generation and object id in indirect object reference"
                ) from e
            else:
                log.warning(
                    "Expected generation and object id in indirect object reference: %s",
                    e,
                )
            return None
        objid = int_value(objid)
        if objid == 0:
            if self.strict:
                raise PDFSyntaxError(
                    "Object ID in reference at pos %d cannot be 0" % (pos,)
                )
            log.warning("Ignoring indirect object reference to 0 at %s", pos)
            return None
        return ObjRef(self.docref, objid)

    def get_inline_image(self, pos: int, token: Token) -> Union[InlineImage, None]:
        """Get an inline image upon finding an "ID" token.

        Returns a tuple of the position of the target in the data and
        the image data.  Advances the file pointer to a position after
        the "EI" token that (we hope) ends the image.

        Note: WELCOME IN THE HELL!!!
            If you're lucky enough to have PDF 2.0 documents, then you
            can skip this, you aren't actually in (the) hell.  Otherwise
            read on to know why you might be missng images or reading
            a lot of garbage in your logs:

            - The PDF 1.7 standard only specifies that image data must be
              delimited by `ID` and `EI`, and that "The bytes between
              the `ID` and `EI` operators shall be treated the same as a
              stream object’s data, even though they do not follow the
              standard stream syntax."  What does that even mean?
            - And, that must be "a single whitespace character"
              following `ID` (floating in perfume, served in a man's
              hat), except in the case of `ASCIIHexDecode` or
              `ASCII85Decode` (in which case there can just be any
              whitespace you like, or none at all).
            - It's obviously impossible to determine what a "conforming
              implementation" of this should do.

            In the easiest case, if it's `ASCIIHexDecode` data then we
            can just look for the first instance of `b"EI"`, ignoring all
            whitespace, since `b"EI"` is thankfully not a valid hex
            sequence.

            Otherwise, the stream data can, and inevitably will,
            contain the literal bytes `b"EI"`, so no, we can't just
            search for that.  In the case of `ASCII85Decode`, however,
            you can look for the `b"~>"` end-of-data sequence but note
            that sometimes it... contains whitespace!

            So, we try for `b"\\sEI\\b"`, which is not foolproof since
            you could have the pixel values `(32, 69, 73)` in your
            image followed by some other byte... so in that case,
            expect a bunch of nonsense in the logs and possible data
            loss.  Also in the rare case where `b"EI"` was preceded by
            `b"\\r\\n"`, there will be an extra `\\r` in the image
            data.  Too bad.

            And finally if that doesn't work then we will try to salvage
            something by just looking for "EI", somewhere, anywhere.  We
            take the most distant one, and if this causes you to lose
            data, well, it's definitely Adobe's fault.

            There **is** an absolutely foolproof way to parse inline
            images, but it's ridiculous so we won't do it:

            1. Find the very first instance of `b"EI"`.
            2. Extract the image itself (which could be in various formats).
            3. If it's a valid image, congratulations!  Otherwise try again.

            The moral of the story is that the author of this part of
            the PDF specification should have considered a career in
            literally anything else.

        """
        assert (
            isinstance(token, PSKeyword) and token is KEYWORD_ID
        ), f"Not ID: {token!r}"
        idpos = pos
        (pos, objs) = self.pop_to(KEYWORD_BI)
        if len(objs) % 2 != 0:
            error_msg = f"Invalid dictionary construct: {objs!r}"
            if self.strict:
                raise TypeError(error_msg)
            else:
                log.warning(error_msg)
        dic = {literal_name(k): v for (k, v) in choplist(2, objs) if v is not None}

        target_re = EIR
        whitespace_re = SPC
        # Final filter is actually the *first* in the list
        final_filter = dic.get("F", dic.get("Filter"))
        if isinstance(final_filter, list) and final_filter:
            final_filter = final_filter[0]
        if final_filter in LITERALS_ASCII85_DECODE:
            # ASCII85: look for ~>EI, ignoring all whitespace
            whitespace_re = WSR
            target_re = A85R
        elif final_filter in LITERALS_ASCIIHEX_DECODE:
            # ASCIIHex: just look for EI
            whitespace_re = WSR
            target_re = EIEIR

        # Find the start of the image data by skipping the appropriate
        # amount of whitespace.  In the case of ASCII filters, we need
        # to skip any extra whitespace before we use a possible Length
        # value (this is very dumb but the standard says...)
        pos = idpos + len(token.name)
        data = self._lexer.data
        m = whitespace_re.match(data, pos)
        if m is None:  # Note that WSR will also match nothing
            errmsg = f"ID token at {pos} not followed by whitespace"
            if self.strict:
                raise PDFSyntaxError(errmsg)
            else:
                log.warning(errmsg)
        else:
            pos = m.end(0)

        # If you have Length, you have everything
        length = dic.get("L", dic.get("Length"))
        if length is not None:
            end = pos + int_value(length)
            self.seek(end)
            (_, token) = self.nexttoken()
            if token is not KEYWORD_EI:
                errmsg = f"EI not found after Length {length!r}"
                if self.strict:
                    raise PDFSyntaxError(errmsg)
                else:
                    log.warning(errmsg)
            return InlineImage(dic, data[pos:end])

        m = target_re.search(data, pos)
        if m is not None:
            self.seek(m.end(0))
            return InlineImage(dic, data[pos : m.start(0)])
        errmsg = f"Inline image at {pos} not terminated with {target_re}"
        if self.strict:
            raise PDFSyntaxError(errmsg)
        else:
            log.warning(errmsg)

        m = FURTHESTEIR.match(data, pos)
        if m is not None:
            log.warning(
                "Inline image at %d has no whitespace before EI, "
                "expect horrible data loss!!!",
                pos,
            )
            self.seek(m.end(0))
            return InlineImage(dic, data[pos : m.end(0) - 2])
        return None

    # Delegation follows
    def seek(self, pos: int) -> None:
        """Seek to a position."""
        self._lexer.seek(pos)

    def tell(self) -> int:
        """Get the current position in the file."""
        return self._lexer.tell()

    def read(self, objlen: int) -> bytes:
        """Read data from a specified position, moving the current
        position to the end of this data."""
        return self._lexer.read(objlen)

    def nextline(self) -> Tuple[int, bytes]:
        """Read (and do not parse) next line from underlying data."""
        return self._lexer.nextline()

    def nexttoken(self) -> Tuple[int, Token]:
        """Get the next token in iteration, raising StopIteration when
        done."""
        return next(self._lexer)

doc property

Get associated document if it exists.

__iter__()

Iterate over (position, object) tuples.

Source code in playa/parser.py
326
327
328
def __iter__(self) -> Iterator[StackEntry]:
    """Iterate over (position, object) tuples."""
    return self

__next__()

Get next PDF object from stream (raises StopIteration at EOF).

Source code in playa/parser.py
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
def __next__(self) -> StackEntry:
    """Get next PDF object from stream (raises StopIteration at EOF)."""
    top: Union[int, None] = None
    obj: Union[Dict[Any, Any], List[PDFObject], PDFObject] = None
    while True:
        if self.stack and top is None:
            return self.stack.pop()
        (pos, token) = self.nexttoken()
        if token is KEYWORD_ARRAY_BEGIN:
            if top is None:
                top = pos
            self.stack.append((pos, token))
        elif token is KEYWORD_ARRAY_END:
            try:
                pos, obj = self.pop_to(KEYWORD_ARRAY_BEGIN)
            except (TypeError, PDFSyntaxError) as e:
                if self.strict:
                    raise e
                log.warning("When constructing array from %r: %s", obj, e)
            if pos == top:
                top = None
                return pos, obj
            self.stack.append((pos, obj))
        elif token is KEYWORD_DICT_BEGIN:
            if top is None:
                top = pos
            self.stack.append((pos, token))
        elif token is KEYWORD_DICT_END:
            try:
                (pos, objs) = self.pop_to(KEYWORD_DICT_BEGIN)
                if len(objs) % 2 != 0:
                    error_msg = (
                        "Dictionary contains odd number of objects: %r" % objs
                    )
                    raise PDFSyntaxError(error_msg)
                obj = {
                    literal_name(k): v
                    for (k, v) in choplist(2, objs)
                    if v is not None
                }
            except (TypeError, PDFSyntaxError) as e:
                if self.strict:
                    raise e
                log.warning("When constructing dict from %r: %s", self.stack, e)
            if pos == top:
                top = None
                return pos, obj
            self.stack.append((pos, obj))
        elif token is KEYWORD_PROC_BEGIN:
            if top is None:
                top = pos
            self.stack.append((pos, token))
        elif token is KEYWORD_PROC_END:
            try:
                pos, obj = self.pop_to(KEYWORD_PROC_BEGIN)
            except (TypeError, PDFSyntaxError) as e:
                if self.strict:
                    raise e
                log.warning("When constructing proc from %r: %s", obj, e)
            if pos == top:
                top = None
                return pos, obj
            self.stack.append((pos, obj))
        elif token is KEYWORD_NULL:
            self.stack.append((pos, None))
        elif token is KEYWORD_R:
            # reference to indirect object (only allowed inside another object)
            if top is None:
                log.warning("Ignoring indirect object reference at top level")
                self.stack.append((pos, token))
            else:
                obj = self.get_object_reference(pos, token)
                if obj is not None:
                    self.stack.append((pos, obj))
        elif token is KEYWORD_BI:
            # Inline images must occur at the top level, otherwise
            # something is wrong (probably a corrupt file)
            if top is not None:
                raise PDFSyntaxError(
                    "Inline image not at top level of stream "
                    f"({pos} != {top}, {self.stack})"
                )
            top = pos
            self.stack.append((pos, token))
        elif token is KEYWORD_ID:
            obj = self.get_inline_image(pos, token)
            if obj is not None:
                top = None
                return pos, obj
        else:
            # Literally anything else, including any other keyword
            # (will be returned above if top is None, or later if
            # we are inside some object)
            self.stack.append((pos, token))

get_inline_image(pos, token)

Get an inline image upon finding an "ID" token.

Returns a tuple of the position of the target in the data and the image data. Advances the file pointer to a position after the "EI" token that (we hope) ends the image.

WELCOME IN THE HELL!!!

If you're lucky enough to have PDF 2.0 documents, then you can skip this, you aren't actually in (the) hell. Otherwise read on to know why you might be missng images or reading a lot of garbage in your logs:

  • The PDF 1.7 standard only specifies that image data must be delimited by ID and EI, and that "The bytes between the ID and EI operators shall be treated the same as a stream object’s data, even though they do not follow the standard stream syntax." What does that even mean?
  • And, that must be "a single whitespace character" following ID (floating in perfume, served in a man's hat), except in the case of ASCIIHexDecode or ASCII85Decode (in which case there can just be any whitespace you like, or none at all).
  • It's obviously impossible to determine what a "conforming implementation" of this should do.

In the easiest case, if it's ASCIIHexDecode data then we can just look for the first instance of b"EI", ignoring all whitespace, since b"EI" is thankfully not a valid hex sequence.

Otherwise, the stream data can, and inevitably will, contain the literal bytes b"EI", so no, we can't just search for that. In the case of ASCII85Decode, however, you can look for the b"~>" end-of-data sequence but note that sometimes it... contains whitespace!

So, we try for b"\sEI\b", which is not foolproof since you could have the pixel values (32, 69, 73) in your image followed by some other byte... so in that case, expect a bunch of nonsense in the logs and possible data loss. Also in the rare case where b"EI" was preceded by b"\r\n", there will be an extra \r in the image data. Too bad.

And finally if that doesn't work then we will try to salvage something by just looking for "EI", somewhere, anywhere. We take the most distant one, and if this causes you to lose data, well, it's definitely Adobe's fault.

There is an absolutely foolproof way to parse inline images, but it's ridiculous so we won't do it:

  1. Find the very first instance of b"EI".
  2. Extract the image itself (which could be in various formats).
  3. If it's a valid image, congratulations! Otherwise try again.

The moral of the story is that the author of this part of the PDF specification should have considered a career in literally anything else.

Source code in playa/parser.py
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
def get_inline_image(self, pos: int, token: Token) -> Union[InlineImage, None]:
    """Get an inline image upon finding an "ID" token.

    Returns a tuple of the position of the target in the data and
    the image data.  Advances the file pointer to a position after
    the "EI" token that (we hope) ends the image.

    Note: WELCOME IN THE HELL!!!
        If you're lucky enough to have PDF 2.0 documents, then you
        can skip this, you aren't actually in (the) hell.  Otherwise
        read on to know why you might be missng images or reading
        a lot of garbage in your logs:

        - The PDF 1.7 standard only specifies that image data must be
          delimited by `ID` and `EI`, and that "The bytes between
          the `ID` and `EI` operators shall be treated the same as a
          stream object’s data, even though they do not follow the
          standard stream syntax."  What does that even mean?
        - And, that must be "a single whitespace character"
          following `ID` (floating in perfume, served in a man's
          hat), except in the case of `ASCIIHexDecode` or
          `ASCII85Decode` (in which case there can just be any
          whitespace you like, or none at all).
        - It's obviously impossible to determine what a "conforming
          implementation" of this should do.

        In the easiest case, if it's `ASCIIHexDecode` data then we
        can just look for the first instance of `b"EI"`, ignoring all
        whitespace, since `b"EI"` is thankfully not a valid hex
        sequence.

        Otherwise, the stream data can, and inevitably will,
        contain the literal bytes `b"EI"`, so no, we can't just
        search for that.  In the case of `ASCII85Decode`, however,
        you can look for the `b"~>"` end-of-data sequence but note
        that sometimes it... contains whitespace!

        So, we try for `b"\\sEI\\b"`, which is not foolproof since
        you could have the pixel values `(32, 69, 73)` in your
        image followed by some other byte... so in that case,
        expect a bunch of nonsense in the logs and possible data
        loss.  Also in the rare case where `b"EI"` was preceded by
        `b"\\r\\n"`, there will be an extra `\\r` in the image
        data.  Too bad.

        And finally if that doesn't work then we will try to salvage
        something by just looking for "EI", somewhere, anywhere.  We
        take the most distant one, and if this causes you to lose
        data, well, it's definitely Adobe's fault.

        There **is** an absolutely foolproof way to parse inline
        images, but it's ridiculous so we won't do it:

        1. Find the very first instance of `b"EI"`.
        2. Extract the image itself (which could be in various formats).
        3. If it's a valid image, congratulations!  Otherwise try again.

        The moral of the story is that the author of this part of
        the PDF specification should have considered a career in
        literally anything else.

    """
    assert (
        isinstance(token, PSKeyword) and token is KEYWORD_ID
    ), f"Not ID: {token!r}"
    idpos = pos
    (pos, objs) = self.pop_to(KEYWORD_BI)
    if len(objs) % 2 != 0:
        error_msg = f"Invalid dictionary construct: {objs!r}"
        if self.strict:
            raise TypeError(error_msg)
        else:
            log.warning(error_msg)
    dic = {literal_name(k): v for (k, v) in choplist(2, objs) if v is not None}

    target_re = EIR
    whitespace_re = SPC
    # Final filter is actually the *first* in the list
    final_filter = dic.get("F", dic.get("Filter"))
    if isinstance(final_filter, list) and final_filter:
        final_filter = final_filter[0]
    if final_filter in LITERALS_ASCII85_DECODE:
        # ASCII85: look for ~>EI, ignoring all whitespace
        whitespace_re = WSR
        target_re = A85R
    elif final_filter in LITERALS_ASCIIHEX_DECODE:
        # ASCIIHex: just look for EI
        whitespace_re = WSR
        target_re = EIEIR

    # Find the start of the image data by skipping the appropriate
    # amount of whitespace.  In the case of ASCII filters, we need
    # to skip any extra whitespace before we use a possible Length
    # value (this is very dumb but the standard says...)
    pos = idpos + len(token.name)
    data = self._lexer.data
    m = whitespace_re.match(data, pos)
    if m is None:  # Note that WSR will also match nothing
        errmsg = f"ID token at {pos} not followed by whitespace"
        if self.strict:
            raise PDFSyntaxError(errmsg)
        else:
            log.warning(errmsg)
    else:
        pos = m.end(0)

    # If you have Length, you have everything
    length = dic.get("L", dic.get("Length"))
    if length is not None:
        end = pos + int_value(length)
        self.seek(end)
        (_, token) = self.nexttoken()
        if token is not KEYWORD_EI:
            errmsg = f"EI not found after Length {length!r}"
            if self.strict:
                raise PDFSyntaxError(errmsg)
            else:
                log.warning(errmsg)
        return InlineImage(dic, data[pos:end])

    m = target_re.search(data, pos)
    if m is not None:
        self.seek(m.end(0))
        return InlineImage(dic, data[pos : m.start(0)])
    errmsg = f"Inline image at {pos} not terminated with {target_re}"
    if self.strict:
        raise PDFSyntaxError(errmsg)
    else:
        log.warning(errmsg)

    m = FURTHESTEIR.match(data, pos)
    if m is not None:
        log.warning(
            "Inline image at %d has no whitespace before EI, "
            "expect horrible data loss!!!",
            pos,
        )
        self.seek(m.end(0))
        return InlineImage(dic, data[pos : m.end(0) - 2])
    return None

get_object_reference(pos, token)

Get an indirect object reference upon finding an "R" token.

Source code in playa/parser.py
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
def get_object_reference(self, pos: int, token: Token) -> Union[ObjRef, None]:
    """Get an indirect object reference upon finding an "R" token."""
    try:
        _pos, _genno = self.stack.pop()
        _pos, objid = self.stack.pop()
    except ValueError as e:
        if self.strict:
            raise PDFSyntaxError(
                "Expected generation and object id in indirect object reference"
            ) from e
        else:
            log.warning(
                "Expected generation and object id in indirect object reference: %s",
                e,
            )
        return None
    objid = int_value(objid)
    if objid == 0:
        if self.strict:
            raise PDFSyntaxError(
                "Object ID in reference at pos %d cannot be 0" % (pos,)
            )
        log.warning("Ignoring indirect object reference to 0 at %s", pos)
        return None
    return ObjRef(self.docref, objid)

newstream(data)

Continue parsing from a new data stream.

Source code in playa/parser.py
318
319
320
def newstream(self, data: Union[bytes, mmap.mmap]) -> None:
    """Continue parsing from a new data stream."""
    self._lexer = Lexer(data)

nextline()

Read (and do not parse) next line from underlying data.

Source code in playa/parser.py
617
618
619
def nextline(self) -> Tuple[int, bytes]:
    """Read (and do not parse) next line from underlying data."""
    return self._lexer.nextline()

nexttoken()

Get the next token in iteration, raising StopIteration when done.

Source code in playa/parser.py
621
622
623
624
def nexttoken(self) -> Tuple[int, Token]:
    """Get the next token in iteration, raising StopIteration when
    done."""
    return next(self._lexer)

pop_to(token)

Pop everything from the stack back to token.

Source code in playa/parser.py
425
426
427
428
429
430
431
432
433
434
def pop_to(self, token: PSKeyword) -> Tuple[int, List[PDFObject]]:
    """Pop everything from the stack back to token."""
    context: List[PDFObject] = []
    while self.stack:
        pos, last = self.stack.pop()
        if last is token:
            context.reverse()
            return pos, context
        context.append(last)
    raise PDFSyntaxError(f"Unmatched end token {token!r}")

read(objlen)

Read data from a specified position, moving the current position to the end of this data.

Source code in playa/parser.py
612
613
614
615
def read(self, objlen: int) -> bytes:
    """Read data from a specified position, moving the current
    position to the end of this data."""
    return self._lexer.read(objlen)

reset()

Clear internal parser state.

Source code in playa/parser.py
322
323
324
def reset(self) -> None:
    """Clear internal parser state."""
    del self.stack[:]

seek(pos)

Seek to a position.

Source code in playa/parser.py
604
605
606
def seek(self, pos: int) -> None:
    """Seek to a position."""
    self._lexer.seek(pos)

tell()

Get the current position in the file.

Source code in playa/parser.py
608
609
610
def tell(self) -> int:
    """Get the current position in the file."""
    return self._lexer.tell()

ObjectStreamParser

Parse indirect objects from an object stream.

Source code in playa/parser.py
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
class ObjectStreamParser:
    """
    Parse indirect objects from an object stream.
    """

    def __init__(
        self,
        stream: ContentStream,
        doc: Union["Document", None] = None,
    ) -> None:
        self._parser = ObjectParser(stream.buffer, doc)
        self.buffer = stream.buffer
        self.nobj = int_value(stream["N"])
        self.first = int_value(stream["First"])
        self.offsets = []
        while True:
            try:
                _, objid = next(self._parser)
                _, pos = next(self._parser)
                objid = int_value(objid)
                pos = int_value(pos)
            except StopIteration:
                log.warning("Unexpected EOF in object stream")
                break
            self.offsets.append((objid, pos))
            if len(self.offsets) == self.nobj:
                break

    def __iter__(self) -> Iterator[Tuple[int, IndirectObject]]:
        self._parser.seek(self.first)
        for (objid, opos), (pos, obj) in zip(self.offsets, self._parser):
            if pos != self.first + opos:
                log.warning(
                    "Invalid object stream: object %d is at %d, should be at %d",
                    objid,
                    pos,
                    self.first + opos,
                )
            yield pos, IndirectObject(objid=objid, genno=0, obj=obj)

reverse_iter_lines(buffer)

Iterate backwards over lines starting at the current position.

This is used to locate the trailers at the end of a file.

Source code in playa/parser.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
def reverse_iter_lines(buffer: Union[bytes, mmap.mmap]) -> Iterator[Tuple[int, bytes]]:
    """Iterate backwards over lines starting at the current position.

    This is used to locate the trailers at the end of a file.
    """
    pos = endline = len(buffer)
    while True:
        nidx = buffer.rfind(b"\n", 0, pos)
        ridx = buffer.rfind(b"\r", 0, pos)
        best = max(nidx, ridx)
        yield best + 1, buffer[best + 1 : endline]
        if best == -1:
            break
        endline = best + 1
        pos = best
        if pos > 0 and buffer[pos - 1 : pos + 1] == b"\r\n":
            pos -= 1

playa.worker

Worker subprocess related functions and data.

in_worker()

Are we currently in a worker process?

Source code in playa/worker.py
25
26
27
def in_worker() -> bool:
    """Are we currently in a worker process?"""
    return __pdf is not None