Skip to content

Reference

playa

PLAYA ain't a LAYout Analyzer... but it can help you get stuff out of PDFs.

Basic usage:

with playa.open(path) as pdf:
    for page in pdf.pages:
        print(f"page {page.label}:")
        for obj in page:
            print(f"    {obj.object_type} at {obj.bbox}")
            if obj.object_type == "text":
                print(f"        chars: {obj.chars}")

open(path, *, password='', space='screen', max_workers=1, mp_context=None)

Open a PDF document from a path on the filesystem.

Parameters:

Name Type Description Default
path Union[PathLike, str]

Path to the document to open.

required
space DeviceSpace

Device space to use ("screen" for screen-like coordinates, "page" for pdfminer.six-like coordinates, "default" for default user space with no rotation or translation)

'screen'
max_workers Union[int, None]

Number of worker processes to use for parallel processing of pages (if 1, no workers are spawned)

1
mp_context Union[BaseContext, None]

Multiprocessing context to use for worker processes, see Contexts and Start Methods for more information.

None
Source code in playa/__init__.py
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def open(
    path: Union[PathLike, str],
    *,
    password: str = "",
    space: DeviceSpace = "screen",
    max_workers: Union[int, None] = 1,
    mp_context: Union[BaseContext, None] = None,
) -> Document:
    """Open a PDF document from a path on the filesystem.

    Args:
        path: Path to the document to open.
        space: Device space to use ("screen" for screen-like
               coordinates, "page" for pdfminer.six-like coordinates, "default" for
               default user space with no rotation or translation)
        max_workers: Number of worker processes to use for parallel
                     processing of pages (if 1, no workers are spawned)
        mp_context: Multiprocessing context to use for worker
                    processes, see [Contexts and Start
                    Methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
                    for more information.
    """
    fp = builtins.open(path, "rb")
    pdf = Document(fp, password=password, space=space)
    pdf._fp = fp
    if max_workers is None or max_workers > 1:
        pdf._pool = ProcessPoolExecutor(
            max_workers=max_workers,
            mp_context=mp_context,
            initializer=_init_worker,  # type: ignore[arg-type]
            initargs=(id(pdf), path, password, space),  # type: ignore[arg-type]
        )
    return pdf

parse(buffer, *, password='', space='screen', max_workers=1, mp_context=None)

Read a PDF document from binary data.

Potential slowness

When using multiple processes, this results in the entire buffer being copied to the worker processes for the moment, which may cause some overhead. It is preferable to use open on a filesystem path if possible, since that uses memory-mapped I/O.

Parameters:

Name Type Description Default
buffer bytes

Buffer containing PDF data.

required
space DeviceSpace

Device space to use ("screen" for screen-like coordinates, "page" for pdfminer.six-like coordinates, "default" for default user space with no rotation or translation)

'screen'
max_workers Union[int, None]

Number of worker processes to use for parallel processing of pages (if 1, no workers are spawned)

1
mp_context Union[BaseContext, None]

Multiprocessing context to use for worker processes, see Contexts and Start Methods for more information.

None
Source code in playa/__init__.py
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
def parse(
    buffer: bytes,
    *,
    password: str = "",
    space: DeviceSpace = "screen",
    max_workers: Union[int, None] = 1,
    mp_context: Union[BaseContext, None] = None,
) -> Document:
    """Read a PDF document from binary data.

    Note: Potential slowness
        When using multiple processes, this results in the entire
        buffer being copied to the worker processes for the moment,
        which may cause some overhead.  It is preferable to use `open`
        on a filesystem path if possible, since that uses
        memory-mapped I/O.

    Args:
        buffer: Buffer containing PDF data.
        space: Device space to use ("screen" for screen-like
               coordinates, "page" for pdfminer.six-like coordinates, "default" for
               default user space with no rotation or translation)
        max_workers: Number of worker processes to use for parallel
                     processing of pages (if 1, no workers are spawned)
        mp_context: Multiprocessing context to use for worker
                    processes, see [Contexts and Start
                    Methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
                    for more information.

    """
    pdf = Document(buffer, password=password, space=space)
    if max_workers is None or max_workers > 1:
        pdf._pool = ProcessPoolExecutor(
            max_workers=max_workers,
            mp_context=mp_context,
            initializer=_init_worker_buffer,  # type: ignore[arg-type]
            initargs=(id(pdf), buffer, password, space),  # type: ignore[arg-type]
        )
    return pdf

playa.document

Basic classes for PDF document parsing.

Destinations

Mapping of named destinations.

These either come as a NameTree or a dict, depending on the version of the PDF standard, so this abstracts that away.

Source code in playa/document.py
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
class Destinations:
    """Mapping of named destinations.

    These either come as a NameTree or a dict, depending on the
    version of the PDF standard, so this abstracts that away.
    """

    dests_dict: Union[Dict[str, PDFObject], None] = None
    dests_tree: Union[NameTree, None] = None

    def __init__(self, doc: Document) -> None:
        self._docref = _ref_document(doc)
        self.dests: Dict[str, Destination] = {}
        if "Dests" in doc.catalog:
            # PDF-1.1: dictionary
            self.dests_dict = resolve1(doc.catalog["Dests"])
            if not isinstance(self.dests_dict, dict):
                log.warning(
                    "Dests entry in catalog is not dictionary: %r", self.dests_dict
                )
                self.dests_dict = None
        elif "Names" in doc.catalog:
            names = resolve1(doc.catalog["Names"])
            if not isinstance(names, dict):
                log.warning("Names entry in catalog is not dictionary: %r", names)
                return
            if "Dests" in names:
                dests = resolve1(names["Dests"])
                if not isinstance(names, dict):
                    log.warning("Dests entry in names is not dictionary: %r", dests)
                    return
                self.dests_tree = NameTree(dests)

    def __iter__(self) -> Iterator[str]:
        """Iterate over names of destinations.

        Danger: Beware of corrupted PDFs
            This simply iterates over the names listed in the PDF, and
            does not attempt to actually parse the destinations
            (because that's pretty slow).  If the PDF is broken, you
            may encounter exceptions when actually trying to access
            them by name.
        """
        if self.dests_dict is not None:
            yield from self.dests_dict
        elif self.dests_tree is not None:
            for kb, _ in self.dests_tree:
                ks = decode_text(kb)
                yield ks

    def items(self) -> Iterator[Tuple[str, Destination]]:
        """Iterate over named destinations."""
        if self.dests_dict is not None:
            for name, dest in self.dests_dict.items():
                if name not in self.dests:
                    dest = resolve1(self.dests_dict[name])
                    self.dests[name] = self._create_dest(dest, name)
                yield name, self.dests[name]
        elif self.dests_tree is not None:
            for k, v in self.dests_tree:
                name = decode_text(k)
                if name not in self.dests:
                    dest = resolve1(v)
                    self.dests[name] = self._create_dest(dest, name)
                yield name, self.dests[name]

    def __getitem__(self, name: Union[bytes, str, PSLiteral]) -> Destination:
        """Get a named destination.

        Args:
            name: The name of the destination.

        Raises:
            KeyError: If no such destination exists.
            TypeError: If the PDF is damaged and the destinations tree
                contains something unexpected or missing.
        """
        if isinstance(name, bytes):
            name = decode_text(name)
        elif isinstance(name, PSLiteral):
            name = literal_name(name)
        if name in self.dests:
            return self.dests[name]
        elif self.dests_dict is not None:
            # This will raise KeyError or TypeError if necessary, so
            # we don't have to do it explicitly
            dest = resolve1(self.dests_dict[name])
            self.dests[name] = self._create_dest(dest, name)
        elif self.dests_tree is not None:
            # This is not at all efficient, but we need to decode
            # the keys (and we cache the result...)
            for k, v in self.dests_tree:
                if decode_text(k) == name:
                    dest = resolve1(v)
                    self.dests[name] = self._create_dest(dest, name)
                    break
        # This will also raise KeyError if necessary
        return self.dests[name]

    def _create_dest(self, dest: PDFObject, name: str) -> Destination:
        if isinstance(dest, list):
            return Destination.from_list(self.doc, dest)
        elif isinstance(dest, dict) and "D" in dest:
            destlist = resolve1(dest["D"])
            if not isinstance(destlist, list):
                raise TypeError("Invalid destination for %s: %r", name, dest)
            return Destination.from_list(self.doc, destlist)
        else:
            raise TypeError("Invalid destination for %s: %r", name, dest)

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self._docref)

doc property

Get associated document if it exists.

__getitem__(name)

Get a named destination.

Parameters:

Name Type Description Default
name Union[bytes, str, PSLiteral]

The name of the destination.

required

Raises:

Type Description
KeyError

If no such destination exists.

TypeError

If the PDF is damaged and the destinations tree contains something unexpected or missing.

Source code in playa/document.py
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
def __getitem__(self, name: Union[bytes, str, PSLiteral]) -> Destination:
    """Get a named destination.

    Args:
        name: The name of the destination.

    Raises:
        KeyError: If no such destination exists.
        TypeError: If the PDF is damaged and the destinations tree
            contains something unexpected or missing.
    """
    if isinstance(name, bytes):
        name = decode_text(name)
    elif isinstance(name, PSLiteral):
        name = literal_name(name)
    if name in self.dests:
        return self.dests[name]
    elif self.dests_dict is not None:
        # This will raise KeyError or TypeError if necessary, so
        # we don't have to do it explicitly
        dest = resolve1(self.dests_dict[name])
        self.dests[name] = self._create_dest(dest, name)
    elif self.dests_tree is not None:
        # This is not at all efficient, but we need to decode
        # the keys (and we cache the result...)
        for k, v in self.dests_tree:
            if decode_text(k) == name:
                dest = resolve1(v)
                self.dests[name] = self._create_dest(dest, name)
                break
    # This will also raise KeyError if necessary
    return self.dests[name]

__iter__()

Iterate over names of destinations.

Beware of corrupted PDFs

This simply iterates over the names listed in the PDF, and does not attempt to actually parse the destinations (because that's pretty slow). If the PDF is broken, you may encounter exceptions when actually trying to access them by name.

Source code in playa/document.py
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
def __iter__(self) -> Iterator[str]:
    """Iterate over names of destinations.

    Danger: Beware of corrupted PDFs
        This simply iterates over the names listed in the PDF, and
        does not attempt to actually parse the destinations
        (because that's pretty slow).  If the PDF is broken, you
        may encounter exceptions when actually trying to access
        them by name.
    """
    if self.dests_dict is not None:
        yield from self.dests_dict
    elif self.dests_tree is not None:
        for kb, _ in self.dests_tree:
            ks = decode_text(kb)
            yield ks

items()

Iterate over named destinations.

Source code in playa/document.py
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
def items(self) -> Iterator[Tuple[str, Destination]]:
    """Iterate over named destinations."""
    if self.dests_dict is not None:
        for name, dest in self.dests_dict.items():
            if name not in self.dests:
                dest = resolve1(self.dests_dict[name])
                self.dests[name] = self._create_dest(dest, name)
            yield name, self.dests[name]
    elif self.dests_tree is not None:
        for k, v in self.dests_tree:
            name = decode_text(k)
            if name not in self.dests:
                dest = resolve1(v)
                self.dests[name] = self._create_dest(dest, name)
            yield name, self.dests[name]

Document

Representation of a PDF document.

Since PDF documents can be very large and complex, merely creating a Document does very little aside from verifying that the password is correct and getting a minimal amount of metadata. In general, PLAYA will try to open just about anything as a PDF, so you should not expect the constructor to fail here if you give it nonsense (something else may fail later on).

Some metadata, such as the structure tree and page tree, will be loaded lazily and cached. We do not handle modification of PDFs.

Parameters:

Name Type Description Default
fp Union[BinaryIO, bytes]

File-like object in binary mode, or a buffer with binary data. Files will be read using mmap if possible. They do not need to be seekable, as if mmap fails the entire file will simply be read into memory (so a pipe or socket ought to work).

required
password str

Password for decryption, if needed.

''
space DeviceSpace

the device space to use for interpreting content ("screen" or "page")

'screen'

Raises:

Type Description
TypeError

if fp is a file opened in text mode (don't do that!)

PDFEncryptionError

if the PDF has an unsupported encryption scheme

PDFPasswordIncorrect

if the password is incorrect

Source code in playa/document.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
class Document:
    """Representation of a PDF document.

    Since PDF documents can be very large and complex, merely creating
    a `Document` does very little aside from verifying that the
    password is correct and getting a minimal amount of metadata.  In
    general, PLAYA will try to open just about anything as a PDF, so
    you should not expect the constructor to fail here if you give it
    nonsense (something else may fail later on).

    Some metadata, such as the structure tree and page tree, will be
    loaded lazily and cached.  We do not handle modification of PDFs.

    Args:
      fp: File-like object in binary mode, or a buffer with binary data.
          Files will be read using `mmap` if possible.  They do not need
          to be seekable, as if `mmap` fails the entire file will simply
          be read into memory (so a pipe or socket ought to work).
      password: Password for decryption, if needed.
      space: the device space to use for interpreting content ("screen"
          or "page")

    Raises:
      TypeError: if `fp` is a file opened in text mode (don't do that!)
      PDFEncryptionError: if the PDF has an unsupported encryption scheme
      PDFPasswordIncorrect: if the password is incorrect
    """

    _fp: Union[BinaryIO, None] = None
    _pages: Union["PageList", None] = None
    _pool: Union[Executor, None] = None
    _outline: Union["Outline", None] = None
    _destinations: Union["Destinations", None] = None

    def __enter__(self) -> "Document":
        return self

    def __exit__(self, exc_type, exc_value, traceback) -> None:
        self.close()

    def close(self) -> None:
        # If we were opened from a file then close it
        if self._fp:
            self._fp.close()
            self._fp = None
        # Shutdown process pool
        if self._pool:
            self._pool.shutdown()
            self._pool = None

    def __init__(
        self,
        fp: Union[BinaryIO, bytes],
        password: str = "",
        space: DeviceSpace = "screen",
        _boss_id: int = 0,
    ) -> None:
        if _boss_id:
            # Set this **right away** because it is needed to get
            # indirect object references right.
            _set_document(self, _boss_id)
            assert in_worker()
        self.xrefs: List[XRef] = []
        self.space = space
        self.info = []
        self.catalog: Dict[str, Any] = {}
        self.encryption: Optional[Tuple[Any, Any]] = None
        self.decipher: Optional[DecipherCallable] = None
        self._cached_objs: Dict[int, PDFObject] = {}
        self._parsed_objs: Dict[int, Tuple[List[PDFObject], int]] = {}
        self._cached_fonts: Dict[object, Font] = {}
        if isinstance(fp, io.TextIOBase):
            raise TypeError("fp is not a binary file")
        self.pdf_version, self.offset, self.buffer = _open_input(fp)
        self.is_printable = self.is_modifiable = self.is_extractable = True
        # Getting the XRef table and trailer is done non-lazily
        # because they contain encryption information among other
        # things.  As noted above we don't try to look for the first
        # page cross-reference table (for linearized PDFs) after the
        # header, it will instead be loaded with all the rest.
        self.parser = IndirectObjectParser(self.buffer, self)
        self.parser.seek(self.offset)
        self._xrefpos: Set[int] = set()
        try:
            self._read_xrefs()
        except Exception as e:
            log.debug(
                "Failed to parse xref table, falling back to object parser: %s",
                e,
            )
            newxref = XRefFallback(self.parser)
            self.xrefs.append(newxref)
        # Now find the trailer
        for xref in self.xrefs:
            trailer = xref.trailer
            if not trailer:
                continue
            # If there's an encryption info, remember it.
            if "Encrypt" in trailer:
                if "ID" in trailer:
                    id_value = list_value(trailer["ID"])
                else:
                    # Some documents may not have a /ID, use two empty
                    # byte strings instead. Solves
                    # https://github.com/pdfminer/pdfminer.six/issues/594
                    id_value = (b"", b"")
                self.encryption = (id_value, dict_value(trailer["Encrypt"]))
                self._initialize_password(password)
            if "Info" in trailer:
                try:
                    self.info.append(dict_value(trailer["Info"]))
                except TypeError:
                    log.warning("Info is a broken reference (incorrect xref table?)")
            if "Root" in trailer:
                # Every PDF file must have exactly one /Root dictionary.
                try:
                    self.catalog = dict_value(trailer["Root"])
                except TypeError:
                    log.warning("Root is a broken reference (incorrect xref table?)")
                    self.catalog = {}
                break
        else:
            log.warning("No /Root object! - Is this really a PDF?")
        if self.catalog.get("Type") is not LITERAL_CATALOG:
            log.warning("Catalog not found!")
        if "Version" in self.catalog:
            log.debug(
                "Using PDF version %r from catalog instead of %r from header",
                self.catalog["Version"],
                self.pdf_version,
            )
            self.pdf_version = literal_name(self.catalog["Version"])
        self.is_tagged = False
        markinfo = resolve1(self.catalog.get("MarkInfo"))
        if isinstance(markinfo, dict):
            self.is_tagged = not not markinfo.get("Marked")

    def _read_xrefs(self):
        try:
            xrefpos = self._find_xref()
        except Exception as e:
            raise PDFSyntaxError("No xref table found at end of file") from e
        try:
            self._read_xref_from(xrefpos, self.xrefs)
            return
        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
            xrefpos = self._detect_concatenation(xrefpos)
            if xrefpos == -1:
                raise PDFSyntaxError("Failed to read xref table at end of file") from e
        try:
            self._read_xref_from(xrefpos, self.xrefs)
        except (ValueError, IndexError, StopIteration, PDFSyntaxError) as e:
            raise PDFSyntaxError(
                "Failed to read xref table with adjusted offset"
            ) from e

    def _detect_concatenation(self, xrefpos: int) -> int:
        # Detect the case where two (or more) PDFs have been
        # concatenated, or where somebody tried an "incremental
        # update" without updating the xref table
        filestart = self.buffer.rfind(b"%%EOF")
        log.debug("Found ultimate %%EOF at %d", filestart)
        if filestart != -1:
            filestart = self.buffer.rfind(b"%%EOF", 0, filestart)
            log.debug("Found penultimate %%EOF at %d", filestart)
        if filestart != -1:
            filestart += 5
            while self.buffer[filestart] in (10, 13):
                filestart += 1
            parser = ObjectParser(self.buffer, self, filestart + xrefpos)
            try:
                (pos, token) = parser.nexttoken()
            except StopIteration:
                raise ValueError("Unexpected EOF at {start}")
            if token is KEYWORD_XREF:
                log.debug(
                    "Found two PDFs in a trenchcoat at %d (second xref is at %d not %d)",
                    filestart,
                    pos,
                    xrefpos,
                )
                self.offset = filestart
                return pos
        return -1

    def _initialize_password(self, password: str = "") -> None:
        """Initialize the decryption handler with a given password, if any.

        Internal function, requires the Encrypt dictionary to have
        been read from the trailer into self.encryption.
        """
        assert self.encryption is not None
        (docid, param) = self.encryption
        if literal_name(param.get("Filter")) != "Standard":
            raise PDFEncryptionError("Unknown filter: param=%r" % param)
        v = int_value(param.get("V", 0))
        # 3 (PDF 1.4) An unpublished algorithm that permits encryption
        # key lengths ranging from 40 to 128 bits. This value shall
        # not appear in a conforming PDF file.
        if v == 3:
            raise PDFEncryptionError("Unpublished algorithm 3 not supported")
        factory = SECURITY_HANDLERS.get(v)
        # 0 An algorithm that is undocumented. This value shall not be used.
        if factory is None:
            raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
        handler = factory(docid, param, password)
        self.decipher = handler.decrypt
        self.is_printable = handler.is_printable
        self.is_modifiable = handler.is_modifiable
        self.is_extractable = handler.is_extractable
        assert self.parser is not None
        # Ensure that no extra data leaks into encrypted streams
        self.parser.strict = True
        self.parser.decipher = self.decipher

    def __iter__(self) -> Iterator[IndirectObject]:
        """Iterate over top-level `IndirectObject` (does not expand object streams)"""
        return (
            obj
            for pos, obj in IndirectObjectParser(
                self.buffer, self, pos=self.offset, strict=self.parser.strict
            )
        )

    @property
    def objects(self) -> Iterator[IndirectObject]:
        """Iterate over all indirect objects (including, then expanding object
        streams)"""
        for pos, obj in IndirectObjectParser(
            self.buffer, self, pos=self.offset, strict=self.parser.strict
        ):
            yield obj
            if (
                isinstance(obj.obj, ContentStream)
                and obj.obj.get("Type") is LITERAL_OBJSTM
            ):
                parser = ObjectStreamParser(obj.obj, self)
                for spos, sobj in parser:
                    yield sobj

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterate over tokens."""
        return (tok for pos, tok in Lexer(self.buffer))

    @property
    def structure(self) -> Union[Tree, None]:
        """Logical structure of this document, if any.

        In the case where no logical structure tree exists, this will
        be `None`.  Otherwise you may iterate over it, search it, etc.
        """
        if "StructTreeRoot" not in self.catalog:
            return None
        return Tree(self)

    @property
    def parent_tree(self) -> Union[NumberTree, None]:
        """Parent tree of this document.

        This is a somewhat obscure data structure that links marked
        content sections to their corresponding structure elements.
        If you don't know what that means, you probably don't need it,
        but if you do, here it is.
        """
        if "StructTreeRoot" not in self.catalog:
            return None
        st = dict_value(self.catalog["StructTreeRoot"])
        if "ParentTree" not in st:
            return None
        return NumberTree(st["ParentTree"])

    def _getobj_objstm(
        self, stream: ContentStream, index: int, objid: int
    ) -> PDFObject:
        if stream.objid in self._parsed_objs:
            (objs, n) = self._parsed_objs[stream.objid]
        else:
            (objs, n) = self._get_objects(stream)
            assert stream.objid is not None
            self._parsed_objs[stream.objid] = (objs, n)
        i = n * 2 + index
        try:
            obj = objs[i]
        except IndexError:
            raise PDFSyntaxError("index too big: %r" % index)
        return obj

    def _get_objects(self, stream: ContentStream) -> Tuple[List[PDFObject], int]:
        if stream.get("Type") is not LITERAL_OBJSTM:
            log.warning("Content stream Type is not /ObjStm: %r" % stream)
        try:
            n = int_value(stream["N"])
        except KeyError:
            log.warning("N is not defined in content stream: %r" % stream)
            n = 0
        except TypeError:
            log.warning("N is invalid in content stream: %r" % stream)
            n = 0
        parser = ObjectParser(stream.buffer, self)
        objs: List[PDFObject] = [obj for _, obj in parser]
        return (objs, n)

    def _getobj_parse(self, pos: int, objid: int) -> PDFObject:
        assert self.parser is not None
        self.parser.seek(pos)
        try:
            _, obj = next(self.parser)
            if obj.objid != objid:
                raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
        except (ValueError, IndexError, PDFSyntaxError) as e:
            log.warning(
                "Indirect object %d not found at position %d: %r", objid, pos, e
            )
            # In case of malformed pdf files where the offset in the
            # xref table doesn't point exactly at the object
            # definition (probably more frequent than you think), just
            # use a regular expression to find the object because we
            # can do that.
            realpos = -1
            lastgen = -1
            for m in re.finditer(rb"%d\s+(\d+)\s+obj" % objid, self.buffer):
                genno = int(m.group(1))
                if genno > lastgen:
                    lastgen = genno
                    realpos = m.start(0)
            if realpos == -1:
                raise PDFSyntaxError(
                    f"Indirect object {objid!r} not found in document"
                ) from e
            self.parser.seek(realpos)
            (_, obj) = next(self.parser)
        if obj.objid != objid:
            raise PDFSyntaxError(f"objid mismatch: {obj.objid!r}={objid!r}")
        return obj.obj

    def __getitem__(self, objid: int) -> PDFObject:
        """Get an indirect object from the PDF.

        Note that the behaviour in the case of a non-existent object
        (raising `IndexError`), while Pythonic, is not PDFic, as PDF
        1.7 sec 7.3.10 states:

        > An indirect reference to an undefined object shall not be
        considered an error by a conforming reader; it shall be
        treated as a reference to the null object.

        Raises:
          ValueError: if Document is not initialized
          IndexError: if objid does not exist in PDF

        """
        if not self.xrefs:
            raise ValueError("Document is not initialized")
        if objid not in self._cached_objs:
            obj = None
            for xref in self.xrefs:
                try:
                    (strmid, index, genno) = xref.get_pos(objid)
                except KeyError:
                    continue
                try:
                    if strmid is not None:
                        stream = stream_value(self[strmid])
                        obj = self._getobj_objstm(stream, index, objid)
                    else:
                        obj = self._getobj_parse(index, objid)
                    break
                # FIXME: We might not actually want to catch these...
                except StopIteration:
                    log.debug("EOF when searching for object %d", objid)
                    continue
                except PDFSyntaxError as e:
                    log.debug("Syntax error when searching for object %d: %s", objid, e)
                    continue
            if obj is None:
                raise IndexError(f"Object with ID {objid} not found")
            self._cached_objs[objid] = obj
        return self._cached_objs[objid]

    def get_font(self, objid: object, spec: Mapping[str, object]) -> Font:
        if objid and objid in self._cached_fonts:
            return self._cached_fonts[objid]
        if spec.get("Type") is not LITERAL_FONT:
            log.warning("Font specification Type is not /Font: %r", spec)
        # Create a Font object.
        if "Subtype" in spec:
            subtype = literal_name(spec["Subtype"])
        else:
            log.warning("Font specification Subtype is not specified: %r", spec)
            subtype = ""
        if subtype in ("Type1", "MMType1"):
            # Type1 Font
            font: Font = Type1Font(spec)
        elif subtype == "TrueType":
            # TrueType Font
            font = TrueTypeFont(spec)
        elif subtype == "Type3":
            # Type3 Font
            font = Type3Font(spec)
        elif subtype == "Type0":
            # Type0 Font
            dfonts = list_value(spec["DescendantFonts"])
            assert dfonts
            if len(dfonts) != 1:
                log.debug("Type 0 font should have 1 descendant, has more: %r", dfonts)
            subspec = dict_value(dfonts[0]).copy()
            # Merge the root and descendant font dictionaries
            for k in ("Encoding", "ToUnicode"):
                if k in spec:
                    subspec[k] = resolve1(spec[k])
            font = CIDFont(subspec)
        else:
            log.warning("Invalid Font spec, creating dummy font: %r" % spec)
            # We need a dummy font object to be able to do *something*
            # (even if it's the wrong thing) with text objects.
            font = Font({}, {})
        if objid:
            self._cached_fonts[objid] = font
        return font

    @property
    def outline(self) -> Union[Outline, None]:
        """Document outline, if any."""
        if "Outlines" not in self.catalog:
            return None
        if self._outline is None:
            try:
                self._outline = Outline(self)
            except TypeError:
                log.warning(
                    "Invalid Outlines entry in catalog: %r", self.catalog["Outlines"]
                )
                return None
        return self._outline

    @property
    def page_labels(self) -> Iterator[str]:
        """Generate page label strings for the PDF document.

        If the document includes page labels, generates strings, one per page.
        If not, raise KeyError.

        The resulting iterator is unbounded (because the page label
        tree does not actually include all the pages), so it is
        recommended to use `pages` instead.

        Raises:
          KeyError: No page labels are present in the catalog

        """
        assert self.catalog is not None  # really it cannot be None

        page_labels = PageLabels(self.catalog["PageLabels"])
        return page_labels.labels

    PageType = Dict[Any, Dict[Any, Any]]

    def _get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
        """Find pages from the cross-reference tables if the page tree
        is missing (note that this only happens in invalid PDFs, but
        it happens.)

        Returns:
          an iterator over (objid, dict) pairs.
        """
        for xref in self.xrefs:
            for object_id in xref.objids:
                try:
                    obj = self[object_id]
                    if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
                        yield object_id, obj
                except IndexError:
                    pass

    def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
        """Iterate over the flattened page tree in reading order, propagating
        inheritable attributes.  Returns an iterator over (objid, dict) pairs.

        Raises:
          KeyError: if there is no page tree.
        """
        if "Pages" not in self.catalog:
            raise KeyError("No 'Pages' entry in catalog")
        stack = [(self.catalog["Pages"], self.catalog)]
        visited = set()
        while stack:
            (obj, parent) = stack.pop()
            if isinstance(obj, ObjRef):
                # The PDF specification *requires* both the Pages
                # element of the catalog and the entries in Kids in
                # the page tree to be indirect references.
                object_id = int(obj.objid)
            elif isinstance(obj, int):
                # Should not happen in a valid PDF, but probably does?
                log.warning("Page tree contains bare integer: %r in %r", obj, parent)
                object_id = obj
            else:
                log.warning("Page tree contains unknown object: %r", obj)
            page_object = dict_value(self[object_id])

            # Avoid recursion errors by keeping track of visited nodes
            # (again, this should never actually happen in a valid PDF)
            if object_id in visited:
                log.warning("Circular reference %r in page tree", obj)
                continue
            visited.add(object_id)

            # Propagate inheritable attributes
            object_properties = page_object.copy()
            for k, v in parent.items():
                if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
                    object_properties[k] = v

            # Recurse, depth-first
            object_type = object_properties.get("Type")
            if object_type is None:
                log.warning("Page has no Type, trying type: %r", object_properties)
                object_type = object_properties.get("type")
            if object_type is LITERAL_PAGES and "Kids" in object_properties:
                for child in reversed(list_value(object_properties["Kids"])):
                    stack.append((child, object_properties))
            elif object_type is LITERAL_PAGE:
                yield object_id, object_properties

    @property
    def pages(self) -> "PageList":
        """Pages of the document as an iterable/addressable `PageList` object."""
        if self._pages is None:
            self._pages = PageList(self)
        return self._pages

    @property
    def names(self) -> Dict[str, Any]:
        """PDF name dictionary (PDF 1.7 sec 7.7.4).

        Raises:
          KeyError: if nonexistent.
        """
        return dict_value(self.catalog["Names"])

    @property
    def destinations(self) -> "Destinations":
        """Named destinations as an iterable/addressable `Destinations` object."""
        if self._destinations is None:
            self._destinations = Destinations(self)
        return self._destinations

    def _find_xref(self) -> int:
        """Internal function used to locate the first XRef."""
        # search the last xref table by scanning the file backwards.
        prev = b""
        for pos, line in reverse_iter_lines(self.buffer):
            line = line.strip()
            if line == b"startxref":
                if not prev.isdigit():
                    log.warning("Invalid startxref position: %r", prev)
                    continue
                start = int(prev)
                if not start >= 0:
                    raise ValueError("Invalid negative startxref position: %d" % start)
                elif start > pos:
                    raise ValueError(
                        "Invalid startxref position (> %d): %d" % (pos, start)
                    )
                return start + self.offset
            elif line == b"xref":
                return pos
            elif line == b"endobj":
                # Okay, we're probably not in Kansas anymore...
                break
            if line:
                prev = line
        raise ValueError("No xref table found at end of file")

    # read xref table
    def _read_xref_from(
        self,
        start: int,
        xrefs: List[XRef],
    ) -> None:
        """Reads XRefs from the given location."""
        if start in self._xrefpos:
            log.warning("Detected circular xref chain at %d", start)
            return
        parser = ObjectParser(self.buffer, self, start)
        try:
            (pos, token) = parser.nexttoken()
        except StopIteration:
            raise ValueError("Unexpected EOF at {start}")
        if token is KEYWORD_XREF:
            parser.nextline()
            xref: XRef = XRefTable(parser, self.offset)
        else:
            # It might be an XRefStream, if this is an indirect object...
            _, token = parser.nexttoken()
            _, token = parser.nexttoken()
            if token is KEYWORD_OBJ:
                # XRefStream: PDF-1.5
                self.parser.seek(pos)
                self.parser.reset()
                xref = XRefStream(self.parser, self.offset)
            else:
                # Well, maybe it's an XRef table without "xref" (but
                # probably not)
                parser.seek(pos)
                xref = XRefTable(parser, self.offset)
        self._xrefpos.add(start)
        xrefs.append(xref)
        trailer = xref.trailer
        # For hybrid-reference files, an additional set of xrefs as a
        # stream.
        if "XRefStm" in trailer:
            pos = int_value(trailer["XRefStm"])
            self._read_xref_from(pos + self.offset, xrefs)
        # Recurse into any previous xref tables or streams
        if "Prev" in trailer:
            # find previous xref
            pos = int_value(trailer["Prev"])
            self._read_xref_from(pos + self.offset, xrefs)

destinations property

Named destinations as an iterable/addressable Destinations object.

names property

PDF name dictionary (PDF 1.7 sec 7.7.4).

Raises:

Type Description
KeyError

if nonexistent.

objects property

Iterate over all indirect objects (including, then expanding object streams)

outline property

Document outline, if any.

page_labels property

Generate page label strings for the PDF document.

If the document includes page labels, generates strings, one per page. If not, raise KeyError.

The resulting iterator is unbounded (because the page label tree does not actually include all the pages), so it is recommended to use pages instead.

Raises:

Type Description
KeyError

No page labels are present in the catalog

pages property

Pages of the document as an iterable/addressable PageList object.

parent_tree property

Parent tree of this document.

This is a somewhat obscure data structure that links marked content sections to their corresponding structure elements. If you don't know what that means, you probably don't need it, but if you do, here it is.

structure property

Logical structure of this document, if any.

In the case where no logical structure tree exists, this will be None. Otherwise you may iterate over it, search it, etc.

tokens property

Iterate over tokens.

__getitem__(objid)

Get an indirect object from the PDF.

Note that the behaviour in the case of a non-existent object (raising IndexError), while Pythonic, is not PDFic, as PDF 1.7 sec 7.3.10 states:

An indirect reference to an undefined object shall not be considered an error by a conforming reader; it shall be treated as a reference to the null object.

Raises:

Type Description
ValueError

if Document is not initialized

IndexError

if objid does not exist in PDF

Source code in playa/document.py
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
def __getitem__(self, objid: int) -> PDFObject:
    """Get an indirect object from the PDF.

    Note that the behaviour in the case of a non-existent object
    (raising `IndexError`), while Pythonic, is not PDFic, as PDF
    1.7 sec 7.3.10 states:

    > An indirect reference to an undefined object shall not be
    considered an error by a conforming reader; it shall be
    treated as a reference to the null object.

    Raises:
      ValueError: if Document is not initialized
      IndexError: if objid does not exist in PDF

    """
    if not self.xrefs:
        raise ValueError("Document is not initialized")
    if objid not in self._cached_objs:
        obj = None
        for xref in self.xrefs:
            try:
                (strmid, index, genno) = xref.get_pos(objid)
            except KeyError:
                continue
            try:
                if strmid is not None:
                    stream = stream_value(self[strmid])
                    obj = self._getobj_objstm(stream, index, objid)
                else:
                    obj = self._getobj_parse(index, objid)
                break
            # FIXME: We might not actually want to catch these...
            except StopIteration:
                log.debug("EOF when searching for object %d", objid)
                continue
            except PDFSyntaxError as e:
                log.debug("Syntax error when searching for object %d: %s", objid, e)
                continue
        if obj is None:
            raise IndexError(f"Object with ID {objid} not found")
        self._cached_objs[objid] = obj
    return self._cached_objs[objid]

__iter__()

Iterate over top-level IndirectObject (does not expand object streams)

Source code in playa/document.py
343
344
345
346
347
348
349
350
def __iter__(self) -> Iterator[IndirectObject]:
    """Iterate over top-level `IndirectObject` (does not expand object streams)"""
    return (
        obj
        for pos, obj in IndirectObjectParser(
            self.buffer, self, pos=self.offset, strict=self.parser.strict
        )
    )

_find_xref()

Internal function used to locate the first XRef.

Source code in playa/document.py
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def _find_xref(self) -> int:
    """Internal function used to locate the first XRef."""
    # search the last xref table by scanning the file backwards.
    prev = b""
    for pos, line in reverse_iter_lines(self.buffer):
        line = line.strip()
        if line == b"startxref":
            if not prev.isdigit():
                log.warning("Invalid startxref position: %r", prev)
                continue
            start = int(prev)
            if not start >= 0:
                raise ValueError("Invalid negative startxref position: %d" % start)
            elif start > pos:
                raise ValueError(
                    "Invalid startxref position (> %d): %d" % (pos, start)
                )
            return start + self.offset
        elif line == b"xref":
            return pos
        elif line == b"endobj":
            # Okay, we're probably not in Kansas anymore...
            break
        if line:
            prev = line
    raise ValueError("No xref table found at end of file")

_get_page_objects()

Iterate over the flattened page tree in reading order, propagating inheritable attributes. Returns an iterator over (objid, dict) pairs.

Raises:

Type Description
KeyError

if there is no page tree.

Source code in playa/document.py
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
def _get_page_objects(self) -> Iterator[Tuple[int, PageType]]:
    """Iterate over the flattened page tree in reading order, propagating
    inheritable attributes.  Returns an iterator over (objid, dict) pairs.

    Raises:
      KeyError: if there is no page tree.
    """
    if "Pages" not in self.catalog:
        raise KeyError("No 'Pages' entry in catalog")
    stack = [(self.catalog["Pages"], self.catalog)]
    visited = set()
    while stack:
        (obj, parent) = stack.pop()
        if isinstance(obj, ObjRef):
            # The PDF specification *requires* both the Pages
            # element of the catalog and the entries in Kids in
            # the page tree to be indirect references.
            object_id = int(obj.objid)
        elif isinstance(obj, int):
            # Should not happen in a valid PDF, but probably does?
            log.warning("Page tree contains bare integer: %r in %r", obj, parent)
            object_id = obj
        else:
            log.warning("Page tree contains unknown object: %r", obj)
        page_object = dict_value(self[object_id])

        # Avoid recursion errors by keeping track of visited nodes
        # (again, this should never actually happen in a valid PDF)
        if object_id in visited:
            log.warning("Circular reference %r in page tree", obj)
            continue
        visited.add(object_id)

        # Propagate inheritable attributes
        object_properties = page_object.copy()
        for k, v in parent.items():
            if k in INHERITABLE_PAGE_ATTRS and k not in object_properties:
                object_properties[k] = v

        # Recurse, depth-first
        object_type = object_properties.get("Type")
        if object_type is None:
            log.warning("Page has no Type, trying type: %r", object_properties)
            object_type = object_properties.get("type")
        if object_type is LITERAL_PAGES and "Kids" in object_properties:
            for child in reversed(list_value(object_properties["Kids"])):
                stack.append((child, object_properties))
        elif object_type is LITERAL_PAGE:
            yield object_id, object_properties

_get_pages_from_xrefs()

Find pages from the cross-reference tables if the page tree is missing (note that this only happens in invalid PDFs, but it happens.)

Returns:

Type Description
Iterator[Tuple[int, PageType]]

an iterator over (objid, dict) pairs.

Source code in playa/document.py
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
def _get_pages_from_xrefs(self) -> Iterator[Tuple[int, PageType]]:
    """Find pages from the cross-reference tables if the page tree
    is missing (note that this only happens in invalid PDFs, but
    it happens.)

    Returns:
      an iterator over (objid, dict) pairs.
    """
    for xref in self.xrefs:
        for object_id in xref.objids:
            try:
                obj = self[object_id]
                if isinstance(obj, dict) and obj.get("Type") is LITERAL_PAGE:
                    yield object_id, obj
            except IndexError:
                pass

_initialize_password(password='')

Initialize the decryption handler with a given password, if any.

Internal function, requires the Encrypt dictionary to have been read from the trailer into self.encryption.

Source code in playa/document.py
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
def _initialize_password(self, password: str = "") -> None:
    """Initialize the decryption handler with a given password, if any.

    Internal function, requires the Encrypt dictionary to have
    been read from the trailer into self.encryption.
    """
    assert self.encryption is not None
    (docid, param) = self.encryption
    if literal_name(param.get("Filter")) != "Standard":
        raise PDFEncryptionError("Unknown filter: param=%r" % param)
    v = int_value(param.get("V", 0))
    # 3 (PDF 1.4) An unpublished algorithm that permits encryption
    # key lengths ranging from 40 to 128 bits. This value shall
    # not appear in a conforming PDF file.
    if v == 3:
        raise PDFEncryptionError("Unpublished algorithm 3 not supported")
    factory = SECURITY_HANDLERS.get(v)
    # 0 An algorithm that is undocumented. This value shall not be used.
    if factory is None:
        raise PDFEncryptionError("Unknown algorithm: param=%r" % param)
    handler = factory(docid, param, password)
    self.decipher = handler.decrypt
    self.is_printable = handler.is_printable
    self.is_modifiable = handler.is_modifiable
    self.is_extractable = handler.is_extractable
    assert self.parser is not None
    # Ensure that no extra data leaks into encrypted streams
    self.parser.strict = True
    self.parser.decipher = self.decipher

_read_xref_from(start, xrefs)

Reads XRefs from the given location.

Source code in playa/document.py
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
def _read_xref_from(
    self,
    start: int,
    xrefs: List[XRef],
) -> None:
    """Reads XRefs from the given location."""
    if start in self._xrefpos:
        log.warning("Detected circular xref chain at %d", start)
        return
    parser = ObjectParser(self.buffer, self, start)
    try:
        (pos, token) = parser.nexttoken()
    except StopIteration:
        raise ValueError("Unexpected EOF at {start}")
    if token is KEYWORD_XREF:
        parser.nextline()
        xref: XRef = XRefTable(parser, self.offset)
    else:
        # It might be an XRefStream, if this is an indirect object...
        _, token = parser.nexttoken()
        _, token = parser.nexttoken()
        if token is KEYWORD_OBJ:
            # XRefStream: PDF-1.5
            self.parser.seek(pos)
            self.parser.reset()
            xref = XRefStream(self.parser, self.offset)
        else:
            # Well, maybe it's an XRef table without "xref" (but
            # probably not)
            parser.seek(pos)
            xref = XRefTable(parser, self.offset)
    self._xrefpos.add(start)
    xrefs.append(xref)
    trailer = xref.trailer
    # For hybrid-reference files, an additional set of xrefs as a
    # stream.
    if "XRefStm" in trailer:
        pos = int_value(trailer["XRefStm"])
        self._read_xref_from(pos + self.offset, xrefs)
    # Recurse into any previous xref tables or streams
    if "Prev" in trailer:
        # find previous xref
        pos = int_value(trailer["Prev"])
        self._read_xref_from(pos + self.offset, xrefs)

PageLabels

Bases: NumberTree

PageLabels from the document catalog.

See Section 12.4.2 in the PDF 1.7 Reference.

Source code in playa/document.py
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
class PageLabels(NumberTree):
    """PageLabels from the document catalog.

    See Section 12.4.2 in the PDF 1.7 Reference.
    """

    @property
    def labels(self) -> Iterator[str]:
        itor = iter(self)
        try:
            start, label_dict_unchecked = next(itor)
            # The tree must begin with page index 0
            if start != 0:
                log.warning("PageLabels tree is missing page index 0")
                # Try to cope, by assuming empty labels for the initial pages
                start = 0
        except StopIteration:
            log.warning("PageLabels tree is empty")
            start = 0
            label_dict_unchecked = {}

        while True:  # forever!
            label_dict = dict_value(label_dict_unchecked)
            style = label_dict.get("S")
            prefix = decode_text(str_value(label_dict.get("P", b"")))
            first_value = int_value(label_dict.get("St", 1))

            try:
                next_start, label_dict_unchecked = next(itor)
            except StopIteration:
                # This is the last specified range. It continues until the end
                # of the document.
                values: Iterable[int] = itertools.count(first_value)
            else:
                range_length = next_start - start
                values = range(first_value, first_value + range_length)
                start = next_start

            for value in values:
                label = self._format_page_label(value, style)
                yield prefix + label

    @staticmethod
    def _format_page_label(value: int, style: Any) -> str:
        """Format page label value in a specific style"""
        if style is None:
            label = ""
        elif style is LIT("D"):  # Decimal arabic numerals
            label = str(value)
        elif style is LIT("R"):  # Uppercase roman numerals
            label = format_int_roman(value).upper()
        elif style is LIT("r"):  # Lowercase roman numerals
            label = format_int_roman(value)
        elif style is LIT("A"):  # Uppercase letters A-Z, AA-ZZ...
            label = format_int_alpha(value).upper()
        elif style is LIT("a"):  # Lowercase letters a-z, aa-zz...
            label = format_int_alpha(value)
        else:
            log.warning("Unknown page label style: %r", style)
            label = ""
        return label

_format_page_label(value, style) staticmethod

Format page label value in a specific style

Source code in playa/document.py
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
@staticmethod
def _format_page_label(value: int, style: Any) -> str:
    """Format page label value in a specific style"""
    if style is None:
        label = ""
    elif style is LIT("D"):  # Decimal arabic numerals
        label = str(value)
    elif style is LIT("R"):  # Uppercase roman numerals
        label = format_int_roman(value).upper()
    elif style is LIT("r"):  # Lowercase roman numerals
        label = format_int_roman(value)
    elif style is LIT("A"):  # Uppercase letters A-Z, AA-ZZ...
        label = format_int_alpha(value).upper()
    elif style is LIT("a"):  # Lowercase letters a-z, aa-zz...
        label = format_int_alpha(value)
    else:
        log.warning("Unknown page label style: %r", style)
        label = ""
    return label

PageList

List of pages indexable by 0-based index or string label.

Attributes:

Name Type Description
have_labels bool

If pages have explicit labels in the PDF.

Source code in playa/document.py
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
class PageList:
    """List of pages indexable by 0-based index or string label.

    Attributes:
        have_labels: If pages have explicit labels in the PDF.
    """

    have_labels: bool

    def __init__(
        self, doc: Document, pages: Union[Iterable[Page], None] = None
    ) -> None:
        self.docref = _ref_document(doc)
        if pages is not None:
            self._pages = list(pages)
            self._labels: Dict[str, Page] = {
                page.label: page for page in pages if page.label is not None
            }
            self.have_labels = not not self._labels
        else:
            self._init_pages(doc)

    def _init_pages(self, doc: Document) -> None:
        try:
            page_labels: Iterable[Union[str, None]] = doc.page_labels
            self.have_labels = True
        except (KeyError, ValueError):
            page_labels = (str(idx) for idx in itertools.count(1))
            self.have_labels = False
        self._pages = []
        self._objids = {}
        self._labels = {}
        try:
            page_objects = list(doc._get_page_objects())
        except (KeyError, IndexError, TypeError):
            page_objects = list(doc._get_pages_from_xrefs())
        for page_idx, ((objid, properties), label) in enumerate(
            zip(page_objects, page_labels)
        ):
            page = Page(doc, objid, properties, label, page_idx, doc.space)
            self._pages.append(page)
            self._objids[objid] = page
            if label is not None:
                if label in self._labels:
                    log.info("Duplicate page label %s at index %d", label, page_idx)
                else:
                    self._labels[label] = page

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self.docref)

    def __len__(self) -> int:
        return len(self._pages)

    def __iter__(self) -> Iterator[Page]:
        return iter(self._pages)

    @overload
    def __getitem__(self, key: int) -> Page: ...

    @overload
    def __getitem__(self, key: str) -> Page: ...

    @overload
    def __getitem__(self, key: slice) -> "PageList": ...

    @overload
    def __getitem__(self, key: Iterable[int]) -> "PageList": ...

    @overload
    def __getitem__(self, key: Iterator[Union[int, str]]) -> "PageList": ...

    def __getitem__(self, key):
        if isinstance(key, int):
            return self._pages[key]
        elif isinstance(key, str):
            return self._labels[key]
        elif isinstance(key, slice):
            return PageList(_deref_document(self.docref), self._pages[key])
        else:
            return PageList(_deref_document(self.docref), (self[k] for k in key))

    def by_id(self, objid: int) -> Page:
        """Get a page by its indirect object ID.

        Args:
            objid: Indirect object ID for the page object.

        Returns:
            the page in question.
        """
        return self._objids[objid]

    def map(self, func: Callable[[Page], Any]) -> Iterator:
        """Apply a function over each page, iterating over its results.

        Args:
            func: The function to apply to each page.

        Note:
            This possibly runs `func` in a separate process.  If its
            return value is not serializable (by `pickle`) then you
            will encounter errors.
        """
        doc = _deref_document(self.docref)
        if doc._pool is not None:
            return doc._pool.map(
                call_page,
                itertools.repeat(func),
                ((id(doc), page.page_idx) for page in self),
            )
        else:
            return (func(page) for page in self)

doc property

Get associated document if it exists.

by_id(objid)

Get a page by its indirect object ID.

Parameters:

Name Type Description Default
objid int

Indirect object ID for the page object.

required

Returns:

Type Description
Page

the page in question.

Source code in playa/document.py
839
840
841
842
843
844
845
846
847
848
def by_id(self, objid: int) -> Page:
    """Get a page by its indirect object ID.

    Args:
        objid: Indirect object ID for the page object.

    Returns:
        the page in question.
    """
    return self._objids[objid]

map(func)

Apply a function over each page, iterating over its results.

Parameters:

Name Type Description Default
func Callable[[Page], Any]

The function to apply to each page.

required
Note

This possibly runs func in a separate process. If its return value is not serializable (by pickle) then you will encounter errors.

Source code in playa/document.py
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
def map(self, func: Callable[[Page], Any]) -> Iterator:
    """Apply a function over each page, iterating over its results.

    Args:
        func: The function to apply to each page.

    Note:
        This possibly runs `func` in a separate process.  If its
        return value is not serializable (by `pickle`) then you
        will encounter errors.
    """
    doc = _deref_document(self.docref)
    if doc._pool is not None:
        return doc._pool.map(
            call_page,
            itertools.repeat(func),
            ((id(doc), page.page_idx) for page in self),
        )
    else:
        return (func(page) for page in self)

call_page(func, pageref)

Call a function on a page in a worker process.

Source code in playa/document.py
750
751
752
def call_page(func: Callable[[Page], Any], pageref: PageRef) -> Any:
    """Call a function on a page in a worker process."""
    return func(_deref_page(pageref))

playa.page

Classes for looking at pages and their contents.

Annotation dataclass

PDF annotation (PDF 1.7 section 12.5).

Attributes:

Name Type Description
subtype str

Type of annotation.

rect Rect

Annotation rectangle (location on page) in default user space

bbox Rect

Annotation rectangle in device space

props Dict[str, PDFObject]

Annotation dictionary containing all other properties (PDF 1.7 sec. 12.5.2).

Source code in playa/page.py
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
@dataclass
class Annotation:
    """PDF annotation (PDF 1.7 section 12.5).

    Attributes:
      subtype: Type of annotation.
      rect: Annotation rectangle (location on page) in *default user space*
      bbox: Annotation rectangle in *device space*
      props: Annotation dictionary containing all other properties
             (PDF 1.7 sec. 12.5.2).
    """

    _pageref: PageRef
    subtype: str
    rect: Rect
    props: Dict[str, PDFObject]

    @property
    def page(self) -> Page:
        """Containing page for this annotation."""
        return _deref_page(self._pageref)

    @property
    def contents(self) -> Union[str, None]:
        """Text contents of annotation."""
        contents = resolve1(self.props.get("Contents"))
        if contents is None:
            return None
        try:
            return decode_text(contents)
        except TypeError:
            log.warning("Invalid annotation contents: %r", contents)
            return None

    @property
    def name(self) -> Union[str, None]:
        """Annotation name, uniquely identifying this annotation."""
        name = resolve1(self.props.get("NM"))
        if name is None:
            return None
        return decode_text(name)

    @property
    def mtime(self) -> Union[str, None]:
        """String describing date and time when annotation was most recently
        modified.

        The date *should* be in the format `D:YYYYMMDDHHmmSSOHH'mm`
        but this is in no way required (and unlikely to be implemented
        consistently, if history is any guide).
        """
        mtime = resolve1(self.props.get("M"))
        if mtime is None:
            return None
        return decode_text(mtime)

contents property

Text contents of annotation.

mtime property

String describing date and time when annotation was most recently modified.

The date should be in the format D:YYYYMMDDHHmmSSOHH'mm but this is in no way required (and unlikely to be implemented consistently, if history is any guide).

name property

Annotation name, uniquely identifying this annotation.

page property

Containing page for this annotation.

ContentObject dataclass

Any sort of content object.

Attributes:

Name Type Description
gstate GraphicState

Graphics state.

ctm Matrix

Coordinate transformation matrix (PDF 1.7 section 8.3.2).

mcstack Tuple[MarkedContent, ...]

Stack of enclosing marked content sections.

Source code in playa/page.py
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
@dataclass
class ContentObject:
    """Any sort of content object.

    Attributes:
      gstate: Graphics state.
      ctm: Coordinate transformation matrix (PDF 1.7 section 8.3.2).
      mcstack: Stack of enclosing marked content sections.
    """

    _pageref: PageRef
    gstate: GraphicState
    ctm: Matrix
    mcstack: Tuple[MarkedContent, ...]

    def __iter__(self) -> Iterator["ContentObject"]:
        yield from ()

    def __len__(self) -> int:
        """Return the number of children of this object (generic implementation)."""
        return sum(1 for _ in self)

    @property
    def object_type(self):
        """Type of this object as a string, e.g. "text", "path", "image"."""
        name = self.__class__.__name__
        return name[: -len("Object")].lower()

    @property
    def bbox(self) -> Rect:
        """The bounding box in device space of this object."""
        # These bboxes have already been computed in device space so
        # we don't need all 4 corners!
        points = itertools.chain.from_iterable(
            ((x0, y0), (x1, y1)) for x0, y0, x1, y1 in (item.bbox for item in self)
        )
        return get_bound(points)

    @property
    def mcs(self) -> Union[MarkedContent, None]:
        """The immediately enclosing marked content section."""
        return self.mcstack[-1] if self.mcstack else None

    @property
    def mcid(self) -> Union[int, None]:
        """The marked content ID of the nearest enclosing marked
        content section with an ID."""
        for mcs in self.mcstack[::-1]:
            if mcs.mcid is not None:
                return mcs.mcid
        return None

    @property
    def page(self) -> Page:
        """The page containing this content object."""
        return _deref_page(self._pageref)

bbox property

The bounding box in device space of this object.

mcid property

The marked content ID of the nearest enclosing marked content section with an ID.

mcs property

The immediately enclosing marked content section.

object_type property

Type of this object as a string, e.g. "text", "path", "image".

page property

The page containing this content object.

__len__()

Return the number of children of this object (generic implementation).

Source code in playa/page.py
721
722
723
def __len__(self) -> int:
    """Return the number of children of this object (generic implementation)."""
    return sum(1 for _ in self)

DashPattern

Bases: NamedTuple

Line dash pattern in PDF graphics state (PDF 1.7 section 8.4.3.6).

Attributes:

Name Type Description
dash Tuple[float, ...]

lengths of dashes and gaps in user space units

phase float

starting position in the dash pattern

Source code in playa/page.py
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
class DashPattern(NamedTuple):
    """
    Line dash pattern in PDF graphics state (PDF 1.7 section 8.4.3.6).

    Attributes:
      dash: lengths of dashes and gaps in user space units
      phase: starting position in the dash pattern
    """

    dash: Tuple[float, ...]
    phase: float

    def __str__(self):
        if len(self.dash) == 0:
            return ""
        else:
            return f"{self.dash} {self.phase}"

GlyphObject dataclass

Bases: ContentObject

Individual glyph on the page.

Attributes:

Name Type Description
textstate TextState

Text state for this glyph. This is a mutable object and you should not expect it to be valid outside the context of iteration over the parent TextObject.

cid int

Character ID for this glyph.

text Union[str, None]

Unicode mapping of this glyph, if any.

adv float

glyph displacement in text space units (horizontal or vertical, depending on the writing direction).

matrix Matrix

rendering matrix for this glyph, which transforms text space (not glyph space!) coordinates to device space.

bbox Rect

glyph bounding box in device space.

text_space_bbox

glyph bounding box in text space (i.e. before any possible coordinate transformation)

corners bool

Is the transformed bounding box rotated or skewed such that all four corners need to be calculated (derived from matrix but precomputed for speed)

Source code in playa/page.py
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
@dataclass
class GlyphObject(ContentObject):
    """Individual glyph on the page.

    Attributes:
      textstate: Text state for this glyph.  This is a **mutable**
        object and you should not expect it to be valid outside the
        context of iteration over the parent `TextObject`.
      cid: Character ID for this glyph.
      text: Unicode mapping of this glyph, if any.
      adv: glyph displacement in text space units (horizontal or vertical,
           depending on the writing direction).
      matrix: rendering matrix for this glyph, which transforms text
              space (*not glyph space!*) coordinates to device space.
      bbox: glyph bounding box in device space.
      text_space_bbox: glyph bounding box in text space (i.e. before
                       any possible coordinate transformation)
      corners: Is the transformed bounding box rotated or skewed such
               that all four corners need to be calculated (derived
               from matrix but precomputed for speed)

    """

    textstate: TextState
    cid: int
    text: Union[str, None]
    matrix: Matrix
    adv: float
    corners: bool

    def __len__(self) -> int:
        """Fool! You cannot iterate over a GlyphObject!"""
        return 0

    @property
    def bbox(self) -> Rect:
        x0, y0, x1, y1 = self.text_space_bbox
        if self.corners:
            return get_bound(
                (
                    apply_matrix_pt(self.matrix, (x0, y0)),
                    apply_matrix_pt(self.matrix, (x0, y1)),
                    apply_matrix_pt(self.matrix, (x1, y1)),
                    apply_matrix_pt(self.matrix, (x1, y0)),
                )
            )
        else:
            x0, y0 = apply_matrix_pt(self.matrix, (x0, y0))
            x1, y1 = apply_matrix_pt(self.matrix, (x1, y1))
            if x1 < x0:
                x0, x1 = x1, x0
            if y1 < y0:
                y0, y1 = y1, y0
            return (x0, y0, x1, y1)

    @property
    def text_space_bbox(self):
        tstate = self.textstate
        font = tstate.font
        assert font is not None
        if font.vertical:
            textdisp = font.char_disp(self.cid)
            assert isinstance(textdisp, tuple)
            (vx, vy) = textdisp
            if vx is None:
                vx = tstate.fontsize * 0.5
            else:
                vx = vx * tstate.fontsize * 0.001
            vy = (1000 - vy) * tstate.fontsize * 0.001
            x0, y0 = (-vx, vy + tstate.rise + self.adv)
            x1, y1 = (-vx + tstate.fontsize, vy + tstate.rise)
        else:
            x0, y0 = (0, tstate.descent + tstate.rise)
            x1, y1 = (self.adv, tstate.descent + tstate.rise + tstate.fontsize)
        return (x0, y0, x1, y1)

__len__()

Fool! You cannot iterate over a GlyphObject!

Source code in playa/page.py
1030
1031
1032
def __len__(self) -> int:
    """Fool! You cannot iterate over a GlyphObject!"""
    return 0

GraphicState dataclass

PDF Graphics state (PDF 1.7 section 8.4)

Attributes:

Name Type Description
linewidth float

Line width in user space units (sec. 8.4.3.2)

linecap int

Line cap style (sec. 8.4.3.3)

linejoin int

Line join style (sec. 8.4.3.4)

miterlimit float

Maximum length of mitered line joins (sec. 8.4.3.5)

dash DashPattern

Dash pattern for stroking (sec 8.4.3.6)

intent PSLiteral

Rendering intent (sec. 8.6.5.8)

flatness float

The precision with which curves shall be rendered on the output device (sec. 10.6.2)

scolor Color

Colour used for stroking operations

scs ColorSpace

Colour space used for stroking operations

ncolor Color

Colour used for non-stroking operations

ncs ColorSpace

Colour space used for non-stroking operations

Source code in playa/page.py
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
@dataclass
class GraphicState:
    """PDF Graphics state (PDF 1.7 section 8.4)

    Attributes:
      linewidth: Line width in user space units (sec. 8.4.3.2)
      linecap: Line cap style (sec. 8.4.3.3)
      linejoin: Line join style (sec. 8.4.3.4)
      miterlimit: Maximum length of mitered line joins (sec. 8.4.3.5)
      dash: Dash pattern for stroking (sec 8.4.3.6)
      intent: Rendering intent (sec. 8.6.5.8)
      flatness: The precision with which curves shall be rendered on
        the output device (sec. 10.6.2)
      scolor: Colour used for stroking operations
      scs: Colour space used for stroking operations
      ncolor: Colour used for non-stroking operations
      ncs: Colour space used for non-stroking operations
    """

    linewidth: float = 1
    linecap: int = 0
    linejoin: int = 0
    miterlimit: float = 10
    dash: DashPattern = SOLID_LINE
    intent: PSLiteral = LITERAL_RELATIVE_COLORIMETRIC
    flatness: float = 1
    scolor: Color = BASIC_BLACK
    scs: ColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]
    ncolor: Color = BASIC_BLACK
    ncs: ColorSpace = PREDEFINED_COLORSPACE["DeviceGray"]

ImageObject dataclass

Bases: ContentObject

An image (either inline or XObject).

Attributes:

Name Type Description
xobjid Union[str, None]

Name of XObject (or None for inline images).

srcsize Tuple[int, int]

Size of source image in pixels.

bits int

Number of bits per component, if required (otherwise 1).

imagemask bool

True if the image is a mask.

stream ContentStream

Content stream with image data.

colorspace Union[ColorSpace, None]

Colour space for this image, if required (otherwise None).

Source code in playa/page.py
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
@dataclass
class ImageObject(ContentObject):
    """An image (either inline or XObject).

    Attributes:
      xobjid: Name of XObject (or None for inline images).
      srcsize: Size of source image in pixels.
      bits: Number of bits per component, if required (otherwise 1).
      imagemask: True if the image is a mask.
      stream: Content stream with image data.
      colorspace: Colour space for this image, if required (otherwise
        None).
    """

    xobjid: Union[str, None]
    srcsize: Tuple[int, int]
    bits: int
    imagemask: bool
    stream: ContentStream
    colorspace: Union[ColorSpace, None]

    def __contains__(self, name: object) -> bool:
        return name in self.stream

    def __getitem__(self, name: str) -> PDFObject:
        return self.stream[name]

    def __len__(self) -> int:
        """Even though you can __getitem__ from an image you cannot iterate
        over its keys, sorry about that.  Returns zero."""
        return 0

    @property
    def buffer(self) -> bytes:
        """Binary stream content for this image"""
        return self.stream.buffer

    @property
    def bbox(self) -> Rect:
        # PDF 1.7 sec 8.3.24: All images shall be 1 unit wide by 1
        # unit high in user space, regardless of the number of samples
        # in the image. To be painted, an image shall be mapped to a
        # region of the page by temporarily altering the CTM.
        return get_transformed_bound(self.ctm, (0, 0, 1, 1))

buffer property

Binary stream content for this image

__len__()

Even though you can getitem from an image you cannot iterate over its keys, sorry about that. Returns zero.

Source code in playa/page.py
826
827
828
829
def __len__(self) -> int:
    """Even though you can __getitem__ from an image you cannot iterate
    over its keys, sorry about that.  Returns zero."""
    return 0

MarkedContent

Bases: NamedTuple

Marked content information for a point or section in a PDF page.

Attributes:

Name Type Description
mcid Union[int, None]

Marked content section ID, or None for a marked content point.

tag str

Name of tag for this marked content.

props Dict[str, PDFObject]

Marked content property dictionary.

Source code in playa/page.py
676
677
678
679
680
681
682
683
684
685
686
687
688
class MarkedContent(NamedTuple):
    """
    Marked content information for a point or section in a PDF page.

    Attributes:
      mcid: Marked content section ID, or `None` for a marked content point.
      tag: Name of tag for this marked content.
      props: Marked content property dictionary.
    """

    mcid: Union[int, None]
    tag: str
    props: Dict[str, PDFObject]

Page

An object that holds the information about a page.

Parameters:

Name Type Description Default
doc Document

a Document object.

required
pageid int

the integer PDF object ID associated with the page in the page tree.

required
attrs Dict

a dictionary of page attributes.

required
label Optional[str]

page label string.

required
page_idx int

0-based index of the page in the document.

0
space DeviceSpace

the device space to use for interpreting content

'screen'

Attributes:

Name Type Description
pageid

the integer object ID associated with the page in the page tree

attrs

a dictionary of page attributes.

resources Dict[str, PDFObject]

a dictionary of resources used by the page.

mediabox

the physical size of the page.

cropbox

the crop rectangle of the page.

rotate

the page rotation (in degree).

label

the page's label (typically, the logical page number).

page_idx

0-based index of the page in the document.

ctm

coordinate transformation matrix from default user space to page's device space

Source code in playa/page.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
class Page:
    """An object that holds the information about a page.

    Args:
      doc: a Document object.
      pageid: the integer PDF object ID associated with the page in the page tree.
      attrs: a dictionary of page attributes.
      label: page label string.
      page_idx: 0-based index of the page in the document.
      space: the device space to use for interpreting content

    Attributes:
      pageid: the integer object ID associated with the page in the page tree
      attrs: a dictionary of page attributes.
      resources: a dictionary of resources used by the page.
      mediabox: the physical size of the page.
      cropbox: the crop rectangle of the page.
      rotate: the page rotation (in degree).
      label: the page's label (typically, the logical page number).
      page_idx: 0-based index of the page in the document.
      ctm: coordinate transformation matrix from default user space to
           page's device space
    """

    def __init__(
        self,
        doc: "Document",
        pageid: int,
        attrs: Dict,
        label: Optional[str],
        page_idx: int = 0,
        space: DeviceSpace = "screen",
    ) -> None:
        self.docref = _ref_document(doc)
        self.pageid = pageid
        self.attrs = attrs
        self.label = label
        self.page_idx = page_idx
        self.space = space
        self.pageref = _ref_page(self)
        self.lastmod = resolve1(self.attrs.get("LastModified"))
        try:
            self.resources: Dict[str, PDFObject] = dict_value(
                self.attrs.get("Resources")
            )
        except TypeError:
            log.warning("Resources missing or invalid from Page id %d", pageid)
            self.resources = {}
        try:
            self.mediabox = normalize_rect(parse_rect(self.attrs["MediaBox"]))
        except KeyError:
            log.warning(
                "MediaBox missing from Page id %d (and not inherited),"
                " defaulting to US Letter (612x792)",
                pageid,
            )
            self.mediabox = (0, 0, 612, 792)
        except (ValueError, PDFSyntaxError):
            log.warning(
                "MediaBox %r invalid in Page id %d,"
                " defaulting to US Letter (612x792)",
                self.attrs["MediaBox"],
                pageid,
            )
            self.mediabox = (0, 0, 612, 792)
        self.cropbox = self.mediabox
        if "CropBox" in self.attrs:
            try:
                self.cropbox = normalize_rect(parse_rect(self.attrs["CropBox"]))
            except (ValueError, PDFSyntaxError):
                log.warning(
                    "Invalid CropBox %r in /Page, defaulting to MediaBox",
                    self.attrs["CropBox"],
                )

        self.rotate = (int_value(self.attrs.get("Rotate", 0)) + 360) % 360
        (x0, y0, x1, y1) = self.mediabox
        width = x1 - x0
        height = y1 - y0
        # PDF 1.7 section 8.4.1: Initial value: a matrix that
        # transforms default user coordinates to device coordinates.
        #
        # We keep this as `self.ctm` in order to transform layout
        # attributes in tagged PDFs which are specified in default
        # user space (PDF 1.7 section 14.8.5.4.3, table 344)
        #
        # "screen" device space: origin is top left of MediaBox
        if self.space == "screen":
            self.ctm = (1.0, 0.0, 0.0, -1.0, -x0, y1)
        # "page" device space: origin is bottom left of MediaBox
        elif self.space == "page":
            self.ctm = (1.0, 0.0, 0.0, 1.0, -x0, -y0)
        # "default" device space: no transformation or rotation
        else:
            if self.space != "default":
                log.warning("Unknown device space: %r", self.space)
            self.ctm = MATRIX_IDENTITY
            width = height = 0
        # If rotation is requested, apply rotation to the initial ctm
        if self.rotate == 90:
            # x' = y
            # y' = width - x
            self.ctm = mult_matrix((0, -1, 1, 0, 0, width), self.ctm)
        elif self.rotate == 180:
            # x' = width - x
            # y' = height - y
            self.ctm = mult_matrix((-1, 0, 0, -1, width, height), self.ctm)
        elif self.rotate == 270:
            # x' = height - y
            # y' = x
            self.ctm = mult_matrix((0, 1, -1, 0, height, 0), self.ctm)
        elif self.rotate != 0:
            log.warning("Invalid /Rotate: %r", self.rotate)

        contents = resolve1(self.attrs.get("Contents"))
        if contents is None:
            self._contents = []
        else:
            if isinstance(contents, list):
                self._contents = contents
            else:
                self._contents = [contents]

    @property
    def annotations(self) -> Iterator["Annotation"]:
        """Lazily iterate over page annotations."""
        alist = resolve1(self.attrs.get("Annots"))
        if alist is None:
            return
        for annot in alist:
            annot = resolve1(annot)
            if not isinstance(annot, dict):
                log.warning("Invalid object in Annots: %r", annot)
                continue
            subtype = annot.get("Subtype")
            if subtype is None or not isinstance(subtype, PSLiteral):
                log.warning("Invalid Subtype in annotation: %r", annot)
                continue
            try:
                rect = parse_rect(annot.get("Rect"))
            except (TypeError, ValueError, PDFSyntaxError):
                log.warning("Invalid Rect in annotation: %r", annot)
                continue
            yield Annotation(
                _pageref=self.pageref,
                subtype=literal_name(subtype),
                rect=rect,
                props=annot,
            )

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self.docref)

    @property
    def streams(self) -> Iterator[ContentStream]:
        """Return resolved content streams."""
        for obj in self._contents:
            try:
                yield stream_value(obj)
            except TypeError:
                log.warning("Found non-stream in contents: %r", obj)

    @property
    def width(self) -> float:
        """Width of the page in default user space units."""
        x0, _, x1, _ = self.mediabox
        return x1 - x0

    @property
    def height(self) -> float:
        """Width of the page in default user space units."""
        _, y0, _, y1 = self.mediabox
        return y1 - y0

    @property
    def contents(self) -> Iterator[PDFObject]:
        """Iterator over PDF objects in the content streams."""
        for pos, obj in ContentParser(self._contents):
            yield obj

    def __iter__(self) -> Iterator["ContentObject"]:
        """Iterator over lazy layout objects."""
        return iter(LazyInterpreter(self, self._contents))

    @property
    def paths(self) -> Iterator["PathObject"]:
        """Iterator over lazy path objects."""
        return self.flatten(PathObject)

    @property
    def images(self) -> Iterator["ImageObject"]:
        """Iterator over lazy image objects."""
        return self.flatten(ImageObject)

    @property
    def texts(self) -> Iterator["TextObject"]:
        """Iterator over lazy text objects."""
        return self.flatten(TextObject)

    @property
    def xobjects(self) -> Iterator["XObjectObject"]:
        """Return resolved and rendered Form XObjects.

        This does *not* return any image or PostScript XObjects.  You
        can get images via the `images` property.  Apparently you
        aren't supposed to use PostScript XObjects for anything, ever.

        Note that these are the XObjects as rendered on the page, so
        you may see the same named XObject multiple times.  If you
        need to access their actual definitions you'll have to look at
        `page.resources`.
        """
        return cast(
            Iterator["XObjectObject"],
            iter(LazyInterpreter(self, self._contents, filter_class=XObjectObject)),
        )

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterator over tokens in the content streams."""
        parser = ContentParser(self._contents)
        while True:
            try:
                pos, tok = parser.nexttoken()
            except StopIteration:
                return
            yield tok

    def __repr__(self) -> str:
        return f"<Page: Resources={self.resources!r}, MediaBox={self.mediabox!r}>"

    @overload
    def flatten(self) -> Iterator["ContentObject"]: ...

    @overload
    def flatten(self, filter_class: Type[CO]) -> Iterator[CO]: ...

    def flatten(
        self, filter_class: Union[None, Type[CO]] = None
    ) -> Iterator[Union[CO, "ContentObject"]]:
        """Iterate over content objects, recursing into form XObjects."""

        def flatten_one(itor: Iterable["ContentObject"]) -> Iterator["ContentObject"]:
            for obj in itor:
                if isinstance(obj, XObjectObject):
                    yield from flatten_one(obj)
                else:
                    yield obj

        if filter_class is None:
            yield from flatten_one(self)
        else:
            for obj in flatten_one(self):
                if isinstance(obj, filter_class):
                    yield obj

    def extract_text(self) -> str:
        """Do some best-effort text extraction.

        This necessarily involves a few heuristics, so don't get your
        hopes up.  It will attempt to use marked content information
        for a tagged PDF, otherwise it will fall back on the character
        displacement and line matrix to determine word and line breaks.
        """
        if self.doc.is_tagged:
            return self.extract_text_tagged()
        else:
            return self.extract_text_untagged()

    def extract_text_untagged(self) -> str:
        """Get text from a page of an untagged PDF."""
        prev_line_matrix = None
        prev_end = 0.0
        lines = []
        strings = []
        for text in self.flatten(TextObject):
            line_matrix = text.textstate.line_matrix
            vertical = (
                False if text.textstate.font is None else text.textstate.font.vertical
            )
            lpos = -2 if vertical else -1
            if (
                prev_line_matrix is not None
                and line_matrix[lpos] < prev_line_matrix[lpos]
            ):
                lines.append("".join(strings))
                strings.clear()
            wpos = -1 if vertical else -2
            if (
                prev_line_matrix is not None
                and prev_end + prev_line_matrix[wpos] < line_matrix[wpos]
            ):
                strings.append(" ")
            textstr, end = _extract_text_from_obj(text, vertical)
            strings.append(textstr)
            prev_line_matrix = line_matrix
            prev_end = end
        if strings:
            lines.append("".join(strings))
        return "\n".join(lines)

    def extract_text_tagged(self) -> str:
        """Get text from a page of a tagged PDF."""
        lines: List[str] = []
        strings: List[str] = []
        at_mcs: Union[MarkedContent, None] = None
        prev_mcid: Union[int, None] = None
        for text in self.flatten(TextObject):
            in_artifact = same_actual_text = reversed_chars = False
            actual_text = None
            for mcs in reversed(text.mcstack):
                if mcs.tag == "Artifact":
                    in_artifact = True
                    break
                actual_text = mcs.props.get("ActualText")
                if actual_text is not None:
                    if mcs is at_mcs:
                        same_actual_text = True
                    at_mcs = mcs
                    break
                if mcs.tag == "ReversedChars":
                    reversed_chars = True
                    break
            if in_artifact or same_actual_text:
                continue
            if actual_text is None:
                chars = text.chars
                if reversed_chars:
                    chars = chars[::-1]
            else:
                assert isinstance(actual_text, bytes)
                chars = actual_text.decode("UTF-16")
            # Remove soft hyphens
            chars = chars.replace("\xad", "")
            # Insert a line break (FIXME: not really correct)
            if text.mcid != prev_mcid:
                lines.extend(textwrap.wrap("".join(strings)))
                strings.clear()
                prev_mcid = text.mcid
            strings.append(chars)
        if strings:
            lines.extend(textwrap.wrap("".join(strings)))
        return "\n".join(lines)

annotations property

Lazily iterate over page annotations.

contents property

Iterator over PDF objects in the content streams.

doc property

Get associated document if it exists.

height property

Width of the page in default user space units.

images property

Iterator over lazy image objects.

paths property

Iterator over lazy path objects.

streams property

Return resolved content streams.

texts property

Iterator over lazy text objects.

tokens property

Iterator over tokens in the content streams.

width property

Width of the page in default user space units.

xobjects property

Return resolved and rendered Form XObjects.

This does not return any image or PostScript XObjects. You can get images via the images property. Apparently you aren't supposed to use PostScript XObjects for anything, ever.

Note that these are the XObjects as rendered on the page, so you may see the same named XObject multiple times. If you need to access their actual definitions you'll have to look at page.resources.

__iter__()

Iterator over lazy layout objects.

Source code in playa/page.py
295
296
297
def __iter__(self) -> Iterator["ContentObject"]:
    """Iterator over lazy layout objects."""
    return iter(LazyInterpreter(self, self._contents))

extract_text()

Do some best-effort text extraction.

This necessarily involves a few heuristics, so don't get your hopes up. It will attempt to use marked content information for a tagged PDF, otherwise it will fall back on the character displacement and line matrix to determine word and line breaks.

Source code in playa/page.py
371
372
373
374
375
376
377
378
379
380
381
382
def extract_text(self) -> str:
    """Do some best-effort text extraction.

    This necessarily involves a few heuristics, so don't get your
    hopes up.  It will attempt to use marked content information
    for a tagged PDF, otherwise it will fall back on the character
    displacement and line matrix to determine word and line breaks.
    """
    if self.doc.is_tagged:
        return self.extract_text_tagged()
    else:
        return self.extract_text_untagged()

extract_text_tagged()

Get text from a page of a tagged PDF.

Source code in playa/page.py
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
def extract_text_tagged(self) -> str:
    """Get text from a page of a tagged PDF."""
    lines: List[str] = []
    strings: List[str] = []
    at_mcs: Union[MarkedContent, None] = None
    prev_mcid: Union[int, None] = None
    for text in self.flatten(TextObject):
        in_artifact = same_actual_text = reversed_chars = False
        actual_text = None
        for mcs in reversed(text.mcstack):
            if mcs.tag == "Artifact":
                in_artifact = True
                break
            actual_text = mcs.props.get("ActualText")
            if actual_text is not None:
                if mcs is at_mcs:
                    same_actual_text = True
                at_mcs = mcs
                break
            if mcs.tag == "ReversedChars":
                reversed_chars = True
                break
        if in_artifact or same_actual_text:
            continue
        if actual_text is None:
            chars = text.chars
            if reversed_chars:
                chars = chars[::-1]
        else:
            assert isinstance(actual_text, bytes)
            chars = actual_text.decode("UTF-16")
        # Remove soft hyphens
        chars = chars.replace("\xad", "")
        # Insert a line break (FIXME: not really correct)
        if text.mcid != prev_mcid:
            lines.extend(textwrap.wrap("".join(strings)))
            strings.clear()
            prev_mcid = text.mcid
        strings.append(chars)
    if strings:
        lines.extend(textwrap.wrap("".join(strings)))
    return "\n".join(lines)

extract_text_untagged()

Get text from a page of an untagged PDF.

Source code in playa/page.py
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
def extract_text_untagged(self) -> str:
    """Get text from a page of an untagged PDF."""
    prev_line_matrix = None
    prev_end = 0.0
    lines = []
    strings = []
    for text in self.flatten(TextObject):
        line_matrix = text.textstate.line_matrix
        vertical = (
            False if text.textstate.font is None else text.textstate.font.vertical
        )
        lpos = -2 if vertical else -1
        if (
            prev_line_matrix is not None
            and line_matrix[lpos] < prev_line_matrix[lpos]
        ):
            lines.append("".join(strings))
            strings.clear()
        wpos = -1 if vertical else -2
        if (
            prev_line_matrix is not None
            and prev_end + prev_line_matrix[wpos] < line_matrix[wpos]
        ):
            strings.append(" ")
        textstr, end = _extract_text_from_obj(text, vertical)
        strings.append(textstr)
        prev_line_matrix = line_matrix
        prev_end = end
    if strings:
        lines.append("".join(strings))
    return "\n".join(lines)

flatten(filter_class=None)

flatten() -> Iterator[ContentObject]
flatten(filter_class: Type[CO]) -> Iterator[CO]

Iterate over content objects, recursing into form XObjects.

Source code in playa/page.py
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
def flatten(
    self, filter_class: Union[None, Type[CO]] = None
) -> Iterator[Union[CO, "ContentObject"]]:
    """Iterate over content objects, recursing into form XObjects."""

    def flatten_one(itor: Iterable["ContentObject"]) -> Iterator["ContentObject"]:
        for obj in itor:
            if isinstance(obj, XObjectObject):
                yield from flatten_one(obj)
            else:
                yield obj

    if filter_class is None:
        yield from flatten_one(self)
    else:
        for obj in flatten_one(self):
            if isinstance(obj, filter_class):
                yield obj

PathObject dataclass

Bases: ContentObject

A path object.

Attributes:

Name Type Description
raw_segments List[PathSegment]

Segments in path (in user space).

stroke bool

True if the outline of the path is stroked.

fill bool

True if the path is filled.

evenodd bool

True if the filling of complex paths uses the even-odd winding rule, False if the non-zero winding number rule is used (PDF 1.7 section 8.5.3.3)

Source code in playa/page.py
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
@dataclass
class PathObject(ContentObject):
    """A path object.

    Attributes:
      raw_segments: Segments in path (in user space).
      stroke: True if the outline of the path is stroked.
      fill: True if the path is filled.
      evenodd: True if the filling of complex paths uses the even-odd
        winding rule, False if the non-zero winding number rule is
        used (PDF 1.7 section 8.5.3.3)
    """

    raw_segments: List[PathSegment]
    stroke: bool
    fill: bool
    evenodd: bool

    def __len__(self):
        """Number of subpaths."""
        return min(1, sum(1 for seg in self.raw_segments if seg.operator == "m"))

    def __iter__(self):
        """Iterate over subpaths.

        If there is only a single subpath, it will still be iterated
        over.  This means that some care must be taken (for example,
        checking if `len(path) == 1`) to avoid endless recursion.

        Note: subpaths inherit the values of `fill` and `evenodd` from
        the parent path, but these values are no longer meaningful
        since the winding rules must be applied to the composite path
        as a whole (this is not a bug, just don't rely on them to know
        which regions are filled or not).

        """
        # FIXME: Is there an itertool or a more_itertool for this?
        segs = []
        for seg in self.raw_segments:
            if seg.operator == "m" and segs:
                yield PathObject(
                    _pageref=self._pageref,
                    gstate=self.gstate,
                    ctm=self.ctm,
                    mcstack=self.mcstack,
                    raw_segments=segs,
                    stroke=self.stroke,
                    fill=self.fill,
                    evenodd=self.evenodd,
                )
                segs = []
            segs.append(seg)
        if segs:
            yield PathObject(
                _pageref=self._pageref,
                gstate=self.gstate,
                ctm=self.ctm,
                mcstack=self.mcstack,
                raw_segments=segs,
                stroke=self.stroke,
                fill=self.fill,
                evenodd=self.evenodd,
            )

    @property
    def segments(self) -> Iterator[PathSegment]:
        """Get path segments in device space."""
        return (
            PathSegment(
                p.operator,
                tuple(apply_matrix_pt(self.ctm, point) for point in p.points),
            )
            for p in self.raw_segments
        )

    @property
    def bbox(self) -> Rect:
        """Get bounding box of path in device space as defined by its
        points and control points."""
        # First get the bounding box in user space (fast)
        bbox = get_bound(
            itertools.chain.from_iterable(seg.points for seg in self.raw_segments)
        )
        # Transform it and get the new bounding box
        return get_transformed_bound(self.ctm, bbox)

bbox property

Get bounding box of path in device space as defined by its points and control points.

segments property

Get path segments in device space.

__iter__()

Iterate over subpaths.

If there is only a single subpath, it will still be iterated over. This means that some care must be taken (for example, checking if len(path) == 1) to avoid endless recursion.

Note: subpaths inherit the values of fill and evenodd from the parent path, but these values are no longer meaningful since the winding rules must be applied to the composite path as a whole (this is not a bug, just don't rely on them to know which regions are filled or not).

Source code in playa/page.py
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
def __iter__(self):
    """Iterate over subpaths.

    If there is only a single subpath, it will still be iterated
    over.  This means that some care must be taken (for example,
    checking if `len(path) == 1`) to avoid endless recursion.

    Note: subpaths inherit the values of `fill` and `evenodd` from
    the parent path, but these values are no longer meaningful
    since the winding rules must be applied to the composite path
    as a whole (this is not a bug, just don't rely on them to know
    which regions are filled or not).

    """
    # FIXME: Is there an itertool or a more_itertool for this?
    segs = []
    for seg in self.raw_segments:
        if seg.operator == "m" and segs:
            yield PathObject(
                _pageref=self._pageref,
                gstate=self.gstate,
                ctm=self.ctm,
                mcstack=self.mcstack,
                raw_segments=segs,
                stroke=self.stroke,
                fill=self.fill,
                evenodd=self.evenodd,
            )
            segs = []
        segs.append(seg)
    if segs:
        yield PathObject(
            _pageref=self._pageref,
            gstate=self.gstate,
            ctm=self.ctm,
            mcstack=self.mcstack,
            raw_segments=segs,
            stroke=self.stroke,
            fill=self.fill,
            evenodd=self.evenodd,
        )

__len__()

Number of subpaths.

Source code in playa/page.py
931
932
933
def __len__(self):
    """Number of subpaths."""
    return min(1, sum(1 for seg in self.raw_segments if seg.operator == "m"))

PathSegment

Bases: NamedTuple

Segment in a PDF graphics path.

Source code in playa/page.py
694
695
696
697
698
699
700
class PathSegment(NamedTuple):
    """
    Segment in a PDF graphics path.
    """

    operator: PathOperator
    points: Tuple[Point, ...]

TagObject dataclass

Bases: ContentObject

A marked content tag..

Source code in playa/page.py
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
@dataclass
class TagObject(ContentObject):
    """A marked content tag.."""

    _mcs: MarkedContent

    def __len__(self) -> int:
        """A tag has no contents, iterating over it returns nothing."""
        return 0

    @property
    def mcs(self) -> MarkedContent:
        """The marked content tag for this object."""
        return self._mcs

    @property
    def mcid(self) -> Union[int, None]:
        """The marked content ID of the nearest enclosing marked
        content section with an ID."""
        if self._mcs.mcid is not None:
            return self._mcs.mcid
        return super().mcid

    @property
    def bbox(self) -> Rect:
        """A tag has no content and thus no bounding box.

        To avoid needlessly complicating user code this returns
        `BBOX_NONE` instead of `None` or throwing a exception.
        Because that is a specific object, you can reliably check for
        it with:

            if obj.bbox is BBOX_NONE:
                ...
        """
        return BBOX_NONE

bbox property

A tag has no content and thus no bounding box.

To avoid needlessly complicating user code this returns BBOX_NONE instead of None or throwing a exception. Because that is a specific object, you can reliably check for it with:

if obj.bbox is BBOX_NONE:
    ...

mcid property

The marked content ID of the nearest enclosing marked content section with an ID.

mcs property

The marked content tag for this object.

__len__()

A tag has no contents, iterating over it returns nothing.

Source code in playa/page.py
767
768
769
def __len__(self) -> int:
    """A tag has no contents, iterating over it returns nothing."""
    return 0

TextObject dataclass

Bases: ContentObject

Text object (contains one or more glyphs).

Attributes:

Name Type Description
textstate TextState

Text state for this object.

args List[Union[bytes, float]]

Strings or position adjustments

bbox Rect

Text bounding box in device space.

text_space_bbox

Text bounding box in text space (i.e. before any possible coordinate transformation)

Source code in playa/page.py
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
@dataclass
class TextObject(ContentObject):
    """Text object (contains one or more glyphs).

    Attributes:
      textstate: Text state for this object.
      args: Strings or position adjustments
      bbox: Text bounding box in device space.
      text_space_bbox: Text bounding box in text space (i.e. before
                       any possible coordinate transformation)
    """

    textstate: TextState
    args: List[Union[bytes, float]]
    _chars: Union[List[str], None] = None
    _bbox: Union[Rect, None] = None
    _text_space_bbox: Union[Rect, None] = None
    _next_tstate: Union[TextState, None] = None

    def __iter__(self) -> Iterator[GlyphObject]:
        """Generate glyphs for this text object"""
        tstate = copy(self.textstate)
        font = tstate.font
        # If no font is set, we cannot do anything, since even calling
        # TJ with a displacement and no text effects requires us at
        # least to know the fontsize.
        if font is None:
            log.warning(
                "No font is set, will not update text state or output text: %r TJ",
                self.args,
            )
            self._next_tstate = tstate
            return
        assert self.ctm is not None
        # Extract all the elements so we can translate efficiently
        a, b, c, d, e, f = mult_matrix(tstate.line_matrix, self.ctm)
        # Pre-determine if we need to recompute the bound for rotated glyphs
        corners = b * d < 0 or a * c < 0
        # Apply horizontal scaling
        scaling = tstate.scaling * 0.01
        charspace = tstate.charspace * scaling
        wordspace = tstate.wordspace * scaling
        vert = font.vertical
        if font.multibyte:
            wordspace = 0
        (x, y) = tstate.glyph_offset
        pos = y if vert else x
        needcharspace = False  # Only for first glyph
        for obj in self.args:
            if isinstance(obj, (int, float)):
                dxscale = 0.001 * tstate.fontsize * scaling
                pos -= obj * dxscale
                needcharspace = True
            else:
                for cid, text in font.decode(obj):
                    if needcharspace:
                        pos += charspace
                    textwidth = font.char_width(cid)
                    adv = textwidth * tstate.fontsize * scaling
                    x, y = tstate.glyph_offset = (x, pos) if vert else (pos, y)
                    glyph = GlyphObject(
                        _pageref=self._pageref,
                        gstate=self.gstate,
                        ctm=self.ctm,
                        mcstack=self.mcstack,
                        textstate=tstate,
                        cid=cid,
                        text=text,
                        # Do pre-translation internally (taking rotation into account)
                        matrix=(a, b, c, d, x * a + y * c + e, x * b + y * d + f),
                        adv=adv,
                        corners=corners,
                    )
                    yield glyph
                    pos += adv
                    if cid == 32 and wordspace:
                        pos += wordspace
                    needcharspace = True
        tstate.glyph_offset = (x, pos) if vert else (pos, y)
        if self._next_tstate is None:
            self._next_tstate = tstate

    @property
    def text_space_bbox(self):
        if self._text_space_bbox is not None:
            return self._text_space_bbox
        # No need to save tstate as we do not update it below
        tstate = self.textstate
        font = tstate.font
        if font is None:
            log.warning(
                "No font is set, will not update text state or output text: %r TJ",
                self.args,
            )
            self._text_space_bbox = BBOX_NONE
            self._next_tstate = tstate
            return self._text_space_bbox
        if len(self.args) == 0:
            self._text_space_bbox = BBOX_NONE
            self._next_tstate = tstate
            return self._text_space_bbox
        scaling = tstate.scaling * 0.01
        charspace = tstate.charspace * scaling
        wordspace = tstate.wordspace * scaling
        vert = font.vertical
        if font.multibyte:
            wordspace = 0
        (x, y) = tstate.glyph_offset
        pos = y if vert else x
        needcharspace = False  # Only for first glyph
        if vert:
            x0 = x1 = x
            y0 = y1 = y
        else:
            # These do not change!
            x0 = x1 = x
            y0 = y + tstate.descent + tstate.rise
            y1 = y0 + tstate.fontsize
        for obj in self.args:
            if isinstance(obj, (int, float)):
                dxscale = 0.001 * tstate.fontsize * scaling
                pos -= obj * dxscale
                needcharspace = True
            else:
                for cid, _ in font.decode(obj):
                    if needcharspace:
                        pos += charspace
                    textwidth = font.char_width(cid)
                    adv = textwidth * tstate.fontsize * scaling
                    x, y = (x, pos) if vert else (pos, y)
                    if vert:
                        textdisp = font.char_disp(cid)
                        assert isinstance(textdisp, tuple)
                        (vx, vy) = textdisp
                        if vx is None:
                            vx = tstate.fontsize * 0.5
                        else:
                            vx = vx * tstate.fontsize * 0.001
                        vy = (1000 - vy) * tstate.fontsize * 0.001
                        x0 = min(x0, x - vx)
                        y0 = min(y0, y + vy + tstate.rise + adv)
                        x1 = max(x1, x - vx + tstate.fontsize)
                        y1 = max(y1, y + vy + tstate.rise)
                    else:
                        x1 = x + adv
                    pos += adv
                    if cid == 32 and wordspace:
                        pos += wordspace
                    needcharspace = True
        if self._next_tstate is None:
            self._next_tstate = copy(tstate)
            self._next_tstate.glyph_offset = (x, pos) if vert else (pos, y)
        self._text_space_bbox = (x0, y0, x1, y1)
        return self._text_space_bbox

    @property
    def next_textstate(self) -> TextState:
        if self._next_tstate is not None:
            return self._next_tstate
        _ = self.text_space_bbox
        assert self._next_tstate is not None
        return self._next_tstate

    @property
    def bbox(self) -> Rect:
        # We specialize this to avoid it having side effects on the
        # text state (already it's a bit of a footgun that __iter__
        # does that...), but also because we know all glyphs have the
        # same text matrix and thus we can avoid a lot of multiply
        if self._bbox is not None:
            return self._bbox
        matrix = mult_matrix(self.textstate.line_matrix, self.ctm)
        self._bbox = get_transformed_bound(matrix, self.text_space_bbox)
        return self._bbox

    @property
    def chars(self) -> str:
        """Get the Unicode characters (in stream order) for this object."""
        if self._chars is not None:
            return "".join(self._chars)
        self._chars = []
        font = self.textstate.font
        assert font is not None, "No font was selected"
        for obj in self.args:
            if not isinstance(obj, bytes):
                continue
            for _, text in font.decode(obj):
                self._chars.append(text)
        return "".join(self._chars)

    def __len__(self) -> int:
        """Return the number of glyphs that would result from iterating over
        this object.

        Important: this is the number of glyphs, *not* the number of
        Unicode characters.
        """
        nglyphs = 0
        font = self.textstate.font
        assert font is not None, "No font was selected"
        for obj in self.args:
            if not isinstance(obj, bytes):
                continue
            nglyphs += sum(1 for _ in font.decode(obj))
        return nglyphs

chars property

Get the Unicode characters (in stream order) for this object.

__iter__()

Generate glyphs for this text object

Source code in playa/page.py
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
def __iter__(self) -> Iterator[GlyphObject]:
    """Generate glyphs for this text object"""
    tstate = copy(self.textstate)
    font = tstate.font
    # If no font is set, we cannot do anything, since even calling
    # TJ with a displacement and no text effects requires us at
    # least to know the fontsize.
    if font is None:
        log.warning(
            "No font is set, will not update text state or output text: %r TJ",
            self.args,
        )
        self._next_tstate = tstate
        return
    assert self.ctm is not None
    # Extract all the elements so we can translate efficiently
    a, b, c, d, e, f = mult_matrix(tstate.line_matrix, self.ctm)
    # Pre-determine if we need to recompute the bound for rotated glyphs
    corners = b * d < 0 or a * c < 0
    # Apply horizontal scaling
    scaling = tstate.scaling * 0.01
    charspace = tstate.charspace * scaling
    wordspace = tstate.wordspace * scaling
    vert = font.vertical
    if font.multibyte:
        wordspace = 0
    (x, y) = tstate.glyph_offset
    pos = y if vert else x
    needcharspace = False  # Only for first glyph
    for obj in self.args:
        if isinstance(obj, (int, float)):
            dxscale = 0.001 * tstate.fontsize * scaling
            pos -= obj * dxscale
            needcharspace = True
        else:
            for cid, text in font.decode(obj):
                if needcharspace:
                    pos += charspace
                textwidth = font.char_width(cid)
                adv = textwidth * tstate.fontsize * scaling
                x, y = tstate.glyph_offset = (x, pos) if vert else (pos, y)
                glyph = GlyphObject(
                    _pageref=self._pageref,
                    gstate=self.gstate,
                    ctm=self.ctm,
                    mcstack=self.mcstack,
                    textstate=tstate,
                    cid=cid,
                    text=text,
                    # Do pre-translation internally (taking rotation into account)
                    matrix=(a, b, c, d, x * a + y * c + e, x * b + y * d + f),
                    adv=adv,
                    corners=corners,
                )
                yield glyph
                pos += adv
                if cid == 32 and wordspace:
                    pos += wordspace
                needcharspace = True
    tstate.glyph_offset = (x, pos) if vert else (pos, y)
    if self._next_tstate is None:
        self._next_tstate = tstate

__len__()

Return the number of glyphs that would result from iterating over this object.

Important: this is the number of glyphs, not the number of Unicode characters.

Source code in playa/page.py
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
def __len__(self) -> int:
    """Return the number of glyphs that would result from iterating over
    this object.

    Important: this is the number of glyphs, *not* the number of
    Unicode characters.
    """
    nglyphs = 0
    font = self.textstate.font
    assert font is not None, "No font was selected"
    for obj in self.args:
        if not isinstance(obj, bytes):
            continue
        nglyphs += sum(1 for _ in font.decode(obj))
    return nglyphs

TextState dataclass

PDF Text State (PDF 1.7 section 9.3.1).

Exceptionally, the line matrix and text matrix are represented more compactly with the line matrix itself in line_matrix, which gets translated by glyph_offset for the current glyph (note: expressed in user space), which pdfminer confusingly called linematrix, to produce the text matrix.

Attributes:

Name Type Description
line_matrix Matrix

The text line matrix, which defines (in user space) the start of the current line of text, which may or may not correspond to an actual line because PDF is a presentation format.

glyph_offset Point

The offset of the current glyph with relation to the line matrix, in text space units.

font Optional[Font]

The current font.

fontsize float

The current font size, in text space units. This is often just 1.0 as it relies on the text matrix (you may use line_matrix here) to scale it to the actual size in user space.

charspace float

Extra spacing to add between each glyph, in text space units.

wordspace float

The width of a space, defined curiously as cid==32 (But PDF Is A prESeNTaTion fORmAT sO ThERe maY NOt Be aNY SpACeS!!), in text space units.

scaling float

The horizontal scaling factor as defined by the PDF standard.

leading float

The leading as defined by the PDF standard.

render_mode int

The PDF rendering mode. The really important one here is 3, which means "don't render the text". You might want to use this to detect invisible text.

rise float

The text rise (superscript or subscript position), in text space units.

descent float

The font's descent (scaled by the font size), in text space units (this is not really part of the text state but is kept here to avoid recomputing it on every glyph)

Source code in playa/page.py
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
@dataclass
class TextState:
    """PDF Text State (PDF 1.7 section 9.3.1).

    Exceptionally, the line matrix and text matrix are represented
    more compactly with the line matrix itself in `line_matrix`, which
    gets translated by `glyph_offset` for the current glyph (note:
    expressed in **user space**), which pdfminer confusingly called
    `linematrix`, to produce the text matrix.

    Attributes:
      line_matrix: The text line matrix, which defines (in user
        space) the start of the current line of text, which may or may
        not correspond to an actual line because PDF is a presentation
        format.
      glyph_offset: The offset of the current glyph with relation to
        the line matrix, in text space units.
      font: The current font.
      fontsize: The current font size, **in text space units**.
        This is often just 1.0 as it relies on the text matrix (you
        may use `line_matrix` here) to scale it to the actual size in
        user space.
      charspace: Extra spacing to add between each glyph, in
        text space units.
      wordspace: The width of a space, defined curiously as `cid==32`
        (But PDF Is A prESeNTaTion fORmAT sO ThERe maY NOt Be aNY
        SpACeS!!), in text space units.
      scaling: The horizontal scaling factor as defined by the PDF
        standard.
      leading: The leading as defined by the PDF standard.
      render_mode: The PDF rendering mode.  The really important one
        here is 3, which means "don't render the text".  You might
        want to use this to detect invisible text.
      rise: The text rise (superscript or subscript position), in text
        space units.
      descent: The font's descent (scaled by the font size), in text
        space units (this is not really part of the text state but is
        kept here to avoid recomputing it on every glyph)
    """

    line_matrix: Matrix = MATRIX_IDENTITY
    glyph_offset: Point = (0, 0)
    font: Optional[Font] = None
    fontsize: float = 0
    charspace: float = 0
    wordspace: float = 0
    scaling: float = 100
    leading: float = 0
    render_mode: int = 0
    rise: float = 0
    descent: float = 0

    def reset(self) -> None:
        """Reset the text state"""
        self.line_matrix = MATRIX_IDENTITY
        self.glyph_offset = (0, 0)

reset()

Reset the text state

Source code in playa/page.py
569
570
571
572
def reset(self) -> None:
    """Reset the text state"""
    self.line_matrix = MATRIX_IDENTITY
    self.glyph_offset = (0, 0)

XObjectObject dataclass

Bases: ContentObject

An eXternal Object, in the context of a page.

There are a couple of kinds of XObjects. Here we are only concerned with "Form XObjects" which, despite their name, have nothing at all to do with fillable forms. Instead they are like little embeddable PDF pages, possibly with their own resources, definitely with their own definition of "user space".

Image XObjects are handled by ImageObject.

Attributes:

Name Type Description
xobjid str

Name of this XObject (in the page resources).

page Page

Weak reference to containing page.

stream ContentStream

Content stream with PDF operators.

resources Union[None, Dict[str, PDFObject]]

Resources specific to this XObject, if any.

Source code in playa/page.py
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
@dataclass
class XObjectObject(ContentObject):
    """An eXternal Object, in the context of a page.

    There are a couple of kinds of XObjects.  Here we are only
    concerned with "Form XObjects" which, despite their name, have
    nothing at all to do with fillable forms.  Instead they are like
    little embeddable PDF pages, possibly with their own resources,
    definitely with their own definition of "user space".

    Image XObjects are handled by `ImageObject`.

    Attributes:
      xobjid: Name of this XObject (in the page resources).
      page: Weak reference to containing page.
      stream: Content stream with PDF operators.
      resources: Resources specific to this XObject, if any.
    """

    xobjid: str
    stream: ContentStream
    resources: Union[None, Dict[str, PDFObject]]

    def __contains__(self, name: object) -> bool:
        return name in self.stream

    def __getitem__(self, name: str) -> PDFObject:
        return self.stream[name]

    @property
    def page(self) -> Page:
        """Get the page (if it exists, raising RuntimeError if not)."""
        return _deref_page(self._pageref)

    @property
    def bbox(self) -> Rect:
        """Get the bounding box of this XObject in device space."""
        # It is a required attribute!
        return get_transformed_bound(self.ctm, parse_rect(self.stream["BBox"]))

    @property
    def buffer(self) -> bytes:
        """Raw stream content for this XObject"""
        return self.stream.buffer

    @property
    def tokens(self) -> Iterator[Token]:
        """Iterate over tokens in the XObject's content stream."""
        parser = ContentParser([self.stream])
        while True:
            try:
                pos, tok = parser.nexttoken()
            except StopIteration:
                return
            yield tok

    @property
    def contents(self) -> Iterator[PDFObject]:
        """Iterator over PDF objects in the content stream."""
        for pos, obj in ContentParser([self.stream]):
            yield obj

    def __iter__(self) -> Iterator["ContentObject"]:
        interp = LazyInterpreter(self.page, [self.stream], self.resources)
        interp.ctm = self.ctm
        return iter(interp)

bbox property

Get the bounding box of this XObject in device space.

buffer property

Raw stream content for this XObject

contents property

Iterator over PDF objects in the content stream.

page property

Get the page (if it exists, raising RuntimeError if not).

tokens property

Iterate over tokens in the XObject's content stream.

_extract_text_from_obj(obj, vertical)

Try to get text from a text object.

Source code in playa/page.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def _extract_text_from_obj(obj: "TextObject", vertical: bool) -> Tuple[str, float]:
    """Try to get text from a text object."""
    chars = []
    prev_end = 0.0
    for glyph in obj:
        x, y = glyph.textstate.glyph_offset
        off = y if vertical else x
        # FIXME: This is a heuristic!!!
        if prev_end and off - prev_end > 0.5:
            chars.append(" ")
        if glyph.text is not None:
            chars.append(glyph.text)
        prev_end = off + glyph.adv
    return "".join(chars), prev_end

playa.structure

Lazy interface to PDF logical structure (PDF 1.7 sect 14.7).

ContentItem dataclass

Content item in logical structure tree.

This corresponds to an individual marked content section on a specific page, and can be used to (lazily) find that section if desired.

Source code in playa/structure.py
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
@dataclass
class ContentItem:
    """Content item in logical structure tree.

    This corresponds to an individual marked content section on a
    specific page, and can be used to (lazily) find that section if
    desired.
    """

    _pageref: PageRef
    mcid: int
    stream: Union[ContentStream, None]

    @property
    def page(self) -> Union["Page", None]:
        """Specific page for this structure tree, if any."""
        if self._pageref is None:
            return None
        return _deref_page(self._pageref)

page property

Specific page for this structure tree, if any.

ContentObject dataclass

Content object in logical structure tree.

This corresponds to a content item that is an entire PDF (X)Object, and can be used to (lazily) get that object.

Not to be confused with playa.page.ContentObject.

Source code in playa/structure.py
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
@dataclass
class ContentObject:
    """Content object in logical structure tree.

    This corresponds to a content item that is an entire PDF
    (X)Object, and can be used to (lazily) get that object.

    Not to be confused with `playa.page.ContentObject`.
    """

    _pageref: PageRef
    props: Dict[str, PDFObject]

    @property
    def page(self) -> Union["Page", None]:
        """Specific page for this structure tree, if any."""
        if self._pageref is None:
            return None
        return _deref_page(self._pageref)

page property

Specific page for this structure tree, if any.

Element dataclass

Bases: Findable

Logical structure element.

Attributes:

Name Type Description
props Dict[str, PDFObject]

Structure element dictionary (PDF 1.7 table 323).

Source code in playa/structure.py
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
@dataclass
class Element(Findable):
    """Logical structure element.

    Attributes:
      props: Structure element dictionary (PDF 1.7 table 323).
    """

    _docref: DocumentRef
    props: Dict[str, PDFObject]

    @classmethod
    def from_dict(cls, doc: "Document", obj: Dict[str, PDFObject]) -> "Element":
        """Construct from PDF structure element dictionary."""
        return cls(_docref=_ref_document(doc), props=obj)

    @property
    def type(self) -> str:
        return literal_name(self.props["S"])

    @property
    def doc(self) -> "Document":
        """Containing document for this element."""
        return _deref_document(self._docref)

    @property
    def page(self) -> Union["Page", None]:
        """Containing page for this element, if any."""
        pg = self.props.get("Pg")
        if pg is None:
            return None
        elif isinstance(pg, ObjRef):
            try:
                return self.doc.pages.by_id(pg.objid)
            except KeyError:
                LOG.warning("'Pg' entry not found in document: %r", self.props)
        else:
            LOG.warning(
                "'Pg' entry is not an indirect object reference: %r", self.props
            )
        return None

    @property
    def parent(self) -> Union["Element", "Tree", None]:
        p = resolve1(self.props.get("P"))
        if p is None:
            return None
        if p.get("Type") is LITERAL_STRUCTTREEROOT:
            return Tree(self.doc)
        return Element.from_dict(self.doc, p)

    def __iter__(self) -> Iterator[Union["Element", ContentItem, ContentObject]]:
        if "K" in self.props:
            kids = resolve1(self.props["K"])
            yield from self._make_kids(kids)

    @functools.singledispatchmethod
    def _make_kids(
        self, k: PDFObject
    ) -> Iterator[Union["Element", ContentItem, ContentObject]]:
        """
        Make a child for this element from its K array.

        K in Element can be (PDF 1.7 Table 323):
        - a structure element (not a content item)
        - an integer marked-content ID
        - a marked-content reference dictionary
        - an object reference dictionary
        - an array of one or more of the above
        """
        LOG.warning("Unrecognized 'K' element: %r", k)
        yield from ()

    @_make_kids.register(list)
    def _make_kids_list(
        self, k: list
    ) -> Iterator[Union["Element", ContentItem, ContentObject]]:
        for el in k:
            yield from self._make_kids(resolve1(el))

    @_make_kids.register(int)
    def _make_kids_int(self, k: int) -> Iterator[ContentItem]:
        page = self.page
        if page is None:
            LOG.warning("No page found for marked-content reference: %r", k)
            return
        yield ContentItem(_pageref=page.pageref, mcid=k, stream=None)

    @_make_kids.register(dict)
    def _make_kids_dict(
        self, k: Dict[str, PDFObject]
    ) -> Iterator[Union[ContentItem, ContentObject, "Element"]]:
        ktype = k.get("Type")
        if ktype is LITERAL_MCR:
            yield from self._make_kids_mcr(k)
        elif ktype is LITERAL_OBJR:
            yield from self._make_kids_objr(k)
        else:
            yield Element(_docref=self._docref, props=k)

    def _make_kids_mcr(self, k: Dict[str, PDFObject]) -> Iterator[ContentItem]:
        mcid = resolve1(k.get("MCID"))
        if mcid is None or not isinstance(mcid, int):
            LOG.warning("'MCID' entry is not an int: %r", k)
            return
        stream: Union[ContentStream, None] = None
        pageref = self._get_kid_pageref(k)
        if pageref is None:
            return
        try:
            stream = stream_value(k["Stm"])
        except KeyError:
            pass
        except TypeError:
            LOG.warning("'Stm' entry is not a content stream: %r", k)
        # Do not care about StmOwn, we don't do appearances
        yield ContentItem(_pageref=pageref, mcid=mcid, stream=stream)

    def _make_kids_objr(self, k: Dict[str, PDFObject]) -> Iterator[ContentObject]:
        ref = k.get("Obj")
        if not isinstance(ref, ObjRef):
            LOG.warning("'Obj' entry is not an indirect object reference: %r", k)
            return
        obj = ref.resolve()
        if not isinstance(obj, dict):
            LOG.warning("'Obj' entry does not point to a dict: %r", obj)
            return
        pageref = self._get_kid_pageref(k)
        if pageref is None:
            return
        yield ContentObject(_pageref=pageref, props=obj)

    def _get_kid_pageref(self, k: Dict[str, PDFObject]) -> Union[PageRef, None]:
        pg = k.get("Pg")
        page: Union[Page, None] = None
        if pg is not None:
            if isinstance(pg, ObjRef):
                try:
                    page = self.doc.pages.by_id(pg.objid)
                except KeyError:
                    LOG.warning("'Pg' entry not found in document: %r", k)
                    page = None
            else:
                LOG.warning("'Pg' entry is not an indirect object reference: %r", k)
        if page is None:
            page = self.page
            if page is None:
                LOG.warning("No page found for marked-content reference: %r", k)
                return None
        return page.pageref

doc property

Containing document for this element.

page property

Containing page for this element, if any.

from_dict(doc, obj) classmethod

Construct from PDF structure element dictionary.

Source code in playa/structure.py
156
157
158
159
@classmethod
def from_dict(cls, doc: "Document", obj: Dict[str, PDFObject]) -> "Element":
    """Construct from PDF structure element dictionary."""
    return cls(_docref=_ref_document(doc), props=obj)

Findable

Bases: Iterable

find() and find_all() methods that can be inherited to avoid repeating oneself

Source code in playa/structure.py
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
class Findable(Iterable):
    """find() and find_all() methods that can be inherited to avoid
    repeating oneself"""

    def find_all(
        self, matcher: Union[str, Pattern[str], MatchFunc]
    ) -> Iterator["Element"]:
        """Iterate depth-first over matching elements in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `Element` and
        returning `True` if the element matches.
        """
        return _find_all(list(self), matcher)

    def find(
        self, matcher: Union[str, Pattern[str], MatchFunc]
    ) -> Union["Element", None]:
        """Find the first matching element in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `Element` and
        returning `True` if the element matches.
        """
        try:
            return next(_find_all(list(self), matcher))
        except StopIteration:
            return None

find(matcher)

Find the first matching element in subtree.

The matcher argument is either an element name, a regular expression, or a function taking a Element and returning True if the element matches.

Source code in playa/structure.py
130
131
132
133
134
135
136
137
138
139
140
141
142
def find(
    self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Union["Element", None]:
    """Find the first matching element in subtree.

    The `matcher` argument is either an element name, a regular
    expression, or a function taking a `Element` and
    returning `True` if the element matches.
    """
    try:
        return next(_find_all(list(self), matcher))
    except StopIteration:
        return None

find_all(matcher)

Iterate depth-first over matching elements in subtree.

The matcher argument is either an element name, a regular expression, or a function taking a Element and returning True if the element matches.

Source code in playa/structure.py
119
120
121
122
123
124
125
126
127
128
def find_all(
    self, matcher: Union[str, Pattern[str], MatchFunc]
) -> Iterator["Element"]:
    """Iterate depth-first over matching elements in subtree.

    The `matcher` argument is either an element name, a regular
    expression, or a function taking a `Element` and
    returning `True` if the element matches.
    """
    return _find_all(list(self), matcher)

Tree

Bases: Findable

Source code in playa/structure.py
334
335
336
337
338
339
340
341
342
343
344
345
346
347
class Tree(Findable):
    _docref: DocumentRef

    def __init__(self, doc: "Document") -> None:
        self._docref = _ref_document(doc)

    def __iter__(self) -> Iterator[Element]:
        doc = _deref_document(self._docref)
        return _iter_structure(doc)

    @property
    def doc(self) -> "Document":
        """Document with which this structure tree is associated."""
        return _deref_document(self._docref)

doc property

Document with which this structure tree is associated.

playa.outline

Lazy interface to PDF document outline (PDF 1.7 sect 12.3.3).

Action dataclass

PDF actions (PDF 1.7 sect 12.6)

Source code in playa/outline.py
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
@dataclass
class Action:
    """PDF actions (PDF 1.7 sect 12.6)"""

    _docref: DocumentRef
    props: Dict[str, PDFObject]

    @property
    def type(self) -> PSLiteral:
        assert isinstance(self.props["S"], PSLiteral)
        return self.props["S"]

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self._docref)

    @property
    def destination(self) -> Union[Destination, None]:
        """Destination of this action, if any."""
        if "D" not in self.props:
            return None
        return Destination.from_dest(self.doc, resolve1(self.props["D"]))

destination property

Destination of this action, if any.

doc property

Get associated document if it exists.

Destination dataclass

PDF destinations (PDF 1.7 sect 12.3.2)

Source code in playa/outline.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
@dataclass
class Destination:
    """PDF destinations (PDF 1.7 sect 12.3.2)"""

    _docref: DocumentRef
    page_idx: Union[int, None]
    display: Union[PSLiteral, None]
    coords: Tuple[Union[float, None], ...]

    @classmethod
    def from_dest(
        cls, doc: "Document", dest: Union[PSLiteral, bytes, list]
    ) -> "Destination":
        if isinstance(dest, (bytes, PSLiteral)):
            return doc.destinations[dest]
        elif isinstance(dest, list):
            return cls.from_list(doc, dest)
        else:
            raise TypeError("Unknown destination type: %r", dest)

    @classmethod
    def from_list(cls, doc: "Document", dest: Sequence) -> "Destination":
        pageobj, display, *args = dest
        page_idx: Union[int, None] = None
        if isinstance(pageobj, int):
            # Not really sure if this is page number or page index...
            page_idx = pageobj - 1
        elif isinstance(pageobj, ObjRef):
            try:
                page_idx = doc.pages.by_id(pageobj.objid).page_idx
            except KeyError:
                LOG.warning("Invalid page object in destination: %r", pageobj)
        else:
            LOG.warning("Unrecognized page in destination object: %r", pageobj)
        if not isinstance(display, PSLiteral):
            LOG.warning("Unknown display type: %r", display)
            display = None
        coords = tuple(x if isinstance(x, (int, float)) else None for x in args)
        return Destination(
            _docref=_ref_document(doc),
            page_idx=page_idx,
            display=display,
            coords=coords,
        )

Outline

PDF document outline (PDF 1.7 sect 12.3.3)

Source code in playa/outline.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
class Outline:
    """PDF document outline (PDF 1.7 sect 12.3.3)"""

    _docref: DocumentRef
    props: Dict[str, PDFObject]

    def __init__(self, doc: "Document") -> None:
        self._docref = _ref_document(doc)
        self.props = dict_value(doc.catalog["Outlines"])

    def __iter__(self) -> Iterator["Outline"]:
        if "First" in self.props and "Last" in self.props:
            ref = self.props["First"]
            while ref is not None:
                if not isinstance(ref, ObjRef):
                    LOG.warning("Not an indirect object reference: %r", ref)
                    break
                out = self._from_ref(ref)
                ref = out.props.get("Next")
                yield out
                if ref is self.props["Last"]:
                    break

    def _from_ref(self, ref: ObjRef) -> "Outline":
        out = Outline.__new__(Outline)
        out._docref = self._docref
        out.props = dict_value(ref)
        return out

    @property
    def doc(self) -> "Document":
        """Get associated document if it exists."""
        return _deref_document(self._docref)

    @property
    def title(self) -> Union[str, None]:
        raw = resolve1(self.props.get("Title"))
        if raw is None:
            return None
        if not isinstance(raw, bytes):
            LOG.warning("Title is not a string: %r", raw)
            return None
        return decode_text(raw)

    @property
    def destination(self) -> Union[Destination, None]:
        """Destination for this outline item.

        Note: Special case of `GoTo` actions.
            Since internal `GoTo` actions (PDF 1.7 sect 12.6.4.2) in
            outlines and links are entirely equivalent to
            destinations, if one exists, it will be returned here as
            well.

        Returns:
            destination, if one exists.
        """
        dest = resolve1(self.props.get("Dest"))
        if dest is not None:
            try:
                return Destination.from_dest(self.doc, dest)
            except KeyError:
                return None
        action = self.action
        if action is None or action.type is not ACTION_GOTO:
            return None
        return action.destination

    @property
    def action(self) -> Union[Action, None]:
        try:
            return Action(self._docref, dict_value(self.props["A"]))
        except (KeyError, TypeError):
            return None

    @property
    def element(self) -> Union[Element, None]:
        """The structure element associated with this outline item, if
        any.

        Returns:
            structure element, if one exists.
        """
        try:
            return Element.from_dict(self.doc, dict_value(self.props["SE"]))
        except (KeyError, TypeError):
            return None

    @property
    def parent(self) -> Union["Outline", None]:
        ref = self.props.get("Parent")
        if ref is None:
            return None
        if not isinstance(ref, ObjRef):
            LOG.warning("Parent is not indirect object reference: %r", ref)
            return None
        return self._from_ref(ref)

destination property

Destination for this outline item.

Special case of GoTo actions.

Since internal GoTo actions (PDF 1.7 sect 12.6.4.2) in outlines and links are entirely equivalent to destinations, if one exists, it will be returned here as well.

Returns:

Type Description
Union[Destination, None]

destination, if one exists.

doc property

Get associated document if it exists.

element property

The structure element associated with this outline item, if any.

Returns:

Type Description
Union[Element, None]

structure element, if one exists.

playa.worker

Worker subprocess related functions and data.

in_worker()

Are we currently in a worker process?

Source code in playa/worker.py
25
26
27
def in_worker() -> bool:
    """Are we currently in a worker process?"""
    return __pdf is not None