Skip to content

Commit 0b64266

Browse files
BUG: Fix XMP handling dropping indirect references (#3392)
Closes #3391. According to table 29 of the PDF 2.0 specification, the Metadata stream inside the catalog dictionary should be an indirect reference. The old code would inline the content stream into the catalog dictionary, making the file unreadable in some applications like some versions of Adobe Acrobat.
1 parent c17f03a commit 0b64266

File tree

4 files changed

+113
-9
lines changed

4 files changed

+113
-9
lines changed

pypdf/_writer.py

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -361,10 +361,23 @@ def xmp_metadata(self, value: Optional[XmpInformation]) -> None:
361361
if value is None:
362362
if "/Metadata" in self.root_object:
363363
del self.root_object["/Metadata"]
364+
return
365+
366+
metadata = self.root_object.get("/Metadata", None)
367+
if not isinstance(metadata, IndirectObject):
368+
if metadata is not None:
369+
del self.root_object["/Metadata"]
370+
metadata_stream = StreamObject()
371+
stream_reference = self._add_object(metadata_stream)
372+
self.root_object[NameObject("/Metadata")] = stream_reference
364373
else:
365-
self.root_object[NameObject("/Metadata")] = value
374+
metadata_stream = cast(StreamObject, metadata.get_object())
366375

367-
return self.root_object.xmp_metadata # type: ignore
376+
if isinstance(value, XmpInformation):
377+
bytes_data = value.stream.get_data()
378+
else:
379+
bytes_data = value
380+
metadata_stream.set_data(bytes_data)
368381

369382
@property
370383
def with_as_usage(self) -> bool:

pypdf/generic/_data_structures.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -479,7 +479,7 @@ def __getitem__(self, key: Any) -> PdfObject:
479479
@property
480480
def xmp_metadata(self) -> Optional[XmpInformationProtocol]:
481481
"""
482-
Retrieve XMP (Extensible Metadata Platform) data relevant to the this
482+
Retrieve XMP (Extensible Metadata Platform) data relevant to this
483483
object, if available.
484484
485485
See Table 347 — Additional entries in a metadata stream dictionary.
@@ -497,11 +497,7 @@ def xmp_metadata(self) -> Optional[XmpInformationProtocol]:
497497
return None
498498
assert metadata is not None, "mypy"
499499
metadata = metadata.get_object()
500-
501-
if not isinstance(metadata, XmpInformation):
502-
metadata = XmpInformation(metadata)
503-
self[NameObject("/Metadata")] = metadata
504-
return metadata
500+
return XmpInformation(metadata)
505501

506502
def write_to_stream(
507503
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None

pypdf/xmp.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from xml.parsers.expat import ExpatError
2323

2424
from ._protocols import XmpInformationProtocol
25-
from ._utils import StreamType, deprecate_no_replacement
25+
from ._utils import StreamType, deprecate_no_replacement, deprecate_with_replacement
2626
from .errors import PdfReadError
2727
from .generic import ContentStream, PdfObject
2828

@@ -247,6 +247,11 @@ def __init__(self, stream: ContentStream) -> None:
247247
def write_to_stream(
248248
self, stream: StreamType, encryption_key: Union[None, str, bytes] = None
249249
) -> None:
250+
deprecate_with_replacement(
251+
"XmpInformation.write_to_stream",
252+
"PdfWriter.xmp_metadata",
253+
"6.0.0"
254+
)
250255
if encryption_key is not None: # deprecated
251256
deprecate_no_replacement(
252257
"the encryption_key parameter of write_to_stream", "5.0.0"

tests/test_xmp.py

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99
import pypdf.xmp
1010
from pypdf import PdfReader, PdfWriter
1111
from pypdf.errors import PdfReadError
12+
from pypdf.generic import NameObject, StreamObject
13+
from pypdf.xmp import XmpInformation
1214

1315
from . import get_data_from_url
1416

@@ -325,3 +327,91 @@ def test_dc_language__no_bag_container():
325327

326328
assert reader.xmp_metadata is not None
327329
assert reader.xmp_metadata.dc_language == ["x-unknown"]
330+
331+
332+
def test_reading_does_not_destroy_root_object():
333+
"""Test for #3391."""
334+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf")
335+
xmp = writer.xmp_metadata
336+
assert xmp is not None
337+
assert not isinstance(writer.root_object["/Metadata"], XmpInformation)
338+
assert isinstance(writer.root_object["/Metadata"].get_object(), StreamObject)
339+
340+
output = BytesIO()
341+
writer.write(output)
342+
output_bytes = output.getvalue()
343+
assert b"\n/Metadata 27 0 R\n" in output_bytes
344+
345+
346+
def test_xmp_information__write_to_stream():
347+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf")
348+
xmp = writer.xmp_metadata
349+
350+
output = BytesIO()
351+
with pytest.warns(
352+
DeprecationWarning,
353+
match=(
354+
r"^XmpInformation\.write_to_stream is deprecated and will be removed in pypdf 6\.0\.0\. "
355+
r"Use PdfWriter\.xmp_metadata instead\.$"
356+
)
357+
):
358+
xmp.write_to_stream(output)
359+
output_bytes = output.getvalue()
360+
assert output_bytes.startswith(b"<<\n/Type /Metadata\n/Subtype /XML\n/Length 2786\n>>\nstream\n<?xpacket begin")
361+
362+
363+
def test_pdf_writer__xmp_metadata_setter():
364+
# Clear existing metadata.
365+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf")
366+
assert writer.xmp_metadata is not None
367+
original_metadata = writer.xmp_metadata.stream.get_data()
368+
writer.xmp_metadata = None
369+
output = BytesIO()
370+
writer.write(output)
371+
output_bytes = output.getvalue()
372+
reader = PdfReader(BytesIO(output_bytes))
373+
assert reader.xmp_metadata is None
374+
375+
# Attempt to clear again.
376+
writer = PdfWriter(clone_from=reader)
377+
assert writer.xmp_metadata is None
378+
writer.xmp_metadata = None
379+
output = BytesIO()
380+
writer.write(output)
381+
output_bytes = output.getvalue()
382+
reader = PdfReader(BytesIO(output_bytes))
383+
assert reader.xmp_metadata is None
384+
385+
# Set new metadata from bytes.
386+
writer = PdfWriter(clone_from=reader)
387+
assert writer.xmp_metadata is None
388+
writer.xmp_metadata = original_metadata
389+
output = BytesIO()
390+
writer.write(output)
391+
output_bytes = output.getvalue()
392+
reader = PdfReader(BytesIO(output_bytes))
393+
assert get_all_tiff(reader.xmp_metadata) == {"tiff:Artist": ["me"]}
394+
395+
# Set metadata from XmpInformation.
396+
writer = PdfWriter(clone_from=reader)
397+
xmp_metadata = writer.xmp_metadata
398+
assert get_all_tiff(xmp_metadata) == {"tiff:Artist": ["me"]}
399+
new_metadata = original_metadata.replace(b"<tiff:Artist>me</tiff:Artist>", b"<tiff:Artist>Foo Bar</tiff:Artist>")
400+
xmp_metadata.stream.set_data(new_metadata)
401+
output = BytesIO()
402+
writer.write(output)
403+
output_bytes = output.getvalue()
404+
reader = PdfReader(BytesIO(output_bytes))
405+
assert get_all_tiff(reader.xmp_metadata) == {"tiff:Artist": ["Foo Bar"]}
406+
407+
# Fix metadata not being an IndirectObject before.
408+
writer = PdfWriter(clone_from=RESOURCE_ROOT / "commented-xmp.pdf")
409+
writer.root_object[NameObject("/Metadata")] = writer.root_object["/Metadata"].get_object()
410+
assert "/XML" in str(writer.root_object)
411+
writer.xmp_metadata = new_metadata
412+
output = BytesIO()
413+
writer.write(output)
414+
output_bytes = output.getvalue()
415+
reader = PdfReader(BytesIO(output_bytes))
416+
assert get_all_tiff(reader.xmp_metadata) == {"tiff:Artist": ["Foo Bar"]}
417+
assert "/XML" not in str(writer.root_object)

0 commit comments

Comments
 (0)