From 828390ab3a7dd153815f12c184045d4371da23c4 Mon Sep 17 00:00:00 2001 From: David Williams Date: Mon, 24 Jul 2023 16:59:08 +0100 Subject: [PATCH 1/2] Third-party tools can write blocks before the header block of the PDF, this casuses validation to fail as the version number cannot be read. FindTrueHeaderPosition() determines the header offset whose value is used to locate XREF entries. --- .../src/org/pdfclown/tokens/FileParser.cs | 58 ++++++++++++++----- .../src/org/pdfclown/tokens/PlainWriter.cs | 5 +- .../src/org/pdfclown/tokens/Reader.cs | 26 ++++----- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs index c1f9a69..fe008cc 100644 --- a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs +++ b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs @@ -24,22 +24,17 @@ this list of conditions. */ using org.pdfclown.bytes; -using org.pdfclown.documents; -using org.pdfclown.files; using org.pdfclown.objects; using org.pdfclown.util.parsers; using System; -using System.Globalization; -using System.IO; -using System.Text; namespace org.pdfclown.tokens -{ - /** - PDF file parser [PDF:1.7:3.2,3.4]. - */ - public sealed class FileParser +{ + /** + PDF file parser [PDF:1.7:3.2,3.4]. + */ + public sealed class FileParser : BaseParser { #region types @@ -214,11 +209,16 @@ XRefEntry xrefEntry /** Retrieves the PDF version of the file [PDF:1.6:3.4.1]. */ - public string RetrieveVersion( + public string RetrieveVersion(out long headerOffset ) - { - IInputStream stream = Stream; - stream.Seek(0); + { + IInputStream stream = Stream; + stream.Seek(0); + + headerOffset = FindTrueHeaderPosition(); + + stream.Seek(headerOffset); + string header = stream.ReadString(10); if(!header.StartsWith(Keyword.BOF)) throw new PostScriptParseException("PDF header not found.", this); @@ -226,6 +226,36 @@ public string RetrieveVersion( return header.Substring(Keyword.BOF.Length,3); } + /** + Some third-party tools insert a block before the header. + The header shuold be the first entry in the document, + therefore find the true header here. + Header position whithin stream. + */ + internal long FindTrueHeaderPosition( + ) + { + IInputStream stream = Stream; + long position = stream.Position; + stream.Seek(0); + + do + { + int read = stream.ReadByte(); + switch (read) + { + case Symbol.Percent: // Comment + long headerOffset = stream.Position - 1; + stream.Seek(position); + return headerOffset; + case -1: // EOF + stream.Seek(position); + return 0; + } + } + while (true); + } + /** Retrieves the starting position of the last xref-table section [PDF:1.6:3.4.4]. */ diff --git a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/PlainWriter.cs b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/PlainWriter.cs index fdf820b..1cc70c2 100644 --- a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/PlainWriter.cs +++ b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/PlainWriter.cs @@ -67,6 +67,7 @@ protected override void WriteIncremental( { // 1. Original content (head, body and previous trailer). FileParser parser = file.Reader.Parser; + long headerOffset = parser.FindTrueHeaderPosition(); stream.Write(parser.Stream); // 2. Body update (modified indirect objects insertion). @@ -118,7 +119,7 @@ in file.IndirectObjects.ModifiedObjects AppendXRefEntry( xrefSubBuilder, indirectObjectEntry.Value.Reference, - stream.Length + stream.Length - headerOffset ); // Add in-use entry content! indirectObjectEntry.Value.WriteTo(stream, file); @@ -148,7 +149,7 @@ in file.IndirectObjects.ModifiedObjects } // 3. XRef-table last section. - long startxref = stream.Length; + long startxref = stream.Length - headerOffset; stream.Write(xrefBuilder.ToString()); // 4. Trailer. diff --git a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/Reader.cs b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/Reader.cs index 0194680..435d37c 100644 --- a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/Reader.cs +++ b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/Reader.cs @@ -24,23 +24,18 @@ this list of conditions. */ using org.pdfclown.bytes; -using org.pdfclown.documents; -using org.pdfclown.files; using org.pdfclown.objects; -using org.pdfclown.util.collections.generic; using org.pdfclown.util.parsers; using System; using System.Collections.Generic; -using System.IO; -using System.Linq; namespace org.pdfclown.tokens -{ - /** - PDF file reader. - */ - public sealed class Reader +{ + /** + PDF file reader. + */ + public sealed class Reader : IDisposable { #region types @@ -116,12 +111,13 @@ public FileParser Parser public FileInfo ReadInfo( ) { - //TODO:hybrid xref table/stream - Version version = Version.Get(parser.RetrieveVersion()); + //TODO:hybrid xref table/stream + + Version version = Version.Get(parser.RetrieveVersion(out long headerOffset)); PdfDictionary trailer = null; SortedDictionary xrefEntries = new SortedDictionary(); { - long sectionOffset = parser.RetrieveXRefOffset(); + long sectionOffset = parser.RetrieveXRefOffset() + headerOffset; while(sectionOffset > -1) { // Move to the start of the xref section! @@ -172,7 +168,7 @@ its entries. } // Get the indirect object offset! - int offset = (int)parser.GetToken(1); + int offset = (int)parser.GetToken(1) + (int)headerOffset; // Get the object generation number! int generation = (int)parser.GetToken(1); // Get the usage tag! @@ -222,7 +218,7 @@ its entries. // Get the previous xref-table section's offset! PdfInteger prevXRefOffset = (PdfInteger)sectionTrailer[PdfName.Prev]; - sectionOffset = (prevXRefOffset != null ? prevXRefOffset.IntValue : -1); + sectionOffset = (prevXRefOffset != null ? prevXRefOffset.IntValue + headerOffset : -1); } } return new FileInfo(version, trailer, xrefEntries); From edb73900b1e8863dca5216e28a1e9a112f1dcb43 Mon Sep 17 00:00:00 2001 From: David Williams Date: Tue, 25 Jul 2023 10:39:57 +0100 Subject: [PATCH 2/2] Read whole BOF when finding header. --- .../src/org/pdfclown/tokens/FileParser.cs | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs index fe008cc..33dc5d6 100644 --- a/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs +++ b/dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs @@ -28,6 +28,7 @@ this list of conditions. using org.pdfclown.util.parsers; using System; +using System.Text; namespace org.pdfclown.tokens { @@ -246,8 +247,19 @@ internal long FindTrueHeaderPosition( { case Symbol.Percent: // Comment long headerOffset = stream.Position - 1; + + StringBuilder header = new StringBuilder(Keyword.BOF.Length); + header.Append(Convert.ToChar(read)); + header.Append(stream.ReadString(4)); + if (!header.ToString().StartsWith(Keyword.BOF)) + { + continue; + } + + // Header found stream.Seek(position); return headerOffset; + case -1: // EOF stream.Seek(position); return 0;