Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

FindTrueHeaderPosition: Determine header offset to locate XREF entries. #11

Merged
merged 2 commits into from
Jul 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 56 additions & 14 deletions dotNET/pdfclown.lib/src/org/pdfclown/tokens/FileParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,22 +24,18 @@ this list of conditions.
*/

using org.pdfclown.bytes;
using org.pdfclown.documents;
using org.pdfclown.files;
using org.pdfclown.objects;
using org.pdfclown.util.parsers;

using System;
using System.Globalization;
using System.IO;
using System.Text;
using System.Text;

namespace org.pdfclown.tokens
{
/**
<summary>PDF file parser [PDF:1.7:3.2,3.4].</summary>
*/
public sealed class FileParser
{
/**
<summary>PDF file parser [PDF:1.7:3.2,3.4].</summary>
*/
public sealed class FileParser
: BaseParser
{
#region types
Expand Down Expand Up @@ -214,18 +210,64 @@ XRefEntry xrefEntry
/**
<summary>Retrieves the PDF version of the file [PDF:1.6:3.4.1].</summary>
*/
public string RetrieveVersion(
public string RetrieveVersion(out long headerOffset
)
{
IInputStream stream = Stream;
stream.Seek(0);
{
IInputStream stream = Stream;
stream.Seek(0);

headerOffset = FindTrueHeaderPosition();

stream.Seek(headerOffset);

string header = stream.ReadString(10);
if(!header.StartsWith(Keyword.BOF))
throw new PostScriptParseException("PDF header not found.", this);

return header.Substring(Keyword.BOF.Length,3);
}

/**
<summary>Some third-party tools insert a block before the header.
The header shuold be the first entry in the document,
therefore find the true header here.</summary>
<returns>Header position whithin stream.</returns>
*/
internal long FindTrueHeaderPosition(
)
{
IInputStream stream = Stream;
long position = stream.Position;
stream.Seek(0);

do
{
int read = stream.ReadByte();
switch (read)
{
case Symbol.Percent: // Comment
long headerOffset = stream.Position - 1;

StringBuilder header = new StringBuilder(Keyword.BOF.Length);
header.Append(Convert.ToChar(read));
header.Append(stream.ReadString(4));
if (!header.ToString().StartsWith(Keyword.BOF))
{
continue;
}

// Header found
stream.Seek(position);
return headerOffset;

case -1: // EOF
stream.Seek(position);
return 0;
}
}
while (true);
}

/**
<summary>Retrieves the starting position of the last xref-table section [PDF:1.6:3.4.4].</summary>
*/
Expand Down
5 changes: 3 additions & 2 deletions dotNET/pdfclown.lib/src/org/pdfclown/tokens/PlainWriter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ protected override void WriteIncremental(
{
// 1. Original content (head, body and previous trailer).
FileParser parser = file.Reader.Parser;
long headerOffset = parser.FindTrueHeaderPosition();
stream.Write(parser.Stream);

// 2. Body update (modified indirect objects insertion).
Expand Down Expand Up @@ -118,7 +119,7 @@ in file.IndirectObjects.ModifiedObjects
AppendXRefEntry(
xrefSubBuilder,
indirectObjectEntry.Value.Reference,
stream.Length
stream.Length - headerOffset
);
// Add in-use entry content!
indirectObjectEntry.Value.WriteTo(stream, file);
Expand Down Expand Up @@ -148,7 +149,7 @@ in file.IndirectObjects.ModifiedObjects
}

// 3. XRef-table last section.
long startxref = stream.Length;
long startxref = stream.Length - headerOffset;
stream.Write(xrefBuilder.ToString());

// 4. Trailer.
Expand Down
26 changes: 11 additions & 15 deletions dotNET/pdfclown.lib/src/org/pdfclown/tokens/Reader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -24,23 +24,18 @@ this list of conditions.
*/

using org.pdfclown.bytes;
using org.pdfclown.documents;
using org.pdfclown.files;
using org.pdfclown.objects;
using org.pdfclown.util.collections.generic;
using org.pdfclown.util.parsers;

using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace org.pdfclown.tokens
{
/**
<summary>PDF file reader.</summary>
*/
public sealed class Reader
{
/**
<summary>PDF file reader.</summary>
*/
public sealed class Reader
: IDisposable
{
#region types
Expand Down Expand Up @@ -116,12 +111,13 @@ public FileParser Parser
public FileInfo ReadInfo(
)
{
//TODO:hybrid xref table/stream
Version version = Version.Get(parser.RetrieveVersion());
//TODO:hybrid xref table/stream

Version version = Version.Get(parser.RetrieveVersion(out long headerOffset));
PdfDictionary trailer = null;
SortedDictionary<int,XRefEntry> xrefEntries = new SortedDictionary<int,XRefEntry>();
{
long sectionOffset = parser.RetrieveXRefOffset();
long sectionOffset = parser.RetrieveXRefOffset() + headerOffset;
while(sectionOffset > -1)
{
// Move to the start of the xref section!
Expand Down Expand Up @@ -172,7 +168,7 @@ its entries.
}

// Get the indirect object offset!
int offset = (int)parser.GetToken(1);
int offset = (int)parser.GetToken(1) + (int)headerOffset;
// Get the object generation number!
int generation = (int)parser.GetToken(1);
// Get the usage tag!
Expand Down Expand Up @@ -222,7 +218,7 @@ its entries.

// Get the previous xref-table section's offset!
PdfInteger prevXRefOffset = (PdfInteger)sectionTrailer[PdfName.Prev];
sectionOffset = (prevXRefOffset != null ? prevXRefOffset.IntValue : -1);
sectionOffset = (prevXRefOffset != null ? prevXRefOffset.IntValue + headerOffset : -1);
}
}
return new FileInfo(version, trailer, xrefEntries);
Expand Down