Skip to content

Commit

Permalink
PDFBOX-5895: support Markdown extraction from the command line
Browse files Browse the repository at this point in the history
git-svn-id: https://svn.apache.org/repos/asf/pdfbox/trunk@1921715 13f79535-47bb-0310-9956-ffa450edef68
  • Loading branch information
THausherr committed Oct 31, 2024
1 parent 25e3359 commit 0c11548
Showing 1 changed file with 42 additions and 5 deletions.
47 changes: 42 additions & 5 deletions tools/src/main/java/org/apache/pdfbox/tools/ExtractText.java
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,9 @@ public final class ExtractText implements Callable<Integer>
@Option(names = "-html", description = "Output in HTML format instead of raw text")
private boolean toHTML = false;

@Option(names = "-md", description = "Output in Markdown format instead of raw text")
private boolean toMD = false;

@Option(names = "-ignoreBeads", description = "Disables the separation by beads")
private boolean ignoreBeads = false;

Expand Down Expand Up @@ -148,7 +151,13 @@ public static void main( String[] args )
public Integer call()
{
// set file extension
if (toHTML && toMD)
{
SYSERR.println( "You can't set md and html at the same time");
return 1;
}
String ext = toHTML ? ".html" : ".txt";
ext = toMD ? ".md" : ext;

if (outfile == null)
{
Expand All @@ -175,7 +184,8 @@ public Integer call()
AccessPermission ap = document.getCurrentAccessPermission();
if( ! ap.canExtractContent() )
{
SYSERR.println( "You do not have permission to extract text");
System.out.println( "You do not have permission to extract text" );
//SYSERR.println( "You do not have permission to extract text");
return 1;
}

Expand Down Expand Up @@ -209,13 +219,27 @@ public Integer call()
}
else
{
if (rotationMagic)
if (toMD)
{
stripper = new FilteredTextStripper();
if (rotationMagic)
{
stripper = new FilteredText2Markdown();
}
else
{
stripper = new PDFText2Markdown();
}
}
else
{
stripper = new PDFTextStripper();
if (rotationMagic)
{
stripper = new FilteredTextStripper();
}
else
{
stripper = new PDFTextStripper();
}
}
stripper.setSortByPosition(sort);
stripper.setShouldSeparateByBeads(!ignoreBeads);
Expand Down Expand Up @@ -306,6 +330,7 @@ private void extractPages(int startPage, int endPage,
{
for (int p = startPage; p <= endPage; ++p)
{
//System.err.println("page " + p);
stripper.setStartPage(p);
stripper.setEndPage(p);
try
Expand Down Expand Up @@ -414,10 +439,22 @@ protected void processTextPosition(TextPosition text)
*/
class FilteredTextStripper extends PDFTextStripper
{
FilteredTextStripper() throws IOException
@Override
protected void processTextPosition(TextPosition text)
{
int angle = ExtractText.getAngle(text);
if (angle == 0)
{
super.processTextPosition(text);
}
}
}

/**
* PDFText2Markdown that only processes glyphs that have angle 0.
*/
class FilteredText2Markdown extends PDFText2Markdown
{
@Override
protected void processTextPosition(TextPosition text)
{
Expand Down

0 comments on commit 0c11548

Please sign in to comment.