-
Notifications
You must be signed in to change notification settings - Fork 101
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #255 from kellemNegasi/udd-35-concurrent-extractio…
…n-example [UDD-35] Concurrent Document Extraction Eexample
- Loading branch information
Showing
2 changed files
with
149 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
# concurrent processing | ||
|
||
UniPDF supports document level concurrent processing. This means processing each document separately in its own go routine. | ||
The concurrency is supported on document level for now. Page level concurrency in UniPDF is not safe yet. | ||
|
||
## Examples | ||
- [concurrent_extraction.go](concurrent_extraction.go) Extracts text from multiple documents provided via the command line arguments concurrently and saves the result to a text file. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
/* | ||
* This example demonstrates how to extract content form multiple documents concurrently | ||
* with each document extraction running in its own go routine. | ||
* N.B. currently concurrency is supported on a document level which means we can only extract one document per go routine. | ||
* | ||
* Run as: go run concurrent_extraction.go <input1.pdf> <input2.pdf> <input3.pdf>... <output_dir> | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"fmt" | ||
"io/fs" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
"time" | ||
|
||
"github.com/unidoc/unipdf/v3/common/license" | ||
"github.com/unidoc/unipdf/v3/extractor" | ||
"github.com/unidoc/unipdf/v3/model" | ||
) | ||
|
||
func init() { | ||
// Make sure to load your metered License API key prior to using the library. | ||
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io | ||
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
|
||
func main() { | ||
if len(os.Args) < 3 { | ||
fmt.Printf("Usage: go run concurrent_extraction.go input1.pdf input2.pdf input3.pdf... output_dir\n") | ||
os.Exit(1) | ||
} | ||
inputDocuments := []string{} | ||
args := os.Args | ||
var outputDir string | ||
for i := 1; i < len(args); i++ { | ||
if i == len(args)-1 { | ||
outputDir = args[i] | ||
} else { | ||
inputDocuments = append(inputDocuments, args[i]) | ||
} | ||
} | ||
|
||
start := time.Now() | ||
runConcurrent(inputDocuments, outputDir) | ||
duration := time.Since(start) | ||
fmt.Println("time taken for concurrent extraction", duration) | ||
} | ||
|
||
// runConcurrent takes the list of input documents and destination output directory and runs the extraction concurrently. | ||
func runConcurrent(documents []string, outputDir string) { | ||
res := make(chan map[string]string, len(documents)) | ||
|
||
err := concurrentExtraction(documents, res) | ||
if err != nil { | ||
fmt.Printf("Error. extraction failed %v.\n", err) | ||
} | ||
outputPath := outputDir | ||
if _, err := os.Stat(outputPath); err != nil { | ||
if os.IsNotExist(err) { | ||
err := os.Mkdir(outputPath, fs.ModePerm) | ||
if err != nil { | ||
fmt.Printf("Error. failed to create directory %s\n", outputPath) | ||
} | ||
} | ||
} | ||
|
||
for i := 0; i < len(documents); i++ { | ||
result := <-res | ||
for path, content := range result { | ||
basename := filepath.Base(path) | ||
fileName := strings.TrimSuffix(basename, filepath.Ext(basename)) + ".txt" | ||
filePath := filepath.Join(outputPath, fileName) | ||
file, err := os.Create(filePath) | ||
if err != nil { | ||
fmt.Printf("Error. failed to create file. %v\n", err) | ||
} | ||
_, err = file.WriteString(content) | ||
if err != nil { | ||
fmt.Printf("Error. failed to write content. %v\n", err) | ||
} | ||
} | ||
} | ||
} | ||
|
||
// concurrentExtraction launches a go routine for each document in `documents` and writes the result of extraction to | ||
// the channel `res`. | ||
func concurrentExtraction(documents []string, res chan map[string]string) error { | ||
for _, docPath := range documents { | ||
filePath := docPath | ||
go func(path string, res chan map[string]string) { | ||
result, err := extractSingleDoc(path) | ||
if err != nil { | ||
fmt.Printf("Error. Failed to extract file %v. %v\n", filePath, err) | ||
} | ||
temp := map[string]string{ | ||
filePath: result, | ||
} | ||
res <- temp | ||
}(filePath, res) | ||
} | ||
|
||
return nil | ||
} | ||
|
||
// extractSingleDoc takes a single file specified by the `filePath` and returns the extracted text. | ||
func extractSingleDoc(filePath string) (string, error) { | ||
pdfReader, _, err := model.NewPdfReaderFromFile(filePath, nil) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to create pdf reader: %w", err) | ||
} | ||
pages, err := pdfReader.GetNumPages() | ||
if err != nil { | ||
return "", err | ||
} | ||
var result string | ||
for i := 0; i < pages; i++ { | ||
pageNum := i + 1 | ||
page, err := pdfReader.GetPage(pageNum) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to get page %d: %w", pageNum, err) | ||
} | ||
|
||
ex, err := extractor.New(page) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to create extractor: %w", err) | ||
} | ||
|
||
text, err := ex.ExtractText() | ||
if err != nil { | ||
return "", fmt.Errorf("failed to extract text: %w", err) | ||
} | ||
result += text | ||
} | ||
|
||
return result, nil | ||
} |