-
Notifications
You must be signed in to change notification settings - Fork 101
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
added concurrent document extraction example
- Loading branch information
1 parent
a16419c
commit 2396e39
Showing
1 changed file
with
130 additions
and
0 deletions.
There are no files selected for viewing
130 changes: 130 additions & 0 deletions
130
concurrent-processing/extract_from_multiple_documents.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
package main | ||
|
||
import ( | ||
"fmt" | ||
"io/fs" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
"time" | ||
|
||
"github.com/unidoc/unipdf/v3/common/license" | ||
"github.com/unidoc/unipdf/v3/extractor" | ||
"github.com/unidoc/unipdf/v3/model" | ||
) | ||
|
||
func init() { | ||
// Make sure to load your metered License API key prior to using the library. | ||
// If you need a key, you can sign up and create a free one at https://cloud.unidoc.io | ||
err := license.SetMeteredKey(os.Getenv(`UNIDOC_LICENSE_API_KEY`)) | ||
if err != nil { | ||
panic(err) | ||
} | ||
} | ||
|
||
func main() { | ||
if len(os.Args) < 2 { | ||
fmt.Printf("Usage: go run pdf_extract_text.go input1.pdf input2.pdf input3.pdf ... output_dir\n") | ||
os.Exit(1) | ||
} | ||
inputDocuments := []string{} | ||
args := os.Args | ||
var outputDir string | ||
for i := 1; i < len(args); i++ { | ||
if i == len(args)-1 { | ||
outputDir = args[i] | ||
} else { | ||
inputDocuments[i] = args[i] | ||
} | ||
} | ||
|
||
start := time.Now() | ||
runConcurrent(inputDocuments, outputDir) | ||
duration := time.Since(start) | ||
fmt.Println("time taken for concurrent extraction", duration) | ||
} | ||
|
||
func runConcurrent(documents []string, outputDir string) { | ||
res := make(chan map[string]string, 3) | ||
|
||
err := concurrentExtraction(documents, res) | ||
if err != nil { | ||
panic(err) | ||
} | ||
outputPath := outputDir | ||
if _, err := os.Stat(outputPath); err != nil { | ||
if os.IsNotExist(err) { | ||
err := os.Mkdir(outputPath, fs.ModePerm) | ||
if err != nil { | ||
fmt.Printf("Error. failed to create directory %s", outputPath) | ||
} | ||
} | ||
} | ||
for i := 0; i < len(documents); i++ { | ||
result := <-res | ||
for path, content := range result { | ||
basename := filepath.Base(path) | ||
fileName := strings.TrimSuffix(basename, filepath.Ext(basename)) + ".txt" | ||
|
||
file, err := os.Create(outputPath + fileName) | ||
if err != nil { | ||
fmt.Printf("Error. failed to create file. %v", err) | ||
} | ||
_, err = file.WriteString(content) | ||
if err != nil { | ||
fmt.Printf("Error. failed to write content. %v", err) | ||
} | ||
} | ||
} | ||
|
||
} | ||
|
||
func concurrentExtraction(documents []string, res chan map[string]string) error { | ||
|
||
for _, docPath := range documents { | ||
filePath := docPath | ||
go func(path string, res chan map[string]string) { | ||
result, err := extractSingleDoc(path) | ||
if err != nil { | ||
fmt.Printf("Error. Failed to extract file %v due to %v", filePath, err) | ||
} | ||
temp := map[string]string{ | ||
filePath: result, | ||
} | ||
res <- temp | ||
}(filePath, res) | ||
} | ||
return nil | ||
} | ||
|
||
func extractSingleDoc(filePath string) (string, error) { | ||
pdfReader, _, err := model.NewPdfReaderFromFile(filePath, nil) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to create pdf reader: %w", err) | ||
} | ||
pages, err := pdfReader.GetNumPages() | ||
if err != nil { | ||
return "", err | ||
} | ||
var result string | ||
for i := 0; i < pages; i++ { | ||
pageNum := i + 1 | ||
page, err := pdfReader.GetPage(pageNum) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to get page %d: %w", pageNum, err) | ||
} | ||
|
||
ex, err := extractor.New(page) | ||
if err != nil { | ||
return "", fmt.Errorf("failed to create extractor: %w", err) | ||
} | ||
|
||
text, err := ex.ExtractText() | ||
if err != nil { | ||
return "", fmt.Errorf("failed to extract text: %w", err) | ||
} | ||
result += text | ||
} | ||
|
||
return result, nil | ||
} |