Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Commit

Permalink
fix: use a lock for extracting text from pdf via mupdf to avoid cgo p…
Browse files Browse the repository at this point in the history
…anic (issue #135) (#136)
  • Loading branch information
iwilltry42 authored Oct 9, 2024
1 parent 41c1e0b commit 678f38a
Showing 1 changed file with 9 additions and 1 deletion.
10 changes: 9 additions & 1 deletion pkg/datastore/documentloader/pdf/mupdf/pdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ import (
// Compile time check to ensure PDF satisfies the DocumentLoader interface.
var _ types.DocumentLoader = (*PDF)(nil)

var mupdfLock sync.Mutex

type PDFOptions struct {
// Password for encrypted PDF files.
Password string
Expand Down Expand Up @@ -88,6 +90,9 @@ func (l *PDF) Load(ctx context.Context) ([]vs.Document, error) {
docs := make([]vs.Document, 0, l.document.NumPage())
numPages := l.document.NumPage()

// We need a lock here, since MuPDF is not thread-safe and there are some edge cases that can cause a CGO panic.
// See https://github.com/gptscript-ai/knowledge/issues/135
mupdfLock.Lock()
g, childCtx := errgroup.WithContext(ctx)
g.SetLimit(l.opts.NumThread)
for pageNum := 0; pageNum < numPages; pageNum++ {
Expand Down Expand Up @@ -131,7 +136,10 @@ func (l *PDF) Load(ctx context.Context) ([]vs.Document, error) {
})
}

return docs, g.Wait()
err := g.Wait()
mupdfLock.Unlock()

return docs, err
}

// LoadAndSplit loads PDF documents from the provided reader and splits them using the specified text splitter.
Expand Down

0 comments on commit 678f38a

Please sign in to comment.