Skip to content
This repository has been archived by the owner on Oct 30, 2024. It is now read-only.

Feat: Improve ingestion speed by running more routinues #25

Merged
merged 1 commit into from
Jun 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 14 additions & 14 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ go 1.22.0

replace (
github.com/philippgille/chromem-go => github.com/iwilltry42/chromem-go v0.0.0-20240513080122-88f1efa639f5 // Azure OpenAI support
github.com/tmc/langchaingo => github.com/iwilltry42/langchaingo v0.0.0-20240516095223-8cf46ac74799 // Context-Aware Markdown Splitting
github.com/tmc/langchaingo => github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b // Context-Aware Markdown Splitting
)

require (
Expand Down Expand Up @@ -36,7 +36,7 @@ require (
)

require (
cloud.google.com/go/ai v0.4.0 // indirect
cloud.google.com/go/ai v0.5.0 // indirect
github.com/AssemblyAI/assemblyai-go-sdk v1.3.0 // indirect
github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 // indirect
github.com/JalfResi/justext v0.0.0-20170829062021-c0282dea7198 // indirect
Expand All @@ -47,13 +47,13 @@ require (
github.com/andybalholm/cascadia v1.3.2 // indirect
github.com/araddon/dateparse v0.0.0-20200409225146-d820a6159ab1 // indirect
github.com/avast/retry-go v3.0.0+incompatible // indirect
github.com/aws/aws-sdk-go-v2 v1.26.0 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.1 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 // indirect
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.7.3 // indirect
github.com/aws/aws-sdk-go-v2 v1.26.1 // indirect
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.2 // indirect
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.5 // indirect
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.5 // indirect
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.8.1 // indirect
github.com/aws/aws-sdk-go-v2/service/textract v1.30.4 // indirect
github.com/aws/smithy-go v1.20.1 // indirect
github.com/aws/smithy-go v1.20.2 // indirect
github.com/aymerick/douceur v0.2.0 // indirect
github.com/bytedance/sonic v1.11.3 // indirect
github.com/cenkalti/backoff v2.2.1+incompatible // indirect
Expand Down Expand Up @@ -98,7 +98,7 @@ require (
github.com/josharian/intern v1.0.0 // indirect
github.com/json-iterator/go v1.1.12 // indirect
github.com/kevinburke/ssh_config v1.2.0 // indirect
github.com/klauspost/compress v1.17.2 // indirect
github.com/klauspost/compress v1.17.6 // indirect
github.com/klauspost/cpuid/v2 v2.2.7 // indirect
github.com/ledongthuc/pdf v0.0.0-20240201131950-da5b75280b06 // indirect
github.com/leodido/go-urn v1.4.0 // indirect
Expand Down Expand Up @@ -144,11 +144,11 @@ require (
golang.org/x/sys v0.20.0 // indirect
golang.org/x/text v0.15.0 // indirect
golang.org/x/tools v0.20.0 // indirect
google.golang.org/api v0.176.1 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240415180920-8c6c420018be // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240415180920-8c6c420018be // indirect
google.golang.org/grpc v1.63.2 // indirect
google.golang.org/protobuf v1.33.0 // indirect
google.golang.org/api v0.180.0 // indirect
google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20240506185236-b8a5c65736ae // indirect
google.golang.org/grpc v1.64.0 // indirect
google.golang.org/protobuf v1.34.1 // indirect
gopkg.in/warnings.v0 v0.1.2 // indirect
gopkg.in/yaml.v3 v3.0.1 // indirect
modernc.org/libc v1.22.5 // indirect
Expand Down
70 changes: 35 additions & 35 deletions go.sum
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
cloud.google.com/go v0.112.1 h1:uJSeirPke5UNZHIb4SxfZklVSiWWVqW4oXlETwZziwM=
cloud.google.com/go/ai v0.4.0 h1:hoF8+joXKfW2Ug7MKssoffXCVUSxUqMUJL0hJxVtO1Q=
cloud.google.com/go/ai v0.4.0/go.mod h1:iX72tmUodGXVDxRDCGUZEPiB9HaMeERXkOdgCkUi8sA=
cloud.google.com/go/aiplatform v1.66.0 h1:bbFYY4JInclG10czRFUYj2rjD+obhh3Gi9zVlyoMgEc=
cloud.google.com/go/aiplatform v1.66.0/go.mod h1:bPQS0UjaXaTAq57UgP3XWDCtYFOIbXXpkMsl6uP4JAc=
cloud.google.com/go/longrunning v0.5.6 h1:xAe8+0YaWoCKr9t1+aWe+OeQgN/iJK1fEgZSXmjuEaE=
cloud.google.com/go/longrunning v0.5.6/go.mod h1:vUaDrWYOMKRuhiv6JBnn49YxCPz2Ayn9GqyjaBT8/mA=
cloud.google.com/go v0.113.0 h1:g3C70mn3lWfckKBiCVsAshabrDg01pQ0pnX1MNtnMkA=
cloud.google.com/go/ai v0.5.0 h1:x8s4rDn5t9OVZvBCgtr5bZTH5X0O7JdE6zYo+O+MpRw=
cloud.google.com/go/ai v0.5.0/go.mod h1:96VBphk70e0zdXZrbtgPuKYRZsQ3UktSUXhuojwiKA8=
cloud.google.com/go/aiplatform v1.67.0 h1:YWeqD4BjYwrmY4fa+isGcw0P81lJ3dKVxbWxdBchoiU=
cloud.google.com/go/aiplatform v1.67.0/go.mod h1:s/sJ6btBEr6bKnrNWdK9ZgHCvwbZNdP90b3DDtxxw+Y=
cloud.google.com/go/longrunning v0.5.7 h1:WLbHekDbjK1fVFD3ibpFFVoyizlLRl73I7YKuAKilhU=
cloud.google.com/go/longrunning v0.5.7/go.mod h1:8GClkudohy1Fxm3owmBGid8W0pSgodEMwEAztp38Xng=
code.sajari.com/docconv/v2 v2.0.0-pre.4 h1:1yQrSTah9rMSC/s1T9bq2H2j1NuRTppeApqZf2A8Zbc=
code.sajari.com/docconv/v2 v2.0.0-pre.4/go.mod h1:+pfeEYCOA46E5fq44sh1OKEkO9hsptg8XRioeP1vvPg=
dario.cat/mergo v1.0.0 h1:AGCNq9Evsj31mOgNPcLyXc+4PNABt905YmuqPYYpBWk=
Expand All @@ -27,6 +27,8 @@ github.com/ProtonMail/go-crypto v1.0.0/go.mod h1:EjAoLdwvbIOoOQr3ihjnSoLZRtE8azu
github.com/PuerkitoBio/goquery v1.4.1/go.mod h1:T9ezsOHcCrDCgA8aF1Cqr3sSYbO/xgdy8/R/XiIMAhA=
github.com/PuerkitoBio/goquery v1.9.2 h1:4/wZksC3KgkQw7SQgkKotmKljk0M6V8TUvA8Wb4yPeE=
github.com/PuerkitoBio/goquery v1.9.2/go.mod h1:GHPCaP0ODyyxqcNoFGYlAprUFH81NuRPd0GX3Zu2Mvk=
github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b h1:xY/6vmYx7H3ZM2Apa926VvmYrzL228unIElCDYrhjEo=
github.com/StrongMonkey/langchaingo v0.0.0-20240617180437-9af4bee04c8b/go.mod h1:MFQg4CUOwjT5VTYjorSmXHhQju6XqdJkSbdpBcZf7SQ=
github.com/acorn-io/cmd v0.0.0-20240404013709-34f690bde37b h1:VzGEGrJn54UcsEvTqcpJj9USv7vc6TIxQGZVS0Ff304=
github.com/acorn-io/cmd v0.0.0-20240404013709-34f690bde37b/go.mod h1:9jrYuzTJCv6QgGKl5gbhKqhG3kke31PmUE2KruBHzpg=
github.com/acorn-io/z v0.0.0-20231104012607-4cab1b3ec5e5 h1:oQnpRt5KoANqwwUNzWFu+5I12Unfu/WZ330QHefxNc8=
Expand All @@ -47,22 +49,22 @@ github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPd
github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5/go.mod h1:wHh0iHkYZB8zMSxRWpUBQtwG5a7fFgvEO+odwuTv2gs=
github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0=
github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY=
github.com/aws/aws-sdk-go-v2 v1.26.0 h1:/Ce4OCiM3EkpW7Y+xUnfAFpchU78K7/Ug01sZni9PgA=
github.com/aws/aws-sdk-go-v2 v1.26.0/go.mod h1:35hUlJVYd+M++iLI3ALmVwMOyRYMmRqUXpTtRGW+K9I=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.1 h1:gTK2uhtAPtFcdRRJilZPx8uJLL2J85xK11nKtWL0wfU=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.1/go.mod h1:sxpLb+nZk7tIfCWChfd+h4QwHNUR57d8hA1cleTkjJo=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4 h1:0ScVK/4qZ8CIW0k8jOeFVsyS/sAiXpYxRBLolMkuLQM=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.4/go.mod h1:84KyjNZdHC6QZW08nfHI6yZgPd+qRgaWcYsyLUo3QY8=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4 h1:sHmMWWX5E7guWEFQ9SVo6A3S4xpPrWnd77a6y4WM6PU=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.4/go.mod h1:WjpDrhWisWOIoS9n3nk67A3Ll1vfULJ9Kq6h29HTD48=
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.7.3 h1:Ch31Jl96ULFDPPe7nLHnxmImjO9I8bjlPAECb1Wrx6E=
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.7.3/go.mod h1:yHTz9jyvT5dT3sUHovWTWjv332k7FvD8MPk7xWEkbnQ=
github.com/aws/aws-sdk-go-v2 v1.26.1 h1:5554eUqIYVWpU0YmeeYZ0wU64H2VLBs8TlhRB2L+EkA=
github.com/aws/aws-sdk-go-v2 v1.26.1/go.mod h1:ffIFB97e2yNsv4aTSGkqtHnppsIJzw7G7BReUZ3jCXM=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.2 h1:x6xsQXGSmW6frevwDA+vi/wqhp1ct18mVXYN08/93to=
github.com/aws/aws-sdk-go-v2/aws/protocol/eventstream v1.6.2/go.mod h1:lPprDr1e6cJdyYeGXnRaJoP4Md+cDBvi2eOj00BlGmg=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.5 h1:aw39xVGeRWlWx9EzGVnhOR4yOjQDHPQ6o6NmBlscyQg=
github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.5/go.mod h1:FSaRudD0dXiMPK2UjknVwwTYyZMRsHv3TtkabsZih5I=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.5 h1:PG1F3OD1szkuQPzDw3CIQsRIrtTlUC3lP84taWzHlq0=
github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.5/go.mod h1:jU1li6RFryMz+so64PpKtudI+QzbKoIEivqdf6LNpOc=
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.8.1 h1:vTHgBjsGhgKWWIgioxd7MkBH5Ekr8C6Cb+/8iWf1dpc=
github.com/aws/aws-sdk-go-v2/service/bedrockruntime v1.8.1/go.mod h1:nZspkhg+9p8iApLFoyAqfyuMP0F38acy2Hm3r5r95Cg=
github.com/aws/aws-sdk-go-v2/service/sagemakerruntime v1.27.3 h1:bvRsyhuEYCvyg3i89U0gRqdoJqtRoH5RK1hwOmmCCsM=
github.com/aws/aws-sdk-go-v2/service/sagemakerruntime v1.27.3/go.mod h1:ljK0mx70pj1+eBVh2tyJM+VzZooiZZgQLMIT+PHJSho=
github.com/aws/aws-sdk-go-v2/service/textract v1.30.4 h1:rz8lRkt7F0iKCGRbpL3NwbqGKUyX6rPrw1Nk1bO9tKc=
github.com/aws/aws-sdk-go-v2/service/textract v1.30.4/go.mod h1:PFUGOzwfe10atRNGR9abVqiKPAPSfKgu/LF0oDBflsY=
github.com/aws/smithy-go v1.20.1 h1:4SZlSlMr36UEqC7XOyRVb27XMeZubNcBNN+9IgEPIQw=
github.com/aws/smithy-go v1.20.1/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E=
github.com/aws/smithy-go v1.20.2 h1:tbp628ireGtzcHDDmLT/6ADHidqnwgF57XOXZe6tp4Q=
github.com/aws/smithy-go v1.20.2/go.mod h1:krry+ya/rV9RDcV/Q16kpu6ypI4K2czasz0NC3qS14E=
github.com/aymerick/douceur v0.2.0 h1:Mv+mAeH1Q+n9Fr+oyamOlAkUNPWPlA8PPGR0QAaYuPk=
github.com/aymerick/douceur v0.2.0/go.mod h1:wlT5vV2O3h55X9m7iVYN0TBM0NH/MmbLnd30/FjWUq4=
github.com/bwesterb/go-ristretto v1.2.3/go.mod h1:fUIoIZaG73pV5biE2Blr2xEzDoMj7NFEuV9ekS419A0=
Expand Down Expand Up @@ -200,8 +202,6 @@ github.com/inconshreveable/mousetrap v1.1.0 h1:wN+x4NVGpMsO7ErUn/mUI3vEoE6Jt13X2
github.com/inconshreveable/mousetrap v1.1.0/go.mod h1:vpF70FUmC8bwa3OWnCshd2FqLfsEA9PFc4w1p2J65bw=
github.com/iwilltry42/chromem-go v0.0.0-20240513080122-88f1efa639f5 h1:g3mhp9K4ujkjne0JDvtmP8i36uQlibwzbWKYzZbzcKU=
github.com/iwilltry42/chromem-go v0.0.0-20240513080122-88f1efa639f5/go.mod h1:hTd+wGEm/fFPQl7ilfCwQXkgEUxceYh86iIdoKMolPo=
github.com/iwilltry42/langchaingo v0.0.0-20240516095223-8cf46ac74799 h1:b5x7HwpU5UzA/qlde8bFWHYfdpRxglHiZKy9KRbll8o=
github.com/iwilltry42/langchaingo v0.0.0-20240516095223-8cf46ac74799/go.mod h1:QZupRrRBW3fhNUlvll1U4YUHatKZd5h7C0UiHBEWJYc=
github.com/jaytaylor/html2text v0.0.0-20180606194806-57d518f124b0/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7 h1:g0fAGBisHaEQ0TRq1iBvemFRf+8AEWEmBESSiWB3Vsc=
github.com/jaytaylor/html2text v0.0.0-20200412013138-3577fbdbcff7/go.mod h1:CVKlgaMiht+LXvHG173ujK6JUhZXKb2u/BQtjPDIvyk=
Expand All @@ -219,8 +219,8 @@ github.com/json-iterator/go v1.1.12/go.mod h1:e30LSqwooZae/UwlEbR2852Gd8hjQvJoHm
github.com/kevinburke/ssh_config v1.2.0 h1:x584FjTGwHzMwvHx18PXxbBVzfnxogHaAReU4gf13a4=
github.com/kevinburke/ssh_config v1.2.0/go.mod h1:CT57kijsi8u/K/BOFA39wgDQJ9CxiF4nAY/ojJ6r6mM=
github.com/klauspost/compress v1.10.3/go.mod h1:aoV0uJVorq1K+umq18yTdKaF57EivdYsUV+/s2qKfXs=
github.com/klauspost/compress v1.17.2 h1:RlWWUY/Dr4fL8qk9YG7DTZ7PDgME2V4csBXA8L/ixi4=
github.com/klauspost/compress v1.17.2/go.mod h1:ntbaceVETuRiXiv4DpjP66DpAtAGkEQskQzEyD//IeE=
github.com/klauspost/compress v1.17.6 h1:60eq2E/jlfwQXtvZEeBUYADs+BwKBWURIY+Gj2eRGjI=
github.com/klauspost/compress v1.17.6/go.mod h1:/dCuZOvVtNoHsyb+cuJD3itjs3NbnF6KH9zAO4BDxPM=
github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
Expand Down Expand Up @@ -455,18 +455,18 @@ golang.org/x/tools v0.20.0 h1:hz/CVckiOxybQvFw6h7b/q80NTr9IUQb4s1IIzW7KNY=
golang.org/x/tools v0.20.0/go.mod h1:WvitBU7JJf6A4jOdg4S1tviW9bhUxkgeCui/0JHctQg=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
google.golang.org/api v0.176.1 h1:DJSXnV6An+NhJ1J+GWtoF2nHEuqB1VNoTfnIbjNvwD4=
google.golang.org/api v0.176.1/go.mod h1:j2MaSDYcvYV1lkZ1+SMW4IeF90SrEyFA+tluDYWRrFg=
google.golang.org/genproto v0.0.0-20240325203815-454cdb8f5daa h1:ePqxpG3LVx+feAUOx8YmR5T7rc0rdzK8DyxM8cQ9zq0=
google.golang.org/genproto v0.0.0-20240325203815-454cdb8f5daa/go.mod h1:CnZenrTdRJb7jc+jOm0Rkywq+9wh0QC4U8tyiRbEPPM=
google.golang.org/genproto/googleapis/api v0.0.0-20240415180920-8c6c420018be h1:Zz7rLWqp0ApfsR/l7+zSHhY3PMiH2xqgxlfYfAfNpoU=
google.golang.org/genproto/googleapis/api v0.0.0-20240415180920-8c6c420018be/go.mod h1:dvdCTIoAGbkWbcIKBniID56/7XHTt6WfxXNMxuziJ+w=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240415180920-8c6c420018be h1:LG9vZxsWGOmUKieR8wPAUR3u3MpnYFQZROPIMaXh7/A=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240415180920-8c6c420018be/go.mod h1:WtryC6hu0hhx87FDGxWCDptyssuo68sk10vYjF+T9fY=
google.golang.org/grpc v1.63.2 h1:MUeiw1B2maTVZthpU5xvASfTh3LDbxHd6IJ6QQVU+xM=
google.golang.org/grpc v1.63.2/go.mod h1:WAX/8DgncnokcFUldAxq7GeB5DXHDbMF+lLvDomNkRA=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
google.golang.org/api v0.180.0 h1:M2D87Yo0rGBPWpo1orwfCLehUUL6E7/TYe5gvMQWDh4=
google.golang.org/api v0.180.0/go.mod h1:51AiyoEg1MJPSZ9zvklA8VnRILPXxn1iVen9v25XHAE=
google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda h1:wu/KJm9KJwpfHWhkkZGohVC6KRrc1oJNr4jwtQMOQXw=
google.golang.org/genproto v0.0.0-20240401170217-c3f982113cda/go.mod h1:g2LLCvCeCSir/JJSWosk19BR4NVxGqHUC6rxIRsd7Aw=
google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae h1:AH34z6WAGVNkllnKs5raNq3yRq93VnjBG6rpfub/jYk=
google.golang.org/genproto/googleapis/api v0.0.0-20240506185236-b8a5c65736ae/go.mod h1:FfiGhwUm6CJviekPrc0oJ+7h29e+DmWU6UtjX0ZvI7Y=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240506185236-b8a5c65736ae h1:c55+MER4zkBS14uJhSZMGGmya0yJx5iHV4x/fpOSNRk=
google.golang.org/genproto/googleapis/rpc v0.0.0-20240506185236-b8a5c65736ae/go.mod h1:I7Y+G38R2bu5j1aLzfFmQfTcU/WnFuqDwLZAbvKTKpM=
google.golang.org/grpc v1.64.0 h1:KH3VH9y/MgNQg1dE7b3XfVK0GsPSIzJwdF617gUSbvY=
google.golang.org/grpc v1.64.0/go.mod h1:oxjF8E3FBnjp+/gVFYdWacaLDx9na1aqy9oovLpxQYg=
google.golang.org/protobuf v1.34.1 h1:9ddQBjfCyZPOHPUiPxpYESBLc+T8P3E+Vo4IbKZgFWg=
google.golang.org/protobuf v1.34.1/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
Expand Down
78 changes: 50 additions & 28 deletions pkg/datastore/documentloader/pdf.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,14 @@ import (
"context"
"io"
"strings"
"sync"

md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/PuerkitoBio/goquery"
"github.com/gen2brain/go-fitz"
"github.com/gptscript-ai/knowledge/pkg/datastore/types"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"golang.org/x/sync/errgroup"
)

// Compile time check to ensure PDF satisfies the DocumentLoader interface.
Expand All @@ -27,6 +29,9 @@ type PDFOptions struct {

// Source is the name of the pdf document
Source string

// Number of goroutines to load pdf documents
NumThread int
}

// WithConfig sets the PDF loader configuration.
Expand All @@ -41,6 +46,7 @@ type PDF struct {
opts PDFOptions
document *fitz.Document
converter *md.Converter
lock *sync.Mutex
}

// NewPDFFromFile creates a new PDF loader with the given options.
Expand All @@ -63,51 +69,67 @@ func NewPDF(r io.Reader, optFns ...func(o *PDFOptions)) (*PDF, error) {

converter := md.NewConverter("", true, nil)

if opts.NumThread == 0 {
opts.NumThread = 100
}

return &PDF{
opts: opts,
document: doc,
converter: converter,
lock: &sync.Mutex{},
}, nil
}

// Load loads the PDF document and returns a slice of vs.Document containing the page contents and metadata.
func (l *PDF) Load(ctx context.Context) ([]vs.Document, error) {
docs := make([]vs.Document, 0, l.document.NumPage())
numPages := l.document.NumPage()

for pageNum := 0; pageNum < l.document.NumPage(); pageNum++ {
g, childCtx := errgroup.WithContext(ctx)
g.SetLimit(l.opts.NumThread)
for pageNum := 0; pageNum < numPages; pageNum++ {
html, err := l.document.HTML(pageNum, true)
if err != nil {
return nil, err
}

htmlDoc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return nil, err
}
htmlDoc.Find("img").Remove()

ret, err := htmlDoc.First().Html()
if err != nil {
return nil, err
}

markdown, err := l.converter.ConvertString(ret)
if err != nil {
return nil, err
}

doc := vs.Document{
Content: strings.TrimSpace(markdown),
Metadata: map[string]any{
"page": pageNum + 1,
"totalPages": l.document.NumPage(),
},
}

docs = append(docs, doc)
g.Go(func() error {
select {
case <-childCtx.Done():
return context.Canceled
default:
htmlDoc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
return err
}
htmlDoc.Find("img").Remove()

ret, err := htmlDoc.First().Html()
if err != nil {
return err
}

markdown, err := l.converter.ConvertString(ret)
if err != nil {
return err
}

doc := vs.Document{
Content: strings.TrimSpace(markdown),
Metadata: map[string]any{
"page": pageNum + 1,
"totalPages": numPages,
},
}
l.lock.Lock()
docs = append(docs, doc)
l.lock.Unlock()
return nil
}
})
}

return docs, nil
return docs, g.Wait()
}

// LoadAndSplit loads PDF documents from the provided reader and splits them using the specified text splitter.
Expand Down
15 changes: 15 additions & 0 deletions pkg/env/env.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package env

import (
"os"
"strconv"
)

func GetIntFromEnvOrDefault(env string, def int) int {
v, _ := strconv.Atoi(os.Getenv(env))
if v != 0 {
return v
}

return def
}
7 changes: 5 additions & 2 deletions pkg/vectorstore/chromem/chromem.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,17 @@ import (
"context"
"log/slog"
"maps"
"runtime"
"strconv"

"github.com/google/uuid"
"github.com/gptscript-ai/knowledge/pkg/env"
vs "github.com/gptscript-ai/knowledge/pkg/vectorstore"
"github.com/philippgille/chromem-go"
)

// EmbeddingParallelThread can be set as an environment variable to control the number of parallel API calls to create embedding for documents. Default is 100
const VsChromemEmbeddingParallelThread = "VS_CHROMEM_EMBEDDING_PARALLEL_THREAD"

type Store struct {
db *chromem.DB
embeddingFunc chromem.EmbeddingFunc
Expand Down Expand Up @@ -58,7 +61,7 @@ func (s *Store) AddDocuments(ctx context.Context, docs []vs.Document, collection
return nil, vs.ErrCollectionNotFound{Collection: collection}
}

err := col.AddDocuments(ctx, chromemDocs, runtime.NumCPU()/2)
err := col.AddDocuments(ctx, chromemDocs, env.GetIntFromEnvOrDefault(VsChromemEmbeddingParallelThread, 100))
if err != nil {
return nil, err
}
Expand Down