From 0747ed90e1bfb4b1c1eba605ed01ddcab63632e2 Mon Sep 17 00:00:00 2001 From: Alexander Alemayhu Date: Sun, 20 Oct 2024 19:41:53 +0200 Subject: [PATCH] feat: experimental pdf support We are using the [Google Vertex AI API](https://cloud.google.com/vertex-ai/docs/reference/rest). --- package-lock.json | 146 +++++++++++++++++- package.json | 1 + .../SettingsController/supportedOptions.ts | 6 + src/lib/anki/zip.tsx | 10 +- src/lib/parser/PrepareDeck.ts | 13 ++ src/lib/parser/Settings/Settings.ts | 3 + .../VertexAPI/convertPDFToHTML.ts | 90 +++++++++++ src/lib/storage/checks.ts | 2 + src/usecases/uploads/getPackagesFromZip.ts | 4 +- 9 files changed, 269 insertions(+), 6 deletions(-) create mode 100644 src/lib/parser/experimental/VertexAPI/convertPDFToHTML.ts diff --git a/package-lock.json b/package-lock.json index 8b9cf1b36..9f2b0fc85 100644 --- a/package-lock.json +++ b/package-lock.json @@ -10,6 +10,7 @@ "license": "MIT", "dependencies": { "@2anki/csv-to-apkg": "^1.4.4", + "@google-cloud/vertexai": "^1.9.0", "@notionhq/client": "^2.2.13", "@sendgrid/mail": "^8.1.3", "aws-sdk": "^2.1502.0", @@ -1717,6 +1718,18 @@ "node": "^12.22.0 || ^14.17.0 || >=16.0.0" } }, + "node_modules/@google-cloud/vertexai": { + "version": "1.9.0", + "resolved": "https://registry.npmjs.org/@google-cloud/vertexai/-/vertexai-1.9.0.tgz", + "integrity": "sha512-8brlcJwFXI4fPuBtsDNQqCdWZmz8gV9jeEKOU0vc5H2SjehCQpXK/NwuSEr916zbhlBHtg/sU37qQQdgvh5BRA==", + "license": "Apache-2.0", + "dependencies": { + "google-auth-library": "^9.1.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.14", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.14.tgz", @@ -4458,6 +4471,14 @@ "resolved": "https://registry.npmjs.org/bcryptjs/-/bcryptjs-2.4.3.tgz", "integrity": "sha1-mrVie5PmBiH/fNrF2pczAn3x0Ms=" }, + "node_modules/bignumber.js": { + "version": "9.1.2", + "resolved": "https://registry.npmjs.org/bignumber.js/-/bignumber.js-9.1.2.tgz", + "integrity": "sha512-2/mKyZH9K85bzOEfhXDBFZTGd1CTs+5IHpeFQo9luiBG7hghdC851Pj2WAhb6E3R6b9tZj/XKhbg4fum+Kepug==", + "engines": { + "node": "*" + } + }, "node_modules/binary-extensions": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.2.0.tgz", @@ -6419,6 +6440,11 @@ "node": ">= 0.8" } }, + "node_modules/extend": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/extend/-/extend-3.0.2.tgz", + "integrity": "sha512-fjquC59cD7CyW6urNXK0FBufkZcoiGG80wTuPujX590cB5Ttln20E2UB4S/WARVqhXffZl2LNgS+gQdPIIim/g==" + }, "node_modules/fast-deep-equal": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", @@ -6751,6 +6777,45 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/gaxios": { + "version": "6.7.1", + "resolved": "https://registry.npmjs.org/gaxios/-/gaxios-6.7.1.tgz", + "integrity": "sha512-LDODD4TMYx7XXdpwxAVRAIAuB0bzv0s+ywFonY46k126qzQHT9ygyoa9tncmOiQmmDrik65UYsEkv3lbfqQ3yQ==", + "dependencies": { + "extend": "^3.0.2", + "https-proxy-agent": "^7.0.1", + "is-stream": "^2.0.0", + "node-fetch": "^2.6.9", + "uuid": "^9.0.1" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/gaxios/node_modules/uuid": { + "version": "9.0.1", + "resolved": "https://registry.npmjs.org/uuid/-/uuid-9.0.1.tgz", + "integrity": "sha512-b+1eJOlsR9K8HJpow9Ok3fiWOWSIcIzXodvv0rQjVoOVNpWMpxf1wZNpt4y9h10odCNrqnYp1OBzRktckBe3sA==", + "funding": [ + "https://github.com/sponsors/broofa", + "https://github.com/sponsors/ctavan" + ], + "bin": { + "uuid": "dist/bin/uuid" + } + }, + "node_modules/gcp-metadata": { + "version": "6.1.0", + "resolved": "https://registry.npmjs.org/gcp-metadata/-/gcp-metadata-6.1.0.tgz", + "integrity": "sha512-Jh/AIwwgaxan+7ZUUmRLCjtchyDiqh4KjBJ5tW3plBZb5iL/BPcso8A5DlzeD9qlw0duCamnNdpFjxwaT0KyKg==", + "dependencies": { + "gaxios": "^6.0.0", + "json-bigint": "^1.0.0" + }, + "engines": { + "node": ">=14" + } + }, "node_modules/gensync": { "version": "1.0.0-beta.2", "resolved": "https://registry.npmjs.org/gensync/-/gensync-1.0.0-beta.2.tgz", @@ -6916,6 +6981,41 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/google-auth-library": { + "version": "9.14.2", + "resolved": "https://registry.npmjs.org/google-auth-library/-/google-auth-library-9.14.2.tgz", + "integrity": "sha512-R+FRIfk1GBo3RdlRYWPdwk8nmtVUOn6+BkDomAC46KoU8kzXzE1HLmOasSCbWUByMMAGkknVF0G5kQ69Vj7dlA==", + "dependencies": { + "base64-js": "^1.3.0", + "ecdsa-sig-formatter": "^1.0.11", + "gaxios": "^6.1.1", + "gcp-metadata": "^6.1.0", + "gtoken": "^7.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=14" + } + }, + "node_modules/google-auth-library/node_modules/jwa": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.0.tgz", + "integrity": "sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==", + "dependencies": { + "buffer-equal-constant-time": "1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/google-auth-library/node_modules/jws": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.0.tgz", + "integrity": "sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==", + "dependencies": { + "jwa": "^2.0.0", + "safe-buffer": "^5.0.1" + } + }, "node_modules/gopd": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.0.1.tgz", @@ -6962,6 +7062,37 @@ "integrity": "sha512-EtKwoO6kxCL9WO5xipiHTZlSzBm7WLT627TqC/uVRd0HKmq8NXyebnNYxDoBi7wt8eTWrUrKXCOVaFq9x1kgag==", "dev": true }, + "node_modules/gtoken": { + "version": "7.1.0", + "resolved": "https://registry.npmjs.org/gtoken/-/gtoken-7.1.0.tgz", + "integrity": "sha512-pCcEwRi+TKpMlxAQObHDQ56KawURgyAf6jtIY046fJ5tIv3zDe/LEIubckAO8fj6JnAxLdmWkUfNyulQ2iKdEw==", + "dependencies": { + "gaxios": "^6.0.0", + "jws": "^4.0.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/gtoken/node_modules/jwa": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.0.tgz", + "integrity": "sha512-jrZ2Qx916EA+fq9cEAeCROWPTfCwi1IVHqT2tapuqLEVVDKFDENFw1oL+MwrTvH6msKxsd1YTDVw6uKEcsrLEA==", + "dependencies": { + "buffer-equal-constant-time": "1.0.1", + "ecdsa-sig-formatter": "1.0.11", + "safe-buffer": "^5.0.1" + } + }, + "node_modules/gtoken/node_modules/jws": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/jws/-/jws-4.0.0.tgz", + "integrity": "sha512-KDncfTmOZoOMTFG4mBlG0qUIOlc03fmzH+ru6RgYVZhPkyiy/92Owlt/8UEN+a4TXR1FQetfIpJE8ApdvdVxTg==", + "dependencies": { + "jwa": "^2.0.0", + "safe-buffer": "^5.0.1" + } + }, "node_modules/has": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/has/-/has-1.0.3.tgz", @@ -7636,7 +7767,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/is-stream/-/is-stream-2.0.1.tgz", "integrity": "sha512-hFoiJiTl63nn+kstHGBtewWSKnQLpyb155KHheA1l39uvtO9nWIop1p3udqPcUd/xbF1VLMO4n7OI6p7RbngDg==", - "dev": true, "engines": { "node": ">=8" }, @@ -8569,6 +8699,14 @@ "node": ">=4" } }, + "node_modules/json-bigint": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/json-bigint/-/json-bigint-1.0.0.tgz", + "integrity": "sha512-SiPv/8VpZuWbvLSMtTDU8hEfrZWg/mH/nV/b4o0CYbSxu1UIQPLdwKOCIyLQX+VIPO5vrLX3i8qtqFyhdPSUSQ==", + "dependencies": { + "bignumber.js": "^9.0.0" + } + }, "node_modules/json-buffer": { "version": "3.0.1", "resolved": "https://registry.npmjs.org/json-buffer/-/json-buffer-3.0.1.tgz", @@ -9751,9 +9889,9 @@ } }, "node_modules/node-fetch": { - "version": "2.6.7", - "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.7.tgz", - "integrity": "sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==", + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", "dependencies": { "whatwg-url": "^5.0.0" }, diff --git a/package.json b/package.json index af9794aaf..4fc70da1c 100644 --- a/package.json +++ b/package.json @@ -31,6 +31,7 @@ "license": "MIT", "dependencies": { "@2anki/csv-to-apkg": "^1.4.4", + "@google-cloud/vertexai": "^1.9.0", "@notionhq/client": "^2.2.13", "@sendgrid/mail": "^8.1.3", "aws-sdk": "^2.1502.0", diff --git a/src/controllers/SettingsController/supportedOptions.ts b/src/controllers/SettingsController/supportedOptions.ts index 093a4d168..80e65751e 100644 --- a/src/controllers/SettingsController/supportedOptions.ts +++ b/src/controllers/SettingsController/supportedOptions.ts @@ -118,6 +118,12 @@ const supportedOptions = (): CardOption[] => { 'Enable conversion of bullet and sub bullet points in Markdown. If you are a Obsidian user, enable this', false ), + new CardOption( + 'vertex-ai-pdf-questions', + 'Generate Questions from PDFs', + 'Use Vertex AI API to generate questions from PDFs. This is a paid feature and if enabled will send your notes to Google Cloud.', + false + ), ]; return v.filter(Boolean); diff --git a/src/lib/anki/zip.tsx b/src/lib/anki/zip.tsx index f2223f93c..14a9989e7 100644 --- a/src/lib/anki/zip.tsx +++ b/src/lib/anki/zip.tsx @@ -2,7 +2,7 @@ import { strFromU8, unzipSync } from 'fflate'; import { Body } from 'aws-sdk/clients/s3'; import { renderToStaticMarkup } from 'react-dom/server'; import { getUploadLimits } from '../misc/getUploadLimits'; -import { isHTMLFile, isMarkdownFile } from '../storage/checks'; +import { isHTMLFile, isMarkdownFile, isPDFFile } from '../storage/checks'; interface File { name: string; @@ -46,6 +46,14 @@ class ZipHandler { for (const name of this.fileNames) { const file = loadedZip[name]; let contents = file; + + /** + * For now disable batch processing of PDF files. We only want single uploads to avoid creating too many requests. + */ + if (name.includes('__MACOSX/') || isPDFFile(name)) { + continue; + } + if ((isHTMLFile(name) || isMarkdownFile(name)) && contents) { this.files.push({ name, contents: strFromU8(file) }); } else if (contents) { diff --git a/src/lib/parser/PrepareDeck.ts b/src/lib/parser/PrepareDeck.ts index 65d57de2a..d0ab5a272 100644 --- a/src/lib/parser/PrepareDeck.ts +++ b/src/lib/parser/PrepareDeck.ts @@ -1,6 +1,8 @@ import getDeckFilename from '../anki/getDeckFilename'; import { DeckParser, DeckParserInput } from './DeckParser'; import Deck from './Deck'; +import { isPDFFile } from '../storage/checks'; +import { convertPDFToHTML } from './experimental/VertexAPI/convertPDFToHTML'; interface PrepareDeckResult { name: string; @@ -11,6 +13,17 @@ interface PrepareDeckResult { export async function PrepareDeck( input: DeckParserInput ): Promise { + if (input.noLimits) { + // Check for PDF files and convert their contents to HTML + for (const file of input.files) { + if (isPDFFile(file.name) && file.contents) { + file.contents = await convertPDFToHTML( + file.contents.toString('base64') + ); + } + } + } + const parser = new DeckParser(input); if (parser.totalCardCount() === 0) { diff --git a/src/lib/parser/Settings/Settings.ts b/src/lib/parser/Settings/Settings.ts index 43df421db..2e8c93a87 100644 --- a/src/lib/parser/Settings/Settings.ts +++ b/src/lib/parser/Settings/Settings.ts @@ -66,6 +66,8 @@ export class Settings { readonly nestedBulletPoints: boolean; + readonly vertexAIPDFQuestions: boolean; + constructor(input: { [key: string]: string }) { this.deckName = input.deckName; if (this.deckName && !this.deckName.trim()) { @@ -97,6 +99,7 @@ export class Settings { this.parentBlockId = input.parentBlockId; this.pageEmoji = input['page-emoji'] || 'first_emoji'; this.addNotionLink = input['add-notion-link'] === 'true'; + this.vertexAIPDFQuestions = input['vertex-ai-pdf-questions'] === 'true'; /* Is this really needed? */ if (this.parentBlockId) { this.addNotionLink = true; diff --git a/src/lib/parser/experimental/VertexAPI/convertPDFToHTML.ts b/src/lib/parser/experimental/VertexAPI/convertPDFToHTML.ts new file mode 100644 index 000000000..aba9d3b52 --- /dev/null +++ b/src/lib/parser/experimental/VertexAPI/convertPDFToHTML.ts @@ -0,0 +1,90 @@ +import path from 'path'; +import fs from 'fs'; + +import { + GenerateContentRequest, + HarmBlockThreshold, + HarmCategory, + VertexAI, +} from '@google-cloud/vertexai'; + +export const convertPDFToHTML = async (pdf: string): Promise => { + const vertexAI = new VertexAI({ + project: 'notion-to-anki', + location: 'europe-west3', + }); + const model = 'gemini-1.5-flash-002'; + const generativeModel = vertexAI.preview.getGenerativeModel({ + model: model, + generationConfig: { + maxOutputTokens: 8192, + temperature: 1, + topP: 0.95, + }, + safetySettings: [ + { + category: HarmCategory.HARM_CATEGORY_HATE_SPEECH, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + { + category: HarmCategory.HARM_CATEGORY_HARASSMENT, + threshold: HarmBlockThreshold.BLOCK_NONE, + }, + ], + }); + + const document1 = { + inlineData: { + mimeType: 'application/pdf', + data: pdf, + }, + }; + + const text1 = { + text: fs + .readFileSync( + path.join( + __dirname, + '../../../../../../pdf-to-html-api', + 'instructions.txt' + ) + ) + .toString(), + }; + + const req: GenerateContentRequest = { + contents: [{ role: 'user', parts: [document1, text1] }], + }; + + let htmlContent = ''; + try { + const streamingResp = await generativeModel.generateContentStream(req); + for await (const item of streamingResp.stream) { + if ( + item.candidates && + item.candidates[0].content && + item.candidates[0].content.parts + ) { + htmlContent += item.candidates[0].content.parts + .map((part) => part.text) + .join(''); + } + } + } catch (error) { + console.error('Error generating content stream:', error); + + // const workSpace = process.cwd(); + // const outputPath = path.join(workSpace, 'output.html'); + // fs.writeFileSync(outputPath, htmlContent); + // console.log(outputPath); + } + return htmlContent; +}; diff --git a/src/lib/storage/checks.ts b/src/lib/storage/checks.ts index 735b14e24..cb178b8a9 100644 --- a/src/lib/storage/checks.ts +++ b/src/lib/storage/checks.ts @@ -20,3 +20,5 @@ export const isImageFileEmbedable = (url: string) => !url.startsWith('http') && !url.startsWith('data:image'); export const isCSVFile = (fileName: string) => /.csv$/i.exec(fileName); + +export const isPDFFile = (fileName: string) => /.pdf$/i.exec(fileName); diff --git a/src/usecases/uploads/getPackagesFromZip.ts b/src/usecases/uploads/getPackagesFromZip.ts index d0adb715d..163783ca1 100644 --- a/src/usecases/uploads/getPackagesFromZip.ts +++ b/src/usecases/uploads/getPackagesFromZip.ts @@ -9,6 +9,7 @@ import { isCSVFile, isHTMLFile, isMarkdownFile, + isPDFFile, isPlainText, } from '../../lib/storage/checks'; import Workspace from '../../lib/parser/WorkSpace'; @@ -17,7 +18,8 @@ export const isFileSupported = (filename: string) => isHTMLFile(filename) ?? isMarkdownFile(filename) ?? isPlainText(filename) ?? - isCSVFile(filename); + isCSVFile(filename) ?? + isPDFFile(filename); export const getPackagesFromZip = async ( fileContents: Body | undefined,