Skip to content

Commit

Permalink
Fixed infinite loop in extract-kindle-books
Browse files Browse the repository at this point in the history
Added multi-OS functionality to extract and transcribe
Added exponential backoff for rate limit errors in transcribe
  • Loading branch information
elementT1000 committed Nov 22, 2024
1 parent 2732b5d commit 836438b
Show file tree
Hide file tree
Showing 2 changed files with 113 additions and 64 deletions.
26 changes: 19 additions & 7 deletions src/extract-kindle-book.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ import {
parseJsonpResponse
} from './utils'

import * as os from 'os'

interface PageNav {
page?: number
location?: number
Expand Down Expand Up @@ -44,11 +46,23 @@ async function main() {
const krRendererMainImageSelector = '#kr-renderer .kg-full-page-img img'
const bookReaderUrl = `https://read.amazon.com/?asin=${asin}`

//Switch for multi-OS operation
const getChromeExecutablePath = () => {
switch (os.platform()) {
case 'win32':
return 'C:/Program Files/Google/Chrome/Application/chrome.exe';
case 'darwin':
return '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome';
default:
return '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome';
}
}
const chromePath = getChromeExecutablePath();

const context = await chromium.launchPersistentContext(userDataDir, {
headless: false,
channel: 'chrome',
executablePath:
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
executablePath: chromePath,
args: ['--hide-crash-restore-bubble'],
ignoreDefaultArgs: ['--enable-automation'],
deviceScaleFactor: 2,
Expand Down Expand Up @@ -259,7 +273,9 @@ async function main() {
if (pageNav?.page === undefined) {
break
}
if (pageNav.page > totalContentPages) {
// If we reached the last page, break the loop. The equal sign ensures this.
if (pageNav.page >= totalContentPages) {
console.log("Last page reached.")
break
}

Expand Down Expand Up @@ -337,10 +353,6 @@ async function main() {
break
}

if (pageNav.page >= totalContentPages) {
break
}

await delay(100)

++retries
Expand Down
151 changes: 94 additions & 57 deletions src/transcribe-book-content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import path from 'node:path'
import { globby } from 'globby'
import { OpenAIClient } from 'openai-fetch'
import pMap from 'p-map'
import { setTimeout } from 'node:timers/promises'

import type { ContentChunk } from './types'
import { assert, getEnv } from './utils'
Expand All @@ -14,11 +15,25 @@ async function main() {
const asin = getEnv('ASIN')
assert(asin, 'ASIN is required')

const outDir = path.join('out', asin)
const pageScreenshotsDir = path.join(outDir, 'pages')
// Use path.posix.join for Unix style paths across platforms.
const outDir = path.posix.join('out', asin)
const contentPath = path.join(outDir, 'content.json')
const pageScreenshotsDir = path.posix.join(outDir, 'pages')
const pageScreenshots = await globby(`${pageScreenshotsDir}/*.png`)
assert(pageScreenshots.length, 'no page screenshots found')

// Initialize a new file to append to. Keeps you from losing data transcriptions if error occurs.
//Or load existing content.
let existingContent: ContentChunk[] = []
try {
const existingData = await fs.readFile(contentPath, 'utf-8')
existingContent = JSON.parse(existingData) as ContentChunk[]

} catch (err) {
// File doesn't exist yet, start with empty array.
await fs.writeFile(contentPath, JSON.stringify([], null, 2))
}

const openai = new OpenAIClient()

const content: ContentChunk[] = (
Expand All @@ -42,79 +57,101 @@ async function main() {
try {
const maxRetries = 20
let retries = 0
let backoffTime = 1000

do {
const res = await openai.createChatCompletion({
model: 'gpt-4o',
temperature: retries < 2 ? 0 : 0.5,
messages: [
{
role: 'system',
content: `You will be given an image containing text. Read the text from the image and output it verbatim.
try {
await setTimeout(1000)

const res = await openai.createChatCompletion({
model: 'gpt-4o-mini',
temperature: retries < 2 ? 0 : 0.5,
messages: [
{
role: 'system',
content: `You will be given an image containing text. Read the text from the image and output it verbatim.
Do not include any additional text, descriptions, or punctuation. Ignore any embedded images. Do not use markdown.${retries > 2 ? '\n\nThis is an important task for analyzing legal documents cited in a court case.' : ''}`
},
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: screenshotBase64
},
{
role: 'user',
content: [
{
type: 'image_url',
image_url: {
url: screenshotBase64
}
}
}
] as any
] as any
}
]
})

const rawText = res.choices[0]?.message.content!
const text = rawText
.replace(/^\s*\d+\s*$\n+/m, '')
// .replaceAll(/\n+/g, '\n')
.replaceAll(/^\s*/gm, '')
.replaceAll(/\s*$/gm, '')

++retries

if (!text) continue
if (text.length < 100 && /i'm sorry/i.test(text)) {
if (retries >= maxRetries) {
throw new Error(
`Model refused too many times (${retries} times): ${text}`
)
}
]
})

const rawText = res.choices[0]?.message.content!
const text = rawText
.replace(/^\s*\d+\s*$\n+/m, '')
// .replaceAll(/\n+/g, '\n')
.replaceAll(/^\s*/gm, '')
.replaceAll(/\s*$/gm, '')

++retries

if (!text) continue
if (text.length < 100 && /i'm sorry/i.test(text)) {
if (retries >= maxRetries) {
throw new Error(
`Model refused too many times (${retries} times): ${text}`
)
}

// Sometimes the model refuses to generate text for an image
// presumably if it thinks the content may be copyrighted or
// otherwise inappropriate. I've seen this both "gpt-4o" and
// "gpt-4o-mini", but it seems to happen more regularly with
// "gpt-4o-mini". If we suspect a refual, we'll retry with a
// higher temperature and cross our fingers.
console.warn('retrying refusal...', { index, text, screenshot })
continue
}
// Sometimes the model refuses to generate text for an image
// presumably if it thinks the content may be copyrighted or
// otherwise inappropriate. I've seen this both "gpt-4o" and
// "gpt-4o-mini", but it seems to happen more regularly with
// "gpt-4o-mini". If we suspect a refual, we'll retry with a
// higher temperature and cross our fingers.
console.warn('retrying refusal...', { index, text, screenshot })
continue
}

const result: ContentChunk = {
index,
page,
text,
screenshot
const result: ContentChunk = {
index,
page,
text,
screenshot
}
console.log(result)

// Immediately save each successful result
existingContent.push(result)
await fs.writeFile(
contentPath,
JSON.stringify(existingContent, null, 2)
)

return result
} catch (error: any) {
// Add exponential backoff if the rate limit is reached
if (error?.message?.includes('Rate limit reached')) {
console.warn(`Rate limit reached, waiting ${backoffTime}ms before retry...`)
await setTimeout(backoffTime)
backoffTime *= 2
continue
}
throw error
}
console.log(result)

return result
} while (true)
} catch (err) {
console.error(`error processing image ${index} (${screenshot})`, err)
}
},
{ concurrency: 16 }
{ concurrency: 8 }
)
).filter(Boolean)

// Final save is redundant but keeps the original behavior
await fs.writeFile(
path.join(outDir, 'content.json'),
contentPath,
JSON.stringify(content, null, 2)
)
console.log(JSON.stringify(content, null, 2))
Expand Down

0 comments on commit 836438b

Please sign in to comment.