Skip to content

Commit

Permalink
🍫
Browse files Browse the repository at this point in the history
  • Loading branch information
transitive-bullshit committed Oct 7, 2024
1 parent ab5c84b commit 760642a
Show file tree
Hide file tree
Showing 4 changed files with 32 additions and 27 deletions.
18 changes: 8 additions & 10 deletions src/export-book-pdf.ts
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
#!/usr/bin/env node
/* eslint-disable no-process-env */
import 'dotenv/config'

import fs from 'node:fs'
import fsp from 'node:fs/promises'
import path from 'node:path'

import type { Metadata } from 'playwright/test'
import PDFDocument from 'pdfkit'

import type { ContentChunk } from './types'
import { assert } from './utils'
import type { BookMetadata, ContentChunk } from './types'
import { assert, getEnv } from './utils'

async function main() {
const asin = process.env.ASIN
const asin = getEnv('ASIN')
assert(asin, 'ASIN is required')

const outDir = path.join('out', asin)
Expand All @@ -23,20 +21,20 @@ async function main() {
) as ContentChunk[]
const metadata = JSON.parse(
await fsp.readFile(path.join(outDir, 'metadata.json'), 'utf8')
) as Metadata
) as BookMetadata
assert(content.length, 'no book content found')
assert(metadata.meta, 'invalid book metadata: missing meta')
assert(metadata.toc?.length, 'invalid book metadata: missing toc')

const title = metadata.meta.title
const author = metadata.meta.authorList.join('\n')
const authors = metadata.meta.authorList

const doc = new PDFDocument({
autoFirstPage: true,
displayTitle: true,
info: {
Title: title,
Author: author
Author: authors.join(', ')
}
})
const stream = doc.pipe(fs.createWriteStream(path.join(outDir, 'book.pdf')))
Expand All @@ -50,10 +48,10 @@ async function main() {
doc.text(title, { align: 'center' })
const w = doc.widthOfString(title)

const byline = `By ${author}`
const byline = `By ${authors.join(',\n')}`

doc.fontSize(20)
doc.y -= 10
doc.y -= doc.heightOfString(byline) / 2
doc.text(byline, {
align: 'center',
indent: w - doc.widthOfString(byline)
Expand Down
8 changes: 4 additions & 4 deletions src/extract-kindle-book.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import { input } from '@inquirer/prompts'
import delay from 'delay'
import { chromium, type Locator } from 'playwright'

import type { Info, Meta, Metadata, PageChunk } from './types'
import type { BookInfo, BookMeta, BookMetadata, PageChunk } from './types'
import {
assert,
deromanize,
Expand Down Expand Up @@ -55,8 +55,8 @@ async function main() {
})
const page = await context.newPage()

let info: Info | undefined
let meta: Meta | undefined
let info: BookInfo | undefined
let meta: BookMeta | undefined

page.on('response', async (response) => {
try {
Expand Down Expand Up @@ -342,7 +342,7 @@ async function main() {
} while (true)
} while (true)

const result: Metadata = { info: info!, meta: meta!, toc, pages }
const result: BookMetadata = { info: info!, meta: meta!, toc, pages }
await fs.writeFile(
path.join(outDir, 'metadata.json'),
JSON.stringify(result, null, 2)
Expand Down
23 changes: 15 additions & 8 deletions src/transcribe-book-content.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ async function main() {

const openai = new OpenAIClient()

const results: ContentChunk[] = (
const content: ContentChunk[] = (
await pMap(
pageScreenshots,
async (screenshot) => {
Expand All @@ -40,6 +40,7 @@ async function main() {
)

try {
const maxRetries = 20
let retries = 0

do {
Expand Down Expand Up @@ -78,13 +79,19 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb

if (!text) continue
if (text.length < 100 && /i'm sorry/i.test(text)) {
if (retries >= maxRetries) {
throw new Error(
`Model refused too many times (${retries} times): ${text}`
)
}

// Sometimes the model refuses to generate text for an image
// presumably if it thinks the content may be copyrighted or
// otherwise inappropriate. I haven't seen this from "gpt-4o",
// but I have seen it more regularly from "gpt-4o-mini", so in
// this case we'll retry with a higher temperature and cross our
// fingers.
console.warn(`retrying refusal...`, { index, text, screenshot })
// otherwise inappropriate. I've seen this both "gpt-4o" and
// "gpt-4o-mini", but it seems to happen more regularly with
// "gpt-4o-mini". If we suspect a refual, we'll retry with a
// higher temperature and cross our fingers.
console.warn('retrying refusal...', { index, text, screenshot })
continue
}

Expand All @@ -108,9 +115,9 @@ Do not include any additional text, descriptions, or punctuation. Ignore any emb

await fs.writeFile(
path.join(outDir, 'content.json'),
JSON.stringify(results, null, 2)
JSON.stringify(content, null, 2)
)
console.log(JSON.stringify(results, null, 2))
console.log(JSON.stringify(content, null, 2))
}

await main()
10 changes: 5 additions & 5 deletions src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ export interface PageChunk {
screenshot: string
}

export interface Meta {
export interface BookMeta {
ACR: string
asin: string
authorList: Array<string>
Expand All @@ -42,7 +42,7 @@ export interface Meta {
endPosition: number
}

export interface Info {
export interface BookInfo {
clippingLimit: number
contentChecksum: any
contentType: string
Expand All @@ -69,9 +69,9 @@ export interface Info {
srl: number
}

export interface Metadata {
info: Info
meta: Meta
export interface BookMetadata {
info: BookInfo
meta: BookMeta
toc: TocItem[]
pages: PageChunk[]
}

0 comments on commit 760642a

Please sign in to comment.