diff --git a/.changeset/poor-planets-pay.md b/.changeset/poor-planets-pay.md new file mode 100644 index 0000000..c843e57 --- /dev/null +++ b/.changeset/poor-planets-pay.md @@ -0,0 +1,5 @@ +--- +"osrs-web-scraper": minor +--- + +Add support for parsing audio files in news posts diff --git a/src/scrapers/news/news.ts b/src/scrapers/news/news.ts index 1544070..b0165e8 100644 --- a/src/scrapers/news/news.ts +++ b/src/scrapers/news/news.ts @@ -7,7 +7,7 @@ import { NewsHeaderTransformer, NewsImageCaptionTransformer, } from "./transformers"; -import { formatFileName } from "../../utils/images"; +import { formatFileName } from "../../utils/file"; import { MediaWikiBuilder } from "../../utils/mediawiki"; import { ScrapingService } from "../types"; diff --git a/src/scrapers/news/sections/newsContent/newsContent.ts b/src/scrapers/news/sections/newsContent/newsContent.ts index c9f7342..cbf9b2f 100644 --- a/src/scrapers/news/sections/newsContent/newsContent.ts +++ b/src/scrapers/news/sections/newsContent/newsContent.ts @@ -3,10 +3,10 @@ import { parse } from "node-html-parser"; import { nodeParser } from "./nodes"; import { - downloadImage, + downloadFile, formatFileName, - getImageExtension, -} from "../../../../utils/images"; + getFileExtension, +} from "../../../../utils/file"; import { MediaWikiContent } from "../../../../utils/mediawiki"; import { getNodeTagName } from "../../../../utils/nodes"; import { NewsSection } from "../types"; @@ -48,9 +48,9 @@ const newsContent: NewsSection = { const imageName = `${formattedTitle} (${++downloadedImages})`; downloadQueue.push( - downloadImage( + downloadFile( imageLink, - `${imageDirectory}/${imageName}.${getImageExtension(imageLink)}` + `${imageDirectory}/${imageName}.${getFileExtension(imageLink)}` ) ); } diff --git a/src/scrapers/news/sections/newsContent/nodes/audio.ts b/src/scrapers/news/sections/newsContent/nodes/audio.ts new file mode 100644 index 0000000..93b50f7 --- /dev/null +++ b/src/scrapers/news/sections/newsContent/nodes/audio.ts @@ -0,0 +1,39 @@ +import fs from "fs"; +import { HTMLElement } from "node-html-parser"; + +import { + downloadFile, + formatFileName, + getFileExtension, +} from "../../../../../utils/file"; +import { + ListenTemplate, + MediaWikiComment, +} from "../../../../../utils/mediawiki"; +import { ContentNodeParser } from "../types"; + +export const audioParser: ContentNodeParser = (node, { title }) => { + if (node instanceof HTMLElement && node.firstChild instanceof HTMLElement) { + const source = node.firstChild as HTMLElement; + const audioLink = source.attributes.src; + + const formattedTitle = formatFileName(title as string); + const audioDirectory = `./out/news/${formattedTitle}`; + if (!fs.existsSync(audioDirectory)) { + fs.mkdirSync(audioDirectory, { recursive: true }); + } + + const audioExtension = getFileExtension(audioLink); + const outputFileName = `${formattedTitle} narration.${audioExtension}`; + + downloadFile(audioLink, `${audioDirectory}/${outputFileName}`); + + return new ListenTemplate(outputFileName, { + align: "center", + title: "Audio reading", + }).build(); + } + return new MediaWikiComment("Invalid audio node"); +}; + +export default audioParser; diff --git a/src/scrapers/news/sections/newsContent/nodes/div/gallery.ts b/src/scrapers/news/sections/newsContent/nodes/div/gallery.ts index 511b759..7fa365f 100644 --- a/src/scrapers/news/sections/newsContent/nodes/div/gallery.ts +++ b/src/scrapers/news/sections/newsContent/nodes/div/gallery.ts @@ -1,10 +1,7 @@ import fs from "fs"; import { HTMLElement } from "node-html-parser"; -import { - formatFileName, - getImageExtension, -} from "../../../../../../utils/images"; +import { formatFileName, getFileExtension } from "../../../../../../utils/file"; import { MediaWikiHTML, MediaWikiText, @@ -27,7 +24,7 @@ export const galleryParser: ContentNodeParser = (node, options) => { } const imageName = `${formattedTitle} (${++ContentContext.imageCount})`; - const imageExtension = getImageExtension(imageLink); + const imageExtension = getFileExtension(imageLink); return new MediaWikiText(`\n${imageName}.${imageExtension}`); }); diff --git a/src/scrapers/news/sections/newsContent/nodes/image.ts b/src/scrapers/news/sections/newsContent/nodes/image.ts index 54b64c5..bafedbb 100644 --- a/src/scrapers/news/sections/newsContent/nodes/image.ts +++ b/src/scrapers/news/sections/newsContent/nodes/image.ts @@ -2,7 +2,7 @@ import fs from "fs"; import sizeOf from "image-size"; import { HTMLElement } from "node-html-parser"; -import { formatFileName, getImageExtension } from "../../../../../utils/images"; +import { formatFileName, getFileExtension } from "../../../../../utils/file"; import { MediaWikiBreak, MediaWikiComment, @@ -32,7 +32,7 @@ export const imageParser: ContentNodeParser = (node, { title, center }) => { } const imageName = `${formattedTitle} (${++ContentContext.imageCount})`; - const imageExtension = getImageExtension(imageLink); + const imageExtension = getFileExtension(imageLink); const dimensions = sizeOf( `${imageDirectory}/${imageName}.${imageExtension}` ); diff --git a/src/scrapers/news/sections/newsContent/nodes/parser.ts b/src/scrapers/news/sections/newsContent/nodes/parser.ts index cfc55bd..e6a3ef9 100644 --- a/src/scrapers/news/sections/newsContent/nodes/parser.ts +++ b/src/scrapers/news/sections/newsContent/nodes/parser.ts @@ -1,3 +1,4 @@ +import audioParser from "./audio"; import boldParser from "./bold"; import breakParser from "./break"; import centerParser from "./center"; @@ -20,6 +21,7 @@ const ignoredTags = ["script"]; const nodeParserMap: { [key: string]: ContentNodeParser } = { a: linkParser, + audio: audioParser, b: boldParser, details: detailsParser, div: divParser, diff --git a/src/scrapers/news/sections/newsHeader/newsHeader.ts b/src/scrapers/news/sections/newsHeader/newsHeader.ts index 8f2956c..ce8dfd6 100644 --- a/src/scrapers/news/sections/newsHeader/newsHeader.ts +++ b/src/scrapers/news/sections/newsHeader/newsHeader.ts @@ -8,10 +8,10 @@ import { getNewsUrlIdentifier, } from "./newsHeader.utils"; import { - downloadImage, + downloadFile, formatFileName, - getImageExtension, -} from "../../../../utils/images"; + getFileExtension, +} from "../../../../utils/file"; import { MediaWikiBreak, MediaWikiContent, @@ -42,13 +42,10 @@ const newsHeader: NewsSection = { fs.mkdirSync(newsDirectory, { recursive: true }); } - const newspostImageName = `${formattedTitle} newspost.${getImageExtension( + const newspostImageName = `${formattedTitle} newspost.${getFileExtension( image.attributes.src )}`; - downloadImage( - image.attributes.src, - `${newsDirectory}/${newspostImageName}` - ); + downloadFile(image.attributes.src, `${newsDirectory}/${newspostImageName}`); const content: MediaWikiContent[] = []; diff --git a/src/scrapers/polls/polls.ts b/src/scrapers/polls/polls.ts index f64fac4..f7a9839 100644 --- a/src/scrapers/polls/polls.ts +++ b/src/scrapers/polls/polls.ts @@ -2,7 +2,7 @@ import fs from "fs"; import parse from "node-html-parser"; import { pollHeader, pollQuestions } from "./sections"; -import { formatFileName } from "../../utils/images"; +import { formatFileName } from "../../utils/file"; import { MediaWikiBuilder, MediaWikiTemplate, diff --git a/src/utils/images.ts b/src/utils/file.ts similarity index 56% rename from src/utils/images.ts rename to src/utils/file.ts index 3949056..8cff4df 100644 --- a/src/utils/images.ts +++ b/src/utils/file.ts @@ -5,13 +5,13 @@ import path from "path"; import { formatText } from "./text"; /** - * Download an image from a url - * @param url The url of the image to download - * @param filepath The filepath of the downloaded image + * Download a file from a url + * @param url The url of the file to download + * @param filepath The filepath of the downloaded file * @returns */ -export const downloadImage = async (url: string, filepath: string) => { - console.info(`Attempting image download: ${url}`); +export const downloadFile = async (url: string, filepath: string) => { + console.info(`Attempting file download: ${url}`); return new Promise((resolve, reject) => { client.get(url, (res) => { if (res.statusCode === 200) { @@ -19,7 +19,7 @@ export const downloadImage = async (url: string, filepath: string) => { .pipe(fs.createWriteStream(filepath)) .on("error", reject) .once("close", () => resolve(filepath)); - console.info(`Downloaded image ${filepath}`); + console.info(`Downloaded file ${filepath}`); } else { res.resume(); reject( @@ -31,22 +31,22 @@ export const downloadImage = async (url: string, filepath: string) => { }; /** - * Get an image name from a url - * @param imageUrl The image url + * Get a file from a url + * @param fileUrl The file url * @returns */ -export const getImageName = (imageUrl: string) => { - const parsed = new URL(imageUrl); +export const getFileName = (fileUrl: string) => { + const parsed = new URL(fileUrl); return path.basename(parsed.pathname); }; /** - * Get the extension of an image from a url link. - * @param imageUrl The image url + * Get the extension of a file from a url link. + * @param fileUrl The file url * @returns */ -export const getImageExtension = (imageUrl: string) => { - return imageUrl.split(/[#?]/)[0].split(".").pop().trim(); +export const getFileExtension = (fileUrl: string) => { + return fileUrl.split(/[#?]/)[0].split(".").pop().trim(); }; /** diff --git a/src/utils/mediawiki/contents/templates/__tests__/__snapshots__/listen.test.ts.snap b/src/utils/mediawiki/contents/templates/__tests__/__snapshots__/listen.test.ts.snap new file mode 100644 index 0000000..5f66095 --- /dev/null +++ b/src/utils/mediawiki/contents/templates/__tests__/__snapshots__/listen.test.ts.snap @@ -0,0 +1,65 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`ListenTemplate it should render with only a file 1`] = ` +MediaWikiTemplate { + "name": "Listen", + "params": Array [ + Object { + "key": "filename", + "value": "test.mp3", + }, + ], +} +`; + +exports[`ListenTemplate it should render with options: {"align":"left","title":"test title"} 1`] = ` +MediaWikiTemplate { + "name": "Listen", + "params": Array [ + Object { + "key": "filename", + "value": "test.mp3", + }, + Object { + "key": "align", + "value": "left", + }, + Object { + "key": "title", + "value": "test title", + }, + ], +} +`; + +exports[`ListenTemplate it should render with options: {"align":"left"} 1`] = ` +MediaWikiTemplate { + "name": "Listen", + "params": Array [ + Object { + "key": "filename", + "value": "test.mp3", + }, + Object { + "key": "align", + "value": "left", + }, + ], +} +`; + +exports[`ListenTemplate it should render with options: {"title":"test title"} 1`] = ` +MediaWikiTemplate { + "name": "Listen", + "params": Array [ + Object { + "key": "filename", + "value": "test.mp3", + }, + Object { + "key": "title", + "value": "test title", + }, + ], +} +`; diff --git a/src/utils/mediawiki/contents/templates/__tests__/listen.test.ts b/src/utils/mediawiki/contents/templates/__tests__/listen.test.ts new file mode 100644 index 0000000..756b3ee --- /dev/null +++ b/src/utils/mediawiki/contents/templates/__tests__/listen.test.ts @@ -0,0 +1,15 @@ +import ListenTemplate, { ListeTemplateOptions } from "../listen"; + +describe("ListenTemplate", () => { + test("it should render with only a file", () => { + expect(new ListenTemplate("test.mp3").build()).toMatchSnapshot(); + }); + + test.each([ + { align: "left" }, + { title: "test title" }, + { align: "left", title: "test title" }, + ])("it should render with options: %j", (options) => { + expect(new ListenTemplate("test.mp3", options).build()).toMatchSnapshot(); + }); +}); diff --git a/src/utils/mediawiki/contents/templates/index.ts b/src/utils/mediawiki/contents/templates/index.ts index 398abe7..7fe41b6 100644 --- a/src/utils/mediawiki/contents/templates/index.ts +++ b/src/utils/mediawiki/contents/templates/index.ts @@ -1,8 +1,10 @@ export { default as CollapedSectionTemplate } from "./collapsedSection"; +export { default as ListenTemplate } from "./listen"; export { default as NewsPollTemplate } from "./newsPoll"; export { default as PollTemplate } from "./poll"; export { default as PollNoticeTemplate } from "./pollNotice"; export { default as PollWrapperTemplate } from "./pollWrapper"; export { default as UpdateTemplate } from "./update"; +export * from "./listen"; export * from "./poll"; diff --git a/src/utils/mediawiki/contents/templates/listen.ts b/src/utils/mediawiki/contents/templates/listen.ts new file mode 100644 index 0000000..8df9423 --- /dev/null +++ b/src/utils/mediawiki/contents/templates/listen.ts @@ -0,0 +1,36 @@ +import { Template } from "./types"; +import MediaWikiTemplate from "../template"; + +export type ListenAlignment = "left" | "right" | "center"; + +export type ListeTemplateOptions = { + align?: ListenAlignment; + title?: string; +}; + +class ListenTemplate extends Template { + align?: ListenAlignment; + fileName: string; + title?: string; + + constructor(fileName: string, options?: ListeTemplateOptions) { + super("Listen"); + this.fileName = fileName; + this.align = options?.align; + this.title = options?.title; + } + + build() { + const listenTemplate = new MediaWikiTemplate(this.name); + listenTemplate.add("filename", this.fileName); + if (this.align) { + listenTemplate.add("align", this.align); + } + if (this.title) { + listenTemplate.add("title", this.title); + } + return listenTemplate; + } +} + +export default ListenTemplate;