Skip to content

Commit

Permalink
[#47] Add support for parsing audio files in news posts (#49)
Browse files Browse the repository at this point in the history
* [#47] Add support for parsing audio files in news posts

* Rename image utils to file utils
  • Loading branch information
allenkinzalow authored Sep 22, 2023
1 parent 4185a05 commit c7dee3c
Show file tree
Hide file tree
Showing 14 changed files with 194 additions and 36 deletions.
5 changes: 5 additions & 0 deletions .changeset/poor-planets-pay.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"osrs-web-scraper": minor
---

Add support for parsing audio files in news posts
2 changes: 1 addition & 1 deletion src/scrapers/news/news.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ import {
NewsHeaderTransformer,
NewsImageCaptionTransformer,
} from "./transformers";
import { formatFileName } from "../../utils/images";
import { formatFileName } from "../../utils/file";
import { MediaWikiBuilder } from "../../utils/mediawiki";
import { ScrapingService } from "../types";

Expand Down
10 changes: 5 additions & 5 deletions src/scrapers/news/sections/newsContent/newsContent.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ import { parse } from "node-html-parser";

import { nodeParser } from "./nodes";
import {
downloadImage,
downloadFile,
formatFileName,
getImageExtension,
} from "../../../../utils/images";
getFileExtension,
} from "../../../../utils/file";
import { MediaWikiContent } from "../../../../utils/mediawiki";
import { getNodeTagName } from "../../../../utils/nodes";
import { NewsSection } from "../types";
Expand Down Expand Up @@ -48,9 +48,9 @@ const newsContent: NewsSection = {

const imageName = `${formattedTitle} (${++downloadedImages})`;
downloadQueue.push(
downloadImage(
downloadFile(
imageLink,
`${imageDirectory}/${imageName}.${getImageExtension(imageLink)}`
`${imageDirectory}/${imageName}.${getFileExtension(imageLink)}`
)
);
}
Expand Down
39 changes: 39 additions & 0 deletions src/scrapers/news/sections/newsContent/nodes/audio.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
import fs from "fs";
import { HTMLElement } from "node-html-parser";

import {
downloadFile,
formatFileName,
getFileExtension,
} from "../../../../../utils/file";
import {
ListenTemplate,
MediaWikiComment,
} from "../../../../../utils/mediawiki";
import { ContentNodeParser } from "../types";

export const audioParser: ContentNodeParser = (node, { title }) => {
if (node instanceof HTMLElement && node.firstChild instanceof HTMLElement) {
const source = node.firstChild as HTMLElement;
const audioLink = source.attributes.src;

const formattedTitle = formatFileName(title as string);
const audioDirectory = `./out/news/${formattedTitle}`;
if (!fs.existsSync(audioDirectory)) {
fs.mkdirSync(audioDirectory, { recursive: true });
}

const audioExtension = getFileExtension(audioLink);
const outputFileName = `${formattedTitle} narration.${audioExtension}`;

downloadFile(audioLink, `${audioDirectory}/${outputFileName}`);

return new ListenTemplate(outputFileName, {
align: "center",
title: "Audio reading",
}).build();
}
return new MediaWikiComment("Invalid audio node");
};

export default audioParser;
7 changes: 2 additions & 5 deletions src/scrapers/news/sections/newsContent/nodes/div/gallery.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import fs from "fs";
import { HTMLElement } from "node-html-parser";

import {
formatFileName,
getImageExtension,
} from "../../../../../../utils/images";
import { formatFileName, getFileExtension } from "../../../../../../utils/file";
import {
MediaWikiHTML,
MediaWikiText,
Expand All @@ -27,7 +24,7 @@ export const galleryParser: ContentNodeParser = (node, options) => {
}

const imageName = `${formattedTitle} (${++ContentContext.imageCount})`;
const imageExtension = getImageExtension(imageLink);
const imageExtension = getFileExtension(imageLink);

return new MediaWikiText(`\n${imageName}.${imageExtension}`);
});
Expand Down
4 changes: 2 additions & 2 deletions src/scrapers/news/sections/newsContent/nodes/image.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import fs from "fs";
import sizeOf from "image-size";
import { HTMLElement } from "node-html-parser";

import { formatFileName, getImageExtension } from "../../../../../utils/images";
import { formatFileName, getFileExtension } from "../../../../../utils/file";
import {
MediaWikiBreak,
MediaWikiComment,
Expand Down Expand Up @@ -32,7 +32,7 @@ export const imageParser: ContentNodeParser = (node, { title, center }) => {
}

const imageName = `${formattedTitle} (${++ContentContext.imageCount})`;
const imageExtension = getImageExtension(imageLink);
const imageExtension = getFileExtension(imageLink);
const dimensions = sizeOf(
`${imageDirectory}/${imageName}.${imageExtension}`
);
Expand Down
2 changes: 2 additions & 0 deletions src/scrapers/news/sections/newsContent/nodes/parser.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import audioParser from "./audio";
import boldParser from "./bold";
import breakParser from "./break";
import centerParser from "./center";
Expand All @@ -20,6 +21,7 @@ const ignoredTags = ["script"];

const nodeParserMap: { [key: string]: ContentNodeParser } = {
a: linkParser,
audio: audioParser,
b: boldParser,
details: detailsParser,
div: divParser,
Expand Down
13 changes: 5 additions & 8 deletions src/scrapers/news/sections/newsHeader/newsHeader.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@ import {
getNewsUrlIdentifier,
} from "./newsHeader.utils";
import {
downloadImage,
downloadFile,
formatFileName,
getImageExtension,
} from "../../../../utils/images";
getFileExtension,
} from "../../../../utils/file";
import {
MediaWikiBreak,
MediaWikiContent,
Expand Down Expand Up @@ -42,13 +42,10 @@ const newsHeader: NewsSection = {
fs.mkdirSync(newsDirectory, { recursive: true });
}

const newspostImageName = `${formattedTitle} newspost.${getImageExtension(
const newspostImageName = `${formattedTitle} newspost.${getFileExtension(
image.attributes.src
)}`;
downloadImage(
image.attributes.src,
`${newsDirectory}/${newspostImageName}`
);
downloadFile(image.attributes.src, `${newsDirectory}/${newspostImageName}`);

const content: MediaWikiContent[] = [];

Expand Down
2 changes: 1 addition & 1 deletion src/scrapers/polls/polls.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import fs from "fs";
import parse from "node-html-parser";

import { pollHeader, pollQuestions } from "./sections";
import { formatFileName } from "../../utils/images";
import { formatFileName } from "../../utils/file";
import {
MediaWikiBuilder,
MediaWikiTemplate,
Expand Down
28 changes: 14 additions & 14 deletions src/utils/images.ts → src/utils/file.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,21 +5,21 @@ import path from "path";
import { formatText } from "./text";

/**
* Download an image from a url
* @param url The url of the image to download
* @param filepath The filepath of the downloaded image
* Download a file from a url
* @param url The url of the file to download
* @param filepath The filepath of the downloaded file
* @returns
*/
export const downloadImage = async (url: string, filepath: string) => {
console.info(`Attempting image download: ${url}`);
export const downloadFile = async (url: string, filepath: string) => {
console.info(`Attempting file download: ${url}`);
return new Promise((resolve, reject) => {
client.get(url, (res) => {
if (res.statusCode === 200) {
res
.pipe(fs.createWriteStream(filepath))
.on("error", reject)
.once("close", () => resolve(filepath));
console.info(`Downloaded image ${filepath}`);
console.info(`Downloaded file ${filepath}`);
} else {
res.resume();
reject(
Expand All @@ -31,22 +31,22 @@ export const downloadImage = async (url: string, filepath: string) => {
};

/**
* Get an image name from a url
* @param imageUrl The image url
* Get a file from a url
* @param fileUrl The file url
* @returns
*/
export const getImageName = (imageUrl: string) => {
const parsed = new URL(imageUrl);
export const getFileName = (fileUrl: string) => {
const parsed = new URL(fileUrl);
return path.basename(parsed.pathname);
};

/**
* Get the extension of an image from a url link.
* @param imageUrl The image url
* Get the extension of a file from a url link.
* @param fileUrl The file url
* @returns
*/
export const getImageExtension = (imageUrl: string) => {
return imageUrl.split(/[#?]/)[0].split(".").pop().trim();
export const getFileExtension = (fileUrl: string) => {
return fileUrl.split(/[#?]/)[0].split(".").pop().trim();
};

/**
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP

exports[`ListenTemplate it should render with only a file 1`] = `
MediaWikiTemplate {
"name": "Listen",
"params": Array [
Object {
"key": "filename",
"value": "test.mp3",
},
],
}
`;

exports[`ListenTemplate it should render with options: {"align":"left","title":"test title"} 1`] = `
MediaWikiTemplate {
"name": "Listen",
"params": Array [
Object {
"key": "filename",
"value": "test.mp3",
},
Object {
"key": "align",
"value": "left",
},
Object {
"key": "title",
"value": "test title",
},
],
}
`;

exports[`ListenTemplate it should render with options: {"align":"left"} 1`] = `
MediaWikiTemplate {
"name": "Listen",
"params": Array [
Object {
"key": "filename",
"value": "test.mp3",
},
Object {
"key": "align",
"value": "left",
},
],
}
`;

exports[`ListenTemplate it should render with options: {"title":"test title"} 1`] = `
MediaWikiTemplate {
"name": "Listen",
"params": Array [
Object {
"key": "filename",
"value": "test.mp3",
},
Object {
"key": "title",
"value": "test title",
},
],
}
`;
15 changes: 15 additions & 0 deletions src/utils/mediawiki/contents/templates/__tests__/listen.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import ListenTemplate, { ListeTemplateOptions } from "../listen";

describe("ListenTemplate", () => {
test("it should render with only a file", () => {
expect(new ListenTemplate("test.mp3").build()).toMatchSnapshot();
});

test.each<ListeTemplateOptions>([
{ align: "left" },
{ title: "test title" },
{ align: "left", title: "test title" },
])("it should render with options: %j", (options) => {
expect(new ListenTemplate("test.mp3", options).build()).toMatchSnapshot();
});
});
2 changes: 2 additions & 0 deletions src/utils/mediawiki/contents/templates/index.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
export { default as CollapedSectionTemplate } from "./collapsedSection";
export { default as ListenTemplate } from "./listen";
export { default as NewsPollTemplate } from "./newsPoll";
export { default as PollTemplate } from "./poll";
export { default as PollNoticeTemplate } from "./pollNotice";
export { default as PollWrapperTemplate } from "./pollWrapper";
export { default as UpdateTemplate } from "./update";

export * from "./listen";
export * from "./poll";
36 changes: 36 additions & 0 deletions src/utils/mediawiki/contents/templates/listen.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import { Template } from "./types";
import MediaWikiTemplate from "../template";

export type ListenAlignment = "left" | "right" | "center";

export type ListeTemplateOptions = {
align?: ListenAlignment;
title?: string;
};

class ListenTemplate extends Template {
align?: ListenAlignment;
fileName: string;
title?: string;

constructor(fileName: string, options?: ListeTemplateOptions) {
super("Listen");
this.fileName = fileName;
this.align = options?.align;
this.title = options?.title;
}

build() {
const listenTemplate = new MediaWikiTemplate(this.name);
listenTemplate.add("filename", this.fileName);
if (this.align) {
listenTemplate.add("align", this.align);
}
if (this.title) {
listenTemplate.add("title", this.title);
}
return listenTemplate;
}
}

export default ListenTemplate;

0 comments on commit c7dee3c

Please sign in to comment.