From bc845dd847531247d0c4151139fa74d313289eda Mon Sep 17 00:00:00 2001 From: Allen Kinzalow Date: Fri, 19 Jul 2024 21:55:50 -0400 Subject: [PATCH] [#125] Add support for scraping the world list (#126) * [#125] Add support for scraping the world list * workflow node versions * README --- .changeset/curly-pumpkins-deny.md | 5 +++ .changeset/giant-seals-fly.md | 5 +++ .changeset/olive-ghosts-count.md | 5 +++ .changeset/poor-rules-sleep.md | 5 +++ .github/workflows/pull-request.yml | 2 +- .github/workflows/push-main.yml | 6 +-- .github/workflows/workflow-dispatch.yml | 37 +++++++-------- README.md | 25 +++++++---- package.json | 3 +- src/index.ts | 39 ++++++++++++---- src/scrapers/index.ts | 1 + .../__snapshots__/worlds.utils.test.ts.snap | 6 +++ .../worlds/__tests__/worlds.utils.test.ts | 15 +++++++ src/scrapers/worlds/index.ts | 3 ++ src/scrapers/worlds/worlds.ts | 45 +++++++++++++++++++ src/scrapers/worlds/worlds.utils.ts | 31 +++++++++++++ tsconfig.json | 1 + yarn.lock | 13 ++++-- 18 files changed, 201 insertions(+), 46 deletions(-) create mode 100644 .changeset/curly-pumpkins-deny.md create mode 100644 .changeset/giant-seals-fly.md create mode 100644 .changeset/olive-ghosts-count.md create mode 100644 .changeset/poor-rules-sleep.md create mode 100644 src/scrapers/worlds/__tests__/__snapshots__/worlds.utils.test.ts.snap create mode 100644 src/scrapers/worlds/__tests__/worlds.utils.test.ts create mode 100644 src/scrapers/worlds/index.ts create mode 100644 src/scrapers/worlds/worlds.ts create mode 100644 src/scrapers/worlds/worlds.utils.ts diff --git a/.changeset/curly-pumpkins-deny.md b/.changeset/curly-pumpkins-deny.md new file mode 100644 index 0000000..a549b10 --- /dev/null +++ b/.changeset/curly-pumpkins-deny.md @@ -0,0 +1,5 @@ +--- +"osrs-web-scraper": minor +--- + +Update workflow dispatch to allow choiced tasks diff --git a/.changeset/giant-seals-fly.md b/.changeset/giant-seals-fly.md new file mode 100644 index 0000000..1226d1d --- /dev/null +++ b/.changeset/giant-seals-fly.md @@ -0,0 +1,5 @@ +--- +"osrs-web-scraper": patch +--- + +Update all workflows to use Node 21 diff --git a/.changeset/olive-ghosts-count.md b/.changeset/olive-ghosts-count.md new file mode 100644 index 0000000..96fa3a8 --- /dev/null +++ b/.changeset/olive-ghosts-count.md @@ -0,0 +1,5 @@ +--- +"osrs-web-scraper": minor +--- + +Add commander for cli parsing diff --git a/.changeset/poor-rules-sleep.md b/.changeset/poor-rules-sleep.md new file mode 100644 index 0000000..fb58793 --- /dev/null +++ b/.changeset/poor-rules-sleep.md @@ -0,0 +1,5 @@ +--- +"osrs-web-scraper": minor +--- + +Add world list scraping diff --git a/.github/workflows/pull-request.yml b/.github/workflows/pull-request.yml index a9ce10b..0068db2 100644 --- a/.github/workflows/pull-request.yml +++ b/.github/workflows/pull-request.yml @@ -34,7 +34,7 @@ jobs: with: always-auth: true cache: yarn - node-version: 16 + node-version: 21 - name: Install yarn packages run: yarn install --frozen-lockfile diff --git a/.github/workflows/push-main.yml b/.github/workflows/push-main.yml index 5e96ecd..f6a1401 100644 --- a/.github/workflows/push-main.yml +++ b/.github/workflows/push-main.yml @@ -18,7 +18,7 @@ jobs: uses: actions/setup-node@v3 with: cache: yarn - node-version: 16.x + node-version: 21.x - name: Install dependencies run: yarn install --frozen-lockfile @@ -41,12 +41,12 @@ jobs: with: fetch-depth: 0 - - name: Use Node.js 16 + - name: Use Node.js uses: actions/setup-node@v3 with: always-auth: true cache: yarn - node-version: 16 + node-version: 21 - name: Install yarn packages run: yarn install --frozen-lockfile diff --git a/.github/workflows/workflow-dispatch.yml b/.github/workflows/workflow-dispatch.yml index ed0426a..b81e7aa 100644 --- a/.github/workflows/workflow-dispatch.yml +++ b/.github/workflows/workflow-dispatch.yml @@ -3,12 +3,17 @@ name: Run News Scraper on: workflow_dispatch: inputs: - newsLink: - description: "News Link" - required: false - - pollLink: - description: "Poll Link" + task: + description: "Task to run" + required: true + default: "news" + type: choice + options: + - news + - poll + - worlds + link: + description: "Link" required: false jobs: @@ -22,7 +27,7 @@ jobs: uses: actions/setup-node@v3 with: cache: yarn - node-version: 16.x + node-version: 21.x - name: Install dependencies run: yarn install --frozen-lockfile @@ -30,19 +35,11 @@ jobs: - name: Build run: yarn build - - name: Scrape News Post - run: export NEWS_LINK=${{ github.event.inputs.newsLink }} && export POLL_LINK=${{ github.event.inputs.pollLink }} && yarn start:node - - - name: Upload News Results - if: "${{ github.event.inputs.newsLink != '' }}" - uses: actions/upload-artifact@v3 - with: - name: newspost-${{ github.run_id }}-${{ github.run_attempt }} - path: out/news/ + - name: OSRS Web Scraper + run: yarn start:node ${{ github.event.inputs.task }} "${{ github.event.inputs.link }}" - - name: Upload Poll Results - if: "${{ github.event.inputs.pollLink != '' }}" + - name: Upload Output uses: actions/upload-artifact@v3 with: - name: poll-${{ github.run_id }}-${{ github.run_attempt }} - path: out/polls/ + name: ${{ github.event.inputs.task }}-${{ github.run_id }}-${{ github.run_attempt }} + path: out/${{ github.event.inputs.task }}/ diff --git a/README.md b/README.md index 0e4ca5c..4c2c96b 100644 --- a/README.md +++ b/README.md @@ -1,28 +1,37 @@ # osrs-web-scraper -Scrape information from the Oldschool Runescape website. + +Scrape information from the Oldschool Runescape website and convert it to MediaWiki format. ## Setup ### Install dependencies + ``` yarn install ``` -### Setup environment -``` -NODE_ENV=development -NEWS_LINK=https://secure.runescape.com/m=news/a=1/desert-treasure-ii---the-fallen-empire?oldschool=1 -``` +### Usage -### Run the scraper ``` -yarn start +Usage: OSRS Web Scraper [options] [command] + +Options: + -V, --version output the version number + -h, --help display help for command + +Commands: + news Scrape an OSRS news posts. + poll Scrape an OSRS poll. + worlds Scrape the OSRS world list. + help [command] display help for command ``` ## Releasing ### Create a changeset + Create a changeset file by running the following command: + ``` yarn changeset ``` diff --git a/package.json b/package.json index e0ed08b..d5d90e0 100644 --- a/package.json +++ b/package.json @@ -26,7 +26,8 @@ }, "homepage": "https://github.com/allenkinzalow/osrs-web-scraper#readme", "dependencies": { - "@osrs-wiki/mediawiki-builder": "^1.3.1", + "@osrs-wiki/mediawiki-builder": "^1.4.0", + "commander": "^12.1.0", "date-fns": "^2.30.0", "dotenv": "^16.0.3", "image-size": "^1.1.1", diff --git a/src/index.ts b/src/index.ts index e530bc7..8a7c50d 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,17 +1,38 @@ import config from "@config"; +import { Command } from "commander"; import scraper from "./scraper"; -import { news, polls } from "./scrapers"; +import { news, polls, worlds } from "./scrapers"; +import { WORLD_LIST_URL } from "./scrapers/worlds/worlds.utils"; +import packageJson from "../package.json"; console.log(`Running ${config.environment}`); -const newsLink = process.env.NEWS_LINK; -const pollLink = process.env.POLL_LINK; +const program = new Command(); -if (newsLink) { - scraper.scrape(newsLink, news); -} +program.name("OSRS Web Scraper").description("").version(packageJson.version); -if (pollLink) { - scraper.scrape(pollLink, polls); -} +program + .command("news") + .description("Scrape an OSRS news posts.") + .argument("", "The news post to scrape.") + .action((newsLink) => { + scraper.scrape(newsLink, news); + }); + +program + .command("poll") + .description("Scrape an OSRS poll.") + .argument("", "The poll to scrape.") + .action((pollLink) => { + scraper.scrape(pollLink, polls); + }); + +program + .command("worlds") + .description("Scrape the OSRS world list.") + .action(() => { + scraper.scrape(WORLD_LIST_URL, worlds); + }); + +program.parse(); diff --git a/src/scrapers/index.ts b/src/scrapers/index.ts index 06b779e..14c22d6 100644 --- a/src/scrapers/index.ts +++ b/src/scrapers/index.ts @@ -1,4 +1,5 @@ export { default as news } from "./news"; export { default as polls } from "./polls"; +export { default as worlds } from "./worlds"; export * from "./types"; diff --git a/src/scrapers/worlds/__tests__/__snapshots__/worlds.utils.test.ts.snap b/src/scrapers/worlds/__tests__/__snapshots__/worlds.utils.test.ts.snap new file mode 100644 index 0000000..739a47b --- /dev/null +++ b/src/scrapers/worlds/__tests__/__snapshots__/worlds.utils.test.ts.snap @@ -0,0 +1,6 @@ +// Jest Snapshot v1, https://goo.gl/fbAQLP + +exports[`world scraper utils getWorldLines 1`] = ` +"{{WorldLine|101|United States|mems=deadman|111-126 Deadman}} +" +`; diff --git a/src/scrapers/worlds/__tests__/worlds.utils.test.ts b/src/scrapers/worlds/__tests__/worlds.utils.test.ts new file mode 100644 index 0000000..78fbb51 --- /dev/null +++ b/src/scrapers/worlds/__tests__/worlds.utils.test.ts @@ -0,0 +1,15 @@ +import { MediaWikiBuilder } from "@osrs-wiki/mediawiki-builder"; +import parse from "node-html-parser"; + +import { getWorldLines } from "../worlds.utils"; + +describe("world scraper utils", () => { + test("getWorldLines", () => { + const worldNodes = parse( + '\nOldSchool 1010 playersUnited StatesMembers111-126 Deadman' + ); + const builder = new MediaWikiBuilder(); + builder.addContents(getWorldLines(worldNodes)); + expect(builder.build()).toMatchSnapshot(); + }); +}); diff --git a/src/scrapers/worlds/index.ts b/src/scrapers/worlds/index.ts new file mode 100644 index 0000000..de720bb --- /dev/null +++ b/src/scrapers/worlds/index.ts @@ -0,0 +1,3 @@ +import worlds from "./worlds"; + +export default worlds; diff --git a/src/scrapers/worlds/worlds.ts b/src/scrapers/worlds/worlds.ts new file mode 100644 index 0000000..3048b9f --- /dev/null +++ b/src/scrapers/worlds/worlds.ts @@ -0,0 +1,45 @@ +import { MediaWikiBuilder } from "@osrs-wiki/mediawiki-builder"; +import fs from "fs"; +import parse from "node-html-parser"; + +import { getWorldLines } from "./worlds.utils"; +import { ScrapingService } from "../types"; + +const worlds: ScrapingService = { + scrape: async (page): Promise => { + try { + const results = await page.evaluate(() => { + // eslint-disable-next-line @typescript-eslint/ban-ts-comment + // @ts-ignore Ignore window typing + const $ = window.$; + const worldRows = $(".server-list__body").html(); + + return { + worldRows, + }; + }); + + const worldNodes = parse(results.worldRows); + + const builder = new MediaWikiBuilder(); + builder.addContents(getWorldLines(worldNodes)); + + console.info("Writing world list results to file..."); + try { + if (!fs.existsSync("out/worlds")) { + fs.mkdirSync("out/worlds", { recursive: true }); + } + await fs.writeFileSync(`out/worlds/worlds.txt`, builder.build()); + console.info("Successfully created worlds file"); + } catch (err) { + console.error(err); + } + + return builder; + } catch (error) { + console.error(error); + } + }, +}; + +export default worlds; diff --git a/src/scrapers/worlds/worlds.utils.ts b/src/scrapers/worlds/worlds.utils.ts new file mode 100644 index 0000000..b1235ee --- /dev/null +++ b/src/scrapers/worlds/worlds.utils.ts @@ -0,0 +1,31 @@ +import { MediaWikiTemplate } from "@osrs-wiki/mediawiki-builder"; +import { HTMLElement } from "node-html-parser"; + +export const WORLD_LIST_URL = "https://oldschool.runescape.com/a=13/slu"; + +export const getWorldLines = (worldRows: HTMLElement) => { + const worldRowNodes = worldRows.childNodes.filter( + (node) => node instanceof HTMLElement && node.tagName === "TR" + ); + const worldLines = worldRowNodes.map((node) => { + const tdNodes = node.childNodes.filter( + (node) => node instanceof HTMLElement && node.tagName === "TD" + ); + const worldLine = new MediaWikiTemplate("WorldLine", { collapsed: true }); + const worldNumber = + tdNodes[0].childNodes?.[1].textContent?.replaceAll(/^\D+/g, "") ?? ""; + const region = tdNodes[2].textContent; + const activity = tdNodes[4].textContent; + const members = activity.includes("Deadman") + ? "deadman" + : tdNodes[3].textContent === "Members" + ? "yes" + : "no"; + worldLine.add("", worldNumber); + worldLine.add("", region); + worldLine.add("mems", members); + worldLine.add("", activity); + return worldLine; + }); + return worldLines; +}; diff --git a/tsconfig.json b/tsconfig.json index 59ed849..fad877b 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -6,6 +6,7 @@ "noImplicitAny": true, "moduleResolution": "node", "sourceMap": true, + "resolveJsonModule": true, "outDir": "dist", "baseUrl": "./", "paths": { diff --git a/yarn.lock b/yarn.lock index 98cbb5d..b4b830f 100644 --- a/yarn.lock +++ b/yarn.lock @@ -929,10 +929,10 @@ "@nodelib/fs.scandir" "2.1.5" fastq "^1.6.0" -"@osrs-wiki/mediawiki-builder@^1.3.1": - version "1.3.1" - resolved "https://registry.yarnpkg.com/@osrs-wiki/mediawiki-builder/-/mediawiki-builder-1.3.1.tgz#1337ba8783b3bad81cab5a4530afe2dcb1bcd026" - integrity sha512-TLEIGA2UM9C7Rxp0ol1/UyBU8xZBUC0ESdLdSIG6IbFpLeu93LnGOq/TZk1LjP00I0v944nNXA9XgAEXdDh9Yg== +"@osrs-wiki/mediawiki-builder@^1.4.0": + version "1.4.0" + resolved "https://registry.yarnpkg.com/@osrs-wiki/mediawiki-builder/-/mediawiki-builder-1.4.0.tgz#9293d1db16c4e6d4cb758bd55c2d44f5de86d000" + integrity sha512-40yguMcZRvrMykDtcFWts2j2fnf3Ib2nDPhv7yGiIiOQ6gVMsJr68EmGAQTWFbV0CEI+vsERuuQhC/0DIqmZEQ== dependencies: tslib "^2.6.2" @@ -1814,6 +1814,11 @@ colorette@^2.0.16: resolved "https://registry.yarnpkg.com/colorette/-/colorette-2.0.16.tgz#713b9af84fdb000139f04546bd4a93f62a5085da" integrity sha512-hUewv7oMjCp+wkBv5Rm0v87eJhq4woh5rSR+42YSQJKecCqgIqNkZ6lAlQms/BwHPJA5NKMRlpxPRv0n8HQW6g== +commander@^12.1.0: + version "12.1.0" + resolved "https://registry.yarnpkg.com/commander/-/commander-12.1.0.tgz#01423b36f501259fdaac4d0e4d60c96c991585d3" + integrity sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA== + commander@^8.3.0: version "8.3.0" resolved "https://registry.yarnpkg.com/commander/-/commander-8.3.0.tgz#4837ea1b2da67b9c616a67afbb0fafee567bca66"