Skip to content

Commit

Permalink
[#125] Add support for scraping the world list (#126)
Browse files Browse the repository at this point in the history
* [#125] Add support for scraping the world list

* workflow node versions

* README
  • Loading branch information
allenkinzalow authored Jul 20, 2024
1 parent c3f94ed commit bc845dd
Show file tree
Hide file tree
Showing 18 changed files with 201 additions and 46 deletions.
5 changes: 5 additions & 0 deletions .changeset/curly-pumpkins-deny.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"osrs-web-scraper": minor
---

Update workflow dispatch to allow choiced tasks
5 changes: 5 additions & 0 deletions .changeset/giant-seals-fly.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"osrs-web-scraper": patch
---

Update all workflows to use Node 21
5 changes: 5 additions & 0 deletions .changeset/olive-ghosts-count.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"osrs-web-scraper": minor
---

Add commander for cli parsing
5 changes: 5 additions & 0 deletions .changeset/poor-rules-sleep.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"osrs-web-scraper": minor
---

Add world list scraping
2 changes: 1 addition & 1 deletion .github/workflows/pull-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ jobs:
with:
always-auth: true
cache: yarn
node-version: 16
node-version: 21

- name: Install yarn packages
run: yarn install --frozen-lockfile
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/push-main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
uses: actions/setup-node@v3
with:
cache: yarn
node-version: 16.x
node-version: 21.x

- name: Install dependencies
run: yarn install --frozen-lockfile
Expand All @@ -41,12 +41,12 @@ jobs:
with:
fetch-depth: 0

- name: Use Node.js 16
- name: Use Node.js
uses: actions/setup-node@v3
with:
always-auth: true
cache: yarn
node-version: 16
node-version: 21

- name: Install yarn packages
run: yarn install --frozen-lockfile
Expand Down
37 changes: 17 additions & 20 deletions .github/workflows/workflow-dispatch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,17 @@ name: Run News Scraper
on:
workflow_dispatch:
inputs:
newsLink:
description: "News Link"
required: false

pollLink:
description: "Poll Link"
task:
description: "Task to run"
required: true
default: "news"
type: choice
options:
- news
- poll
- worlds
link:
description: "Link"
required: false

jobs:
Expand All @@ -22,27 +27,19 @@ jobs:
uses: actions/setup-node@v3
with:
cache: yarn
node-version: 16.x
node-version: 21.x

- name: Install dependencies
run: yarn install --frozen-lockfile

- name: Build
run: yarn build

- name: Scrape News Post
run: export NEWS_LINK=${{ github.event.inputs.newsLink }} && export POLL_LINK=${{ github.event.inputs.pollLink }} && yarn start:node

- name: Upload News Results
if: "${{ github.event.inputs.newsLink != '' }}"
uses: actions/upload-artifact@v3
with:
name: newspost-${{ github.run_id }}-${{ github.run_attempt }}
path: out/news/
- name: OSRS Web Scraper
run: yarn start:node ${{ github.event.inputs.task }} "${{ github.event.inputs.link }}"

- name: Upload Poll Results
if: "${{ github.event.inputs.pollLink != '' }}"
- name: Upload Output
uses: actions/upload-artifact@v3
with:
name: poll-${{ github.run_id }}-${{ github.run_attempt }}
path: out/polls/
name: ${{ github.event.inputs.task }}-${{ github.run_id }}-${{ github.run_attempt }}
path: out/${{ github.event.inputs.task }}/
25 changes: 17 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,28 +1,37 @@
# osrs-web-scraper
Scrape information from the Oldschool Runescape website.

Scrape information from the Oldschool Runescape website and convert it to MediaWiki format.

## Setup

### Install dependencies

```
yarn install
```

### Setup environment
```
NODE_ENV=development
NEWS_LINK=https://secure.runescape.com/m=news/a=1/desert-treasure-ii---the-fallen-empire?oldschool=1
```
### Usage

### Run the scraper
```
yarn start
Usage: OSRS Web Scraper [options] [command]
Options:
-V, --version output the version number
-h, --help display help for command
Commands:
news <string> Scrape an OSRS news posts.
poll <string> Scrape an OSRS poll.
worlds Scrape the OSRS world list.
help [command] display help for command
```

## Releasing

### Create a changeset

Create a changeset file by running the following command:

```
yarn changeset
```
Expand Down
3 changes: 2 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@
},
"homepage": "https://github.com/allenkinzalow/osrs-web-scraper#readme",
"dependencies": {
"@osrs-wiki/mediawiki-builder": "^1.3.1",
"@osrs-wiki/mediawiki-builder": "^1.4.0",
"commander": "^12.1.0",
"date-fns": "^2.30.0",
"dotenv": "^16.0.3",
"image-size": "^1.1.1",
Expand Down
39 changes: 30 additions & 9 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,17 +1,38 @@
import config from "@config";
import { Command } from "commander";

import scraper from "./scraper";
import { news, polls } from "./scrapers";
import { news, polls, worlds } from "./scrapers";
import { WORLD_LIST_URL } from "./scrapers/worlds/worlds.utils";
import packageJson from "../package.json";

console.log(`Running ${config.environment}`);

const newsLink = process.env.NEWS_LINK;
const pollLink = process.env.POLL_LINK;
const program = new Command();

if (newsLink) {
scraper.scrape(newsLink, news);
}
program.name("OSRS Web Scraper").description("").version(packageJson.version);

if (pollLink) {
scraper.scrape(pollLink, polls);
}
program
.command("news")
.description("Scrape an OSRS news posts.")
.argument("<string>", "The news post to scrape.")
.action((newsLink) => {
scraper.scrape(newsLink, news);
});

program
.command("poll")
.description("Scrape an OSRS poll.")
.argument("<string>", "The poll to scrape.")
.action((pollLink) => {
scraper.scrape(pollLink, polls);
});

program
.command("worlds")
.description("Scrape the OSRS world list.")
.action(() => {
scraper.scrape(WORLD_LIST_URL, worlds);
});

program.parse();
1 change: 1 addition & 0 deletions src/scrapers/index.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
export { default as news } from "./news";
export { default as polls } from "./polls";
export { default as worlds } from "./worlds";

export * from "./types";
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
// Jest Snapshot v1, https://goo.gl/fbAQLP

exports[`world scraper utils getWorldLines 1`] = `
"{{WorldLine|101|United States|mems=deadman|111-126 Deadman}}
"
`;
15 changes: 15 additions & 0 deletions src/scrapers/worlds/__tests__/worlds.utils.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { MediaWikiBuilder } from "@osrs-wiki/mediawiki-builder";
import parse from "node-html-parser";

import { getWorldLines } from "../worlds.utils";

describe("world scraper utils", () => {
test("getWorldLines", () => {
const worldNodes = parse(
'<tr><td>\n<a href="">OldSchool 101</a></td><td>0 players</td><td>United States</td><td>Members</td><td>111-126 Deadman</td></tr>'
);
const builder = new MediaWikiBuilder();
builder.addContents(getWorldLines(worldNodes));
expect(builder.build()).toMatchSnapshot();
});
});
3 changes: 3 additions & 0 deletions src/scrapers/worlds/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
import worlds from "./worlds";

export default worlds;
45 changes: 45 additions & 0 deletions src/scrapers/worlds/worlds.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import { MediaWikiBuilder } from "@osrs-wiki/mediawiki-builder";
import fs from "fs";
import parse from "node-html-parser";

import { getWorldLines } from "./worlds.utils";
import { ScrapingService } from "../types";

const worlds: ScrapingService<MediaWikiBuilder> = {
scrape: async (page): Promise<MediaWikiBuilder> => {
try {
const results = await page.evaluate(() => {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore Ignore window typing
const $ = window.$;
const worldRows = $(".server-list__body").html();

return {
worldRows,
};
});

const worldNodes = parse(results.worldRows);

const builder = new MediaWikiBuilder();
builder.addContents(getWorldLines(worldNodes));

console.info("Writing world list results to file...");
try {
if (!fs.existsSync("out/worlds")) {
fs.mkdirSync("out/worlds", { recursive: true });
}
await fs.writeFileSync(`out/worlds/worlds.txt`, builder.build());
console.info("Successfully created worlds file");
} catch (err) {
console.error(err);
}

return builder;
} catch (error) {
console.error(error);
}
},
};

export default worlds;
31 changes: 31 additions & 0 deletions src/scrapers/worlds/worlds.utils.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import { MediaWikiTemplate } from "@osrs-wiki/mediawiki-builder";
import { HTMLElement } from "node-html-parser";

export const WORLD_LIST_URL = "https://oldschool.runescape.com/a=13/slu";

export const getWorldLines = (worldRows: HTMLElement) => {
const worldRowNodes = worldRows.childNodes.filter(
(node) => node instanceof HTMLElement && node.tagName === "TR"
);
const worldLines = worldRowNodes.map((node) => {
const tdNodes = node.childNodes.filter(
(node) => node instanceof HTMLElement && node.tagName === "TD"
);
const worldLine = new MediaWikiTemplate("WorldLine", { collapsed: true });
const worldNumber =
tdNodes[0].childNodes?.[1].textContent?.replaceAll(/^\D+/g, "") ?? "";
const region = tdNodes[2].textContent;
const activity = tdNodes[4].textContent;
const members = activity.includes("Deadman")
? "deadman"
: tdNodes[3].textContent === "Members"
? "yes"
: "no";
worldLine.add("", worldNumber);
worldLine.add("", region);
worldLine.add("mems", members);
worldLine.add("", activity);
return worldLine;
});
return worldLines;
};
1 change: 1 addition & 0 deletions tsconfig.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"noImplicitAny": true,
"moduleResolution": "node",
"sourceMap": true,
"resolveJsonModule": true,
"outDir": "dist",
"baseUrl": "./",
"paths": {
Expand Down
13 changes: 9 additions & 4 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -929,10 +929,10 @@
"@nodelib/fs.scandir" "2.1.5"
fastq "^1.6.0"

"@osrs-wiki/mediawiki-builder@^1.3.1":
version "1.3.1"
resolved "https://registry.yarnpkg.com/@osrs-wiki/mediawiki-builder/-/mediawiki-builder-1.3.1.tgz#1337ba8783b3bad81cab5a4530afe2dcb1bcd026"
integrity sha512-TLEIGA2UM9C7Rxp0ol1/UyBU8xZBUC0ESdLdSIG6IbFpLeu93LnGOq/TZk1LjP00I0v944nNXA9XgAEXdDh9Yg==
"@osrs-wiki/mediawiki-builder@^1.4.0":
version "1.4.0"
resolved "https://registry.yarnpkg.com/@osrs-wiki/mediawiki-builder/-/mediawiki-builder-1.4.0.tgz#9293d1db16c4e6d4cb758bd55c2d44f5de86d000"
integrity sha512-40yguMcZRvrMykDtcFWts2j2fnf3Ib2nDPhv7yGiIiOQ6gVMsJr68EmGAQTWFbV0CEI+vsERuuQhC/0DIqmZEQ==
dependencies:
tslib "^2.6.2"

Expand Down Expand Up @@ -1814,6 +1814,11 @@ colorette@^2.0.16:
resolved "https://registry.yarnpkg.com/colorette/-/colorette-2.0.16.tgz#713b9af84fdb000139f04546bd4a93f62a5085da"
integrity sha512-hUewv7oMjCp+wkBv5Rm0v87eJhq4woh5rSR+42YSQJKecCqgIqNkZ6lAlQms/BwHPJA5NKMRlpxPRv0n8HQW6g==

commander@^12.1.0:
version "12.1.0"
resolved "https://registry.yarnpkg.com/commander/-/commander-12.1.0.tgz#01423b36f501259fdaac4d0e4d60c96c991585d3"
integrity sha512-Vw8qHK3bZM9y/P10u3Vib8o/DdkvA2OtPtZvD871QKjy74Wj1WSKFILMPRPSdUSx5RFK1arlJzEtA4PkFgnbuA==

commander@^8.3.0:
version "8.3.0"
resolved "https://registry.yarnpkg.com/commander/-/commander-8.3.0.tgz#4837ea1b2da67b9c616a67afbb0fafee567bca66"
Expand Down

0 comments on commit bc845dd

Please sign in to comment.