Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: adds scraper script #59

Merged
merged 7 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion .github/knip.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@ import type { KnipConfig } from "knip";
const config: KnipConfig = {
entry: ["src/main.ts"],
project: ["src/**/*.ts"],
ignore: ["src/types/config.ts", "**/__mocks__/**", "**/__fixtures__/**", "src/types/database.ts"],
ignore: [
"src/types/config.ts",
"**/__mocks__/**",
"**/__fixtures__/**",
"src/types/database.ts",
"src/handlers/user-issue-scraper.ts",
"src/handlers/issue-scraper.ts",
],
ignoreExportsUsedInFile: true,
// eslint can also be safely ignored as per the docs: https://knip.dev/guides/handling-issues#eslint--jest
ignoreDependencies: ["eslint-config-prettier", "eslint-plugin-prettier", "ts-node"],
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@ cypress/screenshots
script.ts
.wrangler
test-dashboard.md
auth.users.json
2 changes: 1 addition & 1 deletion .husky/commit-msg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env sh
. "$(dirname -- "$0")/_/husky.sh"

yarn commitlint --edit "$1"
bun commitlint --edit "$1"
2 changes: 1 addition & 1 deletion .husky/pre-commit
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env sh
. "$(dirname -- "$0")/_/husky.sh"

yarn lint-staged
bun lint-staged
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ This is a plugin for [Ubiquibot](https://github.com/ubiquity-os/ubiquity-os-kern

## Testing Locally

- Run `yarn install` to install the dependencies.
- Run `yarn worker` to start the server.
- Run `bun install` to install the dependencies.
- Run `bun worker` to start the server.
- Make HTTP requests to the server to test the plugin with content type `Application/JSON`

```
Expand Down Expand Up @@ -62,4 +62,4 @@ This is a plugin for [Ubiquibot](https://github.com/ubiquity-os/ubiquity-os-kern

## Testing

- Run `yarn test` to run the tests.
- Run `bun test` to run the tests.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
},
"lint-staged": {
"*.ts": [
"yarn prettier --write",
"bun prettier --write",
"eslint --fix"
],
"src/**.{ts,json}": [
Expand Down
304 changes: 304 additions & 0 deletions src/handlers/issue-scraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
import { createClient } from "@supabase/supabase-js";
import { VoyageAIClient } from "voyageai";
import { customOctokit as Octokit } from "@ubiquity-os/plugin-sdk/octokit";
import markdownit from "markdown-it";
import plainTextPlugin from "markdown-it-plain-text";
import "dotenv/config";
import { createAdapters } from "../adapters";
import { Context } from "../types/context";

interface MarkdownItWithPlainText extends markdownit {
plainText: string;
}

function markdownToPlainText(markdown: string | null): string | null {
if (!markdown) return markdown;
const md = markdownit() as MarkdownItWithPlainText;
md.use(plainTextPlugin);
md.render(markdown);
return md.plainText;
}

interface IssueMetadata {
nodeId: string;
number: number;
title: string;
body: string;
state: string;
repositoryName: string;
repositoryId: number;
assignees: string[];
authorId: number;
createdAt: string;
closedAt: string | null;
stateReason: string | null;
updatedAt: string;
}

interface IssueNode {
id: string;
number: number;
title: string;
body: string;
state: string;
stateReason: string | null;
createdAt: string;
updatedAt: string;
closedAt: string | null;
author: {
login: string;
} | null;
assignees: {
nodes: Array<{
login: string;
}>;
};
repository: {
id: string;
name: string;
owner: {
login: string;
};
};
}

interface GraphQlSearchResponse {
search: {
pageInfo: {
hasNextPage: boolean;
endCursor: string | null;
};
nodes: Array<IssueNode>;
};
}

const SEARCH_ISSUES_QUERY = `
query SearchIssues($searchText: String!, $after: String) {
search(
query: $searchText,
type: ISSUE,
first: 100,
after: $after
) {
pageInfo {
hasNextPage
endCursor
}
nodes {
... on Issue {
id
number
title
body
state
stateReason
createdAt
updatedAt
closedAt
author {
login
}
assignees(first: 10) {
nodes {
login
}
}
repository {
id
name
owner {
login
}
}
}
}
}
}
`;

async function fetchAuthorId(octokit: InstanceType<typeof Octokit>, login: string): Promise<number> {
try {
const response = await octokit.rest.users.getByUsername({ username: login });
return response.data.id;
} catch (error) {
console.error(`Error fetching author ID for ${login}:`, error);
return -1;
}
}

async function fetchUserIssues(octokit: InstanceType<typeof Octokit>, username: string): Promise<IssueNode[]> {
const allIssues: IssueNode[] = [];
let hasNextPage = true;
let cursor: string | null = null;

const searchText = `assignee:${username} is:issue is:closed`;

while (hasNextPage) {
const variables: { searchText: string; after?: string } = {
searchText,
};
if (cursor) {
variables.after = cursor;
}

const response: GraphQlSearchResponse = await octokit.graphql<GraphQlSearchResponse>(SEARCH_ISSUES_QUERY, variables);

const completedIssues = response.search.nodes.filter((issue) => issue.stateReason === "COMPLETED");
allIssues.push(...completedIssues);

hasNextPage = response.search.pageInfo.hasNextPage;
cursor = response.search.pageInfo.endCursor;

if (!cursor) break;
}

return allIssues;
}

// Pulls issues from GitHub and stores them in Supabase
export async function issueScraper(username: string, token?: string): Promise<string> {
try {
if (!username) {
throw new Error("Username is required");
}

const required = ["GITHUB_TOKEN", "SUPABASE_URL", "SUPABASE_KEY", "VOYAGEAI_API_KEY"];
const missing = required.filter((key) => !process.env[key]);
if (missing.length > 0) {
throw new Error(`Missing required environment variables: ${missing.join(", ")}`);
}

const context = {
adapters: {},
logger: {
info: (message: string, data: Record<string, unknown>) => console.log("INFO:", message + ":", data),
error: (message: string, data: Record<string, unknown>) => console.error("ERROR:", message + ":", data),
},
octokit: new Octokit({ auth: token || process.env.GITHUB_TOKEN }),
} as unknown as Context;

const supabaseUrl = process.env.SUPABASE_URL;
const supabaseKey = process.env.SUPABASE_KEY;
const voyageApiKey = process.env.VOYAGEAI_API_KEY;

if (!supabaseUrl || !supabaseKey || !voyageApiKey) {
throw new Error("Required environment variables are missing");
}

const supabase = createClient(supabaseUrl, supabaseKey);
const voyageClient = new VoyageAIClient({ apiKey: voyageApiKey });
const adapters = createAdapters(supabase, voyageClient, context);

const issues = await fetchUserIssues(context.octokit, username);
const processedIssues: Array<{ issue: IssueMetadata; error?: string }> = [];

for (const issue of issues) {
try {
const authorId = issue.author?.login ? await fetchAuthorId(context.octokit, issue.author.login) : -1;
const repoOwner = issue.repository.owner.login;

const metadata: IssueMetadata = {
nodeId: issue.id,
number: issue.number,
title: issue.title || "",
body: issue.body || "",
state: issue.state,
stateReason: issue.stateReason,
repositoryName: issue.repository.name,
repositoryId: parseInt(issue.repository.id),
assignees: (issue.assignees?.nodes || []).map((assignee) => assignee.login),
authorId,
createdAt: issue.createdAt,
closedAt: issue.closedAt,
updatedAt: issue.updatedAt,
};

const markdown = metadata.body + " " + metadata.title;
const plaintext = markdownToPlainText(markdown);
const embedding = await adapters.voyage.embedding.createEmbedding(plaintext);

const payload = {
issue: metadata,
action: "created",
sender: {
login: username,
},
repository: {
id: parseInt(issue.repository.id),
node_id: issue.repository.id,
name: issue.repository.name,
full_name: `${repoOwner}/${issue.repository.name}`,
owner: {
login: repoOwner,
id: authorId,
type: "User",
site_admin: false,
},
},
};

const { error } = await supabase.from("issues").upsert({
id: metadata.nodeId,
markdown,
plaintext,
embedding: JSON.stringify(embedding),
author_id: metadata.authorId,
modified_at: metadata.updatedAt,
payload: payload,
});

processedIssues.push({
issue: metadata,
error: error ? `Error storing issue: ${error.message}` : undefined,
});
} catch (error) {
processedIssues.push({
issue: {
nodeId: issue.id,
number: issue.number,
title: issue.title || "",
body: issue.body || "",
state: issue.state,
stateReason: issue.stateReason,
repositoryName: issue.repository.name,
repositoryId: parseInt(issue.repository.id),
assignees: [],
authorId: -1,
createdAt: issue.createdAt,
closedAt: issue.closedAt,
updatedAt: issue.updatedAt,
},
error: `Error processing issue: ${error instanceof Error ? error.message : "Unknown error"}`,
});
}
}

return JSON.stringify(
{
success: true,
stats: {
storageSuccessful: processedIssues.filter((p) => !p.error).length,
storageFailed: processedIssues.filter((p) => p.error).length,
},
errors: processedIssues
.filter((p) => p.error)
.map((p) => ({
type: "storage",
name: `${p.issue.repositoryName}#${p.issue.number}`,
error: p.error,
})),
issues: processedIssues.map((p) => ({
number: p.issue.number,
title: p.issue.title,
repo: p.issue.repositoryName,
error: p.error,
})),
},
null,
2
);
} catch (error) {
console.error("Error in issueScraper:", error);
throw error;
}
}
Loading
Loading