Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(openchallenges): update data in the local OC database #2968

Draft
wants to merge 6 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions apps/openchallenges/data-lambda/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ RUN poetry export --without-hashes --format=requirements.txt > requirements.txt
FROM public.ecr.aws/lambda/python:3.13

COPY --from=builder /app/requirements.txt ${LAMBDA_TASK_ROOT}/
COPY openchallenges_data_lambda/oc_data_sheet.py ${LAMBDA_TASK_ROOT}/
COPY openchallenges_data_lambda/app.py ${LAMBDA_TASK_ROOT}/

RUN python3.13 -m pip install --no-cache-dir -r requirements.txt -t .
Expand Down
16 changes: 16 additions & 0 deletions apps/openchallenges/data-lambda/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,22 @@ nx build openchallenges-data-lambda
nx build-image openchallenges-data-lambda
```

## Update .env with credentials to utilize Google Sheets API

Before running the Lambda function locally (see next section), update the `.env` file and replace
all "UPDATE_ME" values with real credentials.

Failing to update `.env` will result in the following output during invocation:

```console
{
"statusCode": 401,
"body": {
"message": "Private key not found in the credentials file. Please try again."
}
}
```

## Start the Lambda function locally with Docker Compose

Starts the Lambda function in the foreground, allowing you to view logs and interact with it
Expand Down
219 changes: 22 additions & 197 deletions apps/openchallenges/data-lambda/openchallenges_data_lambda/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
import json

import gspread
import numpy as np
import pandas as pd

import oc_data_sheet


GOOGLE_SHEET_CREDENTIALS_FILE = "service_account.json"
Expand Down Expand Up @@ -37,39 +37,42 @@ def lambda_handler(event, context):
try:
google_client = gspread.service_account(filename=GOOGLE_SHEET_CREDENTIALS_FILE)
except Exception as err:
status_code = 401
message = "Private key not found in the credentials file. Please try again."
else:
try:
wks = google_client.open(GOOGLE_SHEET_TITLE)

platforms = get_platform_data(wks)
platforms = oc_data_sheet.get_platform_data(wks)
print(platforms.head())

roles = get_roles(wks)
print(roles.head())
# roles = oc_data_sheet.get_roles(wks)
# print(roles.head())

categories = get_challenge_categories(wks)
print(categories.head())
# categories = oc_data_sheet.get_challenge_categories(wks)
# print(categories.head())

organizations = get_organization_data(wks)
print(organizations.head())
# organizations = oc_data_sheet.get_organization_data(wks)
# print(organizations.head())

edam_data_annotations = get_edam_annotations(wks)
print(edam_data_annotations.head())
# edam_data_annotations = oc_data_sheet.get_edam_annotations(wks)
# print(edam_data_annotations.head())

challenges, incentives, sub_types = get_challenge_data(wks)
print(challenges.head())
print(incentives.head())
print(sub_types.head())
# challenges, incentives, sub_types = oc_data_sheet.get_challenge_data(wks)
# print(challenges.head())
# print(incentives.head())
# print(sub_types.head())

status_code = 200
message = "Data successfully pulled from OC Data google sheet."

except Exception as err:
status_code = 400
message = f"Something went wrong with pulling the data: {err}."

data = {"message": message}
return {
"statusCode": 200,
"statusCode": status_code,
"body": json.dumps(data),
}

Expand All @@ -81,7 +84,9 @@ def write_credentials_file(output_json):
"type": os.getenv("TYPE"),
"project_id": os.getenv("PROJECT_ID"),
"private_key_id": os.getenv("PRIVATE_KEY_ID"),
"private_key": os.getenv("PRIVATE_KEY").encode().decode("unicode_escape"),
"private_key": os.getenv("PRIVATE_KEY", "")
.encode()
.decode("unicode_escape"),
"client_email": os.getenv("CLIENT_EMAIL"),
"client_id": os.getenv("CLIENT_ID"),
"auth_uri": os.getenv("AUTH_URI"),
Expand All @@ -93,185 +98,5 @@ def write_credentials_file(output_json):
out.write(json.dumps(credentials))


def get_challenge_data(wks, sheet_name="challenges"):
"""Get challenges data and clean up as needed.

Output:
- challenges
- challenge incentives
- challenge submission types
"""
df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
df.loc[df._platform == "Other", "platform"] = "\\N"

challenges = df[
[
"id",
"slug",
"name",
"headline",
"description",
"avatar_url",
"website_url",
"status",
"platform",
"doi",
"start_date",
"end_date",
"operation_id",
"created_at",
"updated_at",
]
]
challenges = (
challenges.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
)
challenges["headline"] = (
challenges["headline"]
.astype(str)
.apply(lambda x: x[:76] + "..." if len(x) > 80 else x)
)
challenges["description"] = (
challenges["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
challenges.loc[challenges.start_date == "", "start_date"] = "\\N"
challenges.loc[challenges.end_date == "", "end_date"] = "\\N"
challenges.loc[challenges.operation_id == "", "operation_id"] = "\\N"

incentives = pd.concat(
[
df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="monetary"
),
df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="publication"
),
df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="speaking_engagement"
),
df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(
incentives="other"
),
]
).rename(columns={"id": "challenge_id"})
incentives["incentives"] = pd.Categorical(
incentives["incentives"],
categories=["monetary", "publication", "speaking_engagement", "other"],
)
incentives = incentives.sort_values(["challenge_id", "incentives"])
incentives.index = np.arange(1, len(incentives) + 1)

sub_types = pd.concat(
[
df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="prediction_file"
),
df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="container_image"
),
df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="notebook"
),
df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="mlcube"
),
df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
submission_types="other"
),
]
).rename(columns={"id": "challenge_id"})
sub_types["submission_types"] = pd.Categorical(
sub_types["submission_types"],
categories=[
"prediction_file",
"container_image",
"notebook",
"mlcube",
"other",
],
)
sub_types = sub_types.sort_values(["challenge_id", "submission_types"])
sub_types.index = np.arange(1, len(sub_types) + 1)

return (
challenges,
incentives[["incentives", "challenge_id", "created_at"]],
sub_types[["submission_types", "challenge_id", "created_at"]],
)


def get_challenge_categories(wks, sheet_name="challenge_category"):
"""Get challenge categories."""
return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
["id", "challenge_id", "category"]
]


def get_platform_data(wks, sheet_name="platforms"):
"""Get platform data and clean up as needed."""
platforms = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
return platforms[platforms._public == "TRUE"][
["id", "slug", "name", "avatar_url", "website_url", "created_at", "updated_at"]
]


def get_organization_data(wks, sheet_name="organizations"):
"""Get organization data and clean up as needed."""
organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
organizations = organizations[organizations._public == "TRUE"][
[
"id",
"name",
"login",
"avatar_url",
"website_url",
"description",
"challenge_count",
"created_at",
"updated_at",
"acronym",
]
]
organizations = (
organizations.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
.replace(r"\n", " ", regex=True)
.replace("'", "''")
.replace("\u2019", "''", regex=True) # replace curly right-quote
.replace("\u202f", " ", regex=True) # replace narrow no-break space
.replace("\u2060", "", regex=True) # remove word joiner
)
organizations["description"] = (
organizations["description"]
.astype(str)
.apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
)
return organizations


def get_roles(wks, sheet_name="contribution_role"):
"""Get data on organization's role(s) in challenges."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_organization"], axis=1)
)


def get_edam_annotations(wks, sheet_name="challenge_data"):
"""Get data on challenge's EDAM annotations."""
return (
pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
.fillna("")
.drop(["_challenge", "_edam_name"], axis=1)
)


if __name__ == "__main__":
lambda_handler({}, "")
Loading
Loading