Sage-Bionetworks · vpchung · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025 · Jan 16, 2025
@@ -10,6 +10,7 @@ RUN poetry export --without-hashes --format=requirements.txt > requirements.txt
 FROM public.ecr.aws/lambda/python:3.13
 
 COPY --from=builder /app/requirements.txt ${LAMBDA_TASK_ROOT}/
+COPY openchallenges_data_lambda/oc_data_sheet.py ${LAMBDA_TASK_ROOT}/
 COPY openchallenges_data_lambda/app.py ${LAMBDA_TASK_ROOT}/
 
 RUN python3.13 -m pip install --no-cache-dir -r requirements.txt -t .

@@ -12,6 +12,22 @@ nx build openchallenges-data-lambda
 nx build-image openchallenges-data-lambda
 ```
 
+## Update .env with credentials to utilize Google Sheets API
+
+Before running the Lambda function locally (see next section), update the `.env` file and replace
+all "UPDATE_ME" values with real credentials.
+
+Failing to update `.env` will result in the following output during invocation:
+
+```console
+{
+  "statusCode": 401,
+  "body": {
+    "message": "Private key not found in the credentials file. Please try again."
+  }
+}
+```
+
 ## Start the Lambda function locally with Docker Compose
 
 Starts the Lambda function in the foreground, allowing you to view logs and interact with it

@@ -2,8 +2,8 @@
 import json
 
 import gspread
-import numpy as np
-import pandas as pd
+
+import oc_data_sheet
 
 
 GOOGLE_SHEET_CREDENTIALS_FILE = "service_account.json"
@@ -37,39 +37,42 @@ def lambda_handler(event, context):
     try:
         google_client = gspread.service_account(filename=GOOGLE_SHEET_CREDENTIALS_FILE)
     except Exception as err:
+        status_code = 401
         message = "Private key not found in the credentials file. Please try again."
     else:
         try:
             wks = google_client.open(GOOGLE_SHEET_TITLE)
 
-            platforms = get_platform_data(wks)
+            platforms = oc_data_sheet.get_platform_data(wks)
             print(platforms.head())
 
-            roles = get_roles(wks)
-            print(roles.head())
+            # roles = oc_data_sheet.get_roles(wks)
+            # print(roles.head())
 
-            categories = get_challenge_categories(wks)
-            print(categories.head())
+            # categories = oc_data_sheet.get_challenge_categories(wks)
+            # print(categories.head())
 
-            organizations = get_organization_data(wks)
-            print(organizations.head())
+            # organizations = oc_data_sheet.get_organization_data(wks)
+            # print(organizations.head())
 
-            edam_data_annotations = get_edam_annotations(wks)
-            print(edam_data_annotations.head())
+            # edam_data_annotations = oc_data_sheet.get_edam_annotations(wks)
+            # print(edam_data_annotations.head())
 
-            challenges, incentives, sub_types = get_challenge_data(wks)
-            print(challenges.head())
-            print(incentives.head())
-            print(sub_types.head())
+            # challenges, incentives, sub_types = oc_data_sheet.get_challenge_data(wks)
+            # print(challenges.head())
+            # print(incentives.head())
+            # print(sub_types.head())
 
+            status_code = 200
             message = "Data successfully pulled from OC Data google sheet."
 
         except Exception as err:
+            status_code = 400
             message = f"Something went wrong with pulling the data: {err}."
 
     data = {"message": message}
     return {
-        "statusCode": 200,
+        "statusCode": status_code,
         "body": json.dumps(data),
     }
 
@@ -81,7 +84,9 @@ def write_credentials_file(output_json):
             "type": os.getenv("TYPE"),
             "project_id": os.getenv("PROJECT_ID"),
             "private_key_id": os.getenv("PRIVATE_KEY_ID"),
-            "private_key": os.getenv("PRIVATE_KEY").encode().decode("unicode_escape"),
+            "private_key": os.getenv("PRIVATE_KEY", "")
+            .encode()
+            .decode("unicode_escape"),
             "client_email": os.getenv("CLIENT_EMAIL"),
             "client_id": os.getenv("CLIENT_ID"),
             "auth_uri": os.getenv("AUTH_URI"),
@@ -93,185 +98,5 @@ def write_credentials_file(output_json):
         out.write(json.dumps(credentials))
 
 
-def get_challenge_data(wks, sheet_name="challenges"):
-    """Get challenges data and clean up as needed.
-
-    Output:
-        - challenges
-        - challenge incentives
-        - challenge submission types
-    """
-    df = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
-    df.loc[df._platform == "Other", "platform"] = "\\N"
-
-    challenges = df[
-        [
-            "id",
-            "slug",
-            "name",
-            "headline",
-            "description",
-            "avatar_url",
-            "website_url",
-            "status",
-            "platform",
-            "doi",
-            "start_date",
-            "end_date",
-            "operation_id",
-            "created_at",
-            "updated_at",
-        ]
-    ]
-    challenges = (
-        challenges.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
-        .replace(r"\n", " ", regex=True)
-        .replace("'", "''")
-        .replace("\u2019", "''", regex=True)  # replace curly right-quote
-        .replace("\u202f", " ", regex=True)  # replace narrow no-break space
-        .replace("\u2060", "", regex=True)  # remove word joiner
-    )
-    challenges["headline"] = (
-        challenges["headline"]
-        .astype(str)
-        .apply(lambda x: x[:76] + "..." if len(x) > 80 else x)
-    )
-    challenges["description"] = (
-        challenges["description"]
-        .astype(str)
-        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
-    )
-    challenges.loc[challenges.start_date == "", "start_date"] = "\\N"
-    challenges.loc[challenges.end_date == "", "end_date"] = "\\N"
-    challenges.loc[challenges.operation_id == "", "operation_id"] = "\\N"
-
-    incentives = pd.concat(
-        [
-            df[df.monetary_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="monetary"
-            ),
-            df[df.publication_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="publication"
-            ),
-            df[df.speaking_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="speaking_engagement"
-            ),
-            df[df.other_incentive == "TRUE"][["id", "created_at"]].assign(
-                incentives="other"
-            ),
-        ]
-    ).rename(columns={"id": "challenge_id"})
-    incentives["incentives"] = pd.Categorical(
-        incentives["incentives"],
-        categories=["monetary", "publication", "speaking_engagement", "other"],
-    )
-    incentives = incentives.sort_values(["challenge_id", "incentives"])
-    incentives.index = np.arange(1, len(incentives) + 1)
-
-    sub_types = pd.concat(
-        [
-            df[df.file_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="prediction_file"
-            ),
-            df[df.container_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="container_image"
-            ),
-            df[df.notebook_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="notebook"
-            ),
-            df[df.mlcube_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="mlcube"
-            ),
-            df[df.other_submission == "TRUE"][["id", "created_at"]].assign(
-                submission_types="other"
-            ),
-        ]
-    ).rename(columns={"id": "challenge_id"})
-    sub_types["submission_types"] = pd.Categorical(
-        sub_types["submission_types"],
-        categories=[
-            "prediction_file",
-            "container_image",
-            "notebook",
-            "mlcube",
-            "other",
-        ],
-    )
-    sub_types = sub_types.sort_values(["challenge_id", "submission_types"])
-    sub_types.index = np.arange(1, len(sub_types) + 1)
-
-    return (
-        challenges,
-        incentives[["incentives", "challenge_id", "created_at"]],
-        sub_types[["submission_types", "challenge_id", "created_at"]],
-    )
-
-
-def get_challenge_categories(wks, sheet_name="challenge_category"):
-    """Get challenge categories."""
-    return pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")[
-        ["id", "challenge_id", "category"]
-    ]
-
-
-def get_platform_data(wks, sheet_name="platforms"):
-    """Get platform data and clean up as needed."""
-    platforms = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
-    return platforms[platforms._public == "TRUE"][
-        ["id", "slug", "name", "avatar_url", "website_url", "created_at", "updated_at"]
-    ]
-
-
-def get_organization_data(wks, sheet_name="organizations"):
-    """Get organization data and clean up as needed."""
-    organizations = pd.DataFrame(wks.worksheet(sheet_name).get_all_records()).fillna("")
-    organizations = organizations[organizations._public == "TRUE"][
-        [
-            "id",
-            "name",
-            "login",
-            "avatar_url",
-            "website_url",
-            "description",
-            "challenge_count",
-            "created_at",
-            "updated_at",
-            "acronym",
-        ]
-    ]
-    organizations = (
-        organizations.replace({r"\s+$": "", r"^\s+": ""}, regex=True)
-        .replace(r"\n", " ", regex=True)
-        .replace("'", "''")
-        .replace("\u2019", "''", regex=True)  # replace curly right-quote
-        .replace("\u202f", " ", regex=True)  # replace narrow no-break space
-        .replace("\u2060", "", regex=True)  # remove word joiner
-    )
-    organizations["description"] = (
-        organizations["description"]
-        .astype(str)
-        .apply(lambda x: x[:995] + "..." if len(x) > 1000 else x)
-    )
-    return organizations
-
-
-def get_roles(wks, sheet_name="contribution_role"):
-    """Get data on organization's role(s) in challenges."""
-    return (
-        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
-        .fillna("")
-        .drop(["_challenge", "_organization"], axis=1)
-    )
-
-
-def get_edam_annotations(wks, sheet_name="challenge_data"):
-    """Get data on challenge's EDAM annotations."""
-    return (
-        pd.DataFrame(wks.worksheet(sheet_name).get_all_records())
-        .fillna("")
-        .drop(["_challenge", "_edam_name"], axis=1)
-    )
-
-
 if __name__ == "__main__":
     lambda_handler({}, "")