feat: refactor and add tests

prefix-dev · May 2, 2024 · df12997 · df12997
1 parent c7a2256
commit df12997
Show file tree

Hide file tree

Showing 11 changed files with 1,254 additions and 217 deletions.
diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
@@ -0,0 +1,27 @@
+# name: Run tests for parselmouth
+
+on:
+    push:
+    workflow_dispatch:
+    pull_request:
+
+jobs:
+  run_tests:
+    runs-on: ubuntu-latest
+    steps:
+        - uses: actions/checkout@v4
+
+        - name: Set up Python
+          uses: actions/setup-python@v5
+          with:
+            python-version: '3.9'  # Specify your Python version
+
+        - uses: prefix-dev/[email protected]
+          with:
+            pixi-version: "latest"
+            manifest-path: pyproject.toml
+
+        - name: run tests
+          run: | 
+           pixi run run_tests
+        
diff --git a/pixi.lock b/pixi.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -10,7 +10,7 @@ authors = [
 ]
 description = "Mapper of conda to pypi"
 readme = "README.md"
-requires-python = "==3.9"
+requires-python = ">=3.9"
 classifiers = [
     "Programming Language :: Python :: 3",
     "License :: OSI Approved :: MIT License",
@@ -37,3 +37,13 @@ requests = ">=2.31.0,<2.32"
 boto3 = ">=1.34,<1.35"
 python-dotenv = ">=1.0.1,<1.1"
 packaging = "*"
+
+[tool.pixi.feature.test.dependencies]
+pytest = "*"
+
+[tool.pixi.feature.test.tasks]
+run_tests = "pytest tests"
+
+
+[tool.pixi.environments]
+test = {features = ["test"], solve-group = "default"}
diff --git a/src/parselmouth/conda_forge.py b/src/parselmouth/conda_forge.py
@@ -0,0 +1,40 @@
+import requests
+import logging
+from conda_forge_metadata.artifact_info.info_json import get_artifact_info_as_json
+
+
+def get_all_archs_available() -> list[str]:
+    response = requests.get("https://conda.anaconda.org/conda-forge/channeldata.json")
+    channel_json = response.json()
+    # Collect all subdirectories
+    subdirs: list[str] = []
+    for package in channel_json["packages"].values():
+        subdirs.extend(package.get("subdirs", []))
+
+    return list(set(subdirs))
+
+
+def get_subdir_repodata(subdir: str) -> dict:
+    url = f"https://conda.anaconda.org/conda-forge/{subdir}/repodata.json"
+    response = requests.get(url)
+    if not response.ok:
+        logging.error(f"Requst for repodata to {url} failed. {response.reason}")
+
+    response.raise_for_status()
+
+    return response.json()
+
+
+def get_all_packages_by_subdir(subdir: str) -> dict[str, dict]:
+    repodatas: dict[str, dict] = {}
+
+    repodata = get_subdir_repodata(subdir)
+
+    repodatas.update(repodata["packages"])
+    repodatas.update(repodata["packages.conda"])
+
+    return repodatas
+
+
+def get_artifact_info(subdir, artifact, backend, channel="conda-forge"):
+    return get_artifact_info_as_json(channel, subdir, artifact, backend)
diff --git a/src/parselmouth/updater.py b/src/parselmouth/updater.py
@@ -1,17 +1,18 @@
-import io
 import json
 import sys
 import os
 import re
 from typing import Optional
-import requests
-from conda_forge_metadata.artifact_info.info_json import get_artifact_info_as_json
 from conda_forge_metadata.types import ArtifactData
 import concurrent.futures
 import logging
-import boto3
-from dotenv import load_dotenv
 from packaging.version import parse
+from parselmouth.conda_forge import (
+    get_all_packages_by_subdir,
+    get_artifact_info,
+)
+from parselmouth.s3 import s3_client
+from parselmouth.utils import normalize
 
 
 names_mapping: dict[str, dict] = {}
@@ -23,20 +24,6 @@
 egg_pattern_compiled = re.compile(egg_info_pattern)
 
 
-load_dotenv()
-
-account_id = os.getenv("R2_PREFIX_ACCOUNT_ID", "")
-access_key_id = os.getenv("R2_PREFIX_ACCESS_KEY_ID", "")
-access_key_secret = os.getenv("R2_PREFIX_SECRET_ACCESS_KEY", "")
-bucket_name = os.getenv("R2_PREFIX_BUCKET", "conda")
-
-
-def normalize(name: Optional[str]) -> Optional[str]:
-    if not name:
-        return None
-    return re.sub(r"[-_.]+", "-", name).lower()
-
-
 def check_if_is_direct_url(package_name: str, url: Optional[str]) -> bool:
     if not url:
         return False
@@ -58,26 +45,6 @@ def check_if_is_direct_url(package_name: str, url: Optional[str]) -> bool:
     return True
 
 
-def get_all_archs_available() -> set[str]:
-    response = requests.get("https://conda.anaconda.org/conda-forge/channeldata.json")
-    channel_json = response.json()
-    # Collect all subdirectories
-    subdirs: list[str] = []
-    for package in channel_json["packages"].values():
-        subdirs.extend(package.get("subdirs", []))
-
-    return list(set(subdirs))
-
-
-def get_subdir_repodata(subdir: str) -> dict:
-    url = f"https://conda.anaconda.org/conda-forge/{subdir}/repodata.json"
-    response = requests.get(url)
-    if response.ok:
-        return response.json()
-
-    logging.error(f"Requst for repodata to {url} failed. {response.reason}")
-
-
 def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
     """
     Return a dictionary of normalized names to it's version
@@ -89,6 +56,9 @@ def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
         )
         if match:
             package_name = match.group(1)
+            if not package_name:
+                continue
+
             version = match.group(2)
             if "-py" in version:
                 index_of_py = version.index("-py")
@@ -111,38 +81,15 @@ def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
     return package_names
 
 
-def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
-    output = json.dumps(file_body)
-    output_as_file = io.BytesIO(output.encode("utf-8"))
-
-    s3_client.upload_fileobj(output_as_file, bucket_name, f"hash-v0/{file_name}")
-
-
 if __name__ == "__main__":
     subdir, letter = sys.argv[1].split("@")
 
     all_packages: list[tuple[str, str]] = []
 
-    s3_client = boto3.client(
-        service_name="s3",
-        endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com",
-        aws_access_key_id=f"{access_key_id}",
-        aws_secret_access_key=f"{access_key_secret}",
-        region_name="eeur",  # Must be one of: wnam, enam, weur, eeur, apac, auto
-    )
-
-    # index_obj_key = "hash-v0/index.json"
-    # response = s3_client.get_object(Bucket=bucket_name, Key=index_obj_key)
-
     with open("output_index/index.json") as index_file:
         existing_mapping_data: dict = json.load(index_file)
 
-    repodatas: dict[str, dict] = {}
-
-    repodata = get_subdir_repodata(subdir)
-
-    repodatas.update(repodata["packages"])
-    repodatas.update(repodata["packages.conda"])
+    repodatas = get_all_packages_by_subdir(subdir)
 
     for idx, package_name in enumerate(repodatas):
         if not package_name.startswith(letter):
@@ -162,8 +109,7 @@ def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = {
             executor.submit(
-                get_artifact_info_as_json,
-                channel="conda-forge",
+                get_artifact_info,
                 subdir=subdir,
                 artifact=package_name,
                 backend=backend_type,
@@ -229,11 +175,9 @@ def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
     with concurrent.futures.ThreadPoolExecutor() as executor:
         futures = {
             executor.submit(
-                upload,
-                file_name=package_hash,
+                s3_client.upload_mapping,
                 file_body=pkg_body,
-                bucket_name=bucket_name,
-                s3_client=s3_client,
+                file_name=package_hash,
             ): package_hash
             for package_hash, pkg_body in names_mapping.items()
         }

diff --git a/src/parselmouth/updater_merger.py b/src/parselmouth/updater_merger.py
@@ -1,46 +1,26 @@
-import io
 import json
-import boto3
 import os
-from dotenv import load_dotenv
+import logging
 
-load_dotenv()
+from parselmouth.s3 import s3_client
 
-account_id = os.environ["R2_PREFIX_ACCOUNT_ID"]
-access_key_id = os.environ["R2_PREFIX_ACCESS_KEY_ID"]
-access_key_secret = os.environ["R2_PREFIX_SECRET_ACCESS_KEY"]
-bucket_name = os.environ["R2_PREFIX_BUCKET"]
 
-
-def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
-    output = json.dumps(file_body)
-    output_as_file = io.BytesIO(output.encode("utf-8"))
-
-    s3_client.upload_fileobj(output_as_file, bucket_name, f"hash-v0/{file_name}")
-
-
-if __name__ == "__main__":
-    s3_client = boto3.client(
-        service_name="s3",
-        endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com",
-        aws_access_key_id=f"{access_key_id}",
-        aws_secret_access_key=f"{access_key_secret}",
-        region_name="eeur",  # Must be one of: wnam, enam, weur, eeur, apac, auto
-    )
-
-    obj_key = "hash-v0/index.json"
-    response = s3_client.get_object(Bucket=bucket_name, Key=obj_key)
-    existing_mapping_data: dict = json.loads(response["Body"].read().decode("utf-8"))
+def main(output_dir: str = "output"):
+    existing_mapping_data = s3_client.get_mapping()
 
     total_new_files = 0
 
-    for filename in os.listdir("output"):
-        filepath = os.path.join("output", filename)
+    for filename in os.listdir(output_dir):
+        filepath = os.path.join(output_dir, filename)
         with open(filepath) as partial_file:
             partial_json = json.load(partial_file)
             existing_mapping_data.update(partial_json)
             total_new_files += 1
 
-    print(f"Total new files {total_new_files}")
+    logging.info(f"Total new files {total_new_files}")
+
+    s3_client.upload_mapping(existing_mapping_data, "index.json")
 
-    upload("index.json", bucket_name, existing_mapping_data, s3_client)
+
+if __name__ == "__main__":
+    main()
diff --git a/src/parselmouth/updater_producer.py b/src/parselmouth/updater_producer.py
@@ -1,11 +1,9 @@
 import json
 import os
 import re
-from typing import Any
-import requests
-import logging
-import boto3
-from dotenv import load_dotenv
+
+from parselmouth.conda_forge import get_all_archs_available, get_subdir_repodata
+from parselmouth.s3 import s3_client
 
 
 dist_info_pattern = r"([^/]+)-(\d+[^/]*)\.dist-info\/METADATA"
@@ -15,52 +13,12 @@
 egg_pattern_compiled = re.compile(egg_info_pattern)
 
 
-load_dotenv()
-
-account_id = os.environ["R2_PREFIX_ACCOUNT_ID"]
-access_key_id = os.environ["R2_PREFIX_ACCESS_KEY_ID"]
-access_key_secret = os.environ["R2_PREFIX_SECRET_ACCESS_KEY"]
-bucket_name = os.environ["R2_PREFIX_BUCKET"]
-
-
-def get_all_archs_available() -> list[str]:
-    response = requests.get("https://conda.anaconda.org/conda-forge/channeldata.json")
-    channel_json = response.json()
-    # Collect all subdirectories
-    subdirs: list[str] = []
-    for package in channel_json["packages"].values():
-        subdirs.extend(package.get("subdirs", []))
-
-    return list(set(subdirs))
-
-
-def get_subdir_repodata(subdir: str) -> dict[Any, Any]:
-    url = f"https://conda.anaconda.org/conda-forge/{subdir}/repodata.json"
-    response = requests.get(url)
-    if response.ok:
-        return response.json()
-
-    logging.error(f"Requst for repodata to {url} failed. {response.reason}")
-
-    raise Exception(f"Requst for repodata to {url} failed. {response.reason}")
-
-
-if __name__ == "__main__":
+def main(output_dir: str = "output_index"):
     subdirs = get_all_archs_available()
 
     all_packages: list[tuple[str, str]] = []
 
-    s3_client = boto3.client(
-        service_name="s3",
-        endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com",
-        aws_access_key_id=f"{access_key_id}",
-        aws_secret_access_key=f"{access_key_secret}",
-        region_name="eeur",  # Must be one of: wnam, enam, weur, eeur, apac, auto
-    )
-
-    index_obj_key = "hash-v0/index.json"
-    response = s3_client.get_object(Bucket=bucket_name, Key=index_obj_key)
-    existing_mapping_data = json.loads(response["Body"].read().decode("utf-8"))
+    existing_mapping_data = s3_client.get_mapping()
 
     letters = set()
 
@@ -79,11 +37,14 @@ def get_subdir_repodata(subdir: str) -> dict[Any, Any]:
                 all_packages.append(package_name)
                 letters.add(f"{subdir}@{package_name[0]}")
 
-    total = 0
-    log_once = False
-
-    os.makedirs("output_index", exist_ok=True)
-    with open("output_index/index.json", mode="w") as mapping_file:
+    os.makedirs(output_dir, exist_ok=True)
+    with open(f"{output_dir}/index.json", mode="w") as mapping_file:
         json.dump(existing_mapping_data, mapping_file)
 
-    print(json.dumps(list(letters)))
+    json_letters = json.dumps(list(letters))
+
+    print(json_letters)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/parselmouth/utils.py b/src/parselmouth/utils.py
@@ -0,0 +1,5 @@
+import re
+
+
+def normalize(name: str) -> str:
+    return re.sub(r"[-_.]+", "-", name).lower()