Skip to content

Commit

Permalink
feat: refactor and add tests
Browse files Browse the repository at this point in the history
  • Loading branch information
nichmor committed May 2, 2024
1 parent c7a2256 commit df12997
Show file tree
Hide file tree
Showing 11 changed files with 1,254 additions and 217 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/run_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
# name: Run tests for parselmouth

on:
push:
workflow_dispatch:
pull_request:

jobs:
run_tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4

- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.9' # Specify your Python version

- uses: prefix-dev/[email protected]
with:
pixi-version: "latest"
manifest-path: pyproject.toml

- name: run tests
run: |
pixi run run_tests
1,120 changes: 1,058 additions & 62 deletions pixi.lock

Large diffs are not rendered by default.

12 changes: 11 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ authors = [
]
description = "Mapper of conda to pypi"
readme = "README.md"
requires-python = "==3.9"
requires-python = ">=3.9"
classifiers = [
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
Expand All @@ -37,3 +37,13 @@ requests = ">=2.31.0,<2.32"
boto3 = ">=1.34,<1.35"
python-dotenv = ">=1.0.1,<1.1"
packaging = "*"

[tool.pixi.feature.test.dependencies]
pytest = "*"

[tool.pixi.feature.test.tasks]
run_tests = "pytest tests"


[tool.pixi.environments]
test = {features = ["test"], solve-group = "default"}
40 changes: 40 additions & 0 deletions src/parselmouth/conda_forge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import requests
import logging
from conda_forge_metadata.artifact_info.info_json import get_artifact_info_as_json


def get_all_archs_available() -> list[str]:
response = requests.get("https://conda.anaconda.org/conda-forge/channeldata.json")
channel_json = response.json()
# Collect all subdirectories
subdirs: list[str] = []
for package in channel_json["packages"].values():
subdirs.extend(package.get("subdirs", []))

return list(set(subdirs))


def get_subdir_repodata(subdir: str) -> dict:
url = f"https://conda.anaconda.org/conda-forge/{subdir}/repodata.json"
response = requests.get(url)
if not response.ok:
logging.error(f"Requst for repodata to {url} failed. {response.reason}")

response.raise_for_status()

return response.json()


def get_all_packages_by_subdir(subdir: str) -> dict[str, dict]:
repodatas: dict[str, dict] = {}

repodata = get_subdir_repodata(subdir)

repodatas.update(repodata["packages"])
repodatas.update(repodata["packages.conda"])

return repodatas


def get_artifact_info(subdir, artifact, backend, channel="conda-forge"):
return get_artifact_info_as_json(channel, subdir, artifact, backend)
82 changes: 13 additions & 69 deletions src/parselmouth/updater.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import io
import json
import sys
import os
import re
from typing import Optional
import requests
from conda_forge_metadata.artifact_info.info_json import get_artifact_info_as_json
from conda_forge_metadata.types import ArtifactData
import concurrent.futures
import logging
import boto3
from dotenv import load_dotenv
from packaging.version import parse
from parselmouth.conda_forge import (
get_all_packages_by_subdir,
get_artifact_info,
)
from parselmouth.s3 import s3_client
from parselmouth.utils import normalize


names_mapping: dict[str, dict] = {}
Expand All @@ -23,20 +24,6 @@
egg_pattern_compiled = re.compile(egg_info_pattern)


load_dotenv()

account_id = os.getenv("R2_PREFIX_ACCOUNT_ID", "")
access_key_id = os.getenv("R2_PREFIX_ACCESS_KEY_ID", "")
access_key_secret = os.getenv("R2_PREFIX_SECRET_ACCESS_KEY", "")
bucket_name = os.getenv("R2_PREFIX_BUCKET", "conda")


def normalize(name: Optional[str]) -> Optional[str]:
if not name:
return None
return re.sub(r"[-_.]+", "-", name).lower()


def check_if_is_direct_url(package_name: str, url: Optional[str]) -> bool:
if not url:
return False
Expand All @@ -58,26 +45,6 @@ def check_if_is_direct_url(package_name: str, url: Optional[str]) -> bool:
return True


def get_all_archs_available() -> set[str]:
response = requests.get("https://conda.anaconda.org/conda-forge/channeldata.json")
channel_json = response.json()
# Collect all subdirectories
subdirs: list[str] = []
for package in channel_json["packages"].values():
subdirs.extend(package.get("subdirs", []))

return list(set(subdirs))


def get_subdir_repodata(subdir: str) -> dict:
url = f"https://conda.anaconda.org/conda-forge/{subdir}/repodata.json"
response = requests.get(url)
if response.ok:
return response.json()

logging.error(f"Requst for repodata to {url} failed. {response.reason}")


def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
"""
Return a dictionary of normalized names to it's version
Expand All @@ -89,6 +56,9 @@ def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
)
if match:
package_name = match.group(1)
if not package_name:
continue

version = match.group(2)
if "-py" in version:
index_of_py = version.index("-py")
Expand All @@ -111,38 +81,15 @@ def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
return package_names


def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
output = json.dumps(file_body)
output_as_file = io.BytesIO(output.encode("utf-8"))

s3_client.upload_fileobj(output_as_file, bucket_name, f"hash-v0/{file_name}")


if __name__ == "__main__":
subdir, letter = sys.argv[1].split("@")

all_packages: list[tuple[str, str]] = []

s3_client = boto3.client(
service_name="s3",
endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com",
aws_access_key_id=f"{access_key_id}",
aws_secret_access_key=f"{access_key_secret}",
region_name="eeur", # Must be one of: wnam, enam, weur, eeur, apac, auto
)

# index_obj_key = "hash-v0/index.json"
# response = s3_client.get_object(Bucket=bucket_name, Key=index_obj_key)

with open("output_index/index.json") as index_file:
existing_mapping_data: dict = json.load(index_file)

repodatas: dict[str, dict] = {}

repodata = get_subdir_repodata(subdir)

repodatas.update(repodata["packages"])
repodatas.update(repodata["packages.conda"])
repodatas = get_all_packages_by_subdir(subdir)

for idx, package_name in enumerate(repodatas):
if not package_name.startswith(letter):
Expand All @@ -162,8 +109,7 @@ def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(
get_artifact_info_as_json,
channel="conda-forge",
get_artifact_info,
subdir=subdir,
artifact=package_name,
backend=backend_type,
Expand Down Expand Up @@ -229,11 +175,9 @@ def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
with concurrent.futures.ThreadPoolExecutor() as executor:
futures = {
executor.submit(
upload,
file_name=package_hash,
s3_client.upload_mapping,
file_body=pkg_body,
bucket_name=bucket_name,
s3_client=s3_client,
file_name=package_hash,
): package_hash
for package_hash, pkg_body in names_mapping.items()
}
Expand Down
44 changes: 12 additions & 32 deletions src/parselmouth/updater_merger.py
Original file line number Diff line number Diff line change
@@ -1,46 +1,26 @@
import io
import json
import boto3
import os
from dotenv import load_dotenv
import logging

load_dotenv()
from parselmouth.s3 import s3_client

account_id = os.environ["R2_PREFIX_ACCOUNT_ID"]
access_key_id = os.environ["R2_PREFIX_ACCESS_KEY_ID"]
access_key_secret = os.environ["R2_PREFIX_SECRET_ACCESS_KEY"]
bucket_name = os.environ["R2_PREFIX_BUCKET"]


def upload(file_name: str, bucket_name: str, file_body: dict, s3_client):
output = json.dumps(file_body)
output_as_file = io.BytesIO(output.encode("utf-8"))

s3_client.upload_fileobj(output_as_file, bucket_name, f"hash-v0/{file_name}")


if __name__ == "__main__":
s3_client = boto3.client(
service_name="s3",
endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com",
aws_access_key_id=f"{access_key_id}",
aws_secret_access_key=f"{access_key_secret}",
region_name="eeur", # Must be one of: wnam, enam, weur, eeur, apac, auto
)

obj_key = "hash-v0/index.json"
response = s3_client.get_object(Bucket=bucket_name, Key=obj_key)
existing_mapping_data: dict = json.loads(response["Body"].read().decode("utf-8"))
def main(output_dir: str = "output"):
existing_mapping_data = s3_client.get_mapping()

total_new_files = 0

for filename in os.listdir("output"):
filepath = os.path.join("output", filename)
for filename in os.listdir(output_dir):
filepath = os.path.join(output_dir, filename)
with open(filepath) as partial_file:
partial_json = json.load(partial_file)
existing_mapping_data.update(partial_json)
total_new_files += 1

print(f"Total new files {total_new_files}")
logging.info(f"Total new files {total_new_files}")

s3_client.upload_mapping(existing_mapping_data, "index.json")

upload("index.json", bucket_name, existing_mapping_data, s3_client)

if __name__ == "__main__":
main()
67 changes: 14 additions & 53 deletions src/parselmouth/updater_producer.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import json
import os
import re
from typing import Any
import requests
import logging
import boto3
from dotenv import load_dotenv

from parselmouth.conda_forge import get_all_archs_available, get_subdir_repodata
from parselmouth.s3 import s3_client


dist_info_pattern = r"([^/]+)-(\d+[^/]*)\.dist-info\/METADATA"
Expand All @@ -15,52 +13,12 @@
egg_pattern_compiled = re.compile(egg_info_pattern)


load_dotenv()

account_id = os.environ["R2_PREFIX_ACCOUNT_ID"]
access_key_id = os.environ["R2_PREFIX_ACCESS_KEY_ID"]
access_key_secret = os.environ["R2_PREFIX_SECRET_ACCESS_KEY"]
bucket_name = os.environ["R2_PREFIX_BUCKET"]


def get_all_archs_available() -> list[str]:
response = requests.get("https://conda.anaconda.org/conda-forge/channeldata.json")
channel_json = response.json()
# Collect all subdirectories
subdirs: list[str] = []
for package in channel_json["packages"].values():
subdirs.extend(package.get("subdirs", []))

return list(set(subdirs))


def get_subdir_repodata(subdir: str) -> dict[Any, Any]:
url = f"https://conda.anaconda.org/conda-forge/{subdir}/repodata.json"
response = requests.get(url)
if response.ok:
return response.json()

logging.error(f"Requst for repodata to {url} failed. {response.reason}")

raise Exception(f"Requst for repodata to {url} failed. {response.reason}")


if __name__ == "__main__":
def main(output_dir: str = "output_index"):
subdirs = get_all_archs_available()

all_packages: list[tuple[str, str]] = []

s3_client = boto3.client(
service_name="s3",
endpoint_url=f"https://{account_id}.r2.cloudflarestorage.com",
aws_access_key_id=f"{access_key_id}",
aws_secret_access_key=f"{access_key_secret}",
region_name="eeur", # Must be one of: wnam, enam, weur, eeur, apac, auto
)

index_obj_key = "hash-v0/index.json"
response = s3_client.get_object(Bucket=bucket_name, Key=index_obj_key)
existing_mapping_data = json.loads(response["Body"].read().decode("utf-8"))
existing_mapping_data = s3_client.get_mapping()

letters = set()

Expand All @@ -79,11 +37,14 @@ def get_subdir_repodata(subdir: str) -> dict[Any, Any]:
all_packages.append(package_name)
letters.add(f"{subdir}@{package_name[0]}")

total = 0
log_once = False

os.makedirs("output_index", exist_ok=True)
with open("output_index/index.json", mode="w") as mapping_file:
os.makedirs(output_dir, exist_ok=True)
with open(f"{output_dir}/index.json", mode="w") as mapping_file:
json.dump(existing_mapping_data, mapping_file)

print(json.dumps(list(letters)))
json_letters = json.dumps(list(letters))

print(json_letters)


if __name__ == "__main__":
main()
5 changes: 5 additions & 0 deletions src/parselmouth/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
import re


def normalize(name: str) -> str:
return re.sub(r"[-_.]+", "-", name).lower()
Loading

0 comments on commit df12997

Please sign in to comment.