Skip to content

Commit

Permalink
feat: extract common code in artifact.py module (#31)
Browse files Browse the repository at this point in the history
  • Loading branch information
nichmor authored Sep 11, 2024
1 parent 390027c commit ef9fdda
Show file tree
Hide file tree
Showing 8 changed files with 1,751 additions and 155 deletions.
4 changes: 4 additions & 0 deletions .github/workflows/update_pytorch.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@ jobs:
# Run your script here to produce a list of missing subdirs and letters
# example: linux-64@p linux-64@d
message=$(pixi run parselmouth updater-producer --channel pytorch)
if [ "$message" = "[]" ]; then
echo "Skipping job as no new packages found"
exit 0
fi
echo $message
echo "string_list=$message" >> $GITHUB_OUTPUT
env:
Expand Down
1,614 changes: 1,609 additions & 5 deletions pixi.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ platforms = ["osx-arm64", "linux-64", "win-64"]

[tool.pixi.pypi-dependencies]
parselmouth = { path = ".", editable = true }
types-aioboto3 = "*"

[tool.pixi.dependencies]
requests = ">=2.31.0,<2.32"
Expand Down
114 changes: 114 additions & 0 deletions src/parselmouth/internals/artifact.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from pathlib import Path
import re
from packaging.version import parse
from parselmouth.internals.utils import normalize
from typing import Optional
from conda_forge_metadata.types import ArtifactData
import logging

from parselmouth.internals.s3 import MappingEntry


dist_info_pattern = r"([^/]+)-(\d+[^/]*)\.dist-info\/METADATA"
egg_info_pattern = r"([^/]+?)-(\d+[^/]*)\.egg-info\/PKG-INFO"

dist_pattern_compiled = re.compile(dist_info_pattern)
egg_pattern_compiled = re.compile(egg_info_pattern)


def check_if_is_direct_url(package_name: str, url: Optional[str]) -> bool:
if not url:
return False
urls = None
if not isinstance(url, str):
logging.warning(f"{package_name} contains multiple urls")
urls = url
else:
urls = list(url)

if all(
url.startswith("https://pypi.io/packages/")
or url.startswith("https://pypi.org/packages/")
or url.startswith("https://pypi.python.org/packages/")
for url in urls
):
return False

return True


def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
"""
Return a dictionary of normalized names to it's version
"""
package_names: dict[str, str] = {}
for file_name in files:
file_path = Path(file_name)
# sometimes, packages like setuptools have some stuff vendored
# that our regex will catch:
# site-packages/setuptools/_vendor/zipp-3.19.2.dist-info/RECORD
# but in reality we don't want to include itages:
if "_vendor" in file_path.parts or "_vendored" in file_path.parts:
continue
match = dist_pattern_compiled.search(file_name) or egg_pattern_compiled.search(
file_name
)
if match:
package_name = match.group(1)
if not package_name:
continue

version = match.group(2)
if "-py" in version:
index_of_py = version.index("-py")
version = version[:index_of_py]

pkg_version = None

try:
pkg_version = parse(version)
except Exception:
if "-" in version:
index_of_dash = version.rfind("-")
version = version[:index_of_dash]

if pkg_version:
version = str(pkg_version)

package_names[normalize(package_name)] = version

return package_names


def extract_artifact_mapping(artifact: ArtifactData, package_name: str) -> MappingEntry:
pypi_names_and_versions = get_pypi_names_and_version(artifact["files"])
pypi_normalized_names = (
[name for name in pypi_names_and_versions] if pypi_names_and_versions else None
)
source: Optional[dict] = artifact["rendered_recipe"].get("source", None)
is_direct_url: Optional[bool] = None

if source and isinstance(source, list):
source = artifact["rendered_recipe"]["source"][0]
is_direct_url = check_if_is_direct_url(
package_name,
source.get("url"),
)

conda_name = artifact["name"]

if not is_direct_url or not source:
direct_url = None
else:
url = source.get("url", None)
direct_url = [str(url)] if isinstance(url, str) else url

return MappingEntry.model_validate(
{
"pypi_normalized_names": pypi_normalized_names,
"versions": pypi_names_and_versions if pypi_names_and_versions else None,
"conda_name": str(conda_name),
"package_name": package_name,
"direct_url": direct_url,
}
)
49 changes: 14 additions & 35 deletions src/parselmouth/internals/check_one.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import re
from typing import Optional
import logging
from parselmouth.internals.artifact import extract_artifact_mapping
from parselmouth.internals.channels import SupportedChannels
from parselmouth.internals.conda_forge import (
get_all_packages_by_subdir,
get_artifact_info,
)
from parselmouth.internals.s3 import MappingEntry, s3_client
from parselmouth.internals.updater import get_pypi_names_and_version
from rich import print


Expand Down Expand Up @@ -71,43 +71,11 @@ def main(
subdir=subdir, artifact=package_name, backend=backend_type, channel=channel
)
if artifact:
pypi_names_and_versions = get_pypi_names_and_version(artifact["files"])
pypi_normalized_names = (
[name for name in pypi_names_and_versions]
if pypi_names_and_versions
else None
)
source: Optional[dict] = artifact["rendered_recipe"].get("source", None)
is_direct_url: Optional[bool] = None

if source and isinstance(source, list):
source = artifact["rendered_recipe"]["source"][0]
is_direct_url = check_if_is_direct_url(
package_name,
source.get("url"),
)

sha = repodatas[package_name]["sha256"]
conda_name = artifact["name"]

if not is_direct_url or not source:
direct_url = None
else:
url = source.get("url", None)
direct_url = [url] if isinstance(url, str) else url

if sha not in names_mapping:
names_mapping[sha] = MappingEntry.model_validate(
{
"pypi_normalized_names": pypi_normalized_names,
"versions": pypi_names_and_versions
if pypi_names_and_versions
else None,
"conda_name": conda_name,
"package_name": package_name,
"direct_url": direct_url,
}
)
mapping_entry = extract_artifact_mapping(artifact, package_name)
names_mapping[sha] = mapping_entry
break

if not names_mapping:
Expand All @@ -116,6 +84,17 @@ def main(
print(names_mapping)

if upload:
# getting the index mapping
existing_mapping_data = s3_client.get_channel_index(channel=channel)
if not existing_mapping_data:
raise ValueError(f"Could not get the index mapping for channel {channel}")

# updating with the new mapping
existing_mapping_data.root.update(names_mapping)

logging.warning("Uploading index to S3")
s3_client.upload_index(existing_mapping_data, channel=channel)

logging.warning("Uploading mapping to S3")
for sha_name, mapping_body in names_mapping.items():
s3_client.upload_mapping(mapping_body, sha_name)
3 changes: 2 additions & 1 deletion src/parselmouth/internals/legacy_mapping.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
from typing import Mapping
from deprecated import deprecated

from pydantic import BaseModel
Expand All @@ -15,7 +16,7 @@ class CompressedMapping(BaseModel):


def format_and_save_mapping(
mapping: dict[str, SmallMapping] | dict[str, CompressedMapping],
mapping: Mapping[str, SmallMapping | CompressedMapping],
mapping_name: str = "mapping_as_grayskull",
):
# now le'ts iterate over created small_mapping
Expand Down
120 changes: 6 additions & 114 deletions src/parselmouth/internals/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,98 +3,27 @@
import json
import os
from pathlib import Path
import re
from typing import Optional
import aioboto3.session
import botocore.client
from conda_forge_metadata.types import ArtifactData
import concurrent.futures
import logging
from dotenv import load_dotenv
from packaging.version import parse

from parselmouth.internals.artifact import extract_artifact_mapping
from parselmouth.internals.channels import BackendRequestType, SupportedChannels
from parselmouth.internals.conda_forge import (
get_all_packages_by_subdir,
get_artifact_info,
)
from parselmouth.internals.s3 import IndexMapping, MappingEntry
from parselmouth.internals.utils import normalize
from parselmouth.internals.s3 import IndexMapping

import aioboto3


names_mapping: IndexMapping = IndexMapping.model_construct(root={})

dist_info_pattern = r"([^/]+)-(\d+[^/]*)\.dist-info\/METADATA"
egg_info_pattern = r"([^/]+?)-(\d+[^/]*)\.egg-info\/PKG-INFO"

dist_pattern_compiled = re.compile(dist_info_pattern)
egg_pattern_compiled = re.compile(egg_info_pattern)


def check_if_is_direct_url(package_name: str, url: Optional[str]) -> bool:
if not url:
return False
urls = None
if not isinstance(url, str):
logging.warning(f"{package_name} contains multiple urls")
urls = url
else:
urls = list(url)

if all(
url.startswith("https://pypi.io/packages/")
or url.startswith("https://pypi.org/packages/")
or url.startswith("https://pypi.python.org/packages/")
for url in urls
):
return False

return True


def get_pypi_names_and_version(files: list[str]) -> dict[str, str]:
"""
Return a dictionary of normalized names to it's version
"""
package_names: dict[str, str] = {}
for file_name in files:
file_path = Path(file_name)
# sometimes, packages like setuptools have some stuff vendored
# that our regex will catch:
# site-packages/setuptools/_vendor/zipp-3.19.2.dist-info/RECORD
# but in reality we don't want to include itages:
if "_vendor" in file_path.parts or "_vendored" in file_path.parts:
continue
match = dist_pattern_compiled.search(file_name) or egg_pattern_compiled.search(
file_name
)
if match:
package_name = match.group(1)
if not package_name:
continue

version = match.group(2)
if "-py" in version:
index_of_py = version.index("-py")
version = version[:index_of_py]

pkg_version = None

try:
pkg_version = parse(version)
except Exception:
if "-" in version:
index_of_dash = version.rfind("-")
version = version[:index_of_dash]

if pkg_version:
version = str(pkg_version)

package_names[normalize(package_name)] = version

return package_names


async def async_upload_package(
s3_client, pkg_body: str, package_hash, bucket_name: str
Expand Down Expand Up @@ -175,7 +104,7 @@ def main(
package = repodatas[package_name]
sha256 = package["sha256"]

if sha256 not in existing_mapping_data:
if sha256 not in existing_mapping_data.root:
# trying to get packages info using all backends.
# note: streamed is not supported for .tar.gz
if package_name.endswith(".conda"):
Expand Down Expand Up @@ -220,47 +149,10 @@ def main(
try:
artifact: Optional[ArtifactData] = done.result()
if artifact:
pypi_names_and_versions = get_pypi_names_and_version(
artifact["files"]
)
pypi_normalized_names = (
[name for name in pypi_names_and_versions]
if pypi_names_and_versions
else None
)
source: Optional[dict] = artifact["rendered_recipe"].get(
"source", None
)
is_direct_url: Optional[bool] = None

if source and isinstance(source, list):
source = artifact["rendered_recipe"]["source"][0]
is_direct_url = check_if_is_direct_url(
package_name,
source.get("url"),
)

sha = repodatas[package_name]["sha256"]
conda_name = artifact["name"]

if not is_direct_url or not source:
direct_url = None
else:
url = source.get("url", None)
direct_url = [str(url)] if isinstance(url, str) else url
mapping_entry = extract_artifact_mapping(artifact, package_name)

if sha not in names_mapping:
names_mapping.root[sha] = MappingEntry.model_validate(
{
"pypi_normalized_names": pypi_normalized_names,
"versions": pypi_names_and_versions
if pypi_names_and_versions
else None,
"conda_name": str(conda_name),
"package_name": package_name,
"direct_url": direct_url,
}
)
names_mapping.root[sha] = mapping_entry
else:
logging.warning(
f"Could not get artifact for {package_name} using backend: {backend_type}"
Expand Down
1 change: 1 addition & 0 deletions src/parselmouth/internals/updater_producer.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def main(
if not existing_mapping_data:
existing_mapping_data = IndexMapping(root={})
else:
# a new channel may not have any mapping data. so we need to create an empty one
existing_mapping_data = IndexMapping(root={})

letters = set()
Expand Down

0 comments on commit ef9fdda

Please sign in to comment.