Skip to content

Commit

Permalink
fix/package urls (teaxyz#9)
Browse files Browse the repository at this point in the history
* package urls

* load package urls
  • Loading branch information
sanchitram1 authored Oct 15, 2024
1 parent 37cebf1 commit 1090f86
Show file tree
Hide file tree
Showing 3 changed files with 84 additions and 6 deletions.
27 changes: 25 additions & 2 deletions src/pipeline/crates.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from os import getenv

from dataclasses import dataclass
import sys

from src.pipeline.utils.crates.structures import URLTypes, UserTypes
from src.pipeline.utils.fetcher import TarballFetcher
Expand Down Expand Up @@ -63,17 +64,39 @@ def fetch(config: Config) -> None:


def load(db: DB, transformer: CratesTransformer, config: Config) -> None:
logger.log("loading crates packages...this should take a minute")
db.insert_packages(transformer.packages(), config.package_manager_id, "crates")
logger.log("✅ inserted packages")

logger.log("loading crates urls...this should take a minute")
db.insert_urls(transformer.urls())
logger.log("✅ inserted urls")

logger.log("loading crates package urls...this should take ~3 minutes")
db.insert_package_urls(transformer.package_urls())
logger.log("✅ inserted package urls")

logger.log("loading crates versions...this should take ~5 minutes")
db.insert_versions(transformer.versions())
logger.log("✅ inserted versions")

logger.log("loading crates users...this should take a minute")
db.insert_users(transformer.users(), config.user_types.crates)
logger.log("✅ inserted users")

logger.log("loading crates user packages...this should take a few seconds")
db.insert_user_packages(transformer.user_packages())
db.insert_urls(transformer.urls())
logger.log("✅ inserted user packages")

if not config.test:
# these are bigger files, so we skip them in tests
logger.log("loading crates user versions...this should take ~5 minutes")
db.insert_user_versions(transformer.user_versions(), config.user_types.github)
# db.insert_package_urls(transformer.package_urls()) FIXME
logger.log("✅ inserted user versions")

logger.log("loading crates dependencies...this should take ~1 hour")
db.insert_dependencies(transformer.dependencies())
logger.log("✅ inserted dependencies")

db.insert_load_history(config.package_manager_id)
logger.log("✅ crates")
Expand Down
58 changes: 55 additions & 3 deletions src/pipeline/utils/pg.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
LoadHistory,
Package,
PackageManager,
# PackageURL,
PackageURL,
Source,
URLType,
User,
Expand Down Expand Up @@ -316,8 +316,60 @@ def process_url(item: Dict[str, str]):
self._insert_batch(URL, self._process_batch(batch, process_url))

def insert_package_urls(self, package_url_generator: Iterable[dict[str, str]]):
# todo: complex because url has to be selected by source type as well
pass
url_cache: Dict[tuple[str, str], UUID] = {}
# package_cache: Dict[str, UUID] = {}

def fetch_packages_and_urls(items: List[Dict[str, str]]):
package_ids = build_query_params(items, self.package_cache, "import_id")

if package_ids:
packages = self._batch_fetch(Package, "import_id", list(package_ids))
self.package_cache.update(
self._cache_objects(packages, "import_id", "id")
)

# for url ids, we can't use batch_fetch, because we need to provide the
# url_type_id in addition to the url string itself
# so, let's do it the old fashioned way
for item in items:
url = item["url"]
url_type_id = item["url_type_id"]
if (url, url_type_id) not in url_cache:
url_cache[(url, url_type_id)] = self.select_url_by_url_and_type(
url, url_type_id
).id

def process_package_url(item: Dict[str, str]):
package_id = self.package_cache.get(item["import_id"])
if not package_id:
self.logger.warn(f"package_id not found for {item['import_id']}")
return None

url_id = url_cache.get((item["url"], item["url_type_id"]))
if not url_id:
self.logger.warn(f"url_id not found for {item['url']}")
return None

return PackageURL(
package_id=package_id,
url_id=url_id,
).to_dict()

batch = []
for item in package_url_generator:
batch.append(item)
if len(batch) == DEFAULT_BATCH_SIZE:
fetch_packages_and_urls(batch)
self._insert_batch(
PackageURL, self._process_batch(batch, process_package_url)
)
batch = []

if batch:
fetch_packages_and_urls(batch)
self._insert_batch(
PackageURL, self._process_batch(batch, process_package_url)
)

def insert_source(self, name: str) -> UUID:
with self.session() as session:
Expand Down
5 changes: 4 additions & 1 deletion src/pipeline/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,10 @@ def safe_int(val: str) -> int | None:
return int(val)


# TODO: needs explanation or simplification
# given some items and a cache, this returns a list of attributes that are not in the
# cache so that we can use them in a query
# attr has to be an attribute in the item
# item[attr] is a key in the cache
def build_query_params(
items: List[Dict[str, str]], cache: dict, attr: str
) -> List[str]:
Expand Down

0 comments on commit 1090f86

Please sign in to comment.