diff --git a/netkan/netkan/download_counter.py b/netkan/netkan/download_counter.py index a2dba32..d226bcc 100644 --- a/netkan/netkan/download_counter.py +++ b/netkan/netkan/download_counter.py @@ -190,11 +190,8 @@ def empty(self) -> bool: def full(self) -> bool: return len(self.ids) >= self.MODULES_PER_REQUEST - def _get_ia_ident(self, ckan: Ckan) -> str: - return f'{ckan.identifier}-{ckan.version.string.replace(":", "-")}' - def add(self, ckan: Ckan) -> None: - self.ids[ckan.identifier] = self._get_ia_ident(ckan) + self.ids[ckan.identifier] = ckan.mirror_item() def get_result(self, counts: Optional[Dict[str, int]] = None) -> Dict[str, int]: if counts is None: diff --git a/netkan/netkan/metadata.py b/netkan/netkan/metadata.py index bfe2ede..1d0bf05 100644 --- a/netkan/netkan/metadata.py +++ b/netkan/netkan/metadata.py @@ -119,6 +119,8 @@ class Ckan: EPOCH_VERSION_REGEXP = re.compile('^[0-9]+:') + BUCKET_EXCLUDE_PATTERN = re.compile(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9._-]') + REDISTRIBUTABLE_LICENSES = { "public-domain", "Apache", "Apache-1.0", "Apache-2.0", @@ -443,6 +445,19 @@ def mirror_download(self, with_epoch: bool = True) -> Optional[str]: return f'https://archive.org/download/{self.identifier}-{self._format_version(with_epoch)}/{filename}' return None + def mirror_item(self, with_epoch: bool = True) -> str: + return self._ia_bucket_sanitize( + f'{self.identifier}-{self._format_version(with_epoch)}') + + # InternetArchive says: + # Bucket names should be valid archive identifiers; + # try someting matching this regular expression: + # ^[a-zA-Z0-9][a-zA-Z0-9_.-]{4,100}$ + # (We enforce everything except the minimum of 4 characters) + @classmethod + def _ia_bucket_sanitize(cls, s: str) -> str: + return cls.BUCKET_EXCLUDE_PATTERN.sub('', s)[:100] + def _format_version(self, with_epoch: bool) -> Optional[str]: if self.version: if with_epoch: diff --git a/netkan/netkan/mirrorer.py b/netkan/netkan/mirrorer.py index 2dc4b03..975c5b0 100644 --- a/netkan/netkan/mirrorer.py +++ b/netkan/netkan/mirrorer.py @@ -28,8 +28,6 @@ class CkanMirror(Ckan): DESCRIPTION_TEMPLATE = Template( legacy_read_text('netkan', 'mirror_description_template.jinja2')) - BUCKET_EXCLUDE_PATTERN = re.compile(r'^[^a-zA-Z0-9]+|[^a-zA-Z0-9._-]') - LICENSE_URLS = { "Apache" : 'http://www.apache.org/licenses/LICENSE-1.0', "Apache-1.0" : 'http://www.apache.org/licenses/LICENSE-1.0', @@ -133,10 +131,6 @@ def license_urls(self) -> List[str]: return [self.LICENSE_URLS[lic] for lic in self.licenses() if lic in self.LICENSE_URLS] - def mirror_item(self, with_epoch: bool = True) -> str: - return self._ia_bucket_sanitize( - f'{self.identifier}-{self._format_version(with_epoch)}') - def mirror_source_filename(self, with_epoch: bool = True) -> str: return self._ia_bucket_sanitize( f'{self.identifier}-{self._format_version(with_epoch)}.source.zip') @@ -144,15 +138,6 @@ def mirror_source_filename(self, with_epoch: bool = True) -> str: def mirror_title(self, with_epoch: bool = True) -> str: return f'{self.name} - {self._format_version(with_epoch)}' - # InternetArchive says: - # Bucket names should be valid archive identifiers; - # try someting matching this regular expression: - # ^[a-zA-Z0-9][a-zA-Z0-9_.-]{4,100}$ - # (We enforce everything except the minimum of 4 characters) - @classmethod - def _ia_bucket_sanitize(cls, s: str) -> str: - return cls.BUCKET_EXCLUDE_PATTERN.sub('', s)[:100] - @property def item_metadata(self) -> Dict[str, Any]: lic_urls = self.license_urls()