Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: parse and clean archive badges and markdown links to URL #243

Merged
merged 15 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

## [Unreleased]

[v1.4] - 2024-11-22
* Fix: Parse archive and JOSS links to handle markdown links and validate DOI links are valid. Added python-doi as a dependency to ensure archive/DOI URLs fully resolve (@banesullivan)

## [v1.4] - 2024-11-22

Notes: it looks like i may have mistakenly bumped to 1.3.7 in august. rather than try to fix on pypi we will just go with it to ensure our release cycles are smooth given no one else uses this package except pyopensci.

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ classifiers = [
]
dependencies = [
"pydantic>=2.0",
"python-doi",
"python-dotenv",
"requests",
"ruamel-yaml>=0.17.21",
Expand Down
49 changes: 27 additions & 22 deletions src/pyosmeta/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from enum import Enum
from typing import Any, Optional, Set, Union

import requests
from pydantic import (
AliasChoices,
BaseModel,
Expand All @@ -19,7 +18,12 @@
)

from pyosmeta.models.github import Labels
from pyosmeta.utils_clean import clean_date, clean_markdown
from pyosmeta.utils_clean import (
check_url,
clean_archive,
clean_date,
clean_markdown,
)


class Partnerships(str, Enum):
Expand Down Expand Up @@ -59,29 +63,12 @@
elif not url.startswith("http"):
print("Oops, missing http")
url = "https://" + url
if cls._check_url(url=url):
if check_url(url=url):

Check warning on line 66 in src/pyosmeta/models/base.py

View check run for this annotation

Codecov / codecov/patch

src/pyosmeta/models/base.py#L66

Added line #L66 was not covered by tests
return url
else:
else: # pragma: no cover
print(f"Oops, url `{url}` is not valid, removing it")
return None

@staticmethod
def _check_url(url: str) -> bool:
"""Test url. Return true if there's a valid response, False if not

Parameters
----------
url : str
String for a url to a website to test.

"""

try:
response = requests.get(url, timeout=6)
return response.status_code == 200
except Exception:
print("Oops, url", url, "is not valid, removing it")
return False


class PersonModel(BaseModel, UrlValidatorMixin):
model_config = ConfigDict(
Expand Down Expand Up @@ -403,3 +390,21 @@
label.name if isinstance(label, Labels) else label
for label in labels
]

@field_validator(
"archive",
mode="before",
)
@classmethod
def clean_archive(cls, archive: str) -> str:
"""Clean the archive value to ensure it's a valid archive URL."""
return clean_archive(archive)

@field_validator(
"joss",
mode="before",
)
@classmethod
def clean_joss(cls, joss: str) -> str:
"""Clean the joss value to ensure it's a valid URL."""
return clean_archive(joss)
4 changes: 4 additions & 0 deletions src/pyosmeta/parse_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,9 @@ def _postprocess_meta(self, meta: dict, body: List[str]) -> dict:
meta["partners"] = self.get_categories(
body, "## Community Partnerships", 3, keyed=True
)
if "joss_doi" in meta:
# Normalize the JOSS archive field. Some issues use `JOSS DOI` others `JOSS`
meta["joss"] = meta.pop("joss_doi")

return meta

Expand Down Expand Up @@ -303,6 +306,7 @@ def parse_issues(
reviews = {}
errors = {}
for issue in issues:
print(f"Processing review {issue.title}")
try:
review = self.parse_issue(issue)
reviews[review.package_name] = review
Expand Down
84 changes: 84 additions & 0 deletions src/pyosmeta/utils_clean.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,9 @@
from datetime import datetime
from typing import Any

import doi
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since we've added a new dep, we should make sure that it is noted in the changelog and also document why we added it.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Further clarified this in e1246e0

import requests


def get_clean_user(username: str) -> str:
"""Cleans a GitHub username provided in a review issue by removing any
Expand Down Expand Up @@ -125,3 +128,84 @@
review_dict["date_accepted"] = value
break
return review_dict


def check_url(url: str) -> bool:
"""Test url. Return true if there's a valid response, False if not

Parameters
----------
url : str
String for a url to a website to test.

"""

try:
response = requests.get(url, timeout=6)
return response.status_code == 200
except Exception: # pragma: no cover
return False


def is_doi(archive) -> str | None:
"""Check if the DOI is valid and return the DOI link.

Parameters
----------
archive : str
The DOI string to validate, e.g., `10.1234/zenodo.12345678`

Returns
-------
str | None
The DOI link in the form `https://doi.org/10.1234/zenodo.12345678` or `None`
if the DOI is invalid.
"""
try:
return doi.validate_doi(archive)
except ValueError:
pass


def clean_archive(archive):
"""Clean an archive link to ensure it is a valid DOI URL.

This utility will attempt to parse the DOI link from the various formats
that are commonly present in review metadata. This utility will handle:

* Markdown links in the format `[label](URL)`, e.g., `[my archive](https://doi.org/10.1234/zenodo.12345678)`
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like the issue that failed when i ran it locally using a zenodo badge

pyOpenSci/software-submission#83

honestly, I may have updated and added that (it is possible). But it would be good to parse a markdown badge url too.

let me know if you don't see that error but i saw it running locally.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All good, it was the JOSS DOI field causing the failure as it was left black for pyOpenSci/software-submission#83. The zenodo badge in this example is handle and a recurring pattern on a lot of reviews

* Raw text in the format `DOI` e.g., `10.1234/zenodo.12345678`
* URLs in the format `http(s)://...` e.g., `https://doi.org/10.1234/zenodo.12345678`
* The special cases `n/a` and `tbd` which will be returned as `None` in anticipation of future data

If the archive link is a URL, it will be returned as is with a check that
it resolves but is not required to be a valid DOI. If the archive link is
a DOI, it will be validated and returned as a URL in the form
`https://doi.org/10.1234/zenodo.12345678` using the `python-doi` package.

"""
archive = archive.strip() # Remove leading/trailing whitespace
if not archive:
# If field is empty, return None
return None
if archive.startswith("[") and archive.endswith(")"):
# Extract the outermost link
link = archive[archive.rfind("](") + 2 : -1]
# recursively clean the archive link
return clean_archive(link)
elif link := is_doi(archive):
# is_doi returns the DOI link if it is valid
return link
elif archive.startswith("http"):
if archive.startswith("http://"):
archive = archive.replace("http://", "https://")

Check warning on line 201 in src/pyosmeta/utils_clean.py

View check run for this annotation

Codecov / codecov/patch

src/pyosmeta/utils_clean.py#L201

Added line #L201 was not covered by tests
# Validate that the URL resolves
if not check_url(archive):
raise ValueError(f"Invalid archive URL (not resolving): {archive}")

Check warning on line 204 in src/pyosmeta/utils_clean.py

View check run for this annotation

Codecov / codecov/patch

src/pyosmeta/utils_clean.py#L204

Added line #L204 was not covered by tests
return archive
elif archive.lower() == "n/a":
return None
elif archive.lower() == "tbd":
return None
else:
raise ValueError(f"Invalid archive URL: {archive}")
28 changes: 28 additions & 0 deletions tests/data/reviews/archives_doi.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Submitting Author: Fakename (@fakeauthor)
All current maintainers: (@fakeauthor1, @fakeauthor2)
Package Name: fake_package
One-Line Description of Package: A fake python package
Repository Link: https://example.com/fakeauthor1/fake_package
Version submitted: v1.0.0
EiC: @fakeeic
Editor: @fakeeditor
Reviewer 1: @fakereviewer1
Reviewer 2: @fakereviewer2
Reviews Expected By: fake date
Archive: 10.5281/zenodo.8415866
JOSS DOI: 10.21105/joss.01450
Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
Date accepted (month/day/year): 06/29/2024

---

## Scope

- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
(etc)

## Community Partnerships

- [ ] etc
- [ ] aaaaaa
28 changes: 28 additions & 0 deletions tests/data/reviews/archives_invalid.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Submitting Author: Fakename (@fakeauthor)
All current maintainers: (@fakeauthor1, @fakeauthor2)
Package Name: fake_package
One-Line Description of Package: A fake python package
Repository Link: https://example.com/fakeauthor1/fake_package
Version submitted: v1.0.0
EiC: @fakeeic
Editor: @fakeeditor
Reviewer 1: @fakereviewer1
Reviewer 2: @fakereviewer2
Reviews Expected By: fake date
Archive: 10.1234/zenodo.12345678
JOSS DOI: 10.21105/joss.00000
Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
Date accepted (month/day/year): 06/29/2024

---

## Scope

- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
(etc)

## Community Partnerships

- [ ] etc
- [ ] aaaaaa
28 changes: 28 additions & 0 deletions tests/data/reviews/archives_missing.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Submitting Author: Fakename (@fakeauthor)
All current maintainers: (@fakeauthor1, @fakeauthor2)
Package Name: fake_package
One-Line Description of Package: A fake python package
Repository Link: https://example.com/fakeauthor1/fake_package
Version submitted: v1.0.0
EiC: @fakeeic
Editor: @fakeeditor
Reviewer 1: @fakereviewer1
Reviewer 2: @fakereviewer2
Reviews Expected By: fake date
Archive:
JOSS DOI:
Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
Date accepted (month/day/year): 06/29/2024

---

## Scope

- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
(etc)

## Community Partnerships

- [ ] etc
- [ ] aaaaaa
28 changes: 28 additions & 0 deletions tests/data/reviews/archives_unknown.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
Submitting Author: Fakename (@fakeauthor)
All current maintainers: (@fakeauthor1, @fakeauthor2)
Package Name: fake_package
One-Line Description of Package: A fake python package
Repository Link: https://example.com/fakeauthor1/fake_package
Version submitted: v1.0.0
EiC: @fakeeic
Editor: @fakeeditor
Reviewer 1: @fakereviewer1
Reviewer 2: @fakereviewer2
Reviews Expected By: fake date
Archive: TBD
JOSS DOI: N/A
Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
Date accepted (month/day/year): 06/29/2024

---

## Scope

- [x] I agree to abide by [pyOpenSci's Code of Conduct][PyOpenSciCodeOfConduct] during the review process and in maintaining my package after should it be accepted.
- [x] I have read and will commit to package maintenance after the review as per the [pyOpenSci Policies Guidelines][Commitment].
(etc)

## Community Partnerships

- [ ] etc
- [ ] aaaaaa
4 changes: 2 additions & 2 deletions tests/data/reviews/bolded_keys.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
**Reviewer 1:** @fakereviewer1
**Reviewer 2:** @fakereviewer2
**Reviews Expected By:** fake date
**Archive:** [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive)
**Version accepted:** 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
**Archive:** [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
**Version accepted:** 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://zenodo.org/records/8415866))
**Date accepted (month/day/year):** 06/29/2024

---
Expand Down
4 changes: 2 additions & 2 deletions tests/data/reviews/partnership_astropy.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ Version submitted: v.0.8.5
Editor: @editoruser
Reviewer 1: @reviewer1
Reviewer 2: @reviewer2
Archive: [![DOI](https://zenodo.org/badge/DOI/fakedoi/doi.svg)](https://doi.org/fakedoi/doi.svg)
JOSS DOI: [![DOI](https://joss.theoj.org/papers/fakedoi.svg)](https://joss.theoj.org/papers/fakedoi)
Archive: [![DOI](https://zenodo.org/badge/DOI/fakedoi/doi.svg)](https://zenodo.org/records/8415866)
JOSS DOI: [![DOI](https://joss.theoj.org/papers/fakedoi.svg)](https://doi.org/10.21105/joss.01450)
Version accepted: v.0.9.2
Date accepted (month/day/year): 04/21/2024

Expand Down
2 changes: 1 addition & 1 deletion tests/data/reviews/reviewer_keyed.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ Editor: @fakeeditor
Reviewer 1: @fakereviewer1
Reviewer 2: @fakereviewer2
Reviews Expected By: fake date
Archive: [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive)
Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
Date accepted (month/day/year): 06/29/2024

Expand Down
3 changes: 2 additions & 1 deletion tests/data/reviews/reviewer_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ EiC: @fakeeic
Editor: @fakeeditor
Reviewers: @fakereviewer1 , @fakereviewer2, @fakereviewer3
Reviews Expected By: fake date
Archive: [![DOI](https://example.com/fakearchive)](https://example.com/fakearchive)
Archive: [![DOI](https://example.com/fakearchive)](https://zenodo.org/records/8415866)
JOSS DOI: [![DOI](https://example.com/fakearchive)](https://doi.org/10.21105/joss.01450)
Version accepted: 2.0.0 ([repo](https://example.com/fakeauthor1/fake_package/releases/tag/v2.0.0), [pypi](https://pypi.org/project/fake_project/2.0.0), [archive](https://example.com/fakearchive))
Date accepted (month/day/year): 06/29/2024

Expand Down
27 changes: 27 additions & 0 deletions tests/integration/test_parse_issues.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,33 @@ def test_parse_bolded_keys(process_issues, data_file):
assert review.package_name == "fake_package"


def test_parse_doi_archives(process_issues, data_file):
"""
Test handling of DOI archives in various formats.

This is a smoke test to ensure graceful handling of these cases.
"""
review = data_file("reviews/archives_doi.txt", True)
review = process_issues.parse_issue(review)
assert review.archive == "https://zenodo.org/record/8415866"
assert review.joss == "http://joss.theoj.org/papers/10.21105/joss.01450"

review = data_file("reviews/archives_unknown.txt", True)
review = process_issues.parse_issue(review)
assert review.archive is None
assert review.joss is None

review = data_file("reviews/archives_missing.txt", True)
review = process_issues.parse_issue(review)
assert review.archive is None
assert review.joss is None

review = data_file("reviews/archives_invalid.txt", True)

with pytest.raises(ValueError, match="Invalid archive"):
review = process_issues.parse_issue(review)


def test_parse_labels(issue_list, process_issues):
"""
`Label` models should be coerced to a string when parsing an issue
Expand Down
Loading