From 0ce0e4e6ab688f152336d00c236f9d7b2e1a6c8d Mon Sep 17 00:00:00 2001 From: Yannik Tausch Date: Fri, 19 Jul 2024 17:51:07 +0200 Subject: [PATCH 1/4] fix license discovery: split at dashes for word-based matching --- grayskull/license/discovery.py | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/grayskull/license/discovery.py b/grayskull/license/discovery.py index 6e0159a69..24ba7e643 100644 --- a/grayskull/license/discovery.py +++ b/grayskull/license/discovery.py @@ -62,6 +62,16 @@ def get_all_licenses_from_spdx() -> List: ] +def _replace_dashes(s: str) -> str: + """ + Replace dashes with spaces. + + :param s: string to replace dashes with spaces + :return: string with dashes replaced by spaces + """ + return s.replace("-", " ") + + def _match_scrambled_exact(candidate, licenses) -> str | None: """ Return license with rearranged word order only. @@ -130,12 +140,21 @@ def match_license(name: str) -> dict: lic[0] for lic in original_matches if lic[1] >= spdx_license[1] ] if len(best_matches) > 1: + # we replace dashes by spaces here to match instances like + # "3-Clause BSD" with "BSD-3-Clause" which otherwise would + # not work with word-based scores like token_sort_ratio spdx_license = process.extractOne( - name, best_matches, scorer=token_sort_ratio + name, + best_matches, + scorer=token_sort_ratio, + processor=_replace_dashes, ) if original_matches and original_matches[0][1] < 0.55: spdx_license = process.extractOne( - name, [m[0] for m in original_matches], scorer=token_sort_ratio + name, + [m[0] for m in original_matches], + scorer=token_sort_ratio, + processor=_replace_dashes, ) if spdx_license[1] != 100 and spdx_license[0].startswith("MIT"): From f0ca4bb45a35a285bf3e401294558eec3066a1da Mon Sep 17 00:00:00 2001 From: Yannik Tausch Date: Tue, 23 Jul 2024 14:23:26 +0200 Subject: [PATCH 2/4] add license map to self test --- tests/license/test_discovery.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/license/test_discovery.py b/tests/license/test_discovery.py index 6bb8dff76..6347a009f 100644 --- a/tests/license/test_discovery.py +++ b/tests/license/test_discovery.py @@ -77,6 +77,14 @@ def test_short_license_id(licence_name, short_licence): assert get_short_license_id(licence_name) == short_licence +@pytest.mark.parametrize( + "license_id", [lic["licenseId"] for lic in get_all_licenses_from_spdx()] +) +def test_short_license_id_map_to_self(license_id: str): + print(license_id) + assert get_short_license_id(license_id) == license_id + + def test_get_other_names_from_opensource(): assert sorted(get_other_names_from_opensource("MIT")) == sorted(["MIT", "Expat"]) From 0155b2e49f963123bde4fa01b66b0fda39c950d0 Mon Sep 17 00:00:00 2001 From: Yannik Tausch Date: Tue, 23 Jul 2024 14:36:30 +0200 Subject: [PATCH 3/4] ensure licenses are mapped to self --- grayskull/license/discovery.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/grayskull/license/discovery.py b/grayskull/license/discovery.py index 24ba7e643..4cd8c0acf 100644 --- a/grayskull/license/discovery.py +++ b/grayskull/license/discovery.py @@ -98,6 +98,9 @@ def match_license(name: str) -> dict: name = re.sub(r"\s+license\s*", "", name.strip(), flags=re.IGNORECASE) name = name.strip() + if name in _get_all_license_choice(all_licenses): + return _get_license(name, all_licenses) + exact_match = _match_scrambled_exact(name, _get_all_license_choice(all_licenses)) if exact_match: best_matches = [(exact_match, 100, 0)] From a46e072f4e720186b2959f38afdd0b5aa0ea2ae3 Mon Sep 17 00:00:00 2001 From: Yannik Tausch Date: Mon, 29 Jul 2024 10:48:54 +0200 Subject: [PATCH 4/4] remove print Co-authored-by: Marcelo Duarte Trevisani --- tests/license/test_discovery.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/license/test_discovery.py b/tests/license/test_discovery.py index 6347a009f..8752ff5d0 100644 --- a/tests/license/test_discovery.py +++ b/tests/license/test_discovery.py @@ -81,7 +81,6 @@ def test_short_license_id(licence_name, short_licence): "license_id", [lic["licenseId"] for lic in get_all_licenses_from_spdx()] ) def test_short_license_id_map_to_self(license_id: str): - print(license_id) assert get_short_license_id(license_id) == license_id