Skip to content

Commit

Permalink
🚧 First WIP commit for replace-first algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
pmonks committed Nov 11, 2024
1 parent e9f433b commit c94062d
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 25 deletions.
3 changes: 3 additions & 0 deletions src/lice_comb/impl/id_detection.clj
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,9 @@
"UPL" {
:regex #"(?i)\bUniversal\s+Permissive(\s+Licen[cs]e)?([\s,-]+(V(ersion)?)?\s*(?<version>\d+(\.\d+)?)?)?\b"
:fn (constantly ["UPL-1.0" :high])} ; There are no other listed versions of this license
"W3C" {
:regex #"(?i)\bW3C\b"
:fn (constantly ["W3C" :high])}
"WTFPL" {
:regex #"(?i)\b(WTFPL|DO-WTF-U-WANT-2|Do\s+What\s+The\s+Fuck\s+You\s+Want\s+To(\s+Public)?(\s+Licen[cs]e)?)\b"
:fn (constantly ["WTFPL" :high])}
Expand Down
43 changes: 28 additions & 15 deletions src/lice_comb/impl/parsing.clj
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,14 @@
; [s nil]
; ;####TODO: LOSING A LOT OF IMPORTANT CONTEXT HERE!!!!!
; (let [result (lciid/replace-family "GPL" s)
; result (lciid/replace-family "CDDL" (first result))]
; result (lciid/replace-family "CDDL" (first result))
; result (lciid/replace-family "X11" (first result))]
; [(first result) nil]))) ;####TODO: BOGUS EI (second tuple element)

(defn- replace-operators-with-keywords
"Replaces `String`s that represent SPDX expression operators in `coll` with
an equivalent keyword (`:and`, `:or`, `:with`), or nothing if the 'operator'
in question is unidentifiable (e.g. `and/or`, `/`, `\\`)."
[coll]
(filter identity
(map #(let [trimmed (s/trim %)
Expand Down Expand Up @@ -268,6 +272,28 @@
%)
coll)))

(defn- group-expressions
"Groups expressions in `coll` into sequences of valid SPDX expressions (albeit
in sequence form, rather than `String` form.
For example:
[\"Apache-2.0\" \"MIT\"] -> [[\"Apache-2.0\"] [\"MIT\"]]
[\"Apache-2.0\" :or \"MIT\"] -> [[\"Apache-2.0\" :or \"MIT\"]]
[\"Apache-2.0\" :and \"MIT\" \"GPL-2.0-or-later\"] -> [[\"Apache-2.0\" :and \"MIT\"] [\"GPL-2.0-or-later\"]]"
[coll]
(loop [result [[]]
[f & r] coll]
(if-not f
; Base case
result
; Recursive case
(let [l (last result)]
(case [(string? (last l)) (string? f)]
[true true] (recur (conj result [f]) r) ; String/string, so start a new nested sequence in result
([true false] [false true]) (recur (conj (vec (drop-last result)) (conj l f)) r) ; String/keyword or keyword/string, so continue the current last collection in result
; [false false] ; Not possible - we've already removed leading and consecutive keywords in fragments (in remove-invalid-operator-keywords)
)))))

(defn- rebuild-expressions
"Rebuilds one or more SPDX expressions from the given `fragments` and
expression-infos (`eis`). `fragments` is a heterogeneous sequence containing
Expand All @@ -281,20 +307,7 @@
[fragments existing-eis]
(let [eis (concat existing-eis (filter identity (mapcat #(when (map? %) (vals %)) fragments)))
expr-elements (mapcat #(if (keyword? %) [%] (keys %)) fragments)
regrouped-fragments (loop [result [[]]
[f & r] expr-elements]
(if-not f
; Base case
result
; Recursive case
(let [l (last result)]
(case [(string? (last l)) (string? f)]
[true true] (recur (conj result [f]) r) ; String/string, so add a "gap" to the result
[true false] (recur (conj (drop-last result) (conj l f)) r) ; String/keyword, so continue the current last collection in result
[false true] (recur (conj (drop-last result) (conj l f)) r) ; Keyword/string, so continue the current last collection in result
; [false false] ; Not possible - we've already removed consecutive keywords in fragments (in remove-invalid-operator-keywords/collapse-duplicate-operator-keywords)
))))
expressions (map #(sexp/normalise (s/join " " (map name %))) regrouped-fragments)
expressions (map #(sexp/normalise (s/join " " (map name %))) (group-expressions expr-elements))
; Now regroup expression-infos with their associated expression(s)
ei-lookup (group-by :id eis)
expr-ei-pairs (mapcat #(let [ids (sexp/extract-ids (sexp/parse %))]
Expand Down
40 changes: 38 additions & 2 deletions src/lice_comb/impl/spdx.clj
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,18 @@
(str "-((v|ver|version)-)?" ; Note: - character not escaped because that happens later
(s/join "." (map #(str "0*" %) version-components))))) ; Note: . character not escaped because that happens later

; Note: these regexes uses classes (e.g. [\\/-\s]{1,4}) instead of alternation (e.g. (\\|/|-|\s){1,4}) due to an apparent bug in the JVM's regex libraries when
; the latter are used in look-behind groups. See https://stackoverflow.com/questions/24874404/java-regex-look-behind-group-does-not-have-obvious-maximum-length-error/24922107
(defn- special-case-ids
"Special case handling for specific ids."
[s id]
(case id
"MIT" (s/replace s #"(?i)\bMIT\b" "(?<!X11[\\\\\\\\/\\\\-\\\\s]{1,4})MIT(?![\\\\\\\\/\\\\-\\\\s]{1,4}X11)")
"X11" (s/replace s #"(?i)\bX11\b" "(MIT[\\\\\\\\/\\\\-\\\\s]+)?X11([\\\\\\\\/\\\\-\\\\s]+MIT)?")
"Libpng" (s/replace s #"(?i)\blibpng\b" "(?<!zlib[\\\\\\\\/\\\\-\\\\s]{1,4})libpng")
"libpng-2.0" (s/replace s #"(?i)\blibpng\b" "(?<!zlib[\\\\\\\\/\\\\-\\\\s]{1,4})libpng")
s))

; Only public for the unit tests
(defn id->regex
"Turns `id`, an SPDX license or exception id, into a regex that can be used to
Expand All @@ -107,6 +119,8 @@
(s/replace #"-" "[\\\\-\\\\s]*") ; hyphens as optional hyphens or whitespace
(s/replace #"(?i)\blater\b" "\\\\b(lat[eo]r|newer|greater)\\\\b") ; alternative "or later" formulations
(s/replace #"\." "\\\\.") ; escape . character
; Special cases
(special-case-ids id)
; Add end expressions and remove redundant word boundary matches
(str "((?=\\s)|\\z)")
(s/replace #"(\\b)+" "\\\\b")
Expand Down Expand Up @@ -161,6 +175,29 @@
(str license-str " ((v|ver|version)[\\-\\s]*)?" ; Note: whitespace not escaped because that happens later
(s/join "\\." (map #(str "0*" %) version-components)))))

(defn- special-case-names
"Special case handling for specific names."
[s n]
(cond
(s/includes? n "Apache")
(s/replace s #"(?i)\bApache\b" "Apache(\\\\s+Software)?")

(s/includes? n "MIT")
(s/replace s #"(?i)\bMIT\b" "(?!X11(/?|\\\\s{1,4}))MIT(?!(/?|\\\\s{1,4})X11)")

(s/includes? n "Public")
(s/replace s #"(?i)\s+Public\b" "(\\\\s+Public)?")

(and (or (s/includes? n "libpng") (s/includes? n "Libpng"))
(not (s/includes? n "zlib")))
(s/replace s #"(?i)\blibpng\b" "(?<!zlib[\\\\\\\\/\\\\-\\\\s]{1,4})libpng")

(and (s/includes? n "zlib"))
(s/replace s #"(?i)\bzlib(/libpng)?\b" "zlib(([\\\\\\\\/\\\\-\\\\s]+)libpng)?")

:else
s))

; Only public for the unit tests
(defn name->regex
"Turns `n`, a license or exception name, into a regex that can be used to
Expand All @@ -182,8 +219,7 @@
(s/replace #"(?i)\bmerchant[ai]bility\b" "Merchant[ai]bility")
(s/replace #"(?i)\bnon(\\\-)?commercial\b" "Non(\\\\-)?commercial") ; Note: weird syntax in find regex as hyphens have already been escaped
; Special cases
(s/replace #"(?i)\bApache\b" "Apache(\\\\s+Software)?")
(s/replace #"(?i)\s+Public\b" "(\\\\s+Public)?")
(special-case-names n)
; Whitespace variance
(s/replace #"\s+" "\\\\s+")
; End clauses
Expand Down
28 changes: 20 additions & 8 deletions test/lice_comb/matching_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@
(ns lice-comb.matching-test
(:require [clojure.test :refer [deftest testing is use-fixtures]]
[lice-comb.test-boilerplate :refer [fixture valid= valid-info=]]
[lice-comb.impl.spdx :as lcis]
[lice-comb.matching :refer [init! unidentified? proprietary-commercial? text->expressions name->expressions name->expressions-info uri->expressions]]
[spdx.licenses :as slic]
[spdx.exceptions :as sexc]))
[spdx.exceptions :as sexc]
[lice-comb.impl.spdx :as lcis]
[lice-comb.matching :refer [init! unidentified? proprietary-commercial? text->expressions name->expressions name->expressions-info uri->expressions]]))

(use-fixtures :once fixture)

Expand Down Expand Up @@ -84,6 +84,12 @@
(is (nil? (name->expressions " ")))
(is (nil? (name->expressions "\n")))
(is (nil? (name->expressions "\t"))))
(testing "Simple unidentified names"
(is (valid= #{(lcis/name->unidentified-addition-ref "AND")} (name->expressions "AND")))
(is (valid= #{(lcis/name->unidentified-addition-ref "or")} (name->expressions "or")))
(is (valid= #{(lcis/name->unidentified-addition-ref "with")} (name->expressions "with")))
(is (valid= #{(lcis/name->unidentified-addition-ref "foo")} (name->expressions "foo")))
(is (valid= #{(lcis/name->unidentified-addition-ref "@$%^*)")} (name->expressions "@$%^*)"))))
(testing "SPDX license ids"
(is (valid= #{"AGPL-3.0-only"} (name->expressions "AGPL-3.0")))
(is (valid= #{"AGPL-3.0-only"} (name->expressions "AGPL-3.0-only")))
Expand Down Expand Up @@ -162,9 +168,12 @@
(is (valid= #{"Apache-2.0" "GPL-3.0-only WITH Classpath-exception-2.0"} (name->expressions "Apache License version 2.0 / GNU General Public License version 3 with classpath exception")))
(is (valid= #{"EPL-2.0 OR (Apache-2.0 AND BSD-3-Clause) OR (GPL-2.0-or-later WITH Classpath-exception-2.0 AND MIT)"} (name->expressions "Eclipse Public License or General Public License 2.0 or (at your discretion) later w/ classpath exception aNd MIT Licence or three clause bsd and Apache Licence"))))
(testing "Listed names"
; We use `some` here, because some license names resolve to multiple licenses ids
(run! #(is (some #{(:id %)} (name->expressions (:name %)))) @lcis/license-list-d)
(run! #(is (some #{(:id %)} (name->expressions (:name %)))) @lcis/exception-list-d))
; We use the full license lists here, rather than the ones lice-comb uses for detection, since the real world may contain anything
(let [license-list (map slic/id->info (slic/ids))
exception-list (map sexc/id->info (sexc/ids))]
; We use `some` here, because some license names resolve to multiple licenses ids
(run! #(is (some #{(:id %)} (name->expressions (:name %)))) license-list)
(run! #(is (some #{(:id %)} (name->expressions (:name %)))) exception-list)))
(testing "Names seen in handpicked POMs on Maven Central"
(is (valid= #{"AGPL-3.0-only"} (name->expressions "GNU Affero General Public License (AGPL) version 3.0")))
(is (valid= #{"AGPL-3.0-only"} (name->expressions "GNU Affero General Public License v3.0 only")))
Expand Down Expand Up @@ -618,7 +627,7 @@
(is (valid= #{"LGPL-3.0-or-later"} (name->expressions "LGPL-3.0-or-later")))
(is (valid= #{"LGPL-3.0-or-later"} (name->expressions "LGPLv3+")))
(is (valid= #{"LGPL-3.0-or-later"} (name->expressions "Licensed under GNU Lesser General Public License Version 3 or later (the "))) ; Note trailing space
(is (valid= #{"Libpng"} (name->expressions "zlib/libpng License")))
(is (valid= #{"Zlib"} (name->expressions "zlib/libpng License"))) ; This is a peculiar (i.e. erroneously specified) case from https://github.com/IGJoshua/glfw-clj/blob/f41c41f8011a1a8108b3760f5f81262a7a75bead/pom.xml#L9, which itself came from https://www.glfw.org/
(is (valid= #{"MIT" "Apache-2.0" "BSD-3-Clause"} (name->expressions "MIT/Apache-2.0/BSD-3-Clause")))
(is (valid= #{"MIT"} (name->expressions " MIT License")))
(is (valid= #{"MIT"} (name->expressions "Distributed under an MIT-style license (see LICENSE for details).")))
Expand Down Expand Up @@ -657,7 +666,7 @@
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License 2.0")))
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License Version 2.0")))
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License v2.0")))
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License v2.0+")))
(is (valid= #{"MPL-2.0+"} (name->expressions "Mozilla Public License v2.0+")))
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License version 2")))
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License version 2.0")))
(is (valid= #{"MPL-2.0"} (name->expressions "Mozilla Public License"))) ; Missing version - we assume the latest
Expand Down Expand Up @@ -785,6 +794,8 @@
(is (unidentified-only? (name->expressions "wisdragon")))
(is (unidentified-only? (name->expressions "wiseloong")))))

;####TEST!!!!
(comment
(deftest name->expressions-info-tests
(testing "Nil, empty or blank"
(is (nil? (name->expressions-info nil)))
Expand Down Expand Up @@ -855,3 +866,4 @@
(is (= #{"Apache-2.0"} (uri->expressions "HTTPS://GITHUB.COM/pmonks/lice-comb/blob/main/LICENSE"))))
(testing "URIs that aren't in the SPDX license list, but do match via retrieval, HTML->text conversion, and full text matching"
(is (= #{"MPL-2.0"} (uri->expressions "https://www.mozilla.org/en-US/MPL/2.0/")))))
)

0 comments on commit c94062d

Please sign in to comment.