Skip to content

Commit

Permalink
🚧 Improved name parsing logic (WIP)
Browse files Browse the repository at this point in the history
  • Loading branch information
pmonks committed Aug 15, 2024
1 parent f30c98d commit b85c65d
Show file tree
Hide file tree
Showing 15 changed files with 581 additions and 243 deletions.
37 changes: 0 additions & 37 deletions src/lice_comb/impl/3rd_party.clj

This file was deleted.

2 changes: 1 addition & 1 deletion src/lice_comb/impl/data.clj
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
Use underscore ('_') instead.
* Unlike during class loading, Clojure does not automatically switch hyphens
in classpath resource path elements to underscores. This inconsistency can
be a time-wasting trap."
be a time-wasting foot gun."
[path]
(when-not (s/blank? path)
(try
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,10 @@
; SPDX-License-Identifier: Apache-2.0
;

(ns lice-comb.impl.regex-matching
"Helper functionality focused on regex matching. Note: this namespace is not
part of the public API of lice-comb and may change without notice."
(ns lice-comb.impl.id-detection
"Helper functionality focused on detecting SPDX id(s) from a (short) string.
Note: this namespace is not part of the public API of lice-comb and may change
without notice."
(:require [clojure.string :as s]
[clojure.set :as set]
[medley.core :as med]
Expand Down Expand Up @@ -194,11 +195,11 @@
version (get-rencgs m ["version"] (if (= variant "LGPL") "2.0" "1.0"))
version (s/replace version #"\p{Punct}+" ".")
[confidence confidence-explanations]
(if (s/blank? version)
[:low #{:missing-version}]
(if version-present?
(if (s/includes? version ".")
[:high]
[:medium #{:partial-version}]))
[:medium #{:partial-version}])
[:low #{:missing-version}])
version (if (s/includes? version ".")
version
(str version ".0"))
Expand All @@ -223,7 +224,7 @@

; The regex for the GNU family is a nightmare, so we build it up (and test it) in pieces
(def agpl-re #"(?<agpl>AGPL|Affero)(\s+GNU)?(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?AGPL\)?)?")
(def lgpl-re #"(?<lgpl>(GNU\s+(Genere?al\s+)?(Library\s+or\s+Lesser|Library|Lesser))|((Library\s+or\s+Lesser|Library|Lesser)\s+(GNU|GPL|Genere?al)|(L(esser\s)?\s*GPL)))(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?L\s*GPL\)?)?")
(def lgpl-re #"(?<lgpl>(GNU\s+(Genere?al\s+)?(Library\s+or\s+Lesser|Lesser\s+or\s+Library|Library|Lesser))|((Library\s+or\s+Lesser|Lesser\s+or\s+Library|Library|Lesser)\s+(GNU|GPL|Genere?al)|(L(esser\s)?\s*GPL)))(\s+Genere?al)?(\s+Pub?lic)?(\s+Licen[cs]e)?(\s+\(?L\s*GPL\)?)?")
(def gpl-re #"(?<!(Affero|Lesser|Library)\s+)(?<gpl>GNU(?!\s+Classpath)|(?<!(L|A)\s*)GPL|Genere?al\s+Pub?lic\s+Licen[cs]e)(?!\s+(Affero|Library|Lesser|Genere?al\s+Lesser|Genere?al\s+Library|LGPL|AGPL))((\s+General)?(?!\s+(Affero|Lesser|Library))\s+Pub?lic\s+Licen[cs]e)?(\s+\(?GPL\)?)?")
(def version-re #"[\s,-]*(_?V(ersion)?)?[\s\._]*(?<version>\d+([\._]\d+)?)?")
(def only-or-later-re #"[\s,-]*((?<only>\(?only\)?)|(\(?or(\s+\(?at\s+your\s+(option|discretion)\)?)?(\s+any)?)?([\s-]*(?<orLater>lat[eo]r|newer|greater|\+)))?")
Expand Down Expand Up @@ -370,7 +371,7 @@
:fn (constantly ["Zlib" :high])}
])))

(defn- match
(defn- parse-id
"If a match occured for the given regex element when tested against string s,
returns a map containing the following keys:
* :id The SPDX license or exception identifier that was determined
Expand All @@ -394,7 +395,7 @@
:start (:start match)}
(when (seq confidence-explanations) {:confidence-explanations confidence-explanations})))))

(defn matches
(defn parse-ids
"Returns a sequence (NOT A SET!) of maps where each key is a SPDX license or
exception identifier (a String) that was found in s, and the value is a
sequence containing a single map describing how the identifier was determined.
Expand All @@ -410,7 +411,7 @@
Results are in the order in which they appear in the string, and the function
returns nil if there were no matches."
[s]
(when-let [matches (seq (filter identity (e/pmap* (partial match s) @license-name-matching-d)))]
(when-let [matches (seq (filter identity (e/pmap* (partial parse-id s) @license-name-matching-d)))]
(some->> matches
(med/distinct-by :id) ;####TODO: THINK ABOUT MERGING INSTEAD OF DROPPING
(sort-by :start)
Expand Down
108 changes: 34 additions & 74 deletions src/lice_comb/impl/matching.clj → src/lice_comb/impl/parsing.clj
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
; SPDX-License-Identifier: Apache-2.0
;

(ns lice-comb.impl.matching
"Matching helper functionality. Note: this namespace is not part of
the public API of lice-comb and may change without notice."
(ns lice-comb.impl.parsing
"License name, URI, and text parsing functionality. Note: this namespace is
not part of the public API of lice-comb and may change without notice."
(:require [clojure.string :as s]
[clojure.set :as set]
[clojure.java.io :as io]
Expand All @@ -28,9 +28,9 @@
[spdx.expressions :as sexp]
[embroidery.api :as e]
[lice-comb.impl.spdx :as lcis]
[lice-comb.impl.regex-matching :as lcirm]
[lice-comb.impl.id-detection :as lciid]
[lice-comb.impl.splitting :as lcisp]
[lice-comb.impl.expressions-info :as lciei]
[lice-comb.impl.3rd-party :as lc3]
[lice-comb.impl.http :as lcihttp]
[lice-comb.impl.data :as lcid]
[lice-comb.impl.utils :as lciu]))
Expand Down Expand Up @@ -121,13 +121,13 @@
fix-mpl-2
fix-license-id-with-exception-id))

(defmulti text->expressions-info
(defmulti match-text
"Returns an expressions-info map for the given license text, or nil if no
matches are found."
{:arglists '([text])}
class)

(defmethod text->expressions-info java.lang.String
(defmethod match-text java.lang.String
[s]
; clj-spdx's *-within-text APIs are *expensive* but support batching, so we check batches of ids in parallel
(let [num-cpus (.availableProcessors (Runtime/getRuntime))
Expand All @@ -143,36 +143,36 @@
; Note: we don't need to sexp/normalise the keys here, as the only expressions that can be returned are constructed correctly
(manual-fixes (into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-matching-guidelines})) expressions-found))))))

(defmethod text->expressions-info java.io.Reader
(defmethod match-text java.io.Reader
[r]
(let [sw (java.io.StringWriter.)]
(io/copy r sw)
(text->expressions-info (str sw))))
(match-text (str sw))))

(defmethod text->expressions-info java.io.InputStream
(defmethod match-text java.io.InputStream
[is]
(text->expressions-info (io/reader is)))
(match-text (io/reader is)))

(defmethod text->expressions-info :default
(defmethod match-text :default
[src]
(when src
(with-open [r (io/reader src)]
(doall (text->expressions-info r)))))
(doall (match-text r)))))

(defn uri->expressions-info
"Returns an expressions-info map for the given license uri, or nil if no
matches are found."
(defn parse-uri
"Parses the given license `uri`, returning an expressions-info map, or `nil`
if no matching license ids were found."
[uri]
(when-not (s/blank? uri)
(let [result (manual-fixes
(let [suri (lciu/simplify-uri uri)]
(or ; 1. Does the simplified URI match any of the simplified URIs in the SPDX license or exception lists?
(when-let [ids (get @lcis/index-uri-to-id-d suri)]
(into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids)))

; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it
(when-let [license-text (lcihttp/get-text uri)]
(text->expressions-info license-text)))))]
(or
; 1. Is the URI a close match for any of the URIs in the SPDX license or exception lists?
(when-let [ids (lcis/near-match-uri uri)]
(into {} (map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-uri :source (list uri)})) ids)))

; 2. attempt to retrieve the text/plain contents of the uri and perform license text matching on it
(when-let [license-text (lcihttp/get-text uri)]
(match-text license-text))))]
; We don't need to sexp/normalise the keys here, as we never detect an expression from a URI
(lciei/prepend-source uri result))))

Expand All @@ -194,64 +194,27 @@
(map #(apply hash-map %) cursed-name))

; 2. Is it an SPDX license or exception id?
(when-let [id (get @lcis/spdx-ids-d (s/lower-case s))]
(when-let [id (lcis/near-match-id s)]
(if (= id s)
(list {id (list {:id id :type :declared :strategy :spdx-listed-identifier-exact-match :source (list s)})})
(list {id (list {:id id :type :concluded :confidence :high :strategy :spdx-listed-identifier-case-insensitive-match :source (list s)})})))

; 3. Is it the name of one or more SPDX licenses or exceptions?
(when-let [ids (get @lcis/index-name-to-id-d (s/lower-case s))]
(when-let [ids (lcis/near-match-name s)]
(map #(hash-map % (list {:id % :type :concluded :confidence :high :strategy :spdx-listed-name :source (list s)})) ids))

; 4. Might it be a URI? (this is to handle some dumb corner cases that exist in pom.xml files hosted on Clojars & Maven Central)
(when-let [ids (uri->expressions-info s)]
(when-let [ids (parse-uri s)]
(map #(hash-map (key %) (val %)) ids))

; 5. Attempt regex name matching
(lcirm/matches s)
; 5. Attempt to parse ids from the name
(lciid/parse-ids s)

; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value instead of an SPDX license or exception identifier
; 6. No clue, so return a single info map, but with a made up "UNIDENTIFIED-" value (NOT A LICENSEREF!) instead of an SPDX license or exception identifier
(let [id (str "UNIDENTIFIED-" s)]
(list {id (list {:id id :type :concluded :confidence :low :confidence-explanations [:unidentified] :strategy :unidentified :source (list s)})})))]
(map (partial lciei/prepend-source s) ids))))

(defn- filter-blanks
"Filter blank strings out of coll"
[coll]
(when (seq coll)
(seq (filter #(or (not (string? %)) (not (s/blank? %))) coll))))

(defn- map-split-and-interpose
"Maps over the given sequence, splitting strings using the given regex re and
interposing the given value inter, returning a (flattened) sequence."
[re inter coll]
(mapcat #(if-not (string? %)
[%]
(let [splits (s/split % re)]
(if (nil? inter)
splits
(interpose inter splits))))
coll))

(defn split-on-operators
"Case insensitively splits a string based on license operators (and,
or, with), but only if they're not also part of a license name (e.g.
'Common Development and Distribution License', 'GNU General Public
License version 2.0 or (at your option) any later version', etc.)."
[s]
(when-not (s/blank? s)
(->> (s/split (s/trim s) #"(?i)\band[/-\\]+or\b")
(map-split-and-interpose #"(?i)(\band\b|\&)(?!\s+(distribution|all\s+rights\s+reserved))"
:and)
(map-split-and-interpose #"(?i)\bor\b(?!\s*(-?(greater|(any\s+)?later|(any\s+)?lator|(any\s+)?newer|lesser|library|\(?at\s+your\s+(option|discretion)\)?|([\"']?(Revised|Modified)[\"']?))))"
:or)
(map-split-and-interpose #"(?i)\b(with\b|w/)(?!\s+the\s+acknowledgment\s+clause\s+removed)"
:with)
(map-split-and-interpose #"(?i)(?<=CDDL)/(?=GPL)" ; Special case for splitting particularly cursed combos such as CDDL/GPLv2+CE
nil)
filter-blanks
(map #(if (string? %) (s/trim %) %)))))

(defn- fix-unidentified
"Fixes a singleton UNIDENTIFIED- expression info map by converting the id to
either a lice-comb unidentified LicenseRef or AdditionRef, depending on prev.
Expand Down Expand Up @@ -341,16 +304,13 @@
(recur (process-expression-element result f) (first r) (rest r))
(manual-fixes (into {} result)))))

(defn name->expressions-info
"Returns an expressions-info map for the given license name."
(defn parse-name
"Parses the given license `n`ame, returning an expressions-info map."
[n]
(when-not (s/blank? n)
(let [n (s/trim n)
partial-result (some->> n
split-on-operators ; Split on operators
(drop-while keyword?) ; Drop (nonsensical) leading operators
(lc3/rdrop-while keyword?) ; Drop (nonsensical) trailing operators
dedupe ; Deduplicate consecutive identical values (mostly applies to duplicate operators, which are redundant)
lcisp/split-on-operators ; Split on operators
(map #(if (keyword? %) % (string->ids-info %))) ; Determine SPDX ids (or UNIDENTIFIED-xxx) with info for all non-operators
flatten ; Flatten back to an unnested sequence (since string->ids-info returns sequences)
fix-unidentifieds ; Convert each unidentified non-operator into either a LicenseRef or AdditionRef, depending on context
Expand All @@ -374,7 +334,7 @@
Note: this method has a substantial performance cost."
[]
(lcis/init!)
(lcirm/init!)
(lciid/init!)
(lcihttp/init!)
@cursed-names-d
nil)
36 changes: 28 additions & 8 deletions src/lice_comb/impl/spdx.clj
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
"SPDX-related functionality. Note: this namespace is not part of the public
API of lice-comb and may change without notice."
(:require [clojure.string :as s]
[embroidery.api :as e]
[spdx.licenses :as sl]
[spdx.exceptions :as se]
[spdx.expressions :as sexp]
Expand Down Expand Up @@ -52,15 +53,28 @@
(def ^:private unidentified-addition-ref-prefix (str lice-comb-addition-ref-prefix "-UNIDENTIFIED"))

; Lower case id map
(def spdx-ids-d (delay (merge (into {} (map #(vec [(s/lower-case %) %]) @license-ids-d))
(into {} (map #(vec [(s/lower-case %) %]) @exception-ids-d)))))
(def ^:private spdx-ids-d (delay (merge (into {} (map #(vec [(s/lower-case %) %]) @license-ids-d))
(into {} (map #(vec [(s/lower-case %) %]) @exception-ids-d)))))

(defn near-match-id
"Returns the (case-corrected) id for the given license or exception id `id`,
or `nil` if one wasn't found."
[id]
(get @spdx-ids-d (s/lower-case id)))

(defn- name-to-id-tuple
[list-entry]
[(s/lower-case (s/trim (:name list-entry))) (:id list-entry)])

(def index-name-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @license-list-d)))
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @exception-list-d))))))
(def ^:private index-name-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @license-list-d)))
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (map name-to-id-tuple @exception-list-d))))))

;####TODO: REPLACE THIS WITH REGEX BASED NEAR-MATCHING (to account for whitespace variance and #"licen[cs]e", for example)
(defn near-match-name
"Returns the id(s) for the given license or exception name `n`, or `nil` if
no ids were found."
[n]
(get @index-name-to-id-d (s/lower-case n)))

(defn- urls-to-id-tuples
"Extracts all urls for a given list (license or exception) entry."
Expand All @@ -69,8 +83,14 @@
simplified-uris (map lciu/simplify-uri (filter (complement s/blank?) (concat (:see-also list-entry) (get-in list-entry [:cross-refs :url]))))]
(map #(vec [% id]) simplified-uris)))

(def index-uri-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @license-list-d)))
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @exception-list-d))))))
(def ^:private index-uri-to-id-d (delay (merge (lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @license-list-d)))
(lciu/mapfonv #(lciu/nset (map second %)) (group-by first (mapcat urls-to-id-tuples @exception-list-d))))))

(defn near-match-uri
"Returns the id(s) for the given license or exception `uri`, or `nil` if no
ids were found."
[uri]
(get @index-uri-to-id-d (lciu/simplify-uri uri)))

(defn lice-comb-license-ref?
"Is the given id one of lice-comb's custom LicenseRefs?"
Expand Down Expand Up @@ -209,8 +229,8 @@
Note: this method has a substantial performance cost."
[]
; Parallelise initialisation of the spdx.licenses and spdx.exceptions namespaces, as they're both sloooooooow (~1.5 mins total)
(let [sl-init (future (sl/init!))
se-init (future (se/init!))]
(let [sl-init (e/future* (sl/init!))
se-init (e/future* (se/init!))]
@sl-init
@se-init)
(sexp/init!)
Expand Down
Loading

0 comments on commit b85c65d

Please sign in to comment.