Skip to content

Commit

Permalink
Merge pull request #23 from pavelmemory/access-additional-license-data
Browse files Browse the repository at this point in the history
Expose additional license data
  • Loading branch information
bzz authored Oct 6, 2022
2 parents c6bf6f2 + a7c9e28 commit a3a1cc6
Show file tree
Hide file tree
Showing 13 changed files with 137 additions and 33 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ jobs:
- name: Run GolangCI-Lint
uses: golangci/golangci-lint-action@v2
with:
version: v1.29
version: v1.47.0

- name: Test
run: go test ./...
2 changes: 1 addition & 1 deletion .github/workflows/race.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ jobs:
- name: Checkout code
uses: actions/checkout@v2
- name: Test race
run: go test -v -race ./...
run: go test -v -race -timeout 60m ./...
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ jobs:
strategy:
fail-fast: false
matrix:
goos: [linux, darwin, windows]
goos: [linux, darwin]
runs-on: ubuntu-latest
steps:
- name: Checkout code
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ jobs:
fail-fast: false
matrix:
go-version: [1.16.x, 1.17.x]
platform: [ubuntu-latest, macos-latest, windows-latest]
platform: [ubuntu-latest, macos-latest]
runs-on: ${{ matrix.platform }}
steps:
- name: Install Go
Expand Down
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
GOPATH ?= $(shell go env GOPATH)
SPDX_DATA_VERSION ?= 3.8
SPDX_DATA_VERSION ?= 3.17

licensedb/internal/assets/bindata.go: licenses.tar urls.csv names.csv $(GOPATH)/bin/go-bindata
rm -rf license-list-data-$(SPDX_DATA_VERSION)
Expand Down
8 changes: 4 additions & 4 deletions cmd/license-detector/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,16 @@ func TestCmdMain(t *testing.T) {
assert.Equal(t, "", r[0].ErrStr)
assert.Equal(t, "no license file was found", r[1].ErrStr)
assert.Equal(t, "Apache-2.0", r[0].Matches[0].License)
assert.InDelta(t, 0.9877, r[0].Matches[0].Confidence, 0.001)
assert.InDelta(t, 0.9877, r[0].Matches[0].Confidence, 0.002)
assert.Equal(t, "ECL-2.0", r[0].Matches[1].License)
assert.InDelta(t, 0.9047, r[0].Matches[1].Confidence, 0.001)
assert.InDelta(t, 0.9047, r[0].Matches[1].Confidence, 0.002)
buffer.Reset()
detect([]string{"../..", "."}, "text", buffer)
assert.Equal(t, `../..
99% Apache-2.0
90% ECL-2.0
85% SHL-0.51
85% SHL-0.5
81% SHL-0.51
81% SHL-0.5
.
no license file was found
`, buffer.String())
Expand Down
16 changes: 8 additions & 8 deletions licensedb/internal/assets/bindata.go

Large diffs are not rendered by default.

8 changes: 5 additions & 3 deletions licensedb/internal/assets/extract_urls.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,13 +32,15 @@ func main() {
for _, url := range seeAlso.([]interface{}) {
id := data["licenseId"].(string)
strUrl := strings.TrimSpace(url.(string))
strUrl = strUrl[strings.Index(strUrl, "://"):] // ignore http/https
cutIndex := strings.Index(strUrl, "://")
schema := strUrl[:cutIndex]
strUrl = strUrl[cutIndex:] // ignore http/https
if strings.HasSuffix(strUrl, "/legalcode") && strings.HasPrefix(id, "CC") {
strUrl = strUrl[:len(strUrl)-10]
}
writer.Write([]string{id, strUrl})
writer.Write([]string{id, strUrl, schema})
}
}
}
writer.Write([]string{"MIT", ".mit-license.org"})
writer.Write([]string{"MIT", ".mit-license.org", "https"})
}
46 changes: 41 additions & 5 deletions licensedb/internal/db.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"archive/tar"
"bytes"
"encoding/csv"
"errors"
"fmt"
"index/suffixarray"
"io"
Expand All @@ -24,6 +25,10 @@ import (
"github.com/go-enry/go-license-detector/v4/licensedb/internal/wmh"
)

// ErrUnknownLicenseID is raised if license identifier is not known.
// Probably you need to upgrade version of the SPDX.
var ErrUnknownLicenseID = errors.New("license id is not known")

var (
licenseReadmeMentionRe = regexp.MustCompile(
fmt.Sprintf("(?i)[^\\s]+/[^/\\s]*(%s)[^\\s]*",
Expand All @@ -39,8 +44,12 @@ type database struct {
licenseTexts map[string]string
// minimum license text length
minLicenseLength int
// official license URLs
urls map[string]string
// official license URL -> id
idByURL map[string]string
// id -> license URLs
urlsByID map[string][]string
// id -> license name
nameByID map[string]string
// all URLs joined
urlRe *regexp.Regexp
// first line of each license OR-ed - used to split
Expand Down Expand Up @@ -93,10 +102,12 @@ func loadUrls(db *database) {
if err != nil || len(records) == 0 {
log.Fatalf("failed to parse urls.csv from the assets: %v", err)
}
db.urls = map[string]string{}
db.idByURL = map[string]string{}
db.urlsByID = map[string][]string{}
urlReWriter := &bytes.Buffer{}
for i, record := range records {
db.urls[record[1]] = record[0]
db.idByURL[record[1]] = record[0]
db.urlsByID[record[0]] = append(db.urlsByID[record[0]], record[2]+record[1]) // schema+url
urlReWriter.Write([]byte(regexp.QuoteMeta(record[1])))
if i < len(records)-1 {
urlReWriter.WriteRune('|')
Expand All @@ -115,9 +126,11 @@ func loadNames(db *database) {
if err != nil || len(records) == 0 {
log.Fatalf("failed to parse names.csv from the assets: %v", err)
}
db.nameByID = map[string]string{}
db.nameSubstringSizes = map[string]int{}
db.nameSubstrings = map[string][]substring{}
for _, record := range records {
db.nameByID[record[0]] = record[1]
registerNameSubstrings(record[1], record[0], db.nameSubstringSizes, db.nameSubstrings)
}
}
Expand Down Expand Up @@ -270,6 +283,9 @@ func (db *database) queryLicenseAbstract(text string) map[string]float32 {
for i, titlePos := range titlePositions {
begPos := titlePos[0]
match := normalizedModerate[titlePos[0]:titlePos[1]]
if len(match) == 0 {
continue
}
if match[0] == '\n' {
match = match[1:]
}
Expand Down Expand Up @@ -414,7 +430,7 @@ func (db *database) scanForURLs(text string) map[string]bool {
licenses := map[string]bool{}
for _, match := range urlMatches {
url := byteText[match[0]:match[1]]
licenses[db.urls[string(url)]] = true
licenses[db.idByURL[string(url)]] = true
}
return licenses
}
Expand Down Expand Up @@ -452,6 +468,26 @@ func (db *database) QueryReadmeText(text string, fs filer.Filer) map[string]floa
return candidates
}

// URLs returns the list of the URLs for the given license identifier
func (db *database) URLs(id string) ([]string, error) {
urls, found := db.urlsByID[id]
if !found {
return nil, ErrUnknownLicenseID
}
res := make([]string, len(urls))
copy(res, urls)
return urls, nil
}

// Name returns the SPDX name for the license identifier
func (db *database) Name(id string) (string, error) {
name, found := db.nameByID[id]
if !found {
return "", ErrUnknownLicenseID
}
return name, nil
}

func tfidf(freq int, docfreq int, ndocs int) float32 {
weight := fastlog.Log(1+float32(freq)) * fastlog.Log(float32(ndocs)/float32(docfreq))
if weight < 0 {
Expand Down
10 changes: 10 additions & 0 deletions licensedb/internal/investigation.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,3 +166,13 @@ func IsLicenseDirectory(fileName string) bool {
func Preload() {
_ = globalLicenseDatabase()
}

// LookupURLs returns the list of URLs for the given license identifier
func LookupURLs(id string) ([]string, error) {
return globalLicenseDatabase().URLs(id)
}

// LookupName returns the SPDX name for the given license identifier
func LookupName(id string) (string, error) {
return globalLicenseDatabase().Name(id)
}
16 changes: 8 additions & 8 deletions licensedb/internal/wmh/wmh.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,10 +72,10 @@ func (wmh *WeightedMinHasher) MarshalBinary() (data []byte, err error) {
binary.LittleEndian.PutUint32(data[5:9], uint32(wmh.sampleSize))
offset := 9
writeFloat32Slice := func(arr []float32) {
header := *(*reflect.SliceHeader)(unsafe.Pointer(&arr))
header := (*reflect.SliceHeader)(unsafe.Pointer(&arr))
header.Len *= 4
header.Cap *= 4
buffer := *(*[]byte)(unsafe.Pointer(&header))
buffer := *(*[]byte)(unsafe.Pointer(header))
copy(data[offset:], buffer)
offset += len(buffer)
}
Expand All @@ -86,10 +86,10 @@ func (wmh *WeightedMinHasher) MarshalBinary() (data []byte, err error) {
writeFloat32Slice(arr)
}
for _, arr := range wmh.betas {
header := *(*reflect.SliceHeader)(unsafe.Pointer(&arr))
header := (*reflect.SliceHeader)(unsafe.Pointer(&arr))
header.Len *= 2
header.Cap *= 2
buffer := *(*[]byte)(unsafe.Pointer(&header))
buffer := *(*[]byte)(unsafe.Pointer(header))
copy(data[offset:], buffer)
offset += len(buffer)
}
Expand All @@ -111,10 +111,10 @@ func (wmh *WeightedMinHasher) UnmarshalBinary(data []byte) error {
wmh.lnCs = make([][]float32, wmh.sampleSize)
wmh.betas = make([][]uint16, wmh.sampleSize)
readFloat32Slice := func(dest []float32, src []byte) {
header := *(*reflect.SliceHeader)(unsafe.Pointer(&src))
header := (*reflect.SliceHeader)(unsafe.Pointer(&src))
header.Len /= 4
header.Cap /= 4
buffer := *(*[]float32)(unsafe.Pointer(&header))
buffer := *(*[]float32)(unsafe.Pointer(header))
copy(dest, buffer)
}
offset := 9
Expand All @@ -134,10 +134,10 @@ func (wmh *WeightedMinHasher) UnmarshalBinary(data []byte) error {
wmh.betas[i] = make([]uint16, wmh.dim)
nextOffset := offset + wmh.dim*2
slice := data[offset:nextOffset]
header := *(*reflect.SliceHeader)(unsafe.Pointer(&slice))
header := (*reflect.SliceHeader)(unsafe.Pointer(&slice))
header.Len /= 2
header.Cap /= 2
buffer := *(*[]uint16)(unsafe.Pointer(&header))
buffer := *(*[]uint16)(unsafe.Pointer(header))
copy(wmh.betas[i], buffer)
offset = nextOffset
}
Expand Down
27 changes: 27 additions & 0 deletions licensedb/licensedb.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ import (
var (
// ErrNoLicenseFound is raised if no license files were found.
ErrNoLicenseFound = errors.New("no license file was found")
// ErrUnknownLicenseID is raised if license identifier is not known.
// Probably you need to upgrade version of the SPDX.
ErrUnknownLicenseID = errors.New("license id is not known")
)

// Detect returns the most probable reference licenses matched for the given
Expand Down Expand Up @@ -63,3 +66,27 @@ func Detect(fs filer.Filer) (map[string]api.Match, error) {
func Preload() {
internal.Preload()
}

// LicenseURLs returns the list of the URLs for the given license identifier
func LicenseURLs(id string) ([]string, error) {
urls, err := internal.LookupURLs(id)
if err != nil {
if errors.Is(err, internal.ErrUnknownLicenseID) {
return nil, ErrUnknownLicenseID
}
return nil, err
}
return urls, nil
}

// LicenseName returns the name for the given license identifier
func LicenseName(id string) (string, error) {
name, err := internal.LookupName(id)
if err != nil {
if errors.Is(err, internal.ErrUnknownLicenseID) {
return "", ErrUnknownLicenseID
}
return "", err
}
return name, nil
}
29 changes: 29 additions & 0 deletions licensedb/licensedb_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,9 @@ import (
"path/filepath"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/go-enry/go-license-detector/v4/licensedb/filer"
)

Expand Down Expand Up @@ -43,3 +46,29 @@ func pwdFiler() filer.Filer {
}
return f
}

func TestLicenseURLs(t *testing.T) {
t.Run("existing license", func(t *testing.T) {
res, err := LicenseURLs("ODbL-1.0")
require.NoError(t, err)
assert.Equal(t, []string{"http://www.opendatacommons.org/licenses/odbl/1.0/", "https://opendatacommons.org/licenses/odbl/1-0/"}, res)
})

t.Run("not existing license", func(t *testing.T) {
_, err := LicenseURLs("bad-license-key")
require.Equal(t, ErrUnknownLicenseID, err)
})
}

func TestLicenseName(t *testing.T) {
t.Run("existing license", func(t *testing.T) {
res, err := LicenseName("ODbL-1.0")
require.NoError(t, err)
assert.Equal(t, "Open Data Commons Open Database License v1.0", res)
})

t.Run("not existing license", func(t *testing.T) {
_, err := LicenseName("bad-license-key")
require.Equal(t, ErrUnknownLicenseID, err)
})
}

0 comments on commit a3a1cc6

Please sign in to comment.