Skip to content

Commit

Permalink
fix: check artifacts in DB to reduce number of sha1 queries (#50)
Browse files Browse the repository at this point in the history
  • Loading branch information
DmitriyLewen authored Jan 14, 2025
1 parent 6714acb commit 793673c
Show file tree
Hide file tree
Showing 11 changed files with 250 additions and 51 deletions.
17 changes: 12 additions & 5 deletions .github/workflows/cron.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,18 @@ jobs:
go-version-file: go.mod
id: go

- name: Install oras
run: |
curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz
tar -xvf ./oras_1.2.0_linux_amd64.tar.gz
- name: Pull trivy-java-db
run: |
mkdir -p ./cache/db
lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
./oras pull "ghcr.io/${lowercase_repo}:${DB_VERSION}"
tar -xvf javadb.tar.gz -C ./cache/db
- name: Build the binary
run: make build

Expand Down Expand Up @@ -59,11 +71,6 @@ jobs:
username: ${{ secrets.ECR_ACCESS_KEY_ID }}
password: ${{ secrets.ECR_SECRET_ACCESS_KEY }}

- name: Install oras
run: |
curl -LO https://github.com/oras-project/oras/releases/download/v1.2.0/oras_1.2.0_linux_amd64.tar.gz
tar -xvf ./oras_1.2.0_linux_amd64.tar.gz
- name: Upload assets to registries
run: |
lowercase_repo=$(echo "${{ github.repository }}" | tr '[:upper:]' '[:lower:]')
Expand Down
17 changes: 10 additions & 7 deletions cmd/trivy-java-db/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -66,29 +66,32 @@ func init() {
}

func crawl(ctx context.Context) error {
c := crawler.NewCrawler(crawler.Option{
c, err := crawler.NewCrawler(crawler.Option{
Limit: int64(limit),
CacheDir: cacheDir,
})
if err != nil {
return xerrors.Errorf("unable to create new Crawler: %w", err)
}
if err := c.Crawl(ctx); err != nil {
return xerrors.Errorf("crawl error: %w", err)
}
return nil
}

func build() error {
if err := db.Reset(cacheDir); err != nil {
return xerrors.Errorf("db reset error: %w", err)
}
dbDir := filepath.Join(cacheDir, "db")
dbDir := db.Dir(cacheDir)
slog.Info("Database", slog.String("path", dbDir))
dbc, err := db.New(dbDir)
if err != nil {
return xerrors.Errorf("db create error: %w", err)
}
if err = dbc.Init(); err != nil {
return xerrors.Errorf("db init error: %w", err)
if !db.Exists(dbDir) {
if err = dbc.Init(); err != nil {
return xerrors.Errorf("db init error: %w", err)
}
}

meta := db.NewMetadata(dbDir)
b := builder.NewBuilder(dbc, meta)
if err = b.Build(cacheDir); err != nil {
Expand Down
78 changes: 56 additions & 22 deletions pkg/crawler/crawler.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"golang.org/x/sync/semaphore"
"golang.org/x/xerrors"

"github.com/aquasecurity/trivy-java-db/pkg/db"
"github.com/aquasecurity/trivy-java-db/pkg/fileutil"
"github.com/aquasecurity/trivy-java-db/pkg/types"
)
Expand All @@ -30,6 +31,7 @@ const mavenRepoURL = "https://repo.maven.apache.org/maven2/"
type Crawler struct {
dir string
http *retryablehttp.Client
dbc *db.DB

rootUrl string
wg sync.WaitGroup
Expand All @@ -44,7 +46,7 @@ type Option struct {
CacheDir string
}

func NewCrawler(opt Option) Crawler {
func NewCrawler(opt Option) (Crawler, error) {
client := retryablehttp.NewClient()
client.RetryMax = 10
client.Logger = slog.Default()
Expand Down Expand Up @@ -77,14 +79,26 @@ func NewCrawler(opt Option) Crawler {
indexDir := filepath.Join(opt.CacheDir, "indexes")
slog.Info("Index dir", slog.String("path", indexDir))

var dbc db.DB
dbDir := db.Dir(opt.CacheDir)
if db.Exists(dbDir) {
var err error
dbc, err = db.New(dbDir)
if err != nil {
return Crawler{}, xerrors.Errorf("unable to open DB: %w", err)
}
slog.Info("DB is used for crawler", slog.String("path", opt.CacheDir))
}

return Crawler{
dir: indexDir,
http: client,
dbc: &dbc,

rootUrl: opt.RootUrl,
urlCh: make(chan string, opt.Limit*10),
limit: semaphore.NewWeighted(opt.Limit),
}
}, nil
}

func (c *Crawler) Crawl(ctx context.Context) error {
Expand Down Expand Up @@ -222,7 +236,12 @@ func (c *Crawler) Visit(ctx context.Context, url string) error {
}

func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata, dirs []string) error {
var foundVersions []Version
var foundVersions []types.Version
// Get versions from the DB (if exists) to reduce the number of requests to the server
savedVersion, err := c.versionsFromDB(meta.ArtifactID, meta.GroupID)
if err != nil {
return xerrors.Errorf("unable to get list of versions from DB: %w", err)
}
// Check each version dir to find links to `*.jar.sha1` files.
for _, dir := range dirs {
dirURL := baseURL + dir
Expand All @@ -234,39 +253,47 @@ func (c *Crawler) crawlSHA1(ctx context.Context, baseURL string, meta *Metadata,
// Remove the `/` suffix to correctly compare file versions with version from directory name.
dirVersion := strings.TrimSuffix(dir, "/")
var dirVersionSha1 []byte
var versions []Version
var versions []types.Version

for _, sha1Url := range sha1Urls {
sha1, err := c.fetchSHA1(ctx, sha1Url)
if err != nil {
return xerrors.Errorf("unable to fetch sha1: %s", err)
}
if ver := versionFromSha1URL(meta.ArtifactID, sha1Url); ver != "" && len(sha1) != 0 {
// Save sha1 for the file where the version is equal to the version from the directory name in order to remove duplicates later
// Avoid overwriting dirVersion when inserting versions into the database (sha1 is uniq blob)
// e.g. `cudf-0.14-cuda10-1.jar.sha1` should not overwrite `cudf-0.14.jar.sha1`
// https://repo.maven.apache.org/maven2/ai/rapids/cudf/0.14/
if ver == dirVersion {
dirVersionSha1 = sha1
} else {
versions = append(versions, Version{
Version: ver,
SHA1: sha1,
})
ver := versionFromSha1URL(meta.ArtifactID, sha1Url)
sha1, ok := savedVersion[ver]
if !ok {
sha1, err = c.fetchSHA1(ctx, sha1Url)
if err != nil {
return xerrors.Errorf("unable to fetch sha1: %s", err)
}
}
// Save sha1 for the file where the version is equal to the version from the directory name in order to remove duplicates later
// Avoid overwriting dirVersion when inserting versions into the database (sha1 is uniq blob)
// e.g. `cudf-0.14-cuda10-1.jar.sha1` should not overwrite `cudf-0.14.jar.sha1`
// https://repo.maven.apache.org/maven2/ai/rapids/cudf/0.14/
if ver == dirVersion {
dirVersionSha1 = sha1
} else {
versions = append(versions, types.Version{
Version: ver,
SHA1: sha1,
})
}
}
// Remove duplicates of dirVersionSha1
versions = lo.Filter(versions, func(v Version, _ int) bool {
versions = lo.Filter(versions, func(v types.Version, _ int) bool {
return !bytes.Equal(v.SHA1, dirVersionSha1)
})

if dirVersionSha1 != nil {
versions = append(versions, Version{
versions = append(versions, types.Version{
Version: dirVersion,
SHA1: dirVersionSha1,
})
}

versions = lo.Filter(versions, func(v types.Version, _ int) bool {
_, ok := savedVersion[v.Version]
return !ok
})

foundVersions = append(foundVersions, versions...)
}

Expand Down Expand Up @@ -410,6 +437,13 @@ func (c *Crawler) httpGet(ctx context.Context, url string) (*http.Response, erro
return resp, nil
}

func (c *Crawler) versionsFromDB(artifactID, groupID string) (map[string][]byte, error) {
if c.dbc == nil {
return nil, nil
}
return c.dbc.SelectVersionsByArtifactIDAndGroupID(artifactID, groupID)
}

func randomSleep() {
// Seed rand
r := rand.New(rand.NewSource(int64(time.Now().Nanosecond())))
Expand Down
65 changes: 62 additions & 3 deletions pkg/crawler/crawler_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,30 @@ package crawler_test

import (
"context"
"encoding/hex"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"strings"
"testing"

"github.com/aquasecurity/trivy-java-db/pkg/dbtest"
"github.com/aquasecurity/trivy-java-db/pkg/types"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"

"github.com/aquasecurity/trivy-java-db/pkg/crawler"

_ "modernc.org/sqlite"
)

func TestCrawl(t *testing.T) {
tests := []struct {
name string
limit int64
fileNames map[string]string
withDb bool
goldenPath string
filePath string
wantErr string
Expand All @@ -42,6 +50,27 @@ func TestCrawl(t *testing.T) {
goldenPath: "testdata/happy/abbot.json.golden",
filePath: "indexes/abbot/abbot.json",
},
{
name: "happy path with DB",
withDb: true,
limit: 1,
fileNames: map[string]string{
"/maven2/": "testdata/happy/index.html",
"/maven2/abbot/": "testdata/happy/abbot.html",
"/maven2/abbot/abbot/": "testdata/happy/abbot_abbot.html",
"/maven2/abbot/abbot/maven-metadata.xml": "testdata/happy/maven-metadata.xml",
"/maven2/abbot/abbot/0.12.3/": "testdata/happy/abbot_abbot_0.12.3.html",
"/maven2/abbot/abbot/0.12.3/abbot-0.12.3.jar.sha1": "testdata/happy/abbot-0.12.3.jar.sha1",
"/maven2/abbot/abbot/0.13.0/": "testdata/happy/abbot_abbot_0.13.0.html",
"/maven2/abbot/abbot/0.13.0/abbot-0.13.0.jar.sha1": "testdata/happy/abbot-0.13.0.jar.sha1",
"/maven2/abbot/abbot/0.13.0/abbot-0.13.0-copy.jar.sha1": "testdata/happy/abbot-0.13.0-copy.jar.sha1",
"/maven2/abbot/abbot/1.4.0/": "testdata/happy/abbot_abbot_1.4.0.html",
"/maven2/abbot/abbot/1.4.0/abbot-1.4.0.jar.sha1": "testdata/happy/abbot-1.4.0.jar.sha1",
"/maven2/abbot/abbot/1.4.0/abbot-1.4.0-lite.jar.sha1": "testdata/happy/abbot-1.4.0-lite.jar.sha1",
},
goldenPath: "testdata/happy/abbot-with-db.json.golden",
filePath: "indexes/abbot/abbot.json",
},
{
name: "sad path",
limit: 2,
Expand Down Expand Up @@ -76,13 +105,24 @@ func TestCrawl(t *testing.T) {
defer ts.Close()

tmpDir := t.TempDir()
cl := crawler.NewCrawler(crawler.Option{
if tt.withDb {
dbc, err := dbtest.InitDB(t, []types.Index{
indexAbbot123,
indexAbbot130,
})
require.NoError(t, err)

tmpDir = filepath.Join(strings.TrimSuffix(dbc.Dir(), "db"))
}

cl, err := crawler.NewCrawler(crawler.Option{
RootUrl: ts.URL + "/maven2/",
Limit: tt.limit,
CacheDir: tmpDir,
})
require.NoError(t, err)

err := cl.Crawl(context.Background())
err = cl.Crawl(context.Background())
if tt.wantErr != "" {
assert.ErrorContains(t, err, tt.wantErr)
return
Expand All @@ -97,5 +137,24 @@ func TestCrawl(t *testing.T) {
assert.JSONEq(t, string(want), string(got))
})
}

}

var (
abbot123Sha1b, _ = hex.DecodeString("51d28a27d919ce8690a40f4f335b9d591ceb16e9")
indexAbbot123 = types.Index{
GroupID: "abbot",
ArtifactID: "abbot",
Version: "0.12.3",
SHA1: abbot123Sha1b,
ArchiveType: types.JarType,
}

abbot130Sha1b, _ = hex.DecodeString("596d91e67631b0deb05fb685d8d1b6735f3e4f60")
indexAbbot130 = types.Index{
GroupID: "abbot",
ArtifactID: "abbot",
Version: "0.13.0",
SHA1: abbot130Sha1b,
ArchiveType: types.JarType,
}
)
15 changes: 15 additions & 0 deletions pkg/crawler/testdata/happy/abbot-with-db.json.golden
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
{
"GroupID": "abbot",
"ArtifactID": "abbot",
"Versions": [
{
"Version": "1.4.0-lite",
"SHA1": "BUerA3Bor6ICaSW9lL+5/Pzsl2E="
},
{
"Version": "1.4.0",
"SHA1": "ojY2RqndBZVWM7RQAQtZohr4pCM="
}
],
"ArchiveType": "jar"
}
6 changes: 3 additions & 3 deletions pkg/crawler/testdata/happy/abbot_abbot_0.13.0.html
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ <h1>abbot/abbot/0.13.0</h1>
<hr>
<main>
<pre id="contents"><a href="https://repo.maven.apache.org/maven2/abbot/abbot/">../</a>
<a href="abbot-0.13.0-copy.jar" title="abbot-0.13.0.jar">abbot-0.13.0.jar</a> 2005-09-20 05:44 779426
<a href="abbot-0.13.0-copy.jar.md5" title="abbot-0.13.0.jar.md5">abbot-0.13.0.jar.md5</a> 2005-09-20 05:44 32
<a href="abbot-0.13.0-copy.jar.sha1" title="abbot-0.13.0.jar.sha1">abbot-0.13.0.jar.sha1</a> 2005-09-20 05:44 40
<a href="abbot-0.13.0-copy.jar" title="abbot-0.13.0-copy.jar">abbot-0.13.0-copy.jar</a> 2005-09-20 05:44 779426
<a href="abbot-0.13.0-copy.jar.md5" title="abbot-0.13.0-copy.jar.md5">abbot-0.13.0-copy.jar.md5</a> 2005-09-20 05:44 32
<a href="abbot-0.13.0-copy.jar.sha1" title="abbot-0.13.0-copy.jar.sha1">abbot-0.13.0-copy.jar.sha1</a> 2005-09-20 05:44 40
<a href="abbot-0.13.0.jar" title="abbot-0.13.0.jar">abbot-0.13.0.jar</a> 2005-09-20 05:44 779426
<a href="abbot-0.13.0.jar.md5" title="abbot-0.13.0.jar.md5">abbot-0.13.0.jar.md5</a> 2005-09-20 05:44 32
<a href="abbot-0.13.0.jar.sha1" title="abbot-0.13.0.jar.sha1">abbot-0.13.0.jar.sha1</a> 2005-09-20 05:44 40
Expand Down
6 changes: 1 addition & 5 deletions pkg/crawler/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,6 @@ type Versioning struct {
type Index struct {
GroupID string
ArtifactID string
Versions []Version
Versions []types.Version
ArchiveType types.ArchiveType
}
type Version struct {
Version string
SHA1 []byte
}
Loading

0 comments on commit 793673c

Please sign in to comment.