From c86670658dfa4d604787663159a4c69ef32073ea Mon Sep 17 00:00:00 2001 From: Andrzej Lichnerowicz Date: Sat, 29 Apr 2023 16:26:46 +0200 Subject: [PATCH 1/2] Refactor existing caching mechanism into pluggable system Create an interface `Cache`, similiar to `storage.Storage` that will allow for pluggable caching extensions. Caching subsystem is created with full access to Request and Response objects, to potentialy make more complex caching decisions based on both request and response headers. Because of the need to access those objects, `Cache` is not created in its own package like `storage`, but in root `colly`. Closes #103 --- .gitignore | 2 + VERSION | 2 +- cache.go | 142 ++++++++++++++++++++++++++++++++++++++++++++++++ colly.go | 48 +++++++++++++--- colly_test.go | 5 +- http_backend.go | 48 ++++------------ 6 files changed, 200 insertions(+), 47 deletions(-) create mode 100644 .gitignore create mode 100644 cache.go diff --git a/.gitignore b/.gitignore new file mode 100644 index 000000000..66f8fb502 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +.idea/ +.vscode/ diff --git a/VERSION b/VERSION index 7ec1d6db4..3e3c2f1e5 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.1.0 +2.1.1 diff --git a/cache.go b/cache.go new file mode 100644 index 000000000..a6260a473 --- /dev/null +++ b/cache.go @@ -0,0 +1,142 @@ +// Copyright 2023 Adam Tauber, Andrzej Lichnerowicz +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package colly + +import ( + "crypto/sha1" + "encoding/gob" + "encoding/hex" + "errors" + "net/http" + "os" + "path" +) + +// Cache is an interface which handles caching Collector's responses +// The default Cache of the Collector is the NullCache. +// FileSystemCache keeps compatibility with non-pluggable caching in legacy +// Collector. For this reason, one can set cache folder via `CACHE_DIR` +// environment variable, or by passing CacheDir to NewCollector. +// Collector's caching backend can be changed too by calling new method +// Collector.SetCache +type Cache interface { + // Init initializes the caching backend + Init() error + // Get retrieves a previously cached response for the given request + Get(request *http.Request) (*Response, error) + // Put stores a response given request in cache + Put(request *http.Request, response *Response) error + // Close finalizes caching backend + Close() error +} + +const ( + // DefaultCacheFolderPermissions is set to rwx(user), rx(group), nothing for others + DefaultCacheFolderPermissions = 0750 +) + +var ( + ErrCacheFolderNotConfigured = errors.New("Cache's base folder cannot be empty") + ErrCacheNotConfigured = errors.New("Caching backend is not configured") + ErrRequestNoCache = errors.New("Request cannot be cached") + ErrCachedNotFound = errors.New("Cached response not found") +) + +type NullCache struct { +} + +func (c *NullCache) Init() error { + return nil +} + +// Get always retrieves an error to force re-download +func (c *NullCache) Get(request *http.Request) (*Response, error) { + return nil, ErrCachedNotFound +} + +func (c *NullCache) Put(request *http.Request, response *Response) error { + return nil +} + +func (c *NullCache) Close() error { + return nil +} + +// FileSystemCache is the default cache backend of colly. +// FileSystemCache keeps responses persisted on the disk. +type FileSystemCache struct { + BaseDir string +} + +// Init ensures that specified base folder exists +func (c *FileSystemCache) Init() error { + if c.BaseDir == "" { + return ErrCacheFolderNotConfigured + } + + return os.MkdirAll(c.BaseDir, DefaultCacheFolderPermissions) +} + +func (c *FileSystemCache) getFilenameFromRequest(request *http.Request) (string, string) { + sum := sha1.Sum([]byte(request.URL.String())) + hash := hex.EncodeToString(sum[:]) + dir := path.Join(c.BaseDir, hash[:2]) + return dir, path.Join(dir, hash) +} + +// Get returns an error for HTTP verbs other than GET and if request headers +// specify `Cache-Control: no-cache`. +func (c *FileSystemCache) Get(request *http.Request) (*Response, error) { + if request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { + return nil, ErrRequestNoCache + } + + _, filename := c.getFilenameFromRequest(request) + + if file, err := os.Open(filename); err == nil { + resp := new(Response) + err = gob.NewDecoder(file).Decode(resp) + file.Close() + return resp, err + } else { + return nil, err + } +} + +// Put persists response on disk. For compatibility with legacy non-pluggable version, +// it keeps only one level of folder hierarchy. +func (c *FileSystemCache) Put(request *http.Request, response *Response) error { + dir, filename := c.getFilenameFromRequest(request) + + if _, err := os.Stat(dir); err != nil { + if err := os.MkdirAll(dir, DefaultCacheFolderPermissions); err != nil { + return err + } + } + file, err := os.Create(filename + "~") + if err != nil { + return err + } + if err := gob.NewEncoder(file).Encode(response); err != nil { + file.Close() + return err + } + file.Close() + return os.Rename(filename+"~", filename) +} + +func (c *FileSystemCache) Close() error { + return nil +} diff --git a/colly.go b/colly.go index fdca94518..950d9a612 100644 --- a/colly.go +++ b/colly.go @@ -85,9 +85,6 @@ type Collector struct { // 0 means unlimited. // The default value for MaxBodySize is 10MB (10 * 1024 * 1024 bytes). MaxBodySize int - // CacheDir specifies a location where GET requests are cached as files. - // When it's not defined, caching is disabled. - CacheDir string // IgnoreRobotsTxt allows the Collector to ignore any restrictions set by // the target host's robots.txt file. See http://www.robotstxt.org/ for more // information. @@ -119,6 +116,7 @@ type Collector struct { // Set it to 0 for infinite requests (default). MaxRequests uint32 + cache Cache store storage.Storage debugger debug.Debugger robotsMap map[string]*robotstxt.RobotsData @@ -240,7 +238,14 @@ var envMap = map[string]func(*Collector, string){ c.AllowedDomains = strings.Split(val, ",") }, "CACHE_DIR": func(c *Collector, val string) { - c.CacheDir = val + if c.cache != nil { + c.cache.Close() + } + c.cache = &FileSystemCache{ + BaseDir: val, + } + c.cache.Init() + c.backend.CacheBackend = c.cache }, "DETECT_CHARSET": func(c *Collector, val string) { c.DetectCharset = isYesString(val) @@ -393,7 +398,14 @@ func MaxBodySize(sizeInBytes int) CollectorOption { // CacheDir specifies the location where GET requests are cached as files. func CacheDir(path string) CollectorOption { return func(c *Collector) { - c.CacheDir = path + if c.cache != nil { + c.cache.Close() + } + c.cache = &FileSystemCache{ + BaseDir: path, + } + c.cache.Init() + c.backend.CacheBackend = c.cache } } @@ -462,6 +474,15 @@ func CheckHead() CollectorOption { } } +// CacheBackend sets the caching backend used by the Collector. +func CacheBackend(c Cache) CollectorOption { + return func(c *Collector) { + //d.Init() + //c.debugger = d + // FIXME: naprawić + } +} + // Init initializes the Collector's private variables and sets default // configuration for the Collector func (c *Collector) Init() { @@ -471,10 +492,12 @@ func (c *Collector) Init() { c.MaxRequests = 0 c.store = &storage.InMemoryStorage{} c.store.Init() + c.cache = &NullCache{} + c.cache.Init() c.MaxBodySize = 10 * 1024 * 1024 c.backend = &httpBackend{} jar, _ := cookiejar.New(nil) - c.backend.Init(jar) + c.backend.Init(jar, c.cache) c.backend.Client.CheckRedirect = c.checkRedirectFunc() c.wg = &sync.WaitGroup{} c.lock = &sync.RWMutex{} @@ -699,7 +722,7 @@ func (c *Collector) fetch(u, method string, depth int, requestData io.Reader, ct c.handleOnResponseHeaders(&Response{Ctx: ctx, Request: request, StatusCode: statusCode, Headers: &headers}) return !request.abort } - response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc, c.CacheDir) + response, err := c.backend.Cache(req, c.MaxBodySize, checkHeadersFunc) if proxyURL, ok := req.Context().Value(ProxyURLKey).(string); ok { request.ProxyURL = proxyURL } @@ -1027,6 +1050,16 @@ func (c *Collector) SetStorage(s storage.Storage) error { return nil } +// SetCache overrides the default in-memory storage. +// Storage stores scraping related data like cookies and visited urls +func (c *Collector) SetCache(cache Cache) error { + if err := cache.Init(); err != nil { + return err + } + c.cache = cache + return nil +} + // SetProxy sets a proxy for the collector. This method overrides the previously // used http.Transport if the type of the transport is not http.RoundTripper. // The proxy type is determined by the URL scheme. "http" @@ -1295,7 +1328,6 @@ func (c *Collector) Clone() *Collector { return &Collector{ AllowedDomains: c.AllowedDomains, AllowURLRevisit: c.AllowURLRevisit, - CacheDir: c.CacheDir, DetectCharset: c.DetectCharset, DisallowedDomains: c.DisallowedDomains, ID: atomic.AddUint32(&collectorCounter, 1), diff --git a/colly_test.go b/colly_test.go index 91f1441ed..a632edd55 100644 --- a/colly_test.go +++ b/colly_test.go @@ -368,8 +368,9 @@ var newCollectorTests = map[string]func(*testing.T){ } { c := NewCollector(CacheDir(path)) - if got, want := c.CacheDir, path; got != want { - t.Fatalf("c.CacheDir = %q, want %q", got, want) + fileSystemCache := c.backend.CacheBackend.(*FileSystemCache) + if got, want := fileSystemCache.BaseDir, path; got != want { + t.Fatalf("c.backend.CacheBackend.BaseDir = %q, want %q", got, want) } } }, diff --git a/http_backend.go b/http_backend.go index 0b201d236..26adcd526 100644 --- a/http_backend.go +++ b/http_backend.go @@ -15,15 +15,10 @@ package colly import ( - "crypto/sha1" - "encoding/gob" - "encoding/hex" "io" "io/ioutil" "math/rand" "net/http" - "os" - "path" "regexp" "strings" "sync" @@ -35,9 +30,10 @@ import ( ) type httpBackend struct { - LimitRules []*LimitRule - Client *http.Client - lock *sync.RWMutex + LimitRules []*LimitRule + Client *http.Client + lock *sync.RWMutex + CacheBackend Cache } type checkHeadersFunc func(req *http.Request, statusCode int, header http.Header) bool @@ -94,13 +90,14 @@ func (r *LimitRule) Init() error { return nil } -func (h *httpBackend) Init(jar http.CookieJar) { +func (h *httpBackend) Init(jar http.CookieJar, cache Cache) { rand.Seed(time.Now().UnixNano()) h.Client = &http.Client{ Jar: jar, Timeout: 10 * time.Second, } h.lock = &sync.RWMutex{} + h.CacheBackend = cache } // Match checks that the domain parameter triggers the rule @@ -129,42 +126,21 @@ func (h *httpBackend) GetMatchingRule(domain string) *LimitRule { return nil } -func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc, cacheDir string) (*Response, error) { - if cacheDir == "" || request.Method != "GET" || request.Header.Get("Cache-Control") == "no-cache" { - return h.Do(request, bodySize, checkHeadersFunc) - } - sum := sha1.Sum([]byte(request.URL.String())) - hash := hex.EncodeToString(sum[:]) - dir := path.Join(cacheDir, hash[:2]) - filename := path.Join(dir, hash) - if file, err := os.Open(filename); err == nil { - resp := new(Response) - err := gob.NewDecoder(file).Decode(resp) - file.Close() +func (h *httpBackend) Cache(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) { + if resp, err := h.CacheBackend.Get(request); err == nil { checkHeadersFunc(request, resp.StatusCode, *resp.Headers) if resp.StatusCode < 500 { return resp, err } } + resp, err := h.Do(request, bodySize, checkHeadersFunc) if err != nil || resp.StatusCode >= 500 { return resp, err } - if _, err := os.Stat(dir); err != nil { - if err := os.MkdirAll(dir, 0750); err != nil { - return resp, err - } - } - file, err := os.Create(filename + "~") - if err != nil { - return resp, err - } - if err := gob.NewEncoder(file).Encode(resp); err != nil { - file.Close() - return resp, err - } - file.Close() - return resp, os.Rename(filename+"~", filename) + + err = h.CacheBackend.Put(request, resp) + return resp, err } func (h *httpBackend) Do(request *http.Request, bodySize int, checkHeadersFunc checkHeadersFunc) (*Response, error) { From 16f4136f55bd29cab9f69ca9ff27cfaae70f246c Mon Sep 17 00:00:00 2001 From: Andrzej Lichnerowicz Date: Sat, 29 Apr 2023 17:03:27 +0200 Subject: [PATCH 2/2] Remove leftover function --- colly.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/colly.go b/colly.go index 950d9a612..9df359f72 100644 --- a/colly.go +++ b/colly.go @@ -474,15 +474,6 @@ func CheckHead() CollectorOption { } } -// CacheBackend sets the caching backend used by the Collector. -func CacheBackend(c Cache) CollectorOption { - return func(c *Collector) { - //d.Init() - //c.debugger = d - // FIXME: naprawić - } -} - // Init initializes the Collector's private variables and sets default // configuration for the Collector func (c *Collector) Init() {