From 5e723c94db1335783eb5411673a07759d171d900 Mon Sep 17 00:00:00 2001 From: Marian Steinbach Date: Mon, 24 Oct 2022 21:35:15 +0200 Subject: [PATCH] Make and use a versioned docker image (#279) * Revert redis module to 4.1.0 * Revert dnspython to 2.1.0 * Revert click to 8.0.3 * Specify alpine 3.16.2, reorganize into multiple steps * Replace 'latest' with 'main' everywhere * Fix deprecation warnings * Add Google root certificates * Re-order APK packages, write list after installing * Create VERSION file during docker image build * Pin chromium version --- .gitignore | 1 + Dockerfile | 31 +++++++++++----- Makefile | 8 +++-- README.md | 4 +-- checks/dns_resolution.py | 4 +-- checks/duplicate_content.py | 2 +- checks/load_in_browser.py | 22 ++++++------ docker-compose.yaml | 2 +- job.py | 2 +- .../green-spider-createjobs-cronjob.yaml | 2 +- .../green-spider-screenshotter-cronjob.yaml | 36 ------------------- kubernetes/green-spider-spider-cronjob.yaml | 2 +- requirements.txt | 20 +++++------ 13 files changed, 60 insertions(+), 76 deletions(-) delete mode 100644 kubernetes/green-spider-screenshotter-cronjob.yaml diff --git a/.gitignore b/.gitignore index f7e6a57..d46cc30 100644 --- a/.gitignore +++ b/.gitignore @@ -8,4 +8,5 @@ kubernetes/green-spider-secret.yaml /volumes /screenshots /k8s-jobs +/VERSION .env diff --git a/Dockerfile b/Dockerfile index f82314a..3713baf 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,16 +1,30 @@ -FROM alpine:3.16 +FROM alpine:3.16.2 + +ENV CHROMIUM_VERSION=106.0.5249.119-r1 + +RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \ + echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \ + apk --update --no-cache add ca-certificates \ + chromium=$CHROMIUM_VERSION \ + chromium-chromedriver=$CHROMIUM_VERSION \ + py3-cryptography python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \ + build-base git icu-libs libssl1.1 libssl3 libxml2 libxml2-dev libxslt libxslt-dev \ + libffi-dev openssl-dev cargo + +RUN apk info -v | sort WORKDIR /workdir +# Execute time consuming compilations in a separate step +RUN python3 -m pip install libcst==0.4.7 sgmllib3k==1.0.0 + +ADD https://pki.google.com/roots.pem /google_roots.pem +ENV GRPC_DEFAULT_SSL_ROOTS_FILE_PATH=/google_roots.pem + ADD requirements.txt /workdir/ +RUN pip install -r requirements.txt -RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main/" >> /etc/apk/repositories && \ - echo "http://dl-4.alpinelinux.org/alpine/edge/community/" >> /etc/apk/repositories && \ - apk --update --no-cache add ca-certificates chromium chromium-chromedriver py3-cryptography \ - python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \ - build-base git icu-libs libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \ - pip install -r requirements.txt && \ - apk del build-base +RUN python3 -m pip freeze ADD cli.py /workdir/ ADD manager /workdir/manager @@ -20,3 +34,4 @@ ADD rating /workdir/rating ADD spider /workdir/spider ADD export /workdir/export ADD job.py /workdir/ +ADD VERSION /workdir/VERSION diff --git a/Makefile b/Makefile index b20f2ad..5fc6378 100644 --- a/Makefile +++ b/Makefile @@ -1,11 +1,13 @@ -IMAGE := quay.io/netzbegruenung/green-spider:latest +IMAGE := quay.io/netzbegruenung/green-spider:main DB_ENTITY := spider-results +VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD) + .PHONY: dockerimage spider export # Build docker image -dockerimage: +dockerimage: VERSION docker build --progress plain -t $(IMAGE) . # Fill the queue with spider jobs, one for each site. @@ -50,3 +52,5 @@ test: $(IMAGE) \ -m unittest discover -p '*_test.py' -v +VERSION: + @echo $(VERSION) > VERSION diff --git a/README.md b/README.md index e47b09b..c5e7993 100644 --- a/README.md +++ b/README.md @@ -68,8 +68,8 @@ docker run --rm -ti \ -v $(pwd)/screenshots:/screenshots \ -v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \ --shm-size=2g \ - quay.io/netzbegruenung/green-spider:latest python3 cli.py \ + quay.io/netzbegruenung/green-spider:main python3 cli.py \ --credentials-path /secrets/datastore-writer.json \ --loglevel debug \ - spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}' + spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}' ``` diff --git a/checks/dns_resolution.py b/checks/dns_resolution.py index b24e04b..c0ca470 100644 --- a/checks/dns_resolution.py +++ b/checks/dns_resolution.py @@ -49,7 +49,7 @@ def resolve_hostname(self, hostname): # IPv4 try: - answers = dns.resolver.query(hostname, "A") + answers = dns.resolver.resolve(hostname, "A") result['resolvable_ipv4'] = True for rdata in answers: result['ipv4_addresses'].append(rdata.address) @@ -58,7 +58,7 @@ def resolve_hostname(self, hostname): # IPv6 try: - answers = dns.resolver.query(hostname, "AAAA") + answers = dns.resolver.resolve(hostname, "AAAA") result['resolvable_ipv6'] = True for rdata in answers: result['ipv6_addresses'].append(rdata.address) diff --git a/checks/duplicate_content.py b/checks/duplicate_content.py index 9556902..d0a383d 100644 --- a/checks/duplicate_content.py +++ b/checks/duplicate_content.py @@ -36,7 +36,7 @@ def run(self): page_content = self.previous_results['page_content'][url] if page_content['content'] is None: - logging.warn("Content for URL %s is None" % url) + logging.warning("Content for URL %s is None" % url) content[url] = page_content['content'] diff --git a/checks/load_in_browser.py b/checks/load_in_browser.py index 9b4cd31..a671f5d 100644 --- a/checks/load_in_browser.py +++ b/checks/load_in_browser.py @@ -119,20 +119,20 @@ def run(self): 'screenshots': check_responsiveness_results['screenshots'], } except TimeoutException as e: - logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e)) + logging.warning("TimeoutException when checking responsiveness for %s: %s" % (url, e)) pass except tenacity.RetryError as re: - logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re)) + logging.warning("RetryError when checking responsiveness for %s: %s" % (url, re)) pass # Scroll page to bottom, to load all lazy-loading resources. try: self.scroll_to_bottom() except TimeoutException as e: - logging.warn("TimeoutException in scroll_to_bottom for %s: %s" % (url, e)) + logging.warning("TimeoutException in scroll_to_bottom for %s: %s" % (url, e)) pass except tenacity.RetryError as re: - logging.warn("RetryError in scroll_to_bottom for %s: %s" % (url, re)) + logging.warning("RetryError in scroll_to_bottom for %s: %s" % (url, re)) pass # CSS collection @@ -148,23 +148,23 @@ def run(self): continue font_families.add(font_family.lower()) except StaleElementReferenceException as e: - logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e)) + logging.warning("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e)) continue results[url]['font_families'] = sorted(list(font_families)) except TimeoutException as e: - logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e)) + logging.warning("TimeoutException when collecting CSS elements for %s: %s" % (url, e)) pass # Process cookies. try: results[url]['cookies'] = self.get_cookies() except TimeoutException as e: - logging.warn("TimeoutException when collecting cookies %s: %s" % (url, e)) + logging.warning("TimeoutException when collecting cookies %s: %s" % (url, e)) pass except tenacity.RetryError as re: - logging.warn("RetryError when collecting cookies for %s: %s" % (url, re)) + logging.warning("RetryError when collecting cookies for %s: %s" % (url, re)) pass for logentry in self.driver.get_log('performance'): @@ -209,7 +209,7 @@ def post_hook(self, result): blob.upload_from_file(my_file, content_type="image/png") blob.make_public() except Exception as e: - logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e)) + logging.warning("Error uploading screenshot for %s: %s" % (screenshot['url'], e)) continue try: @@ -232,7 +232,7 @@ def post_hook(self, result): datastore_client.put(entity) logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url']) except Exception as e: - logging.warn("Error in %s: %s" % (screenshot['url'], e)) + logging.warning("Error in %s: %s" % (screenshot['url'], e)) # Remove screenshots part from results @@ -289,7 +289,7 @@ def check_responsiveness(self, url): success = self.driver.save_screenshot(abs_filepath) if not success: - logging.warn("Failed to create screenshot %s" % abs_filepath) + logging.warning("Failed to create screenshot %s" % abs_filepath) continue result['screenshots'].append({ diff --git a/docker-compose.yaml b/docker-compose.yaml index 084cf59..f793e37 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,7 +15,7 @@ services: # manager manages the job queue. manager: - image: quay.io/netzbegruenung/green-spider:latest + image: quay.io/netzbegruenung/green-spider:main command: > python3 cli.py --credentials-path /secrets/datastore-writer.json diff --git a/job.py b/job.py index 05edb7e..6b7f229 100644 --- a/job.py +++ b/job.py @@ -16,7 +16,7 @@ # via the environment JOB_TIMEOUT variable. TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50")) -DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest' +DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:main' CREDENTIALS_PATH = '/secrets/datastore-writer.json' diff --git a/kubernetes/green-spider-createjobs-cronjob.yaml b/kubernetes/green-spider-createjobs-cronjob.yaml index c9ed633..efe2109 100644 --- a/kubernetes/green-spider-createjobs-cronjob.yaml +++ b/kubernetes/green-spider-createjobs-cronjob.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: spider - image: quay.io/netzbegruenung/green-spider:latest + image: quay.io/netzbegruenung/green-spider:main imagePullPolicy: Always args: - "--credentials-path=/secrets/datastore-writer.json" diff --git a/kubernetes/green-spider-screenshotter-cronjob.yaml b/kubernetes/green-spider-screenshotter-cronjob.yaml deleted file mode 100644 index 3f5ac14..0000000 --- a/kubernetes/green-spider-screenshotter-cronjob.yaml +++ /dev/null @@ -1,36 +0,0 @@ -apiVersion: batch/v1beta1 -kind: CronJob -metadata: - name: green-spider-screenshotter -spec: - # Saturday at 1:05 UTC - schedule: "5 1 * * 6" - jobTemplate: - spec: - parallelism: 1 - template: - spec: - containers: - - name: screenshotter - image: quay.io/netzbegruenung/green-spider-screenshotter:latest - imagePullPolicy: Always - volumeMounts: - - name: secrets - mountPath: "/secrets" - readOnly: true - resources: - requests: - cpu: 800m - memory: 4000M - # No restarts, as this would mean to start over. - # TODO: Maintain a queue and change this. - restartPolicy: Never - volumes: - - name: secrets - secret: - secretName: green-spider - items: - - key: datastore-writer.json - path: datastore-writer.json - - key: screenshots-uploader.json - path: screenshots-uploader.json diff --git a/kubernetes/green-spider-spider-cronjob.yaml b/kubernetes/green-spider-spider-cronjob.yaml index 0868d32..1e1e79c 100644 --- a/kubernetes/green-spider-spider-cronjob.yaml +++ b/kubernetes/green-spider-spider-cronjob.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: spider - image: quay.io/netzbegruenung/green-spider:latest + image: quay.io/netzbegruenung/green-spider:main imagePullPolicy: Always args: - "--credentials-path=/secrets/datastore-writer.json" diff --git a/requirements.txt b/requirements.txt index f3459cb..a7c6b57 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,19 +3,19 @@ cachetools==4.2.4 certifi==2021.10.8 cffi==1.15.1 chardet==3.0.4 -click==8.1.3 +click==8.0.3 cssselect==1.1.0 -dnspython==2.2.1 +dnspython==2.1.0 docker==4.4.1 feedparser==6.0.8 gitdb==4.0.9 GitPython==3.1.24 -google-api-core==2.2.2 -google-auth==2.3.3 -google-cloud-core==2.2.1 -google-cloud-datastore==2.4.0 -google-cloud-storage==1.43.0 -googleapis-common-protos==1.53.0 +google-api-core==2.10.2 +google-auth==2.13.0 +google-cloud-core==2.3.2 +google-cloud-datastore==2.9.0 +google-cloud-storage==2.5.0 +googleapis-common-protos==1.56.4 html-similarity==0.3.3 httpretty==1.1.4 idna==2.10 @@ -25,9 +25,9 @@ protobuf==4.21.8 pyasn1==0.4.8 pyasn1-modules==0.2.8 pycparser==2.21 -pyOpenSSL==22.1.0 +pyOpenSSL==22.0.0 pytz==2021.3 -redis==4.3.4 +redis==4.1.0 requests==2.26.0 responses==0.22.0 rq==1.8.0