Skip to content

Commit

Permalink
Make and use a versioned docker image (#279)
Browse files Browse the repository at this point in the history
* Revert redis module to 4.1.0

* Revert dnspython to 2.1.0

* Revert click to 8.0.3

* Specify alpine 3.16.2, reorganize into multiple steps

* Replace 'latest' with 'main' everywhere

* Fix deprecation warnings

* Add Google root certificates

* Re-order APK packages, write list after installing

* Create VERSION file during docker image build

* Pin chromium version
  • Loading branch information
marians authored Oct 24, 2022
1 parent 024ef11 commit 5e723c9
Show file tree
Hide file tree
Showing 13 changed files with 60 additions and 76 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ kubernetes/green-spider-secret.yaml
/volumes
/screenshots
/k8s-jobs
/VERSION
.env
31 changes: 23 additions & 8 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,16 +1,30 @@
FROM alpine:3.16
FROM alpine:3.16.2

ENV CHROMIUM_VERSION=106.0.5249.119-r1

RUN echo "http://dl-cdn.alpinelinux.org/alpine/edge/main" >> /etc/apk/repositories && \
echo "http://dl-cdn.alpinelinux.org/alpine/edge/community" >> /etc/apk/repositories && \
apk --update --no-cache add ca-certificates \
chromium=$CHROMIUM_VERSION \
chromium-chromedriver=$CHROMIUM_VERSION \
py3-cryptography python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
build-base git icu-libs libssl1.1 libssl3 libxml2 libxml2-dev libxslt libxslt-dev \
libffi-dev openssl-dev cargo

RUN apk info -v | sort

WORKDIR /workdir

# Execute time consuming compilations in a separate step
RUN python3 -m pip install libcst==0.4.7 sgmllib3k==1.0.0

ADD https://pki.google.com/roots.pem /google_roots.pem
ENV GRPC_DEFAULT_SSL_ROOTS_FILE_PATH=/google_roots.pem

ADD requirements.txt /workdir/
RUN pip install -r requirements.txt

RUN echo "http://dl-4.alpinelinux.org/alpine/edge/main/" >> /etc/apk/repositories && \
echo "http://dl-4.alpinelinux.org/alpine/edge/community/" >> /etc/apk/repositories && \
apk --update --no-cache add ca-certificates chromium chromium-chromedriver py3-cryptography \
python3-dev py3-grpcio py3-wheel py3-pip py3-lxml py3-yaml \
build-base git icu-libs libxml2 libxml2-dev libxslt libxslt-dev libffi-dev openssl-dev cargo && \
pip install -r requirements.txt && \
apk del build-base
RUN python3 -m pip freeze

ADD cli.py /workdir/
ADD manager /workdir/manager
Expand All @@ -20,3 +34,4 @@ ADD rating /workdir/rating
ADD spider /workdir/spider
ADD export /workdir/export
ADD job.py /workdir/
ADD VERSION /workdir/VERSION
8 changes: 6 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
IMAGE := quay.io/netzbegruenung/green-spider:latest
IMAGE := quay.io/netzbegruenung/green-spider:main

DB_ENTITY := spider-results

VERSION = $(shell git describe --exact-match --tags 2> /dev/null || git rev-parse HEAD)

.PHONY: dockerimage spider export

# Build docker image
dockerimage:
dockerimage: VERSION
docker build --progress plain -t $(IMAGE) .

# Fill the queue with spider jobs, one for each site.
Expand Down Expand Up @@ -50,3 +52,5 @@ test:
$(IMAGE) \
-m unittest discover -p '*_test.py' -v

VERSION:
@echo $(VERSION) > VERSION
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,8 @@ docker run --rm -ti \
-v $(pwd)/screenshots:/screenshots \
-v $(pwd)/volumes/chrome-userdir:/opt/chrome-userdir \
--shm-size=2g \
quay.io/netzbegruenung/green-spider:latest python3 cli.py \
quay.io/netzbegruenung/green-spider:main python3 cli.py \
--credentials-path /secrets/datastore-writer.json \
--loglevel debug \
spider --job '{"url": "https://gruene-porta-westfalica.de/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
spider --job '{"url": "https://gruene-porta-westfalica.de/home/", "city": "Porta Westfalica", "country": "DE", "district": "Minden-Lübbecke", "level": "DE:ORTSVERBAND", "state":" Nordrhein-Westfalen", "type": "REGIONAL_CHAPTER"}'
```
4 changes: 2 additions & 2 deletions checks/dns_resolution.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def resolve_hostname(self, hostname):

# IPv4
try:
answers = dns.resolver.query(hostname, "A")
answers = dns.resolver.resolve(hostname, "A")
result['resolvable_ipv4'] = True
for rdata in answers:
result['ipv4_addresses'].append(rdata.address)
Expand All @@ -58,7 +58,7 @@ def resolve_hostname(self, hostname):

# IPv6
try:
answers = dns.resolver.query(hostname, "AAAA")
answers = dns.resolver.resolve(hostname, "AAAA")
result['resolvable_ipv6'] = True
for rdata in answers:
result['ipv6_addresses'].append(rdata.address)
Expand Down
2 changes: 1 addition & 1 deletion checks/duplicate_content.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def run(self):
page_content = self.previous_results['page_content'][url]

if page_content['content'] is None:
logging.warn("Content for URL %s is None" % url)
logging.warning("Content for URL %s is None" % url)

content[url] = page_content['content']

Expand Down
22 changes: 11 additions & 11 deletions checks/load_in_browser.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,20 +119,20 @@ def run(self):
'screenshots': check_responsiveness_results['screenshots'],
}
except TimeoutException as e:
logging.warn("TimeoutException when checking responsiveness for %s: %s" % (url, e))
logging.warning("TimeoutException when checking responsiveness for %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError when checking responsiveness for %s: %s" % (url, re))
logging.warning("RetryError when checking responsiveness for %s: %s" % (url, re))
pass

# Scroll page to bottom, to load all lazy-loading resources.
try:
self.scroll_to_bottom()
except TimeoutException as e:
logging.warn("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
logging.warning("TimeoutException in scroll_to_bottom for %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError in scroll_to_bottom for %s: %s" % (url, re))
logging.warning("RetryError in scroll_to_bottom for %s: %s" % (url, re))
pass

# CSS collection
Expand All @@ -148,23 +148,23 @@ def run(self):
continue
font_families.add(font_family.lower())
except StaleElementReferenceException as e:
logging.warn("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
logging.warning("StaleElementReferenceException when collecting CSS properties for %s: %s" % (url, e))
continue

results[url]['font_families'] = sorted(list(font_families))

except TimeoutException as e:
logging.warn("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
logging.warning("TimeoutException when collecting CSS elements for %s: %s" % (url, e))
pass

# Process cookies.
try:
results[url]['cookies'] = self.get_cookies()
except TimeoutException as e:
logging.warn("TimeoutException when collecting cookies %s: %s" % (url, e))
logging.warning("TimeoutException when collecting cookies %s: %s" % (url, e))
pass
except tenacity.RetryError as re:
logging.warn("RetryError when collecting cookies for %s: %s" % (url, re))
logging.warning("RetryError when collecting cookies for %s: %s" % (url, re))
pass

for logentry in self.driver.get_log('performance'):
Expand Down Expand Up @@ -209,7 +209,7 @@ def post_hook(self, result):
blob.upload_from_file(my_file, content_type="image/png")
blob.make_public()
except Exception as e:
logging.warn("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
logging.warning("Error uploading screenshot for %s: %s" % (screenshot['url'], e))
continue

try:
Expand All @@ -232,7 +232,7 @@ def post_hook(self, result):
datastore_client.put(entity)
logging.debug("Successfully stored screenshot metadata for %s" % screenshot['screenshot_url'])
except Exception as e:
logging.warn("Error in %s: %s" % (screenshot['url'], e))
logging.warning("Error in %s: %s" % (screenshot['url'], e))


# Remove screenshots part from results
Expand Down Expand Up @@ -289,7 +289,7 @@ def check_responsiveness(self, url):
success = self.driver.save_screenshot(abs_filepath)

if not success:
logging.warn("Failed to create screenshot %s" % abs_filepath)
logging.warning("Failed to create screenshot %s" % abs_filepath)
continue

result['screenshots'].append({
Expand Down
2 changes: 1 addition & 1 deletion docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ services:

# manager manages the job queue.
manager:
image: quay.io/netzbegruenung/green-spider:latest
image: quay.io/netzbegruenung/green-spider:main
command: >
python3 cli.py
--credentials-path /secrets/datastore-writer.json
Expand Down
2 changes: 1 addition & 1 deletion job.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# via the environment JOB_TIMEOUT variable.
TIMEOUT = int(os.environ.get("JOB_TIMEOUT", "50"))

DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:latest'
DOCKER_IMAGE = 'quay.io/netzbegruenung/green-spider:main'

CREDENTIALS_PATH = '/secrets/datastore-writer.json'

Expand Down
2 changes: 1 addition & 1 deletion kubernetes/green-spider-createjobs-cronjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: spider
image: quay.io/netzbegruenung/green-spider:latest
image: quay.io/netzbegruenung/green-spider:main
imagePullPolicy: Always
args:
- "--credentials-path=/secrets/datastore-writer.json"
Expand Down
36 changes: 0 additions & 36 deletions kubernetes/green-spider-screenshotter-cronjob.yaml

This file was deleted.

2 changes: 1 addition & 1 deletion kubernetes/green-spider-spider-cronjob.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ spec:
spec:
containers:
- name: spider
image: quay.io/netzbegruenung/green-spider:latest
image: quay.io/netzbegruenung/green-spider:main
imagePullPolicy: Always
args:
- "--credentials-path=/secrets/datastore-writer.json"
Expand Down
20 changes: 10 additions & 10 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,19 @@ cachetools==4.2.4
certifi==2021.10.8
cffi==1.15.1
chardet==3.0.4
click==8.1.3
click==8.0.3
cssselect==1.1.0
dnspython==2.2.1
dnspython==2.1.0
docker==4.4.1
feedparser==6.0.8
gitdb==4.0.9
GitPython==3.1.24
google-api-core==2.2.2
google-auth==2.3.3
google-cloud-core==2.2.1
google-cloud-datastore==2.4.0
google-cloud-storage==1.43.0
googleapis-common-protos==1.53.0
google-api-core==2.10.2
google-auth==2.13.0
google-cloud-core==2.3.2
google-cloud-datastore==2.9.0
google-cloud-storage==2.5.0
googleapis-common-protos==1.56.4
html-similarity==0.3.3
httpretty==1.1.4
idna==2.10
Expand All @@ -25,9 +25,9 @@ protobuf==4.21.8
pyasn1==0.4.8
pyasn1-modules==0.2.8
pycparser==2.21
pyOpenSSL==22.1.0
pyOpenSSL==22.0.0
pytz==2021.3
redis==4.3.4
redis==4.1.0
requests==2.26.0
responses==0.22.0
rq==1.8.0
Expand Down

0 comments on commit 5e723c9

Please sign in to comment.