Skip to content

Commit

Permalink
Make the container image smaller
Browse files Browse the repository at this point in the history
For each package and binary we need, this pulls in all the files and
deps (shared libs, mostly).  The build is slower but the final image is
85 MB (versus 157 MB before).  e2e passes.  Hopefully less CVE surface.

This is based on scripts used in kubernetes and KinD.
  • Loading branch information
thockin committed Jun 12, 2023
1 parent 1894192 commit f037087
Show file tree
Hide file tree
Showing 4 changed files with 344 additions and 12 deletions.
38 changes: 26 additions & 12 deletions Dockerfile.in
Original file line number Diff line number Diff line change
Expand Up @@ -41,22 +41,36 @@
#############################################################################
# First we prepare the image that we want, regardless of build layers.
#############################################################################
FROM {ARG_FROM} as prep
FROM {ARG_FROM} as base

# When building, we can pass a unique value (e.g. `date +%s`) for this arg,
# which will force a rebuild from here (by invalidating docker's cache).
ARG FORCE_REBUILD=0

RUN apt-get -q -y update
RUN apt-get -q -y upgrade
RUN apt-get -q -y install --no-install-recommends \
ca-certificates \
coreutils \
socat \
openssh-client \
git
RUN apt-get -q -y autoremove
RUN rm -rf /var/lib/apt/lists/*
RUN apt-get -y -qq -o Dpkg::Use-Pty=0 update
RUN apt-get -y -qq -o Dpkg::Use-Pty=0 -y upgrade

RUN apt-get -y -qq -o Dpkg::Use-Pty=0 install bash # for the staging scripts and ldd
RUN mkdir -p {ARG_STAGING}
COPY stage_binaries.sh /
RUN /stage_binaries.sh -o {ARG_STAGING} \
-p coreutils \
-p socat \
-p openssh-client \
-p git \
-b /bin/dash \
-b /bin/grep \
-b /bin/sed
RUN ln -sf /bin/dash {ARG_STAGING}/bin/sh

COPY clean_distroless.sh /clean_distroless.sh
RUN /clean_distroless.sh {ARG_STAGING}

# We need to use distroless/base for tzdata, glibc, and some others.
FROM gcr.io/distroless/base as intermediate

# Docker doesn't do vars in COPY, so we can't use a regular ARG.
COPY --from=base {ARG_STAGING} /

# Add the default UID to /etc/passwd so SSH is satisfied.
RUN echo "git-sync:x:65533:65533::/tmp:/sbin/nologin" >> /etc/passwd
Expand Down Expand Up @@ -90,7 +104,7 @@ COPY bin/{ARG_OS}_{ARG_ARCH}/{ARG_BIN} /{ARG_BIN}
# Now we make a "clean" final image.
#############################################################################
FROM scratch
COPY --from=prep / /
COPY --from=intermediate / /

# Run as non-root by default. There's simply no reason to run as root.
USER 65533:65533
Expand Down
1 change: 1 addition & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ container: .container-$(DOTFILE_IMAGE) container-name
-e 's|{ARG_ARCH}|$(ARCH)|g' \
-e 's|{ARG_OS}|$(OS)|g' \
-e 's|{ARG_FROM}|$(BASEIMAGE)|g' \
-e 's|{ARG_STAGING}|/staging|g' \
Dockerfile.in > .dockerfile-$(OS)_$(ARCH)
HASH_LICENSES=$$(find $(LICENSES) -type f \
| xargs md5sum | md5sum | cut -f1 -d' '); \
Expand Down
39 changes: 39 additions & 0 deletions clean_distroless.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#!/bin/sh

# Copyright 2022 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# USAGE: clean-distroless.sh <staging_dir>

if [ -z "$1" ]; then
echo "usage: $0 <staging-dir>"
exit 1
fi
ROOT="$1"

# This script needs to be "sh" and not "bash", but there are no arrays in sh,
# except for "$@". We need array semantics on the off chance we ever have a
# pathname with spaces in it.
set -- \
/usr/share/base-files \
/usr/share/man \
/usr/lib/*-linux-gnu/gconv \
/usr/bin/c_rehash \
/usr/bin/openssl \
/iptables-wrapper-installer.sh \
/clean-distroless.sh

for item; do
rm -rf "${ROOT}/${item}"
done
278 changes: 278 additions & 0 deletions stage_binaries.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,278 @@
#!/bin/bash

# Copyright 2022 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# USAGE: stage-binaries.sh -o <staging-dir> ( -p <package> | -b binary )..."
#
# Stages all the packages or files and their dependencies (+ libraries and
# copyrights) to the staging dir.
#
# This is intended to be used in a multi-stage docker build with a distroless/base
# or distroless/cc image.

set -o errexit
set -o nounset
set -o pipefail

# A handler for when we exit automatically on an error.
# Borrowed from kubernetes, which was borrowed from
# https://gist.github.com/ahendrix/7030300
function errexit() {
# If the shell we are in doesn't have errexit set (common in subshells) then
# don't dump stacks.
set +o | grep -qe "-o errexit" || return

local file="$(basename "${BASH_SOURCE[1]}")"
local line="${BASH_LINENO[0]}"
local func="${FUNCNAME[1]:-}"
echo "FATAL: error at ${func}() ${file}:${line}" >&2
}

# trap ERR to provide an error handler whenever a command exits nonzero this
# is a more verbose version of set -o errexit
trap 'errexit' ERR

# setting errtrace allows our ERR trap handler to be propagated to functions,
# expansions and subshells
set -o errtrace

# file_to_package identifies the debian package that provided the file $1
function file_to_package() {
local file="$1"

# `dpkg-query --search $file-pattern` outputs lines with the format: "$package: $file-path"
# where $file-path belongs to $package
# https://manpages.debian.org/jessie/dpkg/dpkg-query.1.en.html
dpkg-query --search "$(realpath "${file}")" | cut -d':' -f1
}

# package_to_copyright gives the path to the copyright file for the package $1
function package_to_copyright() {
local pkg="$1"
echo "/usr/share/doc/${pkg}/copyright"
}

# stage_file stages the filepath $1 to $2, following symlinks
# and staging copyrights
function stage_file() {
local file="$1"
local staging="$2"

# copy the named path
cp -a --parents "${file}" "${staging}"

# recursively follow symlinks
if [[ -L "${file}" ]]; then
stage_file "$(cd "$(dirname "${file}")" || exit; realpath -s "$(readlink "${file}")")" "${staging}"
fi

# get the package so we can stage package metadata as well
local package="$(file_to_package "${file}")"
# stage the copyright for the file, if it exists
local copyright="$(package_to_copyright "${package}")"
if [[ -f "${copyright}" ]]; then
cp -a --parents "${copyright}" "${staging}"
fi

# stage the package status mimicking bazel
# https://github.com/bazelbuild/rules_docker/commit/f5432b813e0a11491cf2bf83ff1a923706b36420
# instead of parsing the control file, we can just get the actual package status with dpkg
dpkg -s "${package}" > "${staging}/var/lib/dpkg/status.d/${package}"
}

function grep_allow_nomatch() {
# grep exits 0 on match, 1 on no match, 2 on error
grep "$@" || [[ $? == 1 ]]
}

function _indent() {
while read -r X; do
echo " ${X}"
done
}

# run "$@" and indent the output
function indent() {
# This lets us process stderr and stdout without merging them, without
# bash-isms.
{ "$@" 2>&1 1>&3 | _indent; } 3>&1 1>&2 | _indent
}

function stage_file_list() {
local pkg="$1"
local staging="$2"

dpkg -L "${pkg}" \
| grep_allow_nomatch -vE '(/\.|/usr/share/(man|doc|.*-completion))' \
| while read -r file; do
if [[ -f "$file" ]]; then
stage_file "${file}" "${staging}"
if [[ -L "$file" ]]; then
continue
fi
if [[ -x "$file" ]]; then
stage_binaries "${staging}" "${file}"
fi
fi
done
}

function get_dependent_packages() {
local pkg="$1"
apt-cache depends "${pkg}" \
| grep_allow_nomatch Depends \
| awk -F '.*Depends:[[:space:]]?' '{print $2}'
}

# Args:
# $1: path to staging dir
# $2+: package names
function stage_packages() {
local staging="$1"
shift

mkdir -p "${staging}"/var/lib/dpkg/status.d/
indent apt-get -y -qq -o Dpkg::Use-Pty=0 update

local pkg
for pkg; do
echo "staging package ${pkg}"
indent apt-get -y -qq -o Dpkg::Use-Pty=0 --no-install-recommends install "${pkg}"
stage_file_list "${pkg}" "$staging"
get_dependent_packages "${pkg}" \
| while read -r dep; do
stage_file_list "${dep}" "${staging}"
done
done
}

# binary_to_libraries identifies the library files needed by the binary $1 with ldd
function binary_to_libraries() {
local bin="$1"

# see: https://man7.org/linux/man-pages/man1/ldd.1.html
# Each output line looks like:
# linux-vdso.so.1 (0x00007fffb11c3000)
# or
# libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f2f52d26000)
#
# This is a little funky because ldd treats static binaries as errors ("not
# a dynamic executable") but static libraries as non-errors ("statically
# linked"). We want real ldd errors, but static binaries are OK.
if [[ "$(ldd "${bin}" 2>&1)" =~ "not a dynamic executable" ]]; then
return
fi
ldd "${bin}" \
`# skip static binaries` \
| grep_allow_nomatch -v "statically linked" \
`# linux-vdso.so.1 is a special virtual shared object from the kernel` \
`# see: http://man7.org/linux/man-pages/man7/vdso.7.html` \
| grep_allow_nomatch -v 'linux-vdso.so.1' \
`# strip the leading '${name} => ' if any so only '/lib-foo.so (0xf00)' remains` \
| sed -E 's#.* => /#/#' \
`# we want only the path remaining, not the (0x${LOCATION})` \
| awk '{print $1}'
}

function stage_binaries() {
local staging="$1"
shift

local bin
for bin; do
echo "staging binary ${bin}"

# locate the path to the binary
local binary_path
binary_path="$(which "${bin}")"

# ensure package metadata dir
mkdir -p "${staging}/var/lib/dpkg/status.d/"

# stage the binary itself
stage_file "${binary_path}" "${staging}"

# stage the dependencies of the binary
binary_to_libraries "${binary_path}" \
| while read -r lib; do
stage_file "${lib}" "${staging}"
done
done
}

function usage() {
echo "$0 -o <staging-dir> ( -p <package> | -b binary )..."
}

function main() {
local staging=""
local pkgs=()
local bins=()

while [ "$#" -gt 0 ]; do
case "$1" in
"-?")
usage
exit 0
;;
"-b")
if [[ -z "${2:-}" ]]; then
echo "error: flag '-b' requires an argument" >&2
usage >&2
exit 2
fi
bins+=("$2")
shift 2
;;
"-p")
if [[ -z "${2:-}" ]]; then
echo "error: flag '-p' requires an argument" >&2
usage >&2
exit 2
fi
pkgs+=("$2")
shift 2
;;
"-o")
if [[ -z "${2:-}" ]]; then
echo "error: flag '-o' requires an argument" >&2
usage >&2
exit 2
fi
staging="$2"
shift 2
;;
*)
echo "error: unknown argument: $1" >&2
usage >&2
exit 3
;;
esac
done

if [[ -z "${staging}" ]]; then
usage >&2
exit 4
fi

if (( "${#pkgs[@]}" > 0 )); then
stage_packages "${staging}" "${pkgs[@]}"
fi
if (( "${#bins[@]}" > 0 )); then
stage_binaries "${staging}" "${bins[@]}"
fi
}

main "$@"

0 comments on commit f037087

Please sign in to comment.