diff --git a/Dockerfile b/Dockerfile index 7fea4186e..a971b13ae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,6 +52,30 @@ RUN ctest --test-dir /build --build-config ${SOURCEMETA_ONE_BUILD_TYPE} \ FROM debian:trixie-slim +# Install `gosu` so the runtime entrypoint can drop from root to the +# unprivileged service account. We deliberately don't set `USER` on +# this image so build-time `RUN` instructions (including those in +# downstream consumer Dockerfiles) keep executing as root, matching +# the well-trodden Postgres/Redis/MySQL pattern. +# +# `libcap2-bin` provides `setcap`, used below to grant the server +# binary `CAP_NET_BIND_SERVICE` so the dropped service account can +# bind privileged ports (below 1024) if the operator chooses to run +# without a separate TLS terminator in front of the image. +RUN apt-get --yes update \ + && apt-get install --yes --no-install-recommends gosu libcap2-bin \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# Create the unprivileged service account that the entrypoint will +# `gosu` into. The UID sits above the typical 1000-range dev user +# (so host-mounted volumes can be mapped unambiguously) and outside +# the Debian system-user range (1-999) reserved for daemons. +ARG SOURCEMETA_ONE_UID=10001 +RUN groupadd --system --gid "${SOURCEMETA_ONE_UID}" sourcemeta \ + && useradd --system --uid "${SOURCEMETA_ONE_UID}" \ + --gid "${SOURCEMETA_ONE_UID}" \ + --no-create-home --shell /usr/sbin/nologin sourcemeta + # See https://github.com/opencontainers/image-spec/blob/main/annotations.md#pre-defined-annotation-keys LABEL org.opencontainers.image.url="https://one.sourcemeta.com" LABEL org.opencontainers.image.documentation="https://one.sourcemeta.com" @@ -77,6 +101,13 @@ COPY --from=builder /usr/share/sourcemeta/one \ RUN ldd /usr/bin/sourcemeta-one-index RUN ldd /usr/bin/sourcemeta-one-server +# File capabilities are filesystem extended attributes that do not +# survive a cross-stage `COPY --from=builder`, so the capability has +# to be applied here in the runtime stage after the binary lands. +# `+ep` puts the capability in both the permitted and effective sets +# of the process at exec time. +RUN setcap cap_net_bind_service+ep /usr/bin/sourcemeta-one-server + # We expect images that extend this one to use this directory ARG SOURCEMETA_ONE_WORKDIR=/source ENV SOURCEMETA_ONE_WORKDIR=${SOURCEMETA_ONE_WORKDIR} @@ -90,6 +121,20 @@ COPY docker/wrapper-index.sh /usr/bin/sourcemeta COPY docker/wrapper-server.sh /usr/bin/sourcemeta-server COPY docker/transaction-overlayfs.sh /usr/bin/sourcemeta-transaction-overlayfs +# Pre-create the output directory and hand it to the service account +# so the runtime server can read through it (and the optional +# transactional re-index path can write through it) without +# elevation. The chown is non-recursive to keep blast-radius bounded +# if a build-arg override ever pointed the path at the rootfs root, +# and we normalize via `realpath` so values like `/..`, `/foo/..`, or +# symlinks that resolve to `/` are rejected too. +RUN test -n "${SOURCEMETA_ONE_OUTPUT}" \ + && mkdir -p "${SOURCEMETA_ONE_OUTPUT}" \ + && resolved_output="$(realpath "${SOURCEMETA_ONE_OUTPUT}")" \ + && test "${resolved_output}" != "/" \ + && chown sourcemeta:sourcemeta "${resolved_output}" \ + && chmod 775 "${resolved_output}" + ENV SOURCEMETA_ONE_PORT=8000 HEALTHCHECK --interval=1s --timeout=2s --start-period=1s --retries=10 CMD grep -qE \ "^\s*[0-9]+:\s+[0-9A-F]+:$(printf '%04X' $SOURCEMETA_ONE_PORT)\s+[0-9A-F:]+\s+0A\s" \ diff --git a/docker/wrapper-server.sh b/docker/wrapper-server.sh index 2e954b83d..21d69834f 100755 --- a/docker/wrapper-server.sh +++ b/docker/wrapper-server.sh @@ -3,7 +3,41 @@ set -o errexit set -o nounset -# For better shell expansion in the Dockerfile +# For better shell expansion in the Dockerfile. +# +# When the entrypoint runs as root (the image's default execution +# identity, kept that way so build-time `RUN` instructions in consumer +# Dockerfiles are unsurprising), `gosu` drops privileges to the +# unprivileged service account before the long-running server starts. +# When the entrypoint is already running as a non-root account (e.g. +# `docker run --user 1234`, or an orchestrator like OpenShift that +# assigns an arbitrary UID via its restricted SCC), `gosu` would fail +# because privilege drop requires root, so we just exec the server +# directly and trust the caller's choice. Either path uses `exec` +# (and `gosu` itself uses `execve(2)`), so SIGTERM and friends reach +# the server process directly without an intermediate shell. +# +# The output directory is mode 775 owned by the `sourcemeta` group at +# build time, so the default server (which only reads from it) works +# at any UID. Operators that need write access from an arbitrary UID +# (e.g. the optional transactional re-index path) should run as +# `--user :` (the GID matches the +# `SOURCEMETA_ONE_UID` build arg, default `10001`) so the process +# inherits the `sourcemeta` group and picks up the directory's +# group-write bit, or bind-mount their own writable volume over the +# output path. +# +# Binding `SOURCEMETA_ONE_PORT` to a privileged port (below 1024) +# works even after the privilege drop because the server binary +# carries `CAP_NET_BIND_SERVICE` as a file capability, granted at +# image build time. + +if [ "$(id -u)" -eq 0 ] +then + exec /usr/sbin/gosu sourcemeta /usr/bin/sourcemeta-one-server \ + "$SOURCEMETA_ONE_OUTPUT" \ + "$SOURCEMETA_ONE_PORT" +fi exec /usr/bin/sourcemeta-one-server \ "$SOURCEMETA_ONE_OUTPUT" \ diff --git a/enterprise/Dockerfile b/enterprise/Dockerfile index 42dfbea93..e5602e57e 100644 --- a/enterprise/Dockerfile +++ b/enterprise/Dockerfile @@ -76,11 +76,33 @@ RUN mkdir -p /usr/share/sourcemeta/one \ FROM debian:trixie-slim +# `gosu` lets the runtime entrypoint drop from root to the +# unprivileged service account. We deliberately don't set `USER` on +# this image so build-time `RUN` instructions (including those in +# downstream consumer Dockerfiles) keep executing as root, matching +# the well-trodden Postgres/Redis/MySQL pattern. +# +# `libcap2-bin` provides `setcap`, used below to grant the server +# binary `CAP_NET_BIND_SERVICE` so the dropped service account can +# bind privileged ports (below 1024) if the operator chooses to run +# without a separate TLS terminator in front of the image. RUN apt-get --yes update && apt-get install --yes --no-install-recommends \ - openssl-provider-fips \ + gosu libcap2-bin openssl-provider-fips \ && apt-get clean && rm -rf /var/lib/apt/lists/* COPY --from=builder /etc/ssl/openssl.cnf /etc/ssl/openssl.cnf +# Create the unprivileged service account before the package purge +# below removes `passwd` (and with it `useradd`). The account lives +# in `/etc/passwd` after this point and survives the purge. The UID +# sits above the typical 1000-range dev user (so host-mounted volumes +# can be mapped unambiguously) and outside the Debian system-user +# range (1-999) reserved for daemons. +ARG SOURCEMETA_ONE_UID=10001 +RUN groupadd --system --gid "${SOURCEMETA_ONE_UID}" sourcemeta \ + && useradd --system --uid "${SOURCEMETA_ONE_UID}" \ + --gid "${SOURCEMETA_ONE_UID}" \ + --no-create-home --shell /usr/sbin/nologin sourcemeta + # Commercial editions require a paid license # See https://one.sourcemeta.com/commercial/ @@ -109,6 +131,13 @@ COPY --from=builder /usr/share/sourcemeta/one \ RUN ldd /usr/bin/sourcemeta-one-index RUN ldd /usr/bin/sourcemeta-one-server +# File capabilities are filesystem extended attributes that do not +# survive a cross-stage `COPY --from=builder`, so the capability has +# to be applied here in the runtime stage after the binary lands. +# `+ep` puts the capability in both the permitted and effective sets +# of the process at exec time. +RUN setcap cap_net_bind_service+ep /usr/bin/sourcemeta-one-server + # Verify that the index binary uses system OpenSSL for cryptography RUN ldd /usr/bin/sourcemeta-one-index | grep libcrypto # Verify that the OpenSSL FIPS provider is configured and present @@ -142,6 +171,20 @@ COPY docker/wrapper-index.sh /usr/bin/sourcemeta COPY docker/wrapper-server.sh /usr/bin/sourcemeta-server COPY docker/transaction-overlayfs.sh /usr/bin/sourcemeta-transaction-overlayfs +# Pre-create the output directory and hand it to the service account +# so the runtime server can read through it (and the optional +# transactional re-index path can write through it) without +# elevation. The chown is non-recursive to keep blast-radius bounded +# if a build-arg override ever pointed the path at the rootfs root, +# and we normalize via `realpath` so values like `/..`, `/foo/..`, or +# symlinks that resolve to `/` are rejected too. +RUN test -n "${SOURCEMETA_ONE_OUTPUT}" \ + && mkdir -p "${SOURCEMETA_ONE_OUTPUT}" \ + && resolved_output="$(realpath "${SOURCEMETA_ONE_OUTPUT}")" \ + && test "${resolved_output}" != "/" \ + && chown sourcemeta:sourcemeta "${resolved_output}" \ + && chmod 775 "${resolved_output}" + ENV SOURCEMETA_ONE_PORT=8000 HEALTHCHECK --interval=1s --timeout=2s --start-period=1s --retries=10 CMD grep -qE \ "^\s*[0-9]+:\s+[0-9A-F]+:$(printf '%04X' $SOURCEMETA_ONE_PORT)\s+[0-9A-F:]+\s+0A\s" \