mudler · richiejp · Apr 27, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026
diff --git a/Makefile b/Makefile
@@ -223,6 +223,20 @@ run-e2e-aio: protogen-go
 	@echo 'Running e2e AIO tests'
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
 
+# vLLM multi-node DP smoke (CPU). Builds local-ai:tests and the
+# cpu-vllm backend from the current working tree, then drives a
+# head + headless follower via testcontainers-go and asserts a chat
+# completion. BuildKit caches both images, so re-runs only rebuild
+# what changed. The test lives under tests/e2e/distributed and is
+# selected by the VLLMMultinode label so it doesn't run alongside
+# the other distributed-suite tests by default.
+test-e2e-vllm-multinode: docker-build-e2e extract-backend-vllm protogen-go
+	@echo 'Running e2e vLLM multi-node DP test'
+	LOCALAI_IMAGE=local-ai \
+	LOCALAI_IMAGE_TAG=tests \
+	LOCALAI_VLLM_BACKEND_DIR=$(abspath ./local-backends/vllm) \
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter='VLLMMultinode' -v -r ./tests/e2e/distributed
+
 ########################################################
 ## E2E tests
 ########################################################
@@ -306,7 +320,7 @@ local-backends:
 
 extract-backend-%: docker-build-% local-backends
 	@echo "Extracting backend $*..."
-	@CID=$$(docker create local-ai-backend:$*) && \
+	@CID=$$(docker create --entrypoint=/run.sh local-ai-backend:$*) && \
 	  rm -rf local-backends/$* && mkdir -p local-backends/$* && \
 	  docker cp $$CID:/ - | tar -xf - -C local-backends/$* && \
 	  docker rm $$CID > /dev/null
@@ -579,6 +593,14 @@ test-extra-backend-vllm: docker-build-vllm
 	BACKEND_TEST_OPTIONS=tool_parser:hermes \
 	$(MAKE) test-extra-backend
 
+## vllm multi-node data-parallel smoke test. Runs LocalAI head + a
+## `local-ai p2p-worker vllm` follower in docker compose against
+## Qwen2.5-0.5B with data_parallel_size=2. Requires 2 NVIDIA GPUs and
+## nvidia-container-runtime on the host — vLLM v1's DP coordinator is
+## not viable on CPU so this cannot run in CI without GPU.
+test-extra-backend-vllm-multinode:
+	./tests/e2e/vllm-multinode/smoke.sh
+
 ## tinygrad mirrors the vllm target (same model, same caps, same parser) so
 ## the two backends are directly comparable. The LLM path covers Predict,
 ## streaming and native tool-call extraction. Companion targets below cover

diff --git a/backend/python/vllm/install.sh b/backend/python/vllm/install.sh
@@ -18,12 +18,15 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
-# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
-# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
-# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
-# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
+# Intel XPU: torch==2.11.0+xpu lives on the PyTorch XPU index, transitive
+# deps on PyPI — unsafe-best-match lets uv mix both. vllm-xpu-kernels only
+# ships a python3.12 wheel per upstream docs, so bump the portable Python
+# before installRequirements (matches the l4t13 pattern below).
+# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
 if [ "x${BUILD_PROFILE}" == "xintel" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
+    PYTHON_VERSION="3.12"
+    PYTHON_PATCH="11"
+    EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
 # CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
@@ -56,13 +59,56 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
     EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
 fi
 
+# Intel XPU has no upstream-published vllm wheels, so we always build vllm
+# from source against torch-xpu and replace the default triton with
+# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
+# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
+if [ "x${BUILD_TYPE}" == "xintel" ]; then
+    # Hide requirements-intel-after.txt so installRequirements doesn't
+    # try `pip install vllm` (would either fail or grab a non-XPU wheel).
+    _intel_after="${backend_dir}/requirements-intel-after.txt"
+    _intel_after_bak=""
+    if [ -f "${_intel_after}" ]; then
+        _intel_after_bak="${_intel_after}.xpu.bak"
+        mv "${_intel_after}" "${_intel_after_bak}"
+    fi
+    installRequirements
+    if [ -n "${_intel_after_bak}" ]; then
+        mv "${_intel_after_bak}" "${_intel_after}"
+    fi
+
+    # vllm's CMake build needs the Intel oneAPI dpcpp/sycl compiler — the
+    # base image (intel/oneapi-basekit) has it but the env isn't sourced.
+    if [ -f /opt/intel/oneapi/setvars.sh ]; then
+        set +u
+        source /opt/intel/oneapi/setvars.sh --force
+        set -u
+    fi
+
+    _vllm_src=$(mktemp -d)
+    trap 'rm -rf "${_vllm_src}"' EXIT
+    git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
+    pushd "${_vllm_src}/vllm"
+        # Install vllm's own runtime deps (torch-xpu, vllm_xpu_kernels,
+        # pydantic, fastapi, …) from upstream's requirements/xpu.txt — the
+        # canonical source of truth. Avoids re-pinning everything ourselves.
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -r requirements/xpu.txt
+        # Stock triton (NVIDIA-only) may have come in transitively; replace
+        # with triton-xpu==3.7.0 which matches torch 2.11.
+        uv pip uninstall triton triton-xpu 2>/dev/null || true
+        uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
+            --extra-index-url https://download.pytorch.org/whl/xpu \
+            triton-xpu==3.7.0
+        export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
+        VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
+    popd
 # FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
 # requirements-cpu-after.txt and compiles vllm locally against the host's
 # actual CPU. Not used by default because it takes ~30-40 minutes, but
 # kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
 # required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
 # bigger-runner with compatible hardware instead.
-if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
+elif [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
     # Temporarily hide the prebuilt wheel so installRequirements doesn't
     # pull it — the rest of the requirements files (base deps, torch,
     # transformers) are still installed normally.

diff --git a/backend/python/vllm/package.sh b/backend/python/vllm/package.sh
@@ -45,5 +45,109 @@ copy_with_symlinks() {
 copy_with_symlinks libnuma.so.1
 copy_with_symlinks libgomp.so.1
 
+# CPU profile only: bundle a g++ toolchain so torch._inductor's
+# ISA probe (always run at vllm engine startup, regardless of
+# enforce_eager) finds a C++ compiler. The LocalAI runtime image
+# is FROM ubuntu:24.04 with a minimal apt list that does not
+# include build-essential, and the backend image itself is FROM
+# scratch -- so without this, cpu-vllm crashes with
+# torch._inductor.exc.InvalidCxxCompiler at first inference
+# unless the operator manually sets TORCH_COMPILE_DISABLE=1.
+#
+# We snapshot every file owned by the toolchain packages, mirroring
+# the /usr/... layout into ${BACKEND}/toolchain/ so g++ can find
+# cc1plus, headers, libs etc. via GCC_EXEC_PREFIX / CPATH /
+# LIBRARY_PATH at runtime (libbackend.sh wires those up). Adds
+# ~400 MB to the cpu-vllm image, which is tolerable -- cpu-vllm is
+# already a niche profile.
+if [ "${BUILD_TYPE:-}" = "" ] && command -v dpkg-query >/dev/null 2>&1; then
+    TOOLCHAIN_DIR="${CURDIR}/toolchain"
+    mkdir -p "${TOOLCHAIN_DIR}"
+    # The unversioned g++/gcc packages on Debian/Ubuntu only ship
+    # symlinks; the actual binaries live in g++-${VER}/gcc-${VER}.
+    # Discover the active version so the symlink targets get bundled
+    # along with their owners.
+    GCC_VER=$(gcc -dumpversion 2>/dev/null | cut -d. -f1 || true)
+    # `g++-${VER}` itself is just another symlink layer on Debian/
+    # Ubuntu — the real binary `x86_64-linux-gnu-g++-${VER}` lives
+    # in `g++-${VER}-x86-64-linux-gnu` (a separate package pulled in
+    # as a dependency). Same story for gcc/cpp. Compute the dpkg
+    # arch-triplet to find the right package name for both amd64 and
+    # arm64 hosts.
+    case "$(dpkg --print-architecture 2>/dev/null)" in
+        amd64) HOST_TRIPLET="x86-64-linux-gnu" ;;
+        arm64) HOST_TRIPLET="aarch64-linux-gnu" ;;
+        *)     HOST_TRIPLET="" ;;
+    esac
+    PKGS=(g++ gcc cpp libstdc++-${GCC_VER}-dev libgcc-${GCC_VER}-dev libc6 libc6-dev binutils binutils-common libbinutils libc-dev-bin linux-libc-dev libcrypt-dev libgomp1 libstdc++6 libgcc-s1 libisl23 libmpc3 libmpfr6 libjansson4 libctf0 libctf-nobfd0 libsframe1)
+    if [ -n "${GCC_VER}" ]; then
+        PKGS+=("g++-${GCC_VER}" "gcc-${GCC_VER}" "cpp-${GCC_VER}" "gcc-${GCC_VER}-base")
+        if [ -n "${HOST_TRIPLET}" ]; then
+            PKGS+=(
+                "g++-${GCC_VER}-${HOST_TRIPLET}"
+                "gcc-${GCC_VER}-${HOST_TRIPLET}"
+                "cpp-${GCC_VER}-${HOST_TRIPLET}"
+                "binutils-${HOST_TRIPLET}"
+            )
+        fi
+    fi
+    for pkg in "${PKGS[@]}"; do
+        if ! dpkg-query -W "${pkg}" >/dev/null 2>&1; then
+            continue
+        fi
+        # Copy each owned path, preserving symlinks and mode. We
+        # tolerate dpkg listing directories alongside files.
+        dpkg -L "${pkg}" | while IFS= read -r path; do
+            if [ -L "${path}" ] || [ -f "${path}" ]; then
+                mkdir -p "${TOOLCHAIN_DIR}$(dirname "${path}")"
+                cp -aP "${path}" "${TOOLCHAIN_DIR}${path}" 2>/dev/null || true
+            fi
+        done
+    done
+    # Ubuntu's filesystem layout has /lib -> /usr/lib (UsrMerge) and
+    # /lib64 -> /usr/lib64. ld scripts (e.g. libm.so) hardcode
+    # `/lib/x86_64-linux-gnu/libm.so.6`; with --sysroot the linker
+    # looks for that path under the sysroot, which means we need
+    # the same symlinks under TOOLCHAIN_DIR.
+    [ -e "${TOOLCHAIN_DIR}/lib" ]   || ln -s usr/lib   "${TOOLCHAIN_DIR}/lib"
+    [ -e "${TOOLCHAIN_DIR}/lib64" ] || ln -s usr/lib64 "${TOOLCHAIN_DIR}/lib64"
+
+    # Replace the unversioned g++/gcc/cpp symlinks with wrapper
+    # scripts that pass --sysroot=<toolchain> and -B <gcc-exec-prefix>.
+    # Without these flags gcc would fall back to its compiled-in
+    # /usr search and fail to find headers (the runtime image has no
+    # libc6-dev) or fail to invoke `as`/`ld` (binutils not on PATH at
+    # /usr/bin). Wrappers self-resolve their location at runtime so
+    # they work from any BackendsPath.
+    BIN_DIR="${TOOLCHAIN_DIR}/usr/bin"
+    if [ -n "${GCC_VER}" ] && [ -n "${HOST_TRIPLET}" ]; then
+        # HOST_TRIPLET in package names uses dashes ("x86-64-linux-gnu");
+        # the binary suffix uses underscores in the arch part
+        # ("x86_64-linux-gnu-g++-13"). Translate.
+        BIN_TRIPLET=${HOST_TRIPLET//x86-64/x86_64}
+        for tool in g++ gcc cpp; do
+            real="${BIN_DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}"
+            if [ -x "${real}" ]; then
+                rm -f "${BIN_DIR}/${tool}" "${BIN_DIR}/${tool}-${GCC_VER}"
+                cat > "${BIN_DIR}/${tool}" <<EOF
+#!/bin/bash
+# Auto-generated by package.sh. Passes --sysroot and -B so the
+# bundled toolchain works from any BackendsPath without depending
+# on libc6-dev / binutils being installed at /usr in the runtime
+# image. See backend/python/vllm/package.sh.
+DIR="\$(dirname "\$(readlink -f "\$0")")"     # …/toolchain/usr/bin
+SYSROOT="\$(dirname "\$(dirname "\${DIR}")")" # …/toolchain
+exec "\${DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}" \\
+    -B "\${SYSROOT}/usr/lib/gcc/${BIN_TRIPLET}/${GCC_VER}/" \\
+    --sysroot="\${SYSROOT}" \\
+    "\$@"
+EOF
+                chmod +x "${BIN_DIR}/${tool}"
+            fi
+        done
+    fi
+    echo "Bundled g++ toolchain (gcc-${GCC_VER}) into ${TOOLCHAIN_DIR} ($(du -sh "${TOOLCHAIN_DIR}" | cut -f1))"
+fi
+
 echo "vllm packaging completed successfully"
 ls -liah "${LIB_DIR}/"
diff --git a/backend/python/vllm/requirements-intel-after.txt b/backend/python/vllm/requirements-intel-after.txt
@@ -1 +1,3 @@
-vllm
+# Intel XPU has no upstream-published vllm wheels — install.sh builds vllm
+# from source with VLLM_TARGET_DEVICE=xpu and hides this file during
+# installRequirements. Don't add a `vllm` line here.
diff --git a/backend/python/vllm/requirements-intel.txt b/backend/python/vllm/requirements-intel.txt
@@ -1,7 +1,8 @@
 --extra-index-url https://download.pytorch.org/whl/xpu
+# vllm's own deps (torch==2.11.0+xpu, vllm_xpu_kernels, pydantic, …) are
+# installed from upstream's requirements/xpu.txt during the source build —
+# see install.sh. Only list what LocalAI's vllm backend.py needs directly.
 accelerate
-torch
 transformers
-optimum[openvino]
+bitsandbytes
 setuptools
-bitsandbytes
diff --git a/backend/python/vllm/requirements.txt b/backend/python/vllm/requirements.txt
@@ -1,4 +1,7 @@
 grpcio==1.80.0
 protobuf
 certifi
-setuptools
+setuptools
+pillow
+charset-normalizer>=3.4.0
+chardet
diff --git a/backend/python/vllm/run.sh b/backend/python/vllm/run.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+set -x
 
 backend_dir=$(dirname $0)
 
@@ -8,4 +9,41 @@ else
     source $backend_dir/../common/libbackend.sh
 fi
 
-startBackend $@
+# CPU profile: torch._inductor's ISA-probe (run at vllm engine
+# startup, even with enforce_eager=True) shells out to g++. The
+# LocalAI runtime image and the FROM-scratch backend image both
+# omit a compiler; package.sh bundles one into ${EDIR}/toolchain
+# along with wrapper scripts at toolchain/usr/bin that already pass
+# --sysroot and -B. So all run.sh has to do is put the wrapper on
+# PATH and expose the toolchain's shared libs (libisl, libmpc, libbfd,
+# ...) to ld.so. No-op for other profiles -- the dir doesn't exist.
+if [ -d "${EDIR}/toolchain/usr/bin" ]; then
+    export PATH="${EDIR}/toolchain/usr/bin:${PATH}"
+    _libpath="${EDIR}/toolchain/usr/lib/x86_64-linux-gnu"
+    export LD_LIBRARY_PATH="${_libpath}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+fi
+
+# Multi-node DP follower mode: when the first arg is `serve`, exec into
+# vllm's own CLI instead of LocalAI's backend.py gRPC server. The
+# follower speaks ZMQ directly to the head node's vllm ranks — there
+# is no LocalAI gRPC on the follower side. Reaches this path via
+# `local-ai p2p-worker vllm`.
+if [ "${1:-}" = "serve" ]; then
+    ensureVenv
+    if [ "x${PORTABLE_PYTHON}" == "xtrue" ] || [ -x "$(_portable_python)" ]; then
+        _makeVenvPortable --update-pyvenv-cfg
+    fi
+    if [ -d "${EDIR}/lib" ]; then
+        export LD_LIBRARY_PATH="${EDIR}/lib:${LD_LIBRARY_PATH:-}"
+    fi
+    # Run the vllm console script through the venv python rather than
+    # exec-ing it directly. uv bakes an absolute shebang at install time
+    # (e.g. `#!/vllm/venv/bin/python3` from the build image) which
+    # doesn't exist once the backend is relocated to BackendsPath, and
+    # _makeVenvPortable's shebang rewriter only matches paths that
+    # already point at ${EDIR}. Invoking python with the script as an
+    # argument bypasses the shebang entirely.
+    exec "${EDIR}/venv/bin/python" "${EDIR}/venv/bin/vllm" "$@"
+fi
+
+startBackend $@
diff --git a/core/cli/worker/labels.go b/core/cli/worker/labels.go
@@ -0,0 +1,20 @@
+package worker
+
+import "strings"
+
+// ParseNodeLabels parses a comma-separated `k=v,k=v` string into a map.
+// Whitespace around keys, values, and pairs is trimmed; pairs without
+// `=` are skipped silently.
+func ParseNodeLabels(input string) map[string]string {
+	labels := make(map[string]string)
+	if input == "" {
+		return labels
+	}
+	for _, pair := range strings.Split(input, ",") {
+		pair = strings.TrimSpace(pair)
+		if k, v, ok := strings.Cut(pair, "="); ok {
+			labels[strings.TrimSpace(k)] = strings.TrimSpace(v)
+		}
+	}
+	return labels
+}
diff --git a/core/cli/worker/worker.go b/core/cli/worker/worker.go
@@ -8,8 +8,9 @@ type WorkerFlags struct {
 }
 
 type Worker struct {
-	P2P            P2P            `cmd:"" name:"p2p-llama-cpp-rpc" help:"Starts a LocalAI llama.cpp worker in P2P mode (requires a token)"`
-	P2PMLX         P2PMLX         `cmd:"" name:"p2p-mlx" help:"Starts a LocalAI MLX distributed worker in P2P mode (requires a token)"`
-	LLamaCPP       LLamaCPP       `cmd:"" name:"llama-cpp-rpc" help:"Starts a llama.cpp worker in standalone mode"`
-	MLXDistributed MLXDistributed `cmd:"" name:"mlx-distributed" help:"Starts an MLX distributed worker in standalone mode (requires --hostfile and --rank)"`
+	P2P             P2P             `cmd:"" name:"p2p-llama-cpp-rpc" help:"Starts a LocalAI llama.cpp worker in P2P mode (requires a token)"`
+	P2PMLX          P2PMLX          `cmd:"" name:"p2p-mlx" help:"Starts a LocalAI MLX distributed worker in P2P mode (requires a token)"`
+	LLamaCPP        LLamaCPP        `cmd:"" name:"llama-cpp-rpc" help:"Starts a llama.cpp worker in standalone mode"`
+	MLXDistributed  MLXDistributed  `cmd:"" name:"mlx-distributed" help:"Starts an MLX distributed worker in standalone mode (requires --hostfile and --rank)"`
+	VLLMDistributed VLLMDistributed `cmd:"" name:"vllm" help:"Starts a vLLM data-parallel follower process. Multi-node DP for a single model: head runs the existing vllm backend with engine_args.data_parallel_size>1, followers run this command."`
 }