Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 23 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,20 @@ run-e2e-aio: protogen-go
@echo 'Running e2e AIO tests'
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio

# vLLM multi-node DP smoke (CPU). Builds local-ai:tests and the
# cpu-vllm backend from the current working tree, then drives a
# head + headless follower via testcontainers-go and asserts a chat
# completion. BuildKit caches both images, so re-runs only rebuild
# what changed. The test lives under tests/e2e/distributed and is
# selected by the VLLMMultinode label so it doesn't run alongside
# the other distributed-suite tests by default.
test-e2e-vllm-multinode: docker-build-e2e extract-backend-vllm protogen-go
@echo 'Running e2e vLLM multi-node DP test'
LOCALAI_IMAGE=local-ai \
LOCALAI_IMAGE_TAG=tests \
LOCALAI_VLLM_BACKEND_DIR=$(abspath ./local-backends/vllm) \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter='VLLMMultinode' -v -r ./tests/e2e/distributed

########################################################
## E2E tests
########################################################
Expand Down Expand Up @@ -306,7 +320,7 @@ local-backends:

extract-backend-%: docker-build-% local-backends
@echo "Extracting backend $*..."
@CID=$$(docker create local-ai-backend:$*) && \
@CID=$$(docker create --entrypoint=/run.sh local-ai-backend:$*) && \
rm -rf local-backends/$* && mkdir -p local-backends/$* && \
docker cp $$CID:/ - | tar -xf - -C local-backends/$* && \
docker rm $$CID > /dev/null
Expand Down Expand Up @@ -579,6 +593,14 @@ test-extra-backend-vllm: docker-build-vllm
BACKEND_TEST_OPTIONS=tool_parser:hermes \
$(MAKE) test-extra-backend

## vllm multi-node data-parallel smoke test. Runs LocalAI head + a
## `local-ai p2p-worker vllm` follower in docker compose against
## Qwen2.5-0.5B with data_parallel_size=2. Requires 2 NVIDIA GPUs and
## nvidia-container-runtime on the host — vLLM v1's DP coordinator is
## not viable on CPU so this cannot run in CI without GPU.
test-extra-backend-vllm-multinode:
./tests/e2e/vllm-multinode/smoke.sh

## tinygrad mirrors the vllm target (same model, same caps, same parser) so
## the two backends are directly comparable. The LLM path covers Predict,
## streaming and native tool-call extraction. Companion targets below cover
Expand Down
58 changes: 52 additions & 6 deletions backend/python/vllm/install.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,15 @@ else
source $backend_dir/../common/libbackend.sh
fi

# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
# Intel XPU: torch==2.11.0+xpu lives on the PyTorch XPU index, transitive
# deps on PyPI — unsafe-best-match lets uv mix both. vllm-xpu-kernels only
# ships a python3.12 wheel per upstream docs, so bump the portable Python
# before installRequirements (matches the l4t13 pattern below).
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
if [ "x${BUILD_PROFILE}" == "xintel" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
PYTHON_VERSION="3.12"
PYTHON_PATCH="11"
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi

# CPU builds need unsafe-best-match to pull torch==2.10.0+cpu from the
Expand Down Expand Up @@ -56,13 +59,56 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
fi

# Intel XPU has no upstream-published vllm wheels, so we always build vllm
# from source against torch-xpu and replace the default triton with
# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
if [ "x${BUILD_TYPE}" == "xintel" ]; then
# Hide requirements-intel-after.txt so installRequirements doesn't
# try `pip install vllm` (would either fail or grab a non-XPU wheel).
_intel_after="${backend_dir}/requirements-intel-after.txt"
_intel_after_bak=""
if [ -f "${_intel_after}" ]; then
_intel_after_bak="${_intel_after}.xpu.bak"
mv "${_intel_after}" "${_intel_after_bak}"
fi
installRequirements
if [ -n "${_intel_after_bak}" ]; then
mv "${_intel_after_bak}" "${_intel_after}"
fi

# vllm's CMake build needs the Intel oneAPI dpcpp/sycl compiler — the
# base image (intel/oneapi-basekit) has it but the env isn't sourced.
if [ -f /opt/intel/oneapi/setvars.sh ]; then
set +u
source /opt/intel/oneapi/setvars.sh --force
set -u
fi

_vllm_src=$(mktemp -d)
trap 'rm -rf "${_vllm_src}"' EXIT
git clone --depth 1 https://github.com/vllm-project/vllm "${_vllm_src}/vllm"
pushd "${_vllm_src}/vllm"
# Install vllm's own runtime deps (torch-xpu, vllm_xpu_kernels,
# pydantic, fastapi, …) from upstream's requirements/xpu.txt — the
# canonical source of truth. Avoids re-pinning everything ourselves.
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} -r requirements/xpu.txt
# Stock triton (NVIDIA-only) may have come in transitively; replace
# with triton-xpu==3.7.0 which matches torch 2.11.
uv pip uninstall triton triton-xpu 2>/dev/null || true
uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} \
--extra-index-url https://download.pytorch.org/whl/xpu \
triton-xpu==3.7.0
export CMAKE_PREFIX_PATH="$(python -c 'import site; print(site.getsitepackages()[0])'):${CMAKE_PREFIX_PATH:-}"
VLLM_TARGET_DEVICE=xpu uv pip install ${EXTRA_PIP_INSTALL_FLAGS:-} --no-deps .
popd
# FROM_SOURCE=true on a CPU build skips the prebuilt vllm wheel in
# requirements-cpu-after.txt and compiles vllm locally against the host's
# actual CPU. Not used by default because it takes ~30-40 minutes, but
# kept here for hosts where the prebuilt wheel SIGILLs (CPU without the
# required SIMD baseline, e.g. AVX-512 VNNI/BF16). Default CI uses a
# bigger-runner with compatible hardware instead.
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
elif [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE:-}" == "xtrue" ]; then
# Temporarily hide the prebuilt wheel so installRequirements doesn't
# pull it — the rest of the requirements files (base deps, torch,
# transformers) are still installed normally.
Expand Down
104 changes: 104 additions & 0 deletions backend/python/vllm/package.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,109 @@ copy_with_symlinks() {
copy_with_symlinks libnuma.so.1
copy_with_symlinks libgomp.so.1

# CPU profile only: bundle a g++ toolchain so torch._inductor's
# ISA probe (always run at vllm engine startup, regardless of
# enforce_eager) finds a C++ compiler. The LocalAI runtime image
# is FROM ubuntu:24.04 with a minimal apt list that does not
# include build-essential, and the backend image itself is FROM
# scratch -- so without this, cpu-vllm crashes with
# torch._inductor.exc.InvalidCxxCompiler at first inference
# unless the operator manually sets TORCH_COMPILE_DISABLE=1.
#
# We snapshot every file owned by the toolchain packages, mirroring
# the /usr/... layout into ${BACKEND}/toolchain/ so g++ can find
# cc1plus, headers, libs etc. via GCC_EXEC_PREFIX / CPATH /
# LIBRARY_PATH at runtime (libbackend.sh wires those up). Adds
# ~400 MB to the cpu-vllm image, which is tolerable -- cpu-vllm is
# already a niche profile.
if [ "${BUILD_TYPE:-}" = "" ] && command -v dpkg-query >/dev/null 2>&1; then
TOOLCHAIN_DIR="${CURDIR}/toolchain"
mkdir -p "${TOOLCHAIN_DIR}"
# The unversioned g++/gcc packages on Debian/Ubuntu only ship
# symlinks; the actual binaries live in g++-${VER}/gcc-${VER}.
# Discover the active version so the symlink targets get bundled
# along with their owners.
GCC_VER=$(gcc -dumpversion 2>/dev/null | cut -d. -f1 || true)
# `g++-${VER}` itself is just another symlink layer on Debian/
# Ubuntu — the real binary `x86_64-linux-gnu-g++-${VER}` lives
# in `g++-${VER}-x86-64-linux-gnu` (a separate package pulled in
# as a dependency). Same story for gcc/cpp. Compute the dpkg
# arch-triplet to find the right package name for both amd64 and
# arm64 hosts.
case "$(dpkg --print-architecture 2>/dev/null)" in
amd64) HOST_TRIPLET="x86-64-linux-gnu" ;;
arm64) HOST_TRIPLET="aarch64-linux-gnu" ;;
*) HOST_TRIPLET="" ;;
esac
PKGS=(g++ gcc cpp libstdc++-${GCC_VER}-dev libgcc-${GCC_VER}-dev libc6 libc6-dev binutils binutils-common libbinutils libc-dev-bin linux-libc-dev libcrypt-dev libgomp1 libstdc++6 libgcc-s1 libisl23 libmpc3 libmpfr6 libjansson4 libctf0 libctf-nobfd0 libsframe1)
if [ -n "${GCC_VER}" ]; then
PKGS+=("g++-${GCC_VER}" "gcc-${GCC_VER}" "cpp-${GCC_VER}" "gcc-${GCC_VER}-base")
if [ -n "${HOST_TRIPLET}" ]; then
PKGS+=(
"g++-${GCC_VER}-${HOST_TRIPLET}"
"gcc-${GCC_VER}-${HOST_TRIPLET}"
"cpp-${GCC_VER}-${HOST_TRIPLET}"
"binutils-${HOST_TRIPLET}"
)
fi
fi
for pkg in "${PKGS[@]}"; do
if ! dpkg-query -W "${pkg}" >/dev/null 2>&1; then
continue
fi
# Copy each owned path, preserving symlinks and mode. We
# tolerate dpkg listing directories alongside files.
dpkg -L "${pkg}" | while IFS= read -r path; do
if [ -L "${path}" ] || [ -f "${path}" ]; then
mkdir -p "${TOOLCHAIN_DIR}$(dirname "${path}")"
cp -aP "${path}" "${TOOLCHAIN_DIR}${path}" 2>/dev/null || true
fi
done
done
# Ubuntu's filesystem layout has /lib -> /usr/lib (UsrMerge) and
# /lib64 -> /usr/lib64. ld scripts (e.g. libm.so) hardcode
# `/lib/x86_64-linux-gnu/libm.so.6`; with --sysroot the linker
# looks for that path under the sysroot, which means we need
# the same symlinks under TOOLCHAIN_DIR.
[ -e "${TOOLCHAIN_DIR}/lib" ] || ln -s usr/lib "${TOOLCHAIN_DIR}/lib"
[ -e "${TOOLCHAIN_DIR}/lib64" ] || ln -s usr/lib64 "${TOOLCHAIN_DIR}/lib64"

# Replace the unversioned g++/gcc/cpp symlinks with wrapper
# scripts that pass --sysroot=<toolchain> and -B <gcc-exec-prefix>.
# Without these flags gcc would fall back to its compiled-in
# /usr search and fail to find headers (the runtime image has no
# libc6-dev) or fail to invoke `as`/`ld` (binutils not on PATH at
# /usr/bin). Wrappers self-resolve their location at runtime so
# they work from any BackendsPath.
BIN_DIR="${TOOLCHAIN_DIR}/usr/bin"
if [ -n "${GCC_VER}" ] && [ -n "${HOST_TRIPLET}" ]; then
# HOST_TRIPLET in package names uses dashes ("x86-64-linux-gnu");
# the binary suffix uses underscores in the arch part
# ("x86_64-linux-gnu-g++-13"). Translate.
BIN_TRIPLET=${HOST_TRIPLET//x86-64/x86_64}
for tool in g++ gcc cpp; do
real="${BIN_DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}"
if [ -x "${real}" ]; then
rm -f "${BIN_DIR}/${tool}" "${BIN_DIR}/${tool}-${GCC_VER}"
cat > "${BIN_DIR}/${tool}" <<EOF
#!/bin/bash
# Auto-generated by package.sh. Passes --sysroot and -B so the
# bundled toolchain works from any BackendsPath without depending
# on libc6-dev / binutils being installed at /usr in the runtime
# image. See backend/python/vllm/package.sh.
DIR="\$(dirname "\$(readlink -f "\$0")")" # …/toolchain/usr/bin
SYSROOT="\$(dirname "\$(dirname "\${DIR}")")" # …/toolchain
exec "\${DIR}/${BIN_TRIPLET}-${tool}-${GCC_VER}" \\
-B "\${SYSROOT}/usr/lib/gcc/${BIN_TRIPLET}/${GCC_VER}/" \\
--sysroot="\${SYSROOT}" \\
"\$@"
EOF
chmod +x "${BIN_DIR}/${tool}"
fi
done
fi
echo "Bundled g++ toolchain (gcc-${GCC_VER}) into ${TOOLCHAIN_DIR} ($(du -sh "${TOOLCHAIN_DIR}" | cut -f1))"
fi

echo "vllm packaging completed successfully"
ls -liah "${LIB_DIR}/"
4 changes: 3 additions & 1 deletion backend/python/vllm/requirements-intel-after.txt
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
vllm
# Intel XPU has no upstream-published vllm wheels — install.sh builds vllm
# from source with VLLM_TARGET_DEVICE=xpu and hides this file during
# installRequirements. Don't add a `vllm` line here.
7 changes: 4 additions & 3 deletions backend/python/vllm/requirements-intel.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
--extra-index-url https://download.pytorch.org/whl/xpu
# vllm's own deps (torch==2.11.0+xpu, vllm_xpu_kernels, pydantic, …) are
# installed from upstream's requirements/xpu.txt during the source build —
# see install.sh. Only list what LocalAI's vllm backend.py needs directly.
accelerate
torch
transformers
optimum[openvino]
bitsandbytes
setuptools
bitsandbytes
5 changes: 4 additions & 1 deletion backend/python/vllm/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
grpcio==1.80.0
protobuf
certifi
setuptools
setuptools
pillow
charset-normalizer>=3.4.0
chardet
40 changes: 39 additions & 1 deletion backend/python/vllm/run.sh
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#!/bin/bash
set -x

backend_dir=$(dirname $0)

Expand All @@ -8,4 +9,41 @@ else
source $backend_dir/../common/libbackend.sh
fi

startBackend $@
# CPU profile: torch._inductor's ISA-probe (run at vllm engine
# startup, even with enforce_eager=True) shells out to g++. The
# LocalAI runtime image and the FROM-scratch backend image both
# omit a compiler; package.sh bundles one into ${EDIR}/toolchain
# along with wrapper scripts at toolchain/usr/bin that already pass
# --sysroot and -B. So all run.sh has to do is put the wrapper on
# PATH and expose the toolchain's shared libs (libisl, libmpc, libbfd,
# ...) to ld.so. No-op for other profiles -- the dir doesn't exist.
if [ -d "${EDIR}/toolchain/usr/bin" ]; then
export PATH="${EDIR}/toolchain/usr/bin:${PATH}"
_libpath="${EDIR}/toolchain/usr/lib/x86_64-linux-gnu"
export LD_LIBRARY_PATH="${_libpath}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
fi

# Multi-node DP follower mode: when the first arg is `serve`, exec into
# vllm's own CLI instead of LocalAI's backend.py gRPC server. The
# follower speaks ZMQ directly to the head node's vllm ranks — there
# is no LocalAI gRPC on the follower side. Reaches this path via
# `local-ai p2p-worker vllm`.
if [ "${1:-}" = "serve" ]; then
ensureVenv
if [ "x${PORTABLE_PYTHON}" == "xtrue" ] || [ -x "$(_portable_python)" ]; then
_makeVenvPortable --update-pyvenv-cfg
fi
if [ -d "${EDIR}/lib" ]; then
export LD_LIBRARY_PATH="${EDIR}/lib:${LD_LIBRARY_PATH:-}"
fi
# Run the vllm console script through the venv python rather than
# exec-ing it directly. uv bakes an absolute shebang at install time
# (e.g. `#!/vllm/venv/bin/python3` from the build image) which
# doesn't exist once the backend is relocated to BackendsPath, and
# _makeVenvPortable's shebang rewriter only matches paths that
# already point at ${EDIR}. Invoking python with the script as an
# argument bypasses the shebang entirely.
exec "${EDIR}/venv/bin/python" "${EDIR}/venv/bin/vllm" "$@"
fi

startBackend $@
20 changes: 20 additions & 0 deletions core/cli/worker/labels.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package worker

import "strings"

// ParseNodeLabels parses a comma-separated `k=v,k=v` string into a map.
// Whitespace around keys, values, and pairs is trimmed; pairs without
// `=` are skipped silently.
func ParseNodeLabels(input string) map[string]string {
labels := make(map[string]string)
if input == "" {
return labels
}
for _, pair := range strings.Split(input, ",") {
pair = strings.TrimSpace(pair)
if k, v, ok := strings.Cut(pair, "="); ok {
labels[strings.TrimSpace(k)] = strings.TrimSpace(v)
}
}
return labels
}
9 changes: 5 additions & 4 deletions core/cli/worker/worker.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ type WorkerFlags struct {
}

type Worker struct {
P2P P2P `cmd:"" name:"p2p-llama-cpp-rpc" help:"Starts a LocalAI llama.cpp worker in P2P mode (requires a token)"`
P2PMLX P2PMLX `cmd:"" name:"p2p-mlx" help:"Starts a LocalAI MLX distributed worker in P2P mode (requires a token)"`
LLamaCPP LLamaCPP `cmd:"" name:"llama-cpp-rpc" help:"Starts a llama.cpp worker in standalone mode"`
MLXDistributed MLXDistributed `cmd:"" name:"mlx-distributed" help:"Starts an MLX distributed worker in standalone mode (requires --hostfile and --rank)"`
P2P P2P `cmd:"" name:"p2p-llama-cpp-rpc" help:"Starts a LocalAI llama.cpp worker in P2P mode (requires a token)"`
P2PMLX P2PMLX `cmd:"" name:"p2p-mlx" help:"Starts a LocalAI MLX distributed worker in P2P mode (requires a token)"`
LLamaCPP LLamaCPP `cmd:"" name:"llama-cpp-rpc" help:"Starts a llama.cpp worker in standalone mode"`
MLXDistributed MLXDistributed `cmd:"" name:"mlx-distributed" help:"Starts an MLX distributed worker in standalone mode (requires --hostfile and --rank)"`
VLLMDistributed VLLMDistributed `cmd:"" name:"vllm" help:"Starts a vLLM data-parallel follower process. Multi-node DP for a single model: head runs the existing vllm backend with engine_args.data_parallel_size>1, followers run this command."`
}
Loading
Loading