From ddb6eeac10991f0e072f6e08dd9bb28fefc5839b Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 16:18:34 -0400
Subject: [PATCH 01/43] Add TVM wheel publish workflow

---
 .../actions/tvm-wheel-for-publish/action.yml  |  82 ++++
 .github/workflows/publish_wheel.yml           | 148 +++++++
 ci/scripts/package/README.md                  | 101 +++++
 ci/scripts/package/build_tvm_wheel.sh         | 376 ++++++++++++++++++
 ci/scripts/package/inject_cuda_runtime.py     | 198 +++++++++
 cmake/modules/CUDA.cmake                      |  13 +
 6 files changed, 918 insertions(+)
 create mode 100644 .github/actions/tvm-wheel-for-publish/action.yml
 create mode 100644 .github/workflows/publish_wheel.yml
 create mode 100644 ci/scripts/package/README.md
 create mode 100755 ci/scripts/package/build_tvm_wheel.sh
 create mode 100755 ci/scripts/package/inject_cuda_runtime.py

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
new file mode 100644
index 000000000000..86c85a128d42
--- /dev/null
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -0,0 +1,82 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Build TVM Wheel For Publish
+description: >
+  Build and test a Linux TVM wheel using the CUDA sidecar packaging flow.
+
+inputs:
+  checkout_ref:
+    description: "Branch, tag, or SHA to check out before building"
+    required: true
+  distribution_name:
+    description: "Optional wheel distribution name override, useful for TestPyPI"
+    required: false
+    default: ""
+  cuda_architectures:
+    description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
+    required: false
+    default: "75"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Check out source
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+      with:
+        ref: ${{ inputs.checkout_ref }}
+        submodules: recursive
+        fetch-depth: 0
+        fetch-tags: true
+
+    - name: Set up TVM build environment
+      uses: ./.github/actions/setup
+
+    - name: Install CUDA toolkit
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2
+
+    - name: Install wheel build tools
+      shell: bash -l {0}
+      run: |
+        set -eux
+        sudo apt-get update
+        sudo apt-get install -y patchelf
+        python -m pip install -U pip build auditwheel twine scikit-build-core
+
+    - name: Print build inputs
+      shell: bash -l {0}
+      run: |
+        set -eux
+        git log -1 --oneline
+        python --version
+        cmake --version
+        llvm-config --version
+        "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/nvcc" --version
+
+    - name: Build, repair, and test wheel
+      shell: bash -l {0}
+      env:
+        TVM_PYTHON: python
+        TVM_USE_LLVM: llvm-config
+        TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
+        TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
+      run: |
+        set -eux
+        ci/scripts/package/build_tvm_wheel.sh all
+
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
new file mode 100644
index 000000000000..f5cecf589f98
--- /dev/null
+++ b/.github/workflows/publish_wheel.yml
@@ -0,0 +1,148 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Publish TVM wheel
+
+on:
+  workflow_dispatch:
+    inputs:
+      tag:
+        description: "Tag, branch, or SHA to publish"
+        required: true
+        type: string
+      publish_repository:
+        description: "Where to publish after the wheel build succeeds"
+        required: true
+        default: "none"
+        type: choice
+        options:
+          - none
+          - testpypi
+          - pypi
+      distribution_name:
+        description: "Optional package name override, e.g. tvm-yourname-test for TestPyPI"
+        required: false
+        default: ""
+        type: string
+      cuda_architectures:
+        description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
+        required: false
+        default: "75"
+        type: string
+      verify_from_repository:
+        description: "Install the uploaded package from the selected repository and import-test it"
+        required: true
+        default: true
+        type: boolean
+
+jobs:
+  build_wheel:
+    name: Linux x86_64 CUDA sidecar wheel
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository for local action
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          fetch-depth: 1
+
+      - name: Build TVM wheel
+        uses: ./.github/actions/tvm-wheel-for-publish
+        with:
+          checkout_ref: ${{ inputs.tag }}
+          distribution_name: ${{ inputs.distribution_name }}
+          cuda_architectures: ${{ inputs.cuda_architectures }}
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: tvm-wheel-linux-x86_64
+          path: wheelhouse/*.whl
+
+  upload_pypi:
+    name: Upload package distributions
+    needs: [build_wheel]
+    if: ${{ inputs.publish_repository != 'none' }}
+    runs-on: ubuntu-latest
+    environment: ${{ inputs.publish_repository }}
+    permissions:
+      id-token: write
+      attestations: write
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: tvm-wheel-linux-x86_64
+          path: dist
+
+      - name: Generate artifact attestation for wheels
+        uses: actions/attest-build-provenance@v1
+        with:
+          subject-path: dist/*
+
+      - name: Publish package distributions to TestPyPI
+        if: ${{ inputs.publish_repository == 'testpypi' }}
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
+        with:
+          attestations: true
+          verbose: true
+          repository-url: https://test.pypi.org/legacy/
+
+      - name: Publish package distributions to PyPI
+        if: ${{ inputs.publish_repository == 'pypi' }}
+        uses: pypa/gh-action-pypi-publish@ed0c53931b1dc9bd32cbe73a98c7f6766f8a527e # v1.13.0
+        with:
+          attestations: true
+          verbose: true
+
+  verify_pypi:
+    name: Verify uploaded package
+    needs: [upload_pypi]
+    if: ${{ inputs.publish_repository != 'none' && inputs.verify_from_repository }}
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out source
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          ref: ${{ inputs.tag }}
+          submodules: recursive
+          fetch-depth: 0
+          fetch-tags: true
+
+      - uses: actions/download-artifact@v4
+        with:
+          name: tvm-wheel-linux-x86_64
+          path: wheelhouse
+
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Verify package from TestPyPI
+        if: ${{ inputs.publish_repository == 'testpypi' }}
+        env:
+          TVM_PYTHON: python
+          TVM_TEST_INDEX_URL: https://test.pypi.org/simple/
+          TVM_EXTRA_INDEX_URL: https://pypi.org/simple
+        run: ci/scripts/package/build_tvm_wheel.sh verify-pypi
+
+      - name: Verify package from PyPI
+        if: ${{ inputs.publish_repository == 'pypi' }}
+        env:
+          TVM_PYTHON: python
+          TVM_TEST_INDEX_URL: https://pypi.org/simple/
+          TVM_EXTRA_INDEX_URL: https://pypi.org/simple
+        run: ci/scripts/package/build_tvm_wheel.sh verify-pypi
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
new file mode 100644
index 000000000000..409a2ea6084f
--- /dev/null
+++ b/ci/scripts/package/README.md
@@ -0,0 +1,101 @@
+# TVM wheel packaging helper
+
+This helper follows the CUDA-sidecar packaging flow used for local release
+validation:
+
+1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
+2. Build the main Python wheel with LLVM enabled and CUDA disabled.
+3. Inject the CUDA runtime DSO into `tvm/lib/` inside the wheel.
+4. Repair the wheel, excluding CUDA driver/runtime DSOs from vendoring.
+5. Verify the wheel in a fresh virtualenv.
+6. Upload with `twine`.
+
+It mirrors the TVM-FFI packaging patterns in:
+
+- `tvm-ffi/.github/actions/build-wheel-for-publish/action.yml`
+- `tvm-ffi/.github/workflows/publish_wheel.yml`
+- `tvm-ffi/addons/tvm_ffi_orcjit/pyproject.toml`
+- `tvm-ffi/addons/torch_c_dlpack_ext/build_aot_wheels.sh`
+
+GitHub Actions flow:
+
+1. Create a tag that contains these packaging files.
+2. Open the `Publish TVM wheel` workflow in GitHub Actions.
+3. Fill `tag` with that tag.
+4. For a TestPyPI run, set `publish_repository=testpypi` and set
+   `distribution_name` to a temporary package name such as
+   `tvm-yourname-test`.
+5. After the workflow build, upload, and `verify_pypi` jobs pass, run it again
+   with the final tag/name and `publish_repository=pypi`.
+
+To test this from the fork `tlopex/tvm` without publishing:
+
+```bash
+git push mine HEAD:pypi
+git tag -a tvm-wheel-test0 -m "Test TVM wheel workflow"
+git push mine tvm-wheel-test0
+
+gh workflow run publish_wheel.yml \
+  --repo tlopex/tvm \
+  --ref pypi \
+  -f tag=tvm-wheel-test0 \
+  -f publish_repository=none \
+  -f distribution_name=tvm-tlopexh-test \
+  -f cuda_architectures=75 \
+  -f verify_from_repository=false
+```
+
+If the workflow is not visible in the GitHub UI yet, push or merge these files
+to the fork's default branch first. GitHub only lists manually dispatched
+workflows once the workflow file exists in the repository.
+
+Typical TestPyPI dry run:
+
+```bash
+python version.py --git-describe
+git tag -a v0.25.dev-test0 -m "Test TVM wheel v0.25.dev-test0"
+
+python -m venv /tmp/tvm-wheel-tools
+/tmp/tvm-wheel-tools/bin/python -m pip install -U pip build auditwheel twine
+
+TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
+TVM_USE_LLVM=/path/to/llvm-config \
+TVM_USE_CUDA=/usr/local/cuda-12.8 \
+TVM_WHEEL_DIST_NAME=tvm-tlopexh-test \
+ci/scripts/package/build_tvm_wheel.sh all
+
+TVM_UPLOAD_REPOSITORY_URL=https://test.pypi.org/legacy/ \
+TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
+ci/scripts/package/build_tvm_wheel.sh upload
+
+TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
+ci/scripts/package/build_tvm_wheel.sh verify-pypi
+```
+
+For a real PyPI upload, leave `TVM_WHEEL_DIST_NAME` unset and set the normal
+Twine credentials:
+
+```bash
+TWINE_USERNAME=__token__ \
+TWINE_PASSWORD="$PYPI_TOKEN" \
+TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
+ci/scripts/package/build_tvm_wheel.sh all
+
+TWINE_USERNAME=__token__ \
+TWINE_PASSWORD="$PYPI_TOKEN" \
+TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
+ci/scripts/package/build_tvm_wheel.sh upload
+```
+
+Useful knobs:
+
+- `TVM_USE_LLVM`: LLVM config for the base wheel, default `ON`.
+- `TVM_USE_CUDA`: CUDA root or `ON` for the sidecar build, default `ON`.
+- `TVM_CUDA_ARCHITECTURES`: CMake CUDA architectures, default `75`.
+- `TVM_WHEEL_DIST_NAME`: optional distribution rename for TestPyPI.
+- `TVM_WHEEL_DIST_VERSION`: optional distribution version rewrite.
+- `TVM_SKIP_REPAIR=1`: leave the injected wheel unrepaired.
+- `TVM_SKIP_CUDA=1`: build a base wheel without a CUDA sidecar.
+- `TVM_KEEP_BUILD_DIRS=1`: reuse the CMake build directories.
+- `TVM_TEST_INDEX_URL`: package index for `verify-pypi`, default TestPyPI.
+- `TVM_EXTRA_INDEX_URL`: extra package index for dependencies, default PyPI.
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
new file mode 100755
index 000000000000..cb30ce1d2b87
--- /dev/null
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -0,0 +1,376 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
+
+TVM_PYTHON="${TVM_PYTHON:-python}"
+TVM_RAW_DIST="${TVM_RAW_DIST:-${REPO_ROOT}/dist/tvm-raw}"
+TVM_INJECTED_DIST="${TVM_INJECTED_DIST:-${REPO_ROOT}/dist/tvm-injected}"
+TVM_WHEELHOUSE="${TVM_WHEELHOUSE:-${REPO_ROOT}/wheelhouse}"
+TVM_CUDA_BUILD_DIR="${TVM_CUDA_BUILD_DIR:-${REPO_ROOT}/build-wheel-cuda}"
+TVM_BASE_BUILD_DIR="${TVM_BASE_BUILD_DIR:-${REPO_ROOT}/build-wheel-base}"
+TVM_USE_LLVM="${TVM_USE_LLVM:-ON}"
+TVM_USE_CUDA="${TVM_USE_CUDA:-ON}"
+TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES:-75}"
+TVM_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}"
+TVM_WHEEL_DIST_NAME="${TVM_WHEEL_DIST_NAME:-}"
+TVM_WHEEL_DIST_VERSION="${TVM_WHEEL_DIST_VERSION:-}"
+TVM_SKIP_CUDA="${TVM_SKIP_CUDA:-0}"
+TVM_SKIP_REPAIR="${TVM_SKIP_REPAIR:-0}"
+TVM_BUILD_NO_ISOLATION="${TVM_BUILD_NO_ISOLATION:-0}"
+TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
+
+usage() {
+  cat <<'EOF'
+Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|wheel|inject|repair|verify|upload|verify-pypi]
+
+Environment knobs:
+  TVM_USE_LLVM                 LLVM config for the base wheel, default ON
+  TVM_USE_CUDA                 CUDA root or ON for the sidecar build, default ON
+  TVM_CUDA_ARCHITECTURES       CMake CUDA arch list, default 75
+  TVM_WHEEL_DIST_NAME          Optional distribution rename for TestPyPI
+  TVM_WHEEL_DIST_VERSION       Optional distribution version rewrite
+  TVM_UPLOAD_REPOSITORY_URL    Twine repository URL, e.g. TestPyPI legacy URL
+  TVM_SKIP_CUDA=1              Do not build/inject libtvm_runtime_cuda.so
+  TVM_SKIP_REPAIR=1            Keep injected wheel as final wheel
+  TVM_BUILD_NO_ISOLATION=1     Pass --no-isolation to python -m build
+  TVM_KEEP_BUILD_DIRS=1        Reuse CMake build dirs instead of cleaning them
+  TVM_TEST_INDEX_URL           Package index for verify-pypi, default TestPyPI
+  TVM_EXTRA_INDEX_URL          Extra package index for dependencies, default PyPI
+EOF
+}
+
+require_cmd() {
+  if ! command -v "$1" >/dev/null 2>&1; then
+    echo "error: required command not found: $1" >&2
+    return 1
+  fi
+}
+
+require_pypa_build() {
+  local check_dir
+  check_dir="$(mktemp -d)"
+  if ! (cd "$check_dir" && "$TVM_PYTHON" -m build --version >/dev/null 2>&1); then
+    rm -rf "$check_dir"
+    echo "error: PyPA build is missing; install it with: ${TVM_PYTHON} -m pip install build" >&2
+    return 1
+  fi
+  rm -rf "$check_dir"
+}
+
+single_wheel() {
+  local dir="$1"
+  mapfile -t wheels < <(find "$dir" -maxdepth 1 -type f -name '*.whl' | sort)
+  if [[ "${#wheels[@]}" -ne 1 ]]; then
+    echo "error: expected exactly one wheel under ${dir}, found ${#wheels[@]}" >&2
+    printf '%s\n' "${wheels[@]}" >&2
+    return 1
+  fi
+  echo "${wheels[0]}"
+}
+
+wheel_metadata_field() {
+  local wheel="$1"
+  local field="$2"
+  "$TVM_PYTHON" - "$wheel" "$field" <<'PY'
+from email.parser import Parser
+from pathlib import Path
+import sys
+import zipfile
+
+wheel = Path(sys.argv[1])
+field = sys.argv[2]
+with zipfile.ZipFile(wheel) as zf:
+    metadata_name = next(name for name in zf.namelist() if name.endswith(".dist-info/METADATA"))
+    metadata = Parser().parsestr(zf.read(metadata_name).decode("utf-8"))
+print(metadata[field])
+PY
+}
+
+cuda_runtime_path() {
+  if [[ ! -d "$TVM_CUDA_BUILD_DIR" ]]; then
+    return 0
+  fi
+  find "$TVM_CUDA_BUILD_DIR" -type f -name 'libtvm_runtime_cuda.so' | sort | tail -n 1
+}
+
+build_cuda_runtime() {
+  if [[ "$TVM_SKIP_CUDA" == "1" ]]; then
+    echo "Skipping CUDA sidecar build because TVM_SKIP_CUDA=1"
+    return 0
+  fi
+
+  require_cmd cmake
+  echo "Building libtvm_runtime_cuda.so in ${TVM_CUDA_BUILD_DIR}"
+  if [[ "$TVM_KEEP_BUILD_DIRS" != "1" ]]; then
+    rm -rf "$TVM_CUDA_BUILD_DIR"
+  fi
+  local cuda_compiler_args=()
+  if [[ -x "${TVM_USE_CUDA}/bin/nvcc" ]]; then
+    cuda_compiler_args+=(-DCMAKE_CUDA_COMPILER="${TVM_USE_CUDA}/bin/nvcc")
+  fi
+  cmake -S "$REPO_ROOT" -B "$TVM_CUDA_BUILD_DIR" \
+    -DCMAKE_BUILD_TYPE=Release \
+    -DBUILD_TESTING=OFF \
+    -DTVM_BUILD_PYTHON_MODULE=ON \
+    -DUSE_CUDA="$TVM_USE_CUDA" \
+    -DUSE_LLVM=OFF \
+    -DUSE_CUBLAS=OFF \
+    -DUSE_CUDNN=OFF \
+    -DUSE_CUTLASS=OFF \
+    -DUSE_NCCL=OFF \
+    -DUSE_NVTX=OFF \
+    -DCMAKE_CUDA_ARCHITECTURES="$TVM_CUDA_ARCHITECTURES" \
+    "${cuda_compiler_args[@]}"
+
+  cmake --build "$TVM_CUDA_BUILD_DIR" --target tvm_runtime tvm_runtime_cuda --parallel "$TVM_BUILD_PARALLEL_LEVEL"
+
+  local cuda_lib
+  cuda_lib="$(cuda_runtime_path)"
+  if [[ -z "$cuda_lib" ]]; then
+    echo "error: libtvm_runtime_cuda.so was not produced" >&2
+    return 1
+  fi
+  if [[ "$(uname -s)" == "Linux" ]]; then
+    require_cmd patchelf
+    patchelf --set-rpath '$ORIGIN' "$cuda_lib"
+  fi
+  echo "CUDA sidecar: ${cuda_lib}"
+}
+
+build_base_wheel() {
+  require_pypa_build
+  rm -rf "$TVM_RAW_DIST"
+  mkdir -p "$TVM_RAW_DIST"
+  if [[ "$TVM_KEEP_BUILD_DIRS" != "1" ]]; then
+    rm -rf "$TVM_BASE_BUILD_DIR"
+  fi
+
+  local build_flags=()
+  if [[ "$TVM_BUILD_NO_ISOLATION" == "1" ]]; then
+    build_flags+=(--no-isolation)
+  fi
+
+  echo "Building base TVM wheel with LLVM=${TVM_USE_LLVM}, CUDA=OFF"
+  (
+    cd "$TVM_RAW_DIST"
+    CMAKE_ARGS="-DUSE_LLVM=${TVM_USE_LLVM} -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON ${TVM_EXTRA_CMAKE_ARGS:-}" \
+      "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
+        "${build_flags[@]}" \
+        -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
+        "$REPO_ROOT"
+  )
+
+  single_wheel "$TVM_RAW_DIST" >/dev/null
+}
+
+inject_cuda_runtime() {
+  rm -rf "$TVM_INJECTED_DIST"
+  mkdir -p "$TVM_INJECTED_DIST"
+
+  local raw_wheel
+  raw_wheel="$(single_wheel "$TVM_RAW_DIST")"
+
+  local inject_args=(--output-dir "$TVM_INJECTED_DIST")
+  if [[ "$TVM_SKIP_CUDA" != "1" ]]; then
+    local cuda_lib
+    cuda_lib="$(cuda_runtime_path)"
+    if [[ -z "$cuda_lib" ]]; then
+      echo "error: CUDA sidecar missing; run the 'cuda' step first" >&2
+      return 1
+    fi
+    inject_args+=(--cuda-runtime "$cuda_lib")
+  fi
+  if [[ -n "$TVM_WHEEL_DIST_NAME" ]]; then
+    inject_args+=(--distribution-name "$TVM_WHEEL_DIST_NAME")
+  fi
+  if [[ -n "$TVM_WHEEL_DIST_VERSION" ]]; then
+    inject_args+=(--distribution-version "$TVM_WHEEL_DIST_VERSION")
+  fi
+
+  echo "Injecting sidecar/metadata into ${raw_wheel}"
+  "$TVM_PYTHON" "$SCRIPT_DIR/inject_cuda_runtime.py" "$raw_wheel" "${inject_args[@]}"
+}
+
+auditwheel_excludes() {
+  local cuda_lib="$1"
+  local seen=" libtvm_runtime_cuda.so libcuda.so.1 libcuda.so libcudart.so.11.0 libcudart.so.12 libcudart.so.12.0 "
+
+  printf '%s\n' "--exclude" "libtvm_runtime_cuda.so"
+  printf '%s\n' "--exclude" "libcuda.so.1"
+  printf '%s\n' "--exclude" "libcuda.so"
+  printf '%s\n' "--exclude" "libcudart.so.11.0"
+  printf '%s\n' "--exclude" "libcudart.so.12"
+  printf '%s\n' "--exclude" "libcudart.so.12.0"
+
+  if [[ -n "$cuda_lib" ]] && command -v readelf >/dev/null 2>&1; then
+    while IFS= read -r needed; do
+      case "$needed" in
+        libcuda.so*|libcudart.so*|libnv*.so*)
+          if [[ "$seen" != *" ${needed} "* ]]; then
+            seen+="${needed} "
+            printf '%s\n' "--exclude" "$needed"
+          fi
+          ;;
+      esac
+    done < <(readelf -d "$cuda_lib" | sed -n 's/.*Shared library: \[\(.*\)\].*/\1/p')
+  fi
+}
+
+repair_wheel() {
+  rm -rf "$TVM_WHEELHOUSE"
+  mkdir -p "$TVM_WHEELHOUSE"
+
+  local injected_wheel
+  injected_wheel="$(single_wheel "$TVM_INJECTED_DIST")"
+
+  if [[ "$TVM_SKIP_REPAIR" == "1" ]]; then
+    cp "$injected_wheel" "$TVM_WHEELHOUSE/"
+    echo "Repair skipped; final wheel copied to ${TVM_WHEELHOUSE}"
+    return 0
+  fi
+
+  case "$(uname -s)" in
+    Linux)
+      require_cmd auditwheel
+      local cuda_lib
+      cuda_lib="$(cuda_runtime_path || true)"
+      mapfile -t exclude_args < <(auditwheel_excludes "$cuda_lib")
+      echo "Repairing Linux wheel with auditwheel"
+      auditwheel repair "${exclude_args[@]}" -w "$TVM_WHEELHOUSE" "$injected_wheel"
+      ;;
+    Darwin)
+      require_cmd delocate-wheel
+      echo "Repairing macOS wheel with delocate"
+      delocate-wheel --ignore-missing-dependencies -w "$TVM_WHEELHOUSE" -v "$injected_wheel"
+      ;;
+    *)
+      cp "$injected_wheel" "$TVM_WHEELHOUSE/"
+      echo "No repair step for this platform; final wheel copied to ${TVM_WHEELHOUSE}"
+      ;;
+  esac
+
+  single_wheel "$TVM_WHEELHOUSE" >/dev/null
+}
+
+verify_wheel() {
+  local final_wheel
+  final_wheel="$(single_wheel "$TVM_WHEELHOUSE")"
+
+  local venv="${TVM_VERIFY_VENV:-${REPO_ROOT}/build-wheel-verify-venv}"
+  rm -rf "$venv"
+  "$TVM_PYTHON" -m venv "$venv"
+
+  local venv_python="${venv}/bin/python"
+  if [[ "$(uname -s)" == MINGW* || "$(uname -s)" == CYGWIN* ]]; then
+    venv_python="${venv}/Scripts/python.exe"
+  fi
+
+  "$venv_python" -m pip install --upgrade pip
+  "$venv_python" -m pip install --extra-index-url "${TVM_EXTRA_INDEX_URL:-https://pypi.org/simple}" "$final_wheel"
+  "$venv_python" - <<'PY'
+from pathlib import Path
+import tvm
+
+root = Path(tvm.__file__).resolve().parent
+print("tvm version:", tvm.__version__)
+print("tvm package:", root)
+print("llvm enabled:", tvm.runtime.enabled("llvm"))
+print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
+assert (root / "lib" / "libtvm_runtime.so").exists()
+cuda_sidecar = root / "lib" / "libtvm_runtime_cuda.so"
+print("cuda sidecar present:", cuda_sidecar.exists())
+PY
+}
+
+upload_wheel() {
+  require_cmd twine
+  local repo_args=()
+  if [[ -n "${TVM_UPLOAD_REPOSITORY_URL:-}" ]]; then
+    repo_args+=(--repository-url "$TVM_UPLOAD_REPOSITORY_URL")
+  fi
+  twine upload "${repo_args[@]}" "$TVM_WHEELHOUSE"/*
+}
+
+verify_pypi_wheel() {
+  local final_wheel
+  final_wheel="$(single_wheel "$TVM_WHEELHOUSE")"
+
+  local package_name package_version
+  package_name="$(wheel_metadata_field "$final_wheel" Name)"
+  package_version="$(wheel_metadata_field "$final_wheel" Version)"
+
+  local index_url="${TVM_TEST_INDEX_URL:-https://test.pypi.org/simple/}"
+  local extra_index_url="${TVM_EXTRA_INDEX_URL:-https://pypi.org/simple}"
+  local venv="${TVM_VERIFY_PYPI_VENV:-${REPO_ROOT}/build-wheel-verify-pypi-venv}"
+  rm -rf "$venv"
+  "$TVM_PYTHON" -m venv "$venv"
+
+  local venv_python="${venv}/bin/python"
+  if [[ "$(uname -s)" == MINGW* || "$(uname -s)" == CYGWIN* ]]; then
+    venv_python="${venv}/Scripts/python.exe"
+  fi
+
+  "$venv_python" -m pip install --upgrade pip
+  "$venv_python" -m pip install \
+    --index-url "$index_url" \
+    --extra-index-url "$extra_index_url" \
+    "${package_name}==${package_version}"
+  "$venv_python" - <<'PY'
+from pathlib import Path
+import tvm
+
+root = Path(tvm.__file__).resolve().parent
+print("tvm version:", tvm.__version__)
+print("tvm package:", root)
+print("llvm enabled:", tvm.runtime.enabled("llvm"))
+print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
+assert (root / "lib" / "libtvm_runtime.so").exists()
+print("cuda sidecar present:", (root / "lib" / "libtvm_runtime_cuda.so").exists())
+PY
+}
+
+main() {
+  local step="${1:-all}"
+  case "$step" in
+    all)
+      build_cuda_runtime
+      build_base_wheel
+      inject_cuda_runtime
+      repair_wheel
+      verify_wheel
+      ;;
+    cuda) build_cuda_runtime ;;
+    wheel) build_base_wheel ;;
+    inject) inject_cuda_runtime ;;
+    repair) repair_wheel ;;
+    verify) verify_wheel ;;
+    upload) upload_wheel ;;
+    verify-pypi) verify_pypi_wheel ;;
+    -h|--help|help) usage ;;
+    *)
+      usage >&2
+      return 1
+      ;;
+  esac
+}
+
+main "$@"
diff --git a/ci/scripts/package/inject_cuda_runtime.py b/ci/scripts/package/inject_cuda_runtime.py
new file mode 100755
index 000000000000..8146e76f7894
--- /dev/null
+++ b/ci/scripts/package/inject_cuda_runtime.py
@@ -0,0 +1,198 @@
+#!/usr/bin/env python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""Inject TVM's CUDA runtime DSO into a wheel and refresh RECORD."""
+
+from __future__ import annotations
+
+import argparse
+import base64
+import csv
+import hashlib
+import io
+import re
+import sys
+import zipfile
+from email.parser import Parser
+from pathlib import Path
+
+
+def _wheel_escape(value: str) -> str:
+    """Escape a distribution component for wheel filenames and dist-info dirs."""
+
+    return re.sub(r"[^\w\d.]+", "_", value).lower()
+
+
+def _hash_record(data: bytes) -> tuple[str, str]:
+    digest = hashlib.sha256(data).digest()
+    encoded = base64.urlsafe_b64encode(digest).rstrip(b"=").decode("ascii")
+    return f"sha256={encoded}", str(len(data))
+
+
+def _copy_info(info: zipfile.ZipInfo, filename: str) -> zipfile.ZipInfo:
+    copied = zipfile.ZipInfo(filename=filename, date_time=info.date_time)
+    copied.compress_type = info.compress_type
+    copied.comment = info.comment
+    copied.extra = info.extra
+    copied.internal_attr = info.internal_attr
+    copied.external_attr = info.external_attr
+    return copied
+
+
+def _replace_header(metadata: bytes, key: str, value: str) -> bytes:
+    text = metadata.decode("utf-8")
+    lines = text.splitlines(keepends=True)
+    prefix = f"{key.lower()}:"
+    for index, line in enumerate(lines):
+        if line.lower().startswith(prefix):
+            newline = "\r\n" if line.endswith("\r\n") else "\n"
+            lines[index] = f"{key}: {value}{newline}"
+            return "".join(lines).encode("utf-8")
+    raise ValueError(f"METADATA does not contain {key!r}")
+
+
+def _find_dist_info(names: list[str]) -> str:
+    dist_infos = sorted({name.split("/", 1)[0] for name in names if ".dist-info/" in name})
+    dist_infos = [name for name in dist_infos if name.endswith(".dist-info")]
+    if len(dist_infos) != 1:
+        raise ValueError(f"Expected one .dist-info directory, found {dist_infos}")
+    return dist_infos[0]
+
+
+def _metadata_headers(metadata: bytes) -> tuple[str, str]:
+    headers = Parser().parsestr(metadata.decode("utf-8"))
+    name = headers.get("Name")
+    version = headers.get("Version")
+    if not name or not version:
+        raise ValueError("METADATA must contain Name and Version")
+    return name, version
+
+
+def _retag_wheel_filename(
+    wheel: Path,
+    dist_name: str,
+    version: str,
+) -> str:
+    parts = wheel.name.removesuffix(".whl").split("-")
+    if len(parts) not in (5, 6):
+        raise ValueError(f"Unsupported wheel filename: {wheel.name}")
+    tags = parts[2:]
+    return f"{_wheel_escape(dist_name)}-{version}-{'-'.join(tags)}.whl"
+
+
+def rewrite_wheel(
+    wheel: Path,
+    output_dir: Path,
+    cuda_runtime: Path | None,
+    target_path: str,
+    distribution_name: str | None,
+    distribution_version: str | None,
+) -> Path:
+    output_dir.mkdir(parents=True, exist_ok=True)
+    with zipfile.ZipFile(wheel, "r") as zin:
+        original_names = zin.namelist()
+        original_dist_info = _find_dist_info(original_names)
+        metadata_path = f"{original_dist_info}/METADATA"
+        original_name, original_version = _metadata_headers(zin.read(metadata_path))
+
+        final_name = distribution_name or original_name
+        final_version = distribution_version or original_version
+        final_dist_info = f"{_wheel_escape(final_name)}-{final_version}.dist-info"
+        record_path = f"{final_dist_info}/RECORD"
+        output_path = output_dir / _retag_wheel_filename(wheel, final_name, final_version)
+
+        entries: list[tuple[zipfile.ZipInfo, bytes]] = []
+        for info in zin.infolist():
+            if info.is_dir():
+                continue
+            mapped_name = info.filename
+            if mapped_name == f"{original_dist_info}/RECORD":
+                continue
+            if mapped_name.startswith(f"{original_dist_info}/"):
+                mapped_name = f"{final_dist_info}/{mapped_name.split('/', 1)[1]}"
+            if cuda_runtime is not None and mapped_name == target_path:
+                continue
+
+            data = zin.read(info.filename)
+            if mapped_name == f"{final_dist_info}/METADATA":
+                if distribution_name is not None:
+                    data = _replace_header(data, "Name", final_name)
+                if distribution_version is not None:
+                    data = _replace_header(data, "Version", final_version)
+            entries.append((_copy_info(info, mapped_name), data))
+
+        if cuda_runtime is not None:
+            data = cuda_runtime.read_bytes()
+            info = zipfile.ZipInfo(target_path)
+            info.compress_type = zipfile.ZIP_DEFLATED
+            info.external_attr = 0o644 << 16
+            entries.append((info, data))
+
+    record_buffer = io.StringIO()
+    writer = csv.writer(record_buffer, lineterminator="\n")
+    for info, data in entries:
+        digest, size = _hash_record(data)
+        writer.writerow([info.filename, digest, size])
+    writer.writerow([record_path, "", ""])
+
+    record_info = zipfile.ZipInfo(record_path)
+    record_info.compress_type = zipfile.ZIP_DEFLATED
+    record_info.external_attr = 0o644 << 16
+    entries.append((record_info, record_buffer.getvalue().encode("utf-8")))
+
+    with zipfile.ZipFile(output_path, "w", compression=zipfile.ZIP_DEFLATED) as zout:
+        for info, data in entries:
+            zout.writestr(info, data)
+    return output_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("wheel", type=Path)
+    parser.add_argument("--cuda-runtime", type=Path)
+    parser.add_argument("--target-path", default=None)
+    parser.add_argument("--output-dir", type=Path, required=True)
+    parser.add_argument("--distribution-name")
+    parser.add_argument("--distribution-version")
+    args = parser.parse_args()
+
+    cuda_runtime = args.cuda_runtime
+    if cuda_runtime is not None and not cuda_runtime.is_file():
+        parser.error(f"CUDA runtime DSO does not exist: {cuda_runtime}")
+
+    target_path = args.target_path
+    if target_path is None:
+        if cuda_runtime is None:
+            target_path = "tvm/lib/libtvm_runtime_cuda.so"
+        else:
+            target_path = f"tvm/lib/{cuda_runtime.name}"
+
+    output_path = rewrite_wheel(
+        wheel=args.wheel,
+        output_dir=args.output_dir,
+        cuda_runtime=cuda_runtime,
+        target_path=target_path,
+        distribution_name=args.distribution_name,
+        distribution_version=args.distribution_version,
+    )
+    print(output_path)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index ec6160e7afaf..9314c58b952f 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -79,6 +79,19 @@ if(USE_CUDA)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
+  if(TVM_BUILD_PYTHON_MODULE)
+    if(APPLE)
+      set_target_properties(tvm_runtime_cuda PROPERTIES
+        BUILD_RPATH "@loader_path"
+        INSTALL_RPATH "@loader_path"
+      )
+    elseif(LINUX)
+      set_target_properties(tvm_runtime_cuda PROPERTIES
+        BUILD_RPATH "\$ORIGIN"
+        INSTALL_RPATH "\$ORIGIN"
+      )
+    endif()
+  endif()
   install(TARGETS tvm_runtime_cuda DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_cuda DESTINATION "lib")

From 54982393dc639224602993f5e4b2cd8b93ac10ea Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 16:36:50 -0400
Subject: [PATCH 02/43] Fix TVM wheel packaging lint

---
 .../actions/tvm-wheel-for-publish/action.yml    |  1 -
 ci/scripts/package/README.md                    | 17 +++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 86c85a128d42..2f874322dff6 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -79,4 +79,3 @@ runs:
       run: |
         set -eux
         ci/scripts/package/build_tvm_wheel.sh all
-
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 409a2ea6084f..fc9bab173753 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -1,3 +1,20 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
 # TVM wheel packaging helper
 
 This helper follows the CUDA-sidecar packaging flow used for local release

From 39907d997df1f53af582461910568adb06c8be84 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 17:19:37 -0400
Subject: [PATCH 03/43] Expose LLVM runtime libs to auditwheel

---
 ci/scripts/package/build_tvm_wheel.sh | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index cb30ce1d2b87..2008c0a9caa3 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -235,6 +235,17 @@ auditwheel_excludes() {
   fi
 }
 
+llvm_libdir() {
+  if [[ "$TVM_USE_LLVM" == "OFF" || "$TVM_USE_LLVM" == "0" ]]; then
+    return 0
+  fi
+  if command -v "$TVM_USE_LLVM" >/dev/null 2>&1; then
+    "$TVM_USE_LLVM" --libdir
+  elif [[ -x "$TVM_USE_LLVM" ]]; then
+    "$TVM_USE_LLVM" --libdir
+  fi
+}
+
 repair_wheel() {
   rm -rf "$TVM_WHEELHOUSE"
   mkdir -p "$TVM_WHEELHOUSE"
@@ -255,7 +266,14 @@ repair_wheel() {
       cuda_lib="$(cuda_runtime_path || true)"
       mapfile -t exclude_args < <(auditwheel_excludes "$cuda_lib")
       echo "Repairing Linux wheel with auditwheel"
-      auditwheel repair "${exclude_args[@]}" -w "$TVM_WHEELHOUSE" "$injected_wheel"
+      (
+        llvm_dir="$(llvm_libdir || true)"
+        if [[ -n "${llvm_dir:-}" && -d "$llvm_dir" ]]; then
+          echo "Adding LLVM libdir to LD_LIBRARY_PATH for auditwheel: ${llvm_dir}"
+          export LD_LIBRARY_PATH="${llvm_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+        fi
+        auditwheel repair "${exclude_args[@]}" -w "$TVM_WHEELHOUSE" "$injected_wheel"
+      )
       ;;
     Darwin)
       require_cmd delocate-wheel

From 55ffc29e08a1b28932991c7a9788d42d5f292023 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 17:25:58 -0400
Subject: [PATCH 04/43] Reduce runner disk pressure for wheel publish

---
 .../actions/tvm-wheel-for-publish/action.yml  | 22 +++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 2f874322dff6..23a806b6f0f7 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -40,9 +40,23 @@ runs:
       with:
         ref: ${{ inputs.checkout_ref }}
         submodules: recursive
-        fetch-depth: 0
+        fetch-depth: 1
         fetch-tags: true
 
+    - name: Free runner disk space
+      shell: bash -l {0}
+      run: |
+        set -eux
+        df -h
+        sudo rm -rf /usr/share/dotnet
+        sudo rm -rf /usr/local/lib/android
+        sudo rm -rf /opt/ghc
+        sudo rm -rf /opt/hostedtoolcache/CodeQL
+        sudo rm -rf /usr/local/share/boost
+        docker image prune -af || true
+        sudo apt-get clean
+        df -h
+
     - name: Set up TVM build environment
       uses: ./.github/actions/setup
 
@@ -78,4 +92,8 @@ runs:
         TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
       run: |
         set -eux
-        ci/scripts/package/build_tvm_wheel.sh all
+        ci/scripts/package/build_tvm_wheel.sh cuda
+        ci/scripts/package/build_tvm_wheel.sh wheel
+        ci/scripts/package/build_tvm_wheel.sh inject
+        ci/scripts/package/build_tvm_wheel.sh repair
+        ci/scripts/package/build_tvm_wheel.sh verify

From a36d3b82ab1fe502b9e63cedc782d42ded4ace48 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 18:07:30 -0400
Subject: [PATCH 05/43] Exclude TVM FFI from wheel repair

---
 ci/scripts/package/README.md          | 3 ++-
 ci/scripts/package/build_tvm_wheel.sh | 9 +++++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index fc9bab173753..b893458289d8 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -23,7 +23,8 @@ validation:
 1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
 2. Build the main Python wheel with LLVM enabled and CUDA disabled.
 3. Inject the CUDA runtime DSO into `tvm/lib/` inside the wheel.
-4. Repair the wheel, excluding CUDA driver/runtime DSOs from vendoring.
+4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`
+   from vendoring.
 5. Verify the wheel in a fresh virtualenv.
 6. Upload with `twine`.
 
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 2008c0a9caa3..17656059e735 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -212,9 +212,10 @@ inject_cuda_runtime() {
 
 auditwheel_excludes() {
   local cuda_lib="$1"
-  local seen=" libtvm_runtime_cuda.so libcuda.so.1 libcuda.so libcudart.so.11.0 libcudart.so.12 libcudart.so.12.0 "
+  local seen=" libtvm_runtime_cuda.so libtvm_ffi.so libcuda.so.1 libcuda.so libcudart.so.11.0 libcudart.so.12 libcudart.so.12.0 "
 
   printf '%s\n' "--exclude" "libtvm_runtime_cuda.so"
+  printf '%s\n' "--exclude" "libtvm_ffi.so"
   printf '%s\n' "--exclude" "libcuda.so.1"
   printf '%s\n' "--exclude" "libcuda.so"
   printf '%s\n' "--exclude" "libcudart.so.11.0"
@@ -278,7 +279,11 @@ repair_wheel() {
     Darwin)
       require_cmd delocate-wheel
       echo "Repairing macOS wheel with delocate"
-      delocate-wheel --ignore-missing-dependencies -w "$TVM_WHEELHOUSE" -v "$injected_wheel"
+      delocate-wheel \
+        --ignore-missing-dependencies \
+        --exclude libtvm_ffi.dylib \
+        -w "$TVM_WHEELHOUSE" \
+        -v "$injected_wheel"
       ;;
     *)
       cp "$injected_wheel" "$TVM_WHEELHOUSE/"

From 5738a1fdab527b8439ea21a03d7a54a3885f54bd Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 18:58:02 -0400
Subject: [PATCH 06/43] Repair wheel internal TVM library paths

---
 ci/scripts/package/README.md              |  4 ++--
 ci/scripts/package/build_tvm_wheel.sh     |  8 +++++++-
 ci/scripts/package/inject_cuda_runtime.py | 25 +++++++++++++++++++++++
 3 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index b893458289d8..3a77a46ca267 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -23,8 +23,8 @@ validation:
 1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
 2. Build the main Python wheel with LLVM enabled and CUDA disabled.
 3. Inject the CUDA runtime DSO into `tvm/lib/` inside the wheel.
-4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`
-   from vendoring.
+4. Repair the wheel, excluding CUDA driver/runtime DSOs, `libtvm_ffi`, and
+   intra-wheel TVM DSOs from vendoring.
 5. Verify the wheel in a fresh virtualenv.
 6. Upload with `twine`.
 
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 17656059e735..a9bf840b85b9 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -205,6 +205,9 @@ inject_cuda_runtime() {
   if [[ -n "$TVM_WHEEL_DIST_VERSION" ]]; then
     inject_args+=(--distribution-version "$TVM_WHEEL_DIST_VERSION")
   fi
+  if [[ "$(uname -s)" == "Linux" ]]; then
+    inject_args+=(--set-rpath '$ORIGIN')
+  fi
 
   echo "Injecting sidecar/metadata into ${raw_wheel}"
   "$TVM_PYTHON" "$SCRIPT_DIR/inject_cuda_runtime.py" "$raw_wheel" "${inject_args[@]}"
@@ -212,8 +215,11 @@ inject_cuda_runtime() {
 
 auditwheel_excludes() {
   local cuda_lib="$1"
-  local seen=" libtvm_runtime_cuda.so libtvm_ffi.so libcuda.so.1 libcuda.so libcudart.so.11.0 libcudart.so.12 libcudart.so.12.0 "
+  local seen
+  seen=" libtvm_runtime.so libtvm_runtime_cuda.so libtvm_ffi.so "
+  seen+="libcuda.so.1 libcuda.so libcudart.so.11.0 libcudart.so.12 libcudart.so.12.0 "
 
+  printf '%s\n' "--exclude" "libtvm_runtime.so"
   printf '%s\n' "--exclude" "libtvm_runtime_cuda.so"
   printf '%s\n' "--exclude" "libtvm_ffi.so"
   printf '%s\n' "--exclude" "libcuda.so.1"
diff --git a/ci/scripts/package/inject_cuda_runtime.py b/ci/scripts/package/inject_cuda_runtime.py
index 8146e76f7894..0e85942c4367 100755
--- a/ci/scripts/package/inject_cuda_runtime.py
+++ b/ci/scripts/package/inject_cuda_runtime.py
@@ -26,7 +26,10 @@
 import hashlib
 import io
 import re
+import shutil
+import subprocess
 import sys
+import tempfile
 import zipfile
 from email.parser import Parser
 from pathlib import Path
@@ -83,6 +86,19 @@ def _metadata_headers(metadata: bytes) -> tuple[str, str]:
     return name, version
 
 
+def _is_elf_shared_lib(name: str, data: bytes) -> bool:
+    return name.startswith("tvm/lib/") and name.endswith(".so") and data.startswith(b"\x7fELF")
+
+
+def _set_rpath(data: bytes, rpath: str, name: str) -> bytes:
+    with tempfile.TemporaryDirectory() as tmpdir:
+        path = Path(tmpdir) / Path(name).name
+        path.write_bytes(data)
+        path.chmod(0o755)
+        subprocess.run(["patchelf", "--set-rpath", rpath, str(path)], check=True)
+        return path.read_bytes()
+
+
 def _retag_wheel_filename(
     wheel: Path,
     dist_name: str,
@@ -102,6 +118,7 @@ def rewrite_wheel(
     target_path: str,
     distribution_name: str | None,
     distribution_version: str | None,
+    set_rpath: str | None,
 ) -> Path:
     output_dir.mkdir(parents=True, exist_ok=True)
     with zipfile.ZipFile(wheel, "r") as zin:
@@ -134,10 +151,14 @@ def rewrite_wheel(
                     data = _replace_header(data, "Name", final_name)
                 if distribution_version is not None:
                     data = _replace_header(data, "Version", final_version)
+            if set_rpath is not None and _is_elf_shared_lib(mapped_name, data):
+                data = _set_rpath(data, set_rpath, mapped_name)
             entries.append((_copy_info(info, mapped_name), data))
 
         if cuda_runtime is not None:
             data = cuda_runtime.read_bytes()
+            if set_rpath is not None and _is_elf_shared_lib(target_path, data):
+                data = _set_rpath(data, set_rpath, target_path)
             info = zipfile.ZipInfo(target_path)
             info.compress_type = zipfile.ZIP_DEFLATED
             info.external_attr = 0o644 << 16
@@ -169,11 +190,14 @@ def main() -> int:
     parser.add_argument("--output-dir", type=Path, required=True)
     parser.add_argument("--distribution-name")
     parser.add_argument("--distribution-version")
+    parser.add_argument("--set-rpath")
     args = parser.parse_args()
 
     cuda_runtime = args.cuda_runtime
     if cuda_runtime is not None and not cuda_runtime.is_file():
         parser.error(f"CUDA runtime DSO does not exist: {cuda_runtime}")
+    if args.set_rpath and shutil.which("patchelf") is None:
+        parser.error("--set-rpath requires patchelf on PATH")
 
     target_path = args.target_path
     if target_path is None:
@@ -189,6 +213,7 @@ def main() -> int:
         target_path=target_path,
         distribution_name=args.distribution_name,
         distribution_version=args.distribution_version,
+        set_rpath=args.set_rpath,
     )
     print(output_path)
     return 0

From 93b9c52b609171aa42317ceb95045fb7ae0d3007 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 19:40:42 -0400
Subject: [PATCH 07/43] Avoid pytest dependency during TVM import

---
 python/tvm/rpc/server.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/python/tvm/rpc/server.py b/python/tvm/rpc/server.py
index 099cb8f1f1e7..5ffcb878f185 100644
--- a/python/tvm/rpc/server.py
+++ b/python/tvm/rpc/server.py
@@ -48,9 +48,16 @@
 from tvm.support.popen_pool import PopenWorker
 
 # pylint: disable=unused-import
-from . import _ffi_api, base, testing
+from . import _ffi_api, base
 from .base import TrackerCode
 
+try:
+    # Register RPC unit-test helpers when the testing dependencies are present.
+    from . import testing
+except ModuleNotFoundError as err:
+    if err.name != "pytest":
+        raise
+
 logger = logging.getLogger("RPCServer")
 console_handler = logging.StreamHandler()
 console_handler.setFormatter(

From 0ef42c4ffecbe77da647d9f2badf349f655f40ee Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 20:19:31 -0400
Subject: [PATCH 08/43] Validate TVM wheel internal ELF links

---
 CMakeLists.txt                           |  38 ++++--
 ci/scripts/package/README.md             |   8 +-
 ci/scripts/package/build_tvm_wheel.sh    |  16 ++-
 ci/scripts/package/validate_wheel_elf.py | 150 +++++++++++++++++++++++
 cmake/modules/CUDA.cmake                 |  16 +--
 cmake/modules/Hexagon.cmake              |   1 +
 cmake/modules/Metal.cmake                |   1 +
 cmake/modules/OpenCL.cmake               |   1 +
 cmake/modules/ROCM.cmake                 |   1 +
 cmake/modules/Vulkan.cmake               |   1 +
 10 files changed, 200 insertions(+), 33 deletions(-)
 create mode 100644 ci/scripts/package/validate_wheel_elf.py

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a11a8729700f..b2738a198786 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -104,6 +104,24 @@ tvm_option(USE_NVSHMEM "Build with NVSHMEM support" OFF)
 # Python package options
 tvm_option(TVM_BUILD_PYTHON_MODULE "Build Python module with scikit-build-core" OFF)
 
+function(tvm_set_python_module_relative_rpath target_name)
+  if(NOT TVM_BUILD_PYTHON_MODULE OR NOT TARGET ${target_name})
+    return()
+  endif()
+
+  if(APPLE)
+    set_target_properties(${target_name} PROPERTIES
+      BUILD_RPATH "@loader_path"
+      INSTALL_RPATH "@loader_path"
+    )
+  elseif(UNIX)
+    set_target_properties(${target_name} PROPERTIES
+      BUILD_RPATH "\$ORIGIN"
+      INSTALL_RPATH "\$ORIGIN"
+    )
+  endif()
+endfunction()
+
 # include directories
 include_directories(${CMAKE_INCLUDE_PATH})
 include_directories("include")
@@ -531,6 +549,7 @@ set_target_properties(tvm_runtime_extra PROPERTIES
   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
 )
+tvm_set_python_module_relative_rpath(tvm_runtime_extra)
 
 install(TARGETS tvm_runtime_extra DESTINATION lib${LIB_SUFFIX})
 if(TVM_BUILD_PYTHON_MODULE)
@@ -845,17 +864,12 @@ endif()
 if(TVM_BUILD_PYTHON_MODULE)
   message(STATUS "Configuring Python package installation")
 
-  # Set RPATH for tvm_compiler and tvm_runtime to find each other relatively
-  # (libtvm_compiler.so links against libtvm_runtime.so).
-  if(APPLE)
-    # macOS uses @loader_path
-    set_target_properties(tvm_compiler PROPERTIES INSTALL_RPATH "@loader_path")
-    set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "@loader_path")
-  elseif(LINUX)
-    # Linux uses $ORIGIN
-    set_target_properties(tvm_compiler PROPERTIES INSTALL_RPATH "\$ORIGIN")
-    set_target_properties(tvm_runtime PROPERTIES INSTALL_RPATH "\$ORIGIN")
-  endif()
+  # Keep wheel-internal shared libraries relocatable in both build and install
+  # trees.  Several runtime DSOs link against libtvm_runtime.so and are installed
+  # into the same tvm/lib directory inside the wheel.
+  tvm_set_python_module_relative_rpath(tvm_compiler)
+  tvm_set_python_module_relative_rpath(tvm_runtime)
+  tvm_set_python_module_relative_rpath(tvm_runtime_extra)
 
   # Install compiled shared libraries into <project>/lib so that
   # tvm_ffi.libinfo.load_lib_ctypes(package="tvm", target_name=...) can find
@@ -865,9 +879,11 @@ if(TVM_BUILD_PYTHON_MODULE)
 
   # Install third-party compiled dependencies into the same lib/ dir.
   if(TARGET fpA_intB_gemm)
+    tvm_set_python_module_relative_rpath(fpA_intB_gemm)
     install(TARGETS fpA_intB_gemm DESTINATION "lib")
   endif()
   if(TARGET flash_attn)
+    tvm_set_python_module_relative_rpath(flash_attn)
     install(TARGETS flash_attn DESTINATION "lib")
   endif()
 
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 3a77a46ca267..6adcfe613579 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -23,10 +23,10 @@ validation:
 1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
 2. Build the main Python wheel with LLVM enabled and CUDA disabled.
 3. Inject the CUDA runtime DSO into `tvm/lib/` inside the wheel.
-4. Repair the wheel, excluding CUDA driver/runtime DSOs, `libtvm_ffi`, and
-   intra-wheel TVM DSOs from vendoring.
-5. Verify the wheel in a fresh virtualenv.
-6. Upload with `twine`.
+4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`.
+5. Validate ELF links so intra-wheel TVM DSOs resolve through relative rpaths.
+6. Verify the wheel in a fresh virtualenv.
+7. Upload with `twine`.
 
 It mirrors the TVM-FFI packaging patterns in:
 
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index a9bf840b85b9..a7f84bc83326 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -40,7 +40,7 @@ TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
 
 usage() {
   cat <<'EOF'
-Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|wheel|inject|repair|verify|upload|verify-pypi]
+Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|wheel|inject|repair|validate|verify|upload|verify-pypi]
 
 Environment knobs:
   TVM_USE_LLVM                 LLVM config for the base wheel, default ON
@@ -216,11 +216,9 @@ inject_cuda_runtime() {
 auditwheel_excludes() {
   local cuda_lib="$1"
   local seen
-  seen=" libtvm_runtime.so libtvm_runtime_cuda.so libtvm_ffi.so "
+  seen=" libtvm_ffi.so "
   seen+="libcuda.so.1 libcuda.so libcudart.so.11.0 libcudart.so.12 libcudart.so.12.0 "
 
-  printf '%s\n' "--exclude" "libtvm_runtime.so"
-  printf '%s\n' "--exclude" "libtvm_runtime_cuda.so"
   printf '%s\n' "--exclude" "libtvm_ffi.so"
   printf '%s\n' "--exclude" "libcuda.so.1"
   printf '%s\n' "--exclude" "libcuda.so"
@@ -300,9 +298,18 @@ repair_wheel() {
   single_wheel "$TVM_WHEELHOUSE" >/dev/null
 }
 
+validate_wheel_elf() {
+  local final_wheel
+  final_wheel="$(single_wheel "$TVM_WHEELHOUSE")"
+  if [[ "$(uname -s)" == "Linux" ]]; then
+    "$TVM_PYTHON" "$SCRIPT_DIR/validate_wheel_elf.py" "$final_wheel"
+  fi
+}
+
 verify_wheel() {
   local final_wheel
   final_wheel="$(single_wheel "$TVM_WHEELHOUSE")"
+  validate_wheel_elf
 
   local venv="${TVM_VERIFY_VENV:-${REPO_ROOT}/build-wheel-verify-venv}"
   rm -rf "$venv"
@@ -391,6 +398,7 @@ main() {
     wheel) build_base_wheel ;;
     inject) inject_cuda_runtime ;;
     repair) repair_wheel ;;
+    validate) validate_wheel_elf ;;
     verify) verify_wheel ;;
     upload) upload_wheel ;;
     verify-pypi) verify_pypi_wheel ;;
diff --git a/ci/scripts/package/validate_wheel_elf.py b/ci/scripts/package/validate_wheel_elf.py
new file mode 100644
index 000000000000..837def6a075e
--- /dev/null
+++ b/ci/scripts/package/validate_wheel_elf.py
@@ -0,0 +1,150 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Validate ELF linkage inside a repaired TVM wheel."""
+
+from __future__ import annotations
+
+import argparse
+import os
+import re
+import shutil
+import subprocess
+import sys
+import tempfile
+import zipfile
+from pathlib import Path
+
+TVM_EXTERNAL_LIBS = {
+    "libtvm_ffi.so",
+}
+
+
+def _run(command: list[str], env: dict[str, str] | None = None) -> str:
+    try:
+        proc = subprocess.run(
+            command,
+            check=True,
+            env=env,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.STDOUT,
+            text=True,
+        )
+    except subprocess.CalledProcessError as err:
+        output = err.stdout or ""
+        raise RuntimeError(f"{' '.join(command)} failed:\n{output}") from err
+    return proc.stdout
+
+
+def _dynamic_entries(path: Path) -> tuple[list[str], list[str]]:
+    output = _run(["readelf", "-d", str(path)])
+    needed = re.findall(r"Shared library: \[(.*?)\]", output)
+    rpaths = re.findall(r"Library (?:rpath|runpath): \[(.*?)\]", output)
+    return needed, [entry for rpath in rpaths for entry in rpath.split(":") if entry]
+
+
+def _ldd(path: Path) -> dict[str, str]:
+    output = _run(["ldd", str(path)], env={**os.environ, "LD_LIBRARY_PATH": ""})
+    resolved: dict[str, str] = {}
+    for line in output.splitlines():
+        line = line.strip()
+        if "=>" not in line:
+            continue
+        name, target = line.split("=>", 1)
+        target = target.strip()
+        resolved[name.strip()] = target.split(" ", 1)[0]
+    return resolved
+
+
+def validate(wheel: Path) -> None:
+    if sys.platform != "linux":
+        print("ELF wheel validation skipped on non-Linux platform")
+        return
+    for command in ("readelf", "ldd"):
+        if shutil.which(command) is None:
+            raise RuntimeError(f"{command} is required for ELF wheel validation")
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        root = Path(tmpdir)
+        with zipfile.ZipFile(wheel) as zf:
+            zf.extractall(root)
+
+        libdir = root / "tvm" / "lib"
+        if not libdir.is_dir():
+            raise RuntimeError(f"wheel does not contain {libdir.relative_to(root)}")
+
+        libs = {path.name: path for path in sorted(libdir.glob("*.so*")) if path.is_file()}
+        if "libtvm_runtime.so" not in libs:
+            raise RuntimeError("wheel does not contain tvm/lib/libtvm_runtime.so")
+        if "libtvm_ffi.so" in libs:
+            raise RuntimeError("TVM wheel must depend on tvm_ffi instead of bundling libtvm_ffi.so")
+
+        errors: list[str] = []
+        for lib in libs.values():
+            needed, rpaths = _dynamic_entries(lib)
+            internal_needed = sorted(name for name in needed if name in libs)
+            if internal_needed and "$ORIGIN" not in rpaths:
+                errors.append(
+                    f"{lib.relative_to(root)} needs {internal_needed} but RUNPATH/RPATH is {rpaths}"
+                )
+
+            resolved = _ldd(lib)
+            for name in internal_needed:
+                target = resolved.get(name)
+                if target is None:
+                    errors.append(f"{lib.relative_to(root)}: ldd did not report {name}")
+                    continue
+                if target == "not":
+                    errors.append(f"{lib.relative_to(root)}: {name} is not found")
+                    continue
+                if Path(target).resolve() != libs[name].resolve():
+                    errors.append(
+                        f"{lib.relative_to(root)}: {name} resolved to {target}, "
+                        f"expected {libs[name].relative_to(root)}"
+                    )
+
+            unexpected_tvm_deps = sorted(
+                name
+                for name in needed
+                if name.startswith("libtvm_")
+                and name not in libs
+                and name not in TVM_EXTERNAL_LIBS
+            )
+            if unexpected_tvm_deps:
+                errors.append(
+                    f"{lib.relative_to(root)} has unresolved TVM deps: {unexpected_tvm_deps}"
+                )
+
+        if errors:
+            raise RuntimeError("ELF wheel validation failed:\n" + "\n".join(errors))
+
+        print(f"ELF wheel validation passed for {wheel.name}")
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("wheel", type=Path)
+    args = parser.parse_args()
+    try:
+        validate(args.wheel)
+    except RuntimeError as err:
+        print(err, file=sys.stderr)
+        return 1
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake
index 9314c58b952f..42bd72d3b773 100644
--- a/cmake/modules/CUDA.cmake
+++ b/cmake/modules/CUDA.cmake
@@ -79,19 +79,7 @@ if(USE_CUDA)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
-  if(TVM_BUILD_PYTHON_MODULE)
-    if(APPLE)
-      set_target_properties(tvm_runtime_cuda PROPERTIES
-        BUILD_RPATH "@loader_path"
-        INSTALL_RPATH "@loader_path"
-      )
-    elseif(LINUX)
-      set_target_properties(tvm_runtime_cuda PROPERTIES
-        BUILD_RPATH "\$ORIGIN"
-        INSTALL_RPATH "\$ORIGIN"
-      )
-    endif()
-  endif()
+  tvm_set_python_module_relative_rpath(tvm_runtime_cuda)
   install(TARGETS tvm_runtime_cuda DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_cuda DESTINATION "lib")
@@ -115,7 +103,7 @@ if(USE_CUDA AND USE_CUDNN)
   add_library(tvm_cudnn_objs OBJECT ${CONTRIB_CUDNN_SRCS})
   target_link_libraries(tvm_cudnn_objs PRIVATE tvm_runtime_extra_defs)
   target_link_libraries(tvm_runtime_extra PRIVATE tvm_cudnn_objs ${CUDA_CUDNN_LIBRARY})
-endif(USE_CUDNN)
+endif(USE_CUDA AND USE_CUDNN)
 
 if(USE_CUDA AND USE_CUDNN_FRONTEND)
   message(STATUS "Build with cuDNN Frontend support")
diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake
index 431b15b13ac6..254cda10971f 100644
--- a/cmake/modules/Hexagon.cmake
+++ b/cmake/modules/Hexagon.cmake
@@ -351,6 +351,7 @@ elseif(USE_HEXAGON)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
+  tvm_set_python_module_relative_rpath(tvm_runtime_hexagon)
   install(TARGETS tvm_runtime_hexagon DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_hexagon DESTINATION "lib")
diff --git a/cmake/modules/Metal.cmake b/cmake/modules/Metal.cmake
index 72e7585534bb..7271ccfcae2e 100644
--- a/cmake/modules/Metal.cmake
+++ b/cmake/modules/Metal.cmake
@@ -35,6 +35,7 @@ if(USE_METAL)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
+  tvm_set_python_module_relative_rpath(tvm_runtime_metal)
   install(TARGETS tvm_runtime_metal DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_metal DESTINATION "lib")
diff --git a/cmake/modules/OpenCL.cmake b/cmake/modules/OpenCL.cmake
index 9a1c20a5a5ab..76eb6381e632 100644
--- a/cmake/modules/OpenCL.cmake
+++ b/cmake/modules/OpenCL.cmake
@@ -51,6 +51,7 @@ if(USE_OPENCL)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
+  tvm_set_python_module_relative_rpath(tvm_runtime_opencl)
   install(TARGETS tvm_runtime_opencl DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_opencl DESTINATION "lib")
diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake
index b974aa412959..1e04026c00d4 100644
--- a/cmake/modules/ROCM.cmake
+++ b/cmake/modules/ROCM.cmake
@@ -53,6 +53,7 @@ if(USE_ROCM)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
+  tvm_set_python_module_relative_rpath(tvm_runtime_rocm)
   install(TARGETS tvm_runtime_rocm DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_rocm DESTINATION "lib")
diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake
index 6821b4419b1a..d08b3353b57c 100644
--- a/cmake/modules/Vulkan.cmake
+++ b/cmake/modules/Vulkan.cmake
@@ -60,6 +60,7 @@ if(USE_VULKAN)
     RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
     ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   )
+  tvm_set_python_module_relative_rpath(tvm_runtime_vulkan)
   install(TARGETS tvm_runtime_vulkan DESTINATION lib${LIB_SUFFIX})
   if(TVM_BUILD_PYTHON_MODULE)
     install(TARGETS tvm_runtime_vulkan DESTINATION "lib")

From 3b12e765c7290b2547e0e17669e1e207f40025a6 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Tue, 26 May 2026 20:20:57 -0400
Subject: [PATCH 09/43] Format ELF wheel validator

---
 ci/scripts/package/validate_wheel_elf.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/ci/scripts/package/validate_wheel_elf.py b/ci/scripts/package/validate_wheel_elf.py
index 837def6a075e..84ffa4986876 100644
--- a/ci/scripts/package/validate_wheel_elf.py
+++ b/ci/scripts/package/validate_wheel_elf.py
@@ -119,9 +119,7 @@ def validate(wheel: Path) -> None:
             unexpected_tvm_deps = sorted(
                 name
                 for name in needed
-                if name.startswith("libtvm_")
-                and name not in libs
-                and name not in TVM_EXTERNAL_LIBS
+                if name.startswith("libtvm_") and name not in libs and name not in TVM_EXTERNAL_LIBS
             )
             if unexpected_tvm_deps:
                 errors.append(

From 10bd65eaa24a59bffd6c934183ae65084ff09950 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 11:21:34 -0400
Subject: [PATCH 10/43] Build TVM wheel with static LLVM

---
 .../actions/tvm-wheel-for-publish/action.yml  |  2 +-
 ci/scripts/package/README.md                  |  5 ++--
 ci/scripts/package/build_tvm_wheel.sh         | 25 +++++++++++++------
 ci/scripts/package/validate_wheel_elf.py      | 14 +++++++++++
 4 files changed, 36 insertions(+), 10 deletions(-)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 23a806b6f0f7..5646caf96397 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -86,7 +86,7 @@ runs:
       shell: bash -l {0}
       env:
         TVM_PYTHON: python
-        TVM_USE_LLVM: llvm-config
+        TVM_USE_LLVM: llvm-config --link-static
         TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
         TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
         TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 6adcfe613579..4a76d6993ec2 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -77,7 +77,7 @@ python -m venv /tmp/tvm-wheel-tools
 /tmp/tvm-wheel-tools/bin/python -m pip install -U pip build auditwheel twine
 
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-TVM_USE_LLVM=/path/to/llvm-config \
+TVM_USE_LLVM="/path/to/llvm-config --link-static" \
 TVM_USE_CUDA=/usr/local/cuda-12.8 \
 TVM_WHEEL_DIST_NAME=tvm-tlopexh-test \
 ci/scripts/package/build_tvm_wheel.sh all
@@ -107,7 +107,8 @@ ci/scripts/package/build_tvm_wheel.sh upload
 
 Useful knobs:
 
-- `TVM_USE_LLVM`: LLVM config for the base wheel, default `ON`.
+- `TVM_USE_LLVM`: LLVM config for the base wheel, default
+  `llvm-config --link-static`.
 - `TVM_USE_CUDA`: CUDA root or `ON` for the sidecar build, default `ON`.
 - `TVM_CUDA_ARCHITECTURES`: CMake CUDA architectures, default `75`.
 - `TVM_WHEEL_DIST_NAME`: optional distribution rename for TestPyPI.
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index a7f84bc83326..7617f60184cd 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -27,7 +27,7 @@ TVM_INJECTED_DIST="${TVM_INJECTED_DIST:-${REPO_ROOT}/dist/tvm-injected}"
 TVM_WHEELHOUSE="${TVM_WHEELHOUSE:-${REPO_ROOT}/wheelhouse}"
 TVM_CUDA_BUILD_DIR="${TVM_CUDA_BUILD_DIR:-${REPO_ROOT}/build-wheel-cuda}"
 TVM_BASE_BUILD_DIR="${TVM_BASE_BUILD_DIR:-${REPO_ROOT}/build-wheel-base}"
-TVM_USE_LLVM="${TVM_USE_LLVM:-ON}"
+TVM_USE_LLVM="${TVM_USE_LLVM:-llvm-config --link-static}"
 TVM_USE_CUDA="${TVM_USE_CUDA:-ON}"
 TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES:-75}"
 TVM_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}"
@@ -43,7 +43,7 @@ usage() {
 Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|wheel|inject|repair|validate|verify|upload|verify-pypi]
 
 Environment knobs:
-  TVM_USE_LLVM                 LLVM config for the base wheel, default ON
+  TVM_USE_LLVM                 LLVM config for the base wheel, default "llvm-config --link-static"
   TVM_USE_CUDA                 CUDA root or ON for the sidecar build, default ON
   TVM_CUDA_ARCHITECTURES       CMake CUDA arch list, default 75
   TVM_WHEEL_DIST_NAME          Optional distribution rename for TestPyPI
@@ -170,9 +170,15 @@ build_base_wheel() {
   fi
 
   echo "Building base TVM wheel with LLVM=${TVM_USE_LLVM}, CUDA=OFF"
+  local cmake_args
+  printf -v cmake_args '%q ' \
+    "-DUSE_LLVM=${TVM_USE_LLVM}" \
+    "-DUSE_CUDA=OFF" \
+    "-DBUILD_TESTING=OFF" \
+    "-DTVM_BUILD_PYTHON_MODULE=ON"
   (
     cd "$TVM_RAW_DIST"
-    CMAKE_ARGS="-DUSE_LLVM=${TVM_USE_LLVM} -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON ${TVM_EXTRA_CMAKE_ARGS:-}" \
+    CMAKE_ARGS="${cmake_args}${TVM_EXTRA_CMAKE_ARGS:-}" \
       "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
         "${build_flags[@]}" \
         -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
@@ -244,10 +250,15 @@ llvm_libdir() {
   if [[ "$TVM_USE_LLVM" == "OFF" || "$TVM_USE_LLVM" == "0" ]]; then
     return 0
   fi
-  if command -v "$TVM_USE_LLVM" >/dev/null 2>&1; then
-    "$TVM_USE_LLVM" --libdir
-  elif [[ -x "$TVM_USE_LLVM" ]]; then
-    "$TVM_USE_LLVM" --libdir
+  local -a llvm_config
+  read -r -a llvm_config <<<"$TVM_USE_LLVM"
+  if [[ "${#llvm_config[@]}" -eq 0 ]]; then
+    return 0
+  fi
+  if command -v "${llvm_config[0]}" >/dev/null 2>&1; then
+    "${llvm_config[@]}" --libdir
+  elif [[ -x "${llvm_config[0]}" ]]; then
+    "${llvm_config[@]}" --libdir
   fi
 }
 
diff --git a/ci/scripts/package/validate_wheel_elf.py b/ci/scripts/package/validate_wheel_elf.py
index 84ffa4986876..cde90bd43a45 100644
--- a/ci/scripts/package/validate_wheel_elf.py
+++ b/ci/scripts/package/validate_wheel_elf.py
@@ -91,10 +91,24 @@ def validate(wheel: Path) -> None:
             raise RuntimeError("wheel does not contain tvm/lib/libtvm_runtime.so")
         if "libtvm_ffi.so" in libs:
             raise RuntimeError("TVM wheel must depend on tvm_ffi instead of bundling libtvm_ffi.so")
+        bundled_llvm = sorted(
+            str(path.relative_to(root)) for path in root.rglob("libLLVM*.so*") if path.is_file()
+        )
+        if bundled_llvm:
+            raise RuntimeError(
+                "TVM wheel must link LLVM statically instead of bundling libLLVM: "
+                + ", ".join(bundled_llvm)
+            )
 
         errors: list[str] = []
         for lib in libs.values():
             needed, rpaths = _dynamic_entries(lib)
+            llvm_needed = sorted(name for name in needed if name.startswith("libLLVM"))
+            if llvm_needed:
+                errors.append(
+                    f"{lib.relative_to(root)} links dynamic LLVM libraries: {llvm_needed}"
+                )
+
             internal_needed = sorted(name for name in needed if name in libs)
             if internal_needed and "$ORIGIN" not in rpaths:
                 errors.append(

From a90d2e7bbc1c97934707e9269f3d3899668338e9 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 13:27:04 -0400
Subject: [PATCH 11/43] Build publish wheels across platforms

---
 .../actions/tvm-wheel-for-publish/action.yml  | 149 ++++++++++++++++--
 .github/workflows/publish_wheel.yml           |  47 +++++-
 ci/scripts/package/README.md                  |  19 ++-
 ci/scripts/package/build_tvm_wheel.sh         |  18 ++-
 4 files changed, 210 insertions(+), 23 deletions(-)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 5646caf96397..0414da34d0d5 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -17,9 +17,19 @@
 
 name: Build TVM Wheel For Publish
 description: >
-  Build and test a Linux TVM wheel using the CUDA sidecar packaging flow.
+  Build and test a TVM wheel for a given platform using the publish packaging flow.
 
 inputs:
+  os:
+    description: "Runner operating system (e.g., ubuntu-latest, macos-14, windows-latest)"
+    required: true
+  arch:
+    description: "Target architecture (e.g., x86_64, arm64, AMD64)"
+    required: true
+  linux_image:
+    description: "Manylinux image tag to use on Linux runners (empty string for non-Linux)"
+    required: false
+    default: ""
   checkout_ref:
     description: "Branch, tag, or SHA to check out before building"
     required: true
@@ -31,6 +41,10 @@ inputs:
     description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
     required: false
     default: "75"
+  skip_cuda:
+    description: "Set to true to build a CPU-only wheel without the CUDA sidecar"
+    required: false
+    default: "false"
 
 runs:
   using: "composite"
@@ -44,6 +58,7 @@ runs:
         fetch-tags: true
 
     - name: Free runner disk space
+      if: runner.os == 'Linux'
       shell: bash -l {0}
       run: |
         set -eux
@@ -57,42 +72,154 @@ runs:
         sudo apt-get clean
         df -h
 
+    - name: Cache LLVM for manylinux build
+      if: runner.os == 'Linux'
+      uses: actions/cache@v4
+      id: llvm-cache
+      with:
+        path: /opt/llvm
+        key: tvm-wheel-llvm-22.1.0-${{ runner.os }}-${{ inputs.arch }}-v1
+
+    - name: Set up conda for LLVM cache
+      if: runner.os == 'Linux' && steps.llvm-cache.outputs.cache-hit != 'true'
+      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      continue-on-error: true
+      id: conda1
+      with:
+        miniforge-version: latest
+
+    - name: Set up conda for LLVM cache (retry with tar.bz2)
+      if: runner.os == 'Linux' && steps.llvm-cache.outputs.cache-hit != 'true' && steps.conda1.outcome == 'failure'
+      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      with:
+        miniforge-version: latest
+        use-only-tar-bz2: true
+
+    - name: Install LLVM for manylinux build
+      if: runner.os == 'Linux' && steps.llvm-cache.outputs.cache-hit != 'true'
+      shell: bash -l {0}
+      run: |
+        set -eux
+        sudo mkdir -p /opt/llvm
+        sudo chown -R "$(whoami)" /opt/llvm
+        conda create -q -p /opt/llvm -c conda-forge \
+          llvmdev=22.1.0 clangdev=22.1.0 compiler-rt=22.1.0 zlib zstd-static \
+          -y
+
     - name: Set up TVM build environment
+      if: runner.os != 'Linux'
       uses: ./.github/actions/setup
 
     - name: Install CUDA toolkit
+      if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
       id: cuda-toolkit
       uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2
 
     - name: Install wheel build tools
+      if: runner.os != 'Linux'
       shell: bash -l {0}
       run: |
         set -eux
-        sudo apt-get update
-        sudo apt-get install -y patchelf
-        python -m pip install -U pip build auditwheel twine scikit-build-core
+        python -m pip install -U pip build twine scikit-build-core wheel
+        if [[ "$(uname -s)" == "Darwin" ]]; then
+          python -m pip install delocate
+        fi
 
-    - name: Print build inputs
+    - name: Print host build inputs
+      if: runner.os != 'Linux'
       shell: bash -l {0}
       run: |
         set -eux
         git log -1 --oneline
         python --version
         cmake --version
-        llvm-config --version
-        "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/nvcc" --version
+        if command -v llvm-config >/dev/null 2>&1; then
+          llvm-config --version
+        fi
+        if [[ "${{ inputs.skip_cuda }}" != "true" ]]; then
+          "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/nvcc" --version
+        fi
+
+    - name: Build, repair, and test manylinux wheel
+      if: runner.os == 'Linux'
+      shell: bash -l {0}
+      env:
+        TVM_MANYLINUX_IMAGE: ${{ inputs.linux_image }}
+        TVM_ARCH: ${{ inputs.arch }}
+        TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
+        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
+        TVM_SKIP_CUDA: ${{ inputs.skip_cuda == 'true' && '1' || '0' }}
+      run: |
+        set -eux
+        if [[ -z "${TVM_MANYLINUX_IMAGE}" ]]; then
+          echo "linux_image is required on Linux runners" >&2
+          exit 1
+        fi
+
+        image="quay.io/pypa/${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}:latest"
+        container="tvm_wheel_build_${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}"
+        docker pull "${image}"
+        docker run --name "${container}" -d \
+          --workdir /workspace \
+          --volume "${GITHUB_WORKSPACE}:/workspace" \
+          --volume /opt/llvm:/opt/llvm \
+          "${image}" tail -f /dev/null
+        trap 'docker rm -f "${container}" || true' EXIT
+
+        if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
+          cuda_rpm="cuda-repo-rhel8-13-0-local-13.0.2_580.95.05-1.${TVM_ARCH}.rpm"
+          curl -fsSLo "${cuda_rpm}" "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/${cuda_rpm}"
+          docker cp "${cuda_rpm}" "${container}:/${cuda_rpm}"
+          rm "${cuda_rpm}"
+          docker exec "${container}" bash -lc "
+            rpm -i /${cuda_rpm} && \
+            dnf clean all && \
+            dnf -y install cuda-toolkit-13-0 && \
+            rm /${cuda_rpm} && \
+            dnf clean all"
+        fi
+
+        docker exec \
+          -e TVM_PYTHON=/opt/python/cp310-cp310/bin/python \
+          -e TVM_USE_LLVM="/opt/llvm/bin/llvm-config --link-static" \
+          -e TVM_USE_CUDA=/usr/local/cuda \
+          -e TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES}" \
+          -e TVM_WHEEL_DIST_NAME="${TVM_WHEEL_DIST_NAME}" \
+          -e TVM_SKIP_CUDA="${TVM_SKIP_CUDA}" \
+          -e TVM_AUDITWHEEL_PLAT="${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}" \
+          -e TVM_EXPECT_WHEEL_PLATFORM_TAG="${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}" \
+          -e CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" \
+          -e TVM_BUILD_PARALLEL_LEVEL="$(nproc)" \
+          "${container}" bash -lc '
+            set -eux
+            export PATH=/opt/python/cp310-cp310/bin:/opt/llvm/bin:/usr/local/cuda/bin:$PATH
+            python -m pip install -U pip build auditwheel twine scikit-build-core wheel cmake ninja
+            python -m pip install -v ./3rdparty/tvm-ffi
+            python --version
+            cmake --version
+            llvm-config --version
+            if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
+              nvcc --version
+            fi
+            ci/scripts/package/build_tvm_wheel.sh all'
+        docker exec "${container}" bash -lc \
+          "chown -R $(id -u):$(id -g) /workspace/wheelhouse /workspace/dist /workspace/build-wheel-* || true"
 
-    - name: Build, repair, and test wheel
+    - name: Build, repair, and test host wheel
+      if: runner.os != 'Linux'
       shell: bash -l {0}
       env:
         TVM_PYTHON: python
-        TVM_USE_LLVM: llvm-config --link-static
-        TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH }}
+        TVM_USE_LLVM: ON
+        TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH || 'OFF' }}
         TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
         TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
+        TVM_SKIP_CUDA: ${{ inputs.skip_cuda == 'true' && '1' || '0' }}
       run: |
         set -eux
-        ci/scripts/package/build_tvm_wheel.sh cuda
+        if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
+          ci/scripts/package/build_tvm_wheel.sh cuda
+        fi
         ci/scripts/package/build_tvm_wheel.sh wheel
         ci/scripts/package/build_tvm_wheel.sh inject
         ci/scripts/package/build_tvm_wheel.sh repair
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index f5cecf589f98..6446db699b2c 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -50,9 +50,37 @@ on:
         type: boolean
 
 jobs:
-  build_wheel:
-    name: Linux x86_64 CUDA sidecar wheel
-    runs-on: ubuntu-latest
+  build_wheels:
+    name: ${{ matrix.name }}
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - name: Linux x86_64 CUDA sidecar wheel (manylinux_2_28)
+            os: ubuntu-latest
+            arch: x86_64
+            linux_image: manylinux_2_28
+            skip_cuda: "false"
+            artifact_suffix: linux-x86_64-manylinux_2_28
+          - name: Linux aarch64 CUDA sidecar wheel (manylinux_2_28)
+            os: ubuntu-24.04-arm
+            arch: aarch64
+            linux_image: manylinux_2_28
+            skip_cuda: "false"
+            artifact_suffix: linux-aarch64-manylinux_2_28
+          - name: macOS arm64 CPU wheel
+            os: macos-14
+            arch: arm64
+            linux_image: ""
+            skip_cuda: "true"
+            artifact_suffix: macos-arm64
+          - name: Windows AMD64 CPU wheel
+            os: windows-latest
+            arch: AMD64
+            linux_image: ""
+            skip_cuda: "true"
+            artifact_suffix: windows-amd64
     steps:
       - name: Checkout repository for local action
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -62,19 +90,23 @@ jobs:
       - name: Build TVM wheel
         uses: ./.github/actions/tvm-wheel-for-publish
         with:
+          os: ${{ matrix.os }}
+          arch: ${{ matrix.arch }}
+          linux_image: ${{ matrix.linux_image }}
           checkout_ref: ${{ inputs.tag }}
           distribution_name: ${{ inputs.distribution_name }}
           cuda_architectures: ${{ inputs.cuda_architectures }}
+          skip_cuda: ${{ matrix.skip_cuda }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v4
         with:
-          name: tvm-wheel-linux-x86_64
+          name: tvm-wheel-${{ matrix.artifact_suffix }}
           path: wheelhouse/*.whl
 
   upload_pypi:
     name: Upload package distributions
-    needs: [build_wheel]
+    needs: [build_wheels]
     if: ${{ inputs.publish_repository != 'none' }}
     runs-on: ubuntu-latest
     environment: ${{ inputs.publish_repository }}
@@ -84,8 +116,9 @@ jobs:
     steps:
       - uses: actions/download-artifact@v4
         with:
-          name: tvm-wheel-linux-x86_64
+          pattern: tvm-wheel-*
           path: dist
+          merge-multiple: true
 
       - name: Generate artifact attestation for wheels
         uses: actions/attest-build-provenance@v1
@@ -123,7 +156,7 @@ jobs:
 
       - uses: actions/download-artifact@v4
         with:
-          name: tvm-wheel-linux-x86_64
+          name: tvm-wheel-linux-x86_64-manylinux_2_28
           path: wheelhouse
 
       - name: Set up Python
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 4a76d6993ec2..489ff1317194 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -17,7 +17,7 @@
 
 # TVM wheel packaging helper
 
-This helper follows the CUDA-sidecar packaging flow used for local release
+This helper follows the CUDA-sidecar packaging flow used for release
 validation:
 
 1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
@@ -40,12 +40,22 @@ GitHub Actions flow:
 1. Create a tag that contains these packaging files.
 2. Open the `Publish TVM wheel` workflow in GitHub Actions.
 3. Fill `tag` with that tag.
-4. For a TestPyPI run, set `publish_repository=testpypi` and set
+4. The workflow builds a platform wheel matrix:
+   - Linux x86_64 in a `manylinux_2_28` container, with the CUDA sidecar.
+   - Linux aarch64 in a `manylinux_2_28` container, with the CUDA sidecar.
+   - macOS arm64 CPU-only.
+   - Windows AMD64 CPU-only.
+5. For a TestPyPI run, set `publish_repository=testpypi` and set
    `distribution_name` to a temporary package name such as
    `tvm-yourname-test`.
-5. After the workflow build, upload, and `verify_pypi` jobs pass, run it again
+6. After the workflow build, upload, and `verify_pypi` jobs pass, run it again
    with the final tag/name and `publish_repository=pypi`.
 
+Linux wheels are built inside a manylinux image, following the TVM-FFI
+packaging pattern. This avoids accidentally publishing a wheel tagged for the
+GitHub runner's host glibc, such as `manylinux_2_39`, which would not install
+on older supported Linux systems.
+
 To test this from the fork `tlopex/tvm` without publishing:
 
 ```bash
@@ -116,5 +126,8 @@ Useful knobs:
 - `TVM_SKIP_REPAIR=1`: leave the injected wheel unrepaired.
 - `TVM_SKIP_CUDA=1`: build a base wheel without a CUDA sidecar.
 - `TVM_KEEP_BUILD_DIRS=1`: reuse the CMake build directories.
+- `TVM_AUDITWHEEL_PLAT`: optional `auditwheel repair --plat` override.
+- `TVM_EXPECT_WHEEL_PLATFORM_TAG`: require the final wheel filename to include
+  a specific platform tag, such as `manylinux_2_28_x86_64`.
 - `TVM_TEST_INDEX_URL`: package index for `verify-pypi`, default TestPyPI.
 - `TVM_EXTRA_INDEX_URL`: extra package index for dependencies, default PyPI.
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 7617f60184cd..8cd2db788ac8 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -30,7 +30,7 @@ TVM_BASE_BUILD_DIR="${TVM_BASE_BUILD_DIR:-${REPO_ROOT}/build-wheel-base}"
 TVM_USE_LLVM="${TVM_USE_LLVM:-llvm-config --link-static}"
 TVM_USE_CUDA="${TVM_USE_CUDA:-ON}"
 TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES:-75}"
-TVM_BUILD_PARALLEL_LEVEL="${CMAKE_BUILD_PARALLEL_LEVEL:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}"
+TVM_BUILD_PARALLEL_LEVEL="${TVM_BUILD_PARALLEL_LEVEL:-${CMAKE_BUILD_PARALLEL_LEVEL:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}}"
 TVM_WHEEL_DIST_NAME="${TVM_WHEEL_DIST_NAME:-}"
 TVM_WHEEL_DIST_VERSION="${TVM_WHEEL_DIST_VERSION:-}"
 TVM_SKIP_CUDA="${TVM_SKIP_CUDA:-0}"
@@ -53,6 +53,9 @@ Environment knobs:
   TVM_SKIP_REPAIR=1            Keep injected wheel as final wheel
   TVM_BUILD_NO_ISOLATION=1     Pass --no-isolation to python -m build
   TVM_KEEP_BUILD_DIRS=1        Reuse CMake build dirs instead of cleaning them
+  TVM_AUDITWHEEL_PLAT          Optional auditwheel --plat value
+  TVM_EXPECT_WHEEL_PLATFORM_TAG
+                                Require the final wheel filename to include this tag
   TVM_TEST_INDEX_URL           Package index for verify-pypi, default TestPyPI
   TVM_EXTRA_INDEX_URL          Extra package index for dependencies, default PyPI
 EOF
@@ -283,12 +286,17 @@ repair_wheel() {
       mapfile -t exclude_args < <(auditwheel_excludes "$cuda_lib")
       echo "Repairing Linux wheel with auditwheel"
       (
+        auditwheel_plat_args=()
+        if [[ -n "${TVM_AUDITWHEEL_PLAT:-}" ]]; then
+          auditwheel_plat_args+=(--plat "$TVM_AUDITWHEEL_PLAT")
+        fi
         llvm_dir="$(llvm_libdir || true)"
         if [[ -n "${llvm_dir:-}" && -d "$llvm_dir" ]]; then
           echo "Adding LLVM libdir to LD_LIBRARY_PATH for auditwheel: ${llvm_dir}"
           export LD_LIBRARY_PATH="${llvm_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
         fi
-        auditwheel repair "${exclude_args[@]}" -w "$TVM_WHEELHOUSE" "$injected_wheel"
+        auditwheel repair "${auditwheel_plat_args[@]}" "${exclude_args[@]}" \
+          -w "$TVM_WHEELHOUSE" "$injected_wheel"
       )
       ;;
     Darwin)
@@ -320,6 +328,12 @@ validate_wheel_elf() {
 verify_wheel() {
   local final_wheel
   final_wheel="$(single_wheel "$TVM_WHEELHOUSE")"
+  if [[ -n "${TVM_EXPECT_WHEEL_PLATFORM_TAG:-}" ]]; then
+    if [[ "$(basename "$final_wheel")" != *"${TVM_EXPECT_WHEEL_PLATFORM_TAG}"* ]]; then
+      echo "error: expected final wheel tag ${TVM_EXPECT_WHEEL_PLATFORM_TAG}, got ${final_wheel}" >&2
+      return 1
+    fi
+  fi
   validate_wheel_elf
 
   local venv="${TVM_VERIFY_VENV:-${REPO_ROOT}/build-wheel-verify-venv}"

From 74ae3e23741e274cc73afed08bf06b984abc86e7 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 13:32:08 -0400
Subject: [PATCH 12/43] Avoid empty array expansion in wheel build

---
 ci/scripts/package/build_tvm_wheel.sh | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 8cd2db788ac8..7d662b555959 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -167,11 +167,6 @@ build_base_wheel() {
     rm -rf "$TVM_BASE_BUILD_DIR"
   fi
 
-  local build_flags=()
-  if [[ "$TVM_BUILD_NO_ISOLATION" == "1" ]]; then
-    build_flags+=(--no-isolation)
-  fi
-
   echo "Building base TVM wheel with LLVM=${TVM_USE_LLVM}, CUDA=OFF"
   local cmake_args
   printf -v cmake_args '%q ' \
@@ -181,11 +176,18 @@ build_base_wheel() {
     "-DTVM_BUILD_PYTHON_MODULE=ON"
   (
     cd "$TVM_RAW_DIST"
-    CMAKE_ARGS="${cmake_args}${TVM_EXTRA_CMAKE_ARGS:-}" \
-      "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
-        "${build_flags[@]}" \
-        -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
-        "$REPO_ROOT"
+    if [[ "$TVM_BUILD_NO_ISOLATION" == "1" ]]; then
+      CMAKE_ARGS="${cmake_args}${TVM_EXTRA_CMAKE_ARGS:-}" \
+        "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
+          --no-isolation \
+          -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
+          "$REPO_ROOT"
+    else
+      CMAKE_ARGS="${cmake_args}${TVM_EXTRA_CMAKE_ARGS:-}" \
+        "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
+          -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
+          "$REPO_ROOT"
+    fi
   )
 
   single_wheel "$TVM_RAW_DIST" >/dev/null

From 1b5724dcf27bf2f0cef5f07ff83427b8dd3e2db5 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 13:41:47 -0400
Subject: [PATCH 13/43] Add zlib to wheel build conda environment

---
 tests/conda/build-environment.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/conda/build-environment.yaml b/tests/conda/build-environment.yaml
index ebd45ff4c422..e772a12e5409 100644
--- a/tests/conda/build-environment.yaml
+++ b/tests/conda/build-environment.yaml
@@ -33,6 +33,7 @@ dependencies:
   - pip
   - git
   - bzip2
+  - zlib
   - pytest
   - numpy
   - scipy

From 0d31684dcc6bc3a74c0c9146d70ea8b4935e649f Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 13:54:56 -0400
Subject: [PATCH 14/43] Add LLVM prefix path for manylinux wheel builds

---
 .github/actions/tvm-wheel-for-publish/action.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 0414da34d0d5..16776b620e32 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -188,11 +188,13 @@ runs:
           -e TVM_SKIP_CUDA="${TVM_SKIP_CUDA}" \
           -e TVM_AUDITWHEEL_PLAT="${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}" \
           -e TVM_EXPECT_WHEEL_PLATFORM_TAG="${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}" \
+          -e CMAKE_PREFIX_PATH=/opt/llvm \
           -e CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" \
           -e TVM_BUILD_PARALLEL_LEVEL="$(nproc)" \
           "${container}" bash -lc '
             set -eux
             export PATH=/opt/python/cp310-cp310/bin:/opt/llvm/bin:/usr/local/cuda/bin:$PATH
+            export CMAKE_PREFIX_PATH=/opt/llvm${CMAKE_PREFIX_PATH:+:$CMAKE_PREFIX_PATH}
             python -m pip install -U pip build auditwheel twine scikit-build-core wheel cmake ninja
             python -m pip install -v ./3rdparty/tvm-ffi
             python --version

From 32f534925e5ee3bf5cac93199bc3b4fc92d7ac1f Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 14:23:16 -0400
Subject: [PATCH 15/43] Keep TVM wheel LLVM linkage static

---
 .../actions/tvm-wheel-for-publish/action.yml  | 17 +++++++
 ci/scripts/package/README.md                  | 13 +++--
 ci/scripts/package/build_tvm_wheel.sh         | 49 ++++++++++++++++---
 cmake/utils/FindLLVM.cmake                    | 44 +++++++++++++----
 4 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 16776b620e32..445dd1d0eb3a 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -125,6 +125,21 @@ runs:
           python -m pip install delocate
         fi
 
+    - name: Detect host CPU count
+      if: runner.os != 'Linux'
+      id: host-env
+      shell: bash -l {0}
+      run: |
+        set -eux
+        if command -v nproc >/dev/null 2>&1; then
+          cpu_count="$(nproc)"
+        elif [[ "$(uname -s)" == "Darwin" ]]; then
+          cpu_count="$(sysctl -n hw.ncpu)"
+        else
+          cpu_count="${NUMBER_OF_PROCESSORS:-4}"
+        fi
+        echo "cpu_count=${cpu_count}" >> "${GITHUB_OUTPUT}"
+
     - name: Print host build inputs
       if: runner.os != 'Linux'
       shell: bash -l {0}
@@ -217,6 +232,8 @@ runs:
         TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
         TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
         TVM_SKIP_CUDA: ${{ inputs.skip_cuda == 'true' && '1' || '0' }}
+        CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.host-env.outputs.cpu_count }}
+        TVM_BUILD_PARALLEL_LEVEL: ${{ steps.host-env.outputs.cpu_count }}
       run: |
         set -eux
         if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 489ff1317194..90e70363f729 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -25,15 +25,18 @@ validation:
 3. Inject the CUDA runtime DSO into `tvm/lib/` inside the wheel.
 4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`.
 5. Validate ELF links so intra-wheel TVM DSOs resolve through relative rpaths.
+   LLVM is expected to be linked statically; the final wheel must not bundle
+   or dynamically depend on `libLLVM`.
 6. Verify the wheel in a fresh virtualenv.
 7. Upload with `twine`.
 
-It mirrors the TVM-FFI packaging patterns in:
+It mirrors the TVM-FFI packaging patterns from
+`apache/tvm-ffi/.github`, especially:
 
-- `tvm-ffi/.github/actions/build-wheel-for-publish/action.yml`
-- `tvm-ffi/.github/workflows/publish_wheel.yml`
-- `tvm-ffi/addons/tvm_ffi_orcjit/pyproject.toml`
-- `tvm-ffi/addons/torch_c_dlpack_ext/build_aot_wheels.sh`
+- `apache/tvm-ffi/.github/workflows/publish_wheel.yml`
+- `apache/tvm-ffi/.github/actions/build-wheel-for-publish/action.yml`
+- `apache/tvm-ffi/.github/actions/build-orcjit-wheel/action.yml`
+- `apache/tvm-ffi/addons/tvm_ffi_orcjit/pyproject.toml`
 
 GitHub Actions flow:
 
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 7d662b555959..c47a5bbf0a12 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -81,7 +81,11 @@ require_pypa_build() {
 
 single_wheel() {
   local dir="$1"
-  mapfile -t wheels < <(find "$dir" -maxdepth 1 -type f -name '*.whl' | sort)
+  local wheels=()
+  local wheel
+  while IFS= read -r wheel; do
+    wheels+=("$wheel")
+  done < <(find "$dir" -maxdepth 1 -type f -name '*.whl' | sort)
   if [[ "${#wheels[@]}" -ne 1 ]]; then
     echo "error: expected exactly one wheel under ${dir}, found ${#wheels[@]}" >&2
     printf '%s\n' "${wheels[@]}" >&2
@@ -169,11 +173,7 @@ build_base_wheel() {
 
   echo "Building base TVM wheel with LLVM=${TVM_USE_LLVM}, CUDA=OFF"
   local cmake_args
-  printf -v cmake_args '%q ' \
-    "-DUSE_LLVM=${TVM_USE_LLVM}" \
-    "-DUSE_CUDA=OFF" \
-    "-DBUILD_TESTING=OFF" \
-    "-DTVM_BUILD_PYTHON_MODULE=ON"
+  cmake_args="$(base_cmake_args)"
   (
     cd "$TVM_RAW_DIST"
     if [[ "$TVM_BUILD_NO_ISOLATION" == "1" ]]; then
@@ -267,6 +267,37 @@ llvm_libdir() {
   fi
 }
 
+llvm_prefix() {
+  if [[ "$TVM_USE_LLVM" == "OFF" || "$TVM_USE_LLVM" == "0" ]]; then
+    return 0
+  fi
+  local -a llvm_config
+  read -r -a llvm_config <<<"$TVM_USE_LLVM"
+  if [[ "${#llvm_config[@]}" -eq 0 ]]; then
+    return 0
+  fi
+  if command -v "${llvm_config[0]}" >/dev/null 2>&1 || [[ -x "${llvm_config[0]}" ]]; then
+    "${llvm_config[@]}" --prefix
+  fi
+}
+
+base_cmake_args() {
+  local llvm_prefix_dir
+  llvm_prefix_dir="$(llvm_prefix || true)"
+  local args=(
+    "-DUSE_LLVM=${TVM_USE_LLVM}"
+    "-DUSE_CUDA=OFF"
+    "-DBUILD_TESTING=OFF"
+    "-DTVM_BUILD_PYTHON_MODULE=ON"
+  )
+  if [[ -n "$llvm_prefix_dir" && -d "$llvm_prefix_dir" ]]; then
+    # scikit-build-core writes its own CMAKE_PREFIX_PATH init cache, so pass
+    # the LLVM prefix as an explicit CMake argument.
+    args+=("-DCMAKE_PREFIX_PATH=${llvm_prefix_dir}")
+  fi
+  printf '%q ' "${args[@]}"
+}
+
 repair_wheel() {
   rm -rf "$TVM_WHEELHOUSE"
   mkdir -p "$TVM_WHEELHOUSE"
@@ -285,7 +316,11 @@ repair_wheel() {
       require_cmd auditwheel
       local cuda_lib
       cuda_lib="$(cuda_runtime_path || true)"
-      mapfile -t exclude_args < <(auditwheel_excludes "$cuda_lib")
+      local exclude_args=()
+      local exclude_arg
+      while IFS= read -r exclude_arg; do
+        exclude_args+=("$exclude_arg")
+      done < <(auditwheel_excludes "$cuda_lib")
       echo "Repairing Linux wheel with auditwheel"
       (
         auditwheel_plat_args=()
diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
index 8aa9c8b1b959..98540dbb5994 100644
--- a/cmake/utils/FindLLVM.cmake
+++ b/cmake/utils/FindLLVM.cmake
@@ -122,6 +122,12 @@ macro(find_llvm use_llvm)
       message(FATAL_ERROR "Fatal error executing: ${LLVM_CONFIG} --libdir")
     endif()
     message(STATUS "LLVM libdir: ${__llvm_libdir}")
+    set(__llvm_lib_hints
+      "${__llvm_libdir}"
+      "${__llvm_prefix}/lib"
+      "${__llvm_prefix}/lib64"
+      "${__llvm_prefix}/Library/lib"
+    )
     execute_process(COMMAND ${LLVM_CONFIG} --cmakedir
       RESULT_VARIABLE __llvm_exit_code
       OUTPUT_VARIABLE __llvm_cmakedir
@@ -193,18 +199,36 @@ macro(find_llvm use_llvm)
         message(STATUS "LLVM links against math")
         list(APPEND LLVM_LIBS "m")
       elseif(("${__flag}" STREQUAL "-lz") OR ("${__flag}" STREQUAL "z.lib"))
-        message(STATUS "LLVM links against zlib")
-        find_package(ZLIB REQUIRED)
-        list(APPEND LLVM_LIBS "ZLIB::ZLIB")
+        find_library(ZLIB_STATIC
+          NAMES libz.a zlibstatic z
+          HINTS ${__llvm_lib_hints}
+          NO_DEFAULT_PATH)
+        if (ZLIB_STATIC)
+          message(STATUS "LLVM links against static zlib: ${ZLIB_STATIC}")
+          list(APPEND LLVM_LIBS "${ZLIB_STATIC}")
+        else()
+          message(STATUS "LLVM links against zlib")
+          find_package(ZLIB REQUIRED)
+          list(APPEND LLVM_LIBS "ZLIB::ZLIB")
+        endif()
       elseif("${__flag}" STREQUAL "-lzstd")
-        list(APPEND CMAKE_MODULE_PATH "${__llvm_cmakedir}")
-        find_package(zstd REQUIRED)
-        if (TARGET "zstd::libzstd_static")
-          message(STATUS "LLVM links against static zstd")
-          list(APPEND LLVM_LIBS "zstd::libzstd_static")
+        find_library(ZSTD_STATIC
+          NAMES libzstd.a zstd_static zstd
+          HINTS ${__llvm_lib_hints}
+          NO_DEFAULT_PATH)
+        if (ZSTD_STATIC)
+          message(STATUS "LLVM links against static zstd: ${ZSTD_STATIC}")
+          list(APPEND LLVM_LIBS "${ZSTD_STATIC}")
         else()
-          message(STATUS "LLVM links against shared zstd")
-          list(APPEND LLVM_LIBS "zstd::libzstd_shared")
+          list(APPEND CMAKE_MODULE_PATH "${__llvm_cmakedir}")
+          find_package(zstd REQUIRED)
+          if (TARGET "zstd::libzstd_static")
+            message(STATUS "LLVM links against static zstd")
+            list(APPEND LLVM_LIBS "zstd::libzstd_static")
+          else()
+            message(STATUS "LLVM links against shared zstd")
+            list(APPEND LLVM_LIBS "zstd::libzstd_shared")
+          endif()
         endif()
       elseif("${__flag}" STREQUAL "-lxml2")
         message(STATUS "LLVM links against xml2")

From bc2a7e4db5d5346e4e7b956871dd365a24163460 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 14:30:00 -0400
Subject: [PATCH 16/43] Use static LLVM for host wheel builds

---
 .github/actions/tvm-wheel-for-publish/action.yml | 3 ++-
 tests/conda/build-environment.yaml               | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
index 445dd1d0eb3a..8d0c4c27f07a 100644
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ b/.github/actions/tvm-wheel-for-publish/action.yml
@@ -150,6 +150,7 @@ runs:
         cmake --version
         if command -v llvm-config >/dev/null 2>&1; then
           llvm-config --version
+          llvm-config --link-static --system-libs
         fi
         if [[ "${{ inputs.skip_cuda }}" != "true" ]]; then
           "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/nvcc" --version
@@ -227,7 +228,7 @@ runs:
       shell: bash -l {0}
       env:
         TVM_PYTHON: python
-        TVM_USE_LLVM: ON
+        TVM_USE_LLVM: llvm-config --link-static
         TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH || 'OFF' }}
         TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
         TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
diff --git a/tests/conda/build-environment.yaml b/tests/conda/build-environment.yaml
index e772a12e5409..3b2c4dd16751 100644
--- a/tests/conda/build-environment.yaml
+++ b/tests/conda/build-environment.yaml
@@ -34,6 +34,7 @@ dependencies:
   - git
   - bzip2
   - zlib
+  - zstd-static
   - pytest
   - numpy
   - scipy

From 59c47bdce91fe6e2e440188e7b1218a359fa8b08 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 14:53:26 -0400
Subject: [PATCH 17/43] Verify platform-specific TVM runtime library

---
 ci/scripts/package/build_tvm_wheel.sh | 33 +++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index c47a5bbf0a12..761b21d87356 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -386,15 +386,27 @@ verify_wheel() {
   "$venv_python" -m pip install --extra-index-url "${TVM_EXTRA_INDEX_URL:-https://pypi.org/simple}" "$final_wheel"
   "$venv_python" - <<'PY'
 from pathlib import Path
+import sys
 import tvm
 
 root = Path(tvm.__file__).resolve().parent
+libdir = root / "lib"
+if sys.platform == "darwin":
+    runtime_lib = libdir / "libtvm_runtime.dylib"
+    cuda_sidecar = libdir / "libtvm_runtime_cuda.dylib"
+elif sys.platform == "win32":
+    runtime_lib = libdir / "tvm_runtime.dll"
+    cuda_sidecar = libdir / "tvm_runtime_cuda.dll"
+else:
+    runtime_lib = libdir / "libtvm_runtime.so"
+    cuda_sidecar = libdir / "libtvm_runtime_cuda.so"
+
 print("tvm version:", tvm.__version__)
 print("tvm package:", root)
 print("llvm enabled:", tvm.runtime.enabled("llvm"))
 print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
-assert (root / "lib" / "libtvm_runtime.so").exists()
-cuda_sidecar = root / "lib" / "libtvm_runtime_cuda.so"
+print("runtime library:", runtime_lib)
+assert runtime_lib.exists()
 print("cuda sidecar present:", cuda_sidecar.exists())
 PY
 }
@@ -434,15 +446,28 @@ verify_pypi_wheel() {
     "${package_name}==${package_version}"
   "$venv_python" - <<'PY'
 from pathlib import Path
+import sys
 import tvm
 
 root = Path(tvm.__file__).resolve().parent
+libdir = root / "lib"
+if sys.platform == "darwin":
+    runtime_lib = libdir / "libtvm_runtime.dylib"
+    cuda_sidecar = libdir / "libtvm_runtime_cuda.dylib"
+elif sys.platform == "win32":
+    runtime_lib = libdir / "tvm_runtime.dll"
+    cuda_sidecar = libdir / "tvm_runtime_cuda.dll"
+else:
+    runtime_lib = libdir / "libtvm_runtime.so"
+    cuda_sidecar = libdir / "libtvm_runtime_cuda.so"
+
 print("tvm version:", tvm.__version__)
 print("tvm package:", root)
 print("llvm enabled:", tvm.runtime.enabled("llvm"))
 print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
-assert (root / "lib" / "libtvm_runtime.so").exists()
-print("cuda sidecar present:", (root / "lib" / "libtvm_runtime_cuda.so").exists())
+print("runtime library:", runtime_lib)
+assert runtime_lib.exists()
+print("cuda sidecar present:", cuda_sidecar.exists())
 PY
 }
 

From 0e954605bcf204ba26ee10dd0c491b933c4e6ca6 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 15:53:36 -0400
Subject: [PATCH 18/43] Use cibuildwheel for publish wheel builds

---
 .github/actions/build-cuda-sidecar/action.yml |  70 +++++
 .../build-wheel-for-publish/action.yml        | 207 +++++++++++++++
 .github/actions/detect-env-vars/action.yml    |  38 +++
 .../actions/tvm-wheel-for-publish/action.yml  | 246 ------------------
 .github/workflows/publish_wheel.yml           |  22 +-
 .gitignore                                    |   2 +
 ci/scripts/package/README.md                  |  23 +-
 ci/scripts/package/build_tvm_wheel.sh         | 188 ++++++++-----
 ci/scripts/package/inject_cuda_runtime.py     |   4 +-
 ci/scripts/package/verify_tvm_install.py      |  52 ++++
 cmake/utils/FindLLVM.cmake                    |  14 +-
 11 files changed, 536 insertions(+), 330 deletions(-)
 create mode 100644 .github/actions/build-cuda-sidecar/action.yml
 create mode 100644 .github/actions/build-wheel-for-publish/action.yml
 create mode 100644 .github/actions/detect-env-vars/action.yml
 delete mode 100644 .github/actions/tvm-wheel-for-publish/action.yml
 create mode 100644 ci/scripts/package/verify_tvm_install.py

diff --git a/.github/actions/build-cuda-sidecar/action.yml b/.github/actions/build-cuda-sidecar/action.yml
new file mode 100644
index 000000000000..8aeb81419f88
--- /dev/null
+++ b/.github/actions/build-cuda-sidecar/action.yml
@@ -0,0 +1,70 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Build CUDA Sidecar
+description: Build libtvm_runtime_cuda for the TVM wheel packaging flow.
+
+inputs:
+  arch:
+    description: "Target architecture (e.g., x86_64, aarch64, arm64, AMD64)"
+    required: true
+  linux_image:
+    description: "Manylinux image tag to use on Linux runners"
+    required: false
+    default: ""
+  cuda_architectures:
+    description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
+    required: false
+    default: "75"
+  skip_cuda:
+    description: "Set to true to skip the CUDA sidecar"
+    required: false
+    default: "false"
+
+runs:
+  using: "composite"
+  steps:
+    - uses: ./.github/actions/detect-env-vars
+      id: env_vars
+
+    - name: Build CUDA sidecar in manylinux
+      if: runner.os == 'Linux' && inputs.skip_cuda != 'true'
+      shell: bash -l {0}
+      env:
+        TVM_MANYLINUX_IMAGE: ${{ inputs.linux_image }}
+        TVM_ARCH: ${{ inputs.arch }}
+        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
+        TVM_SKIP_CUDA: "0"
+        TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
+        CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
+      run: ci/scripts/package/build_tvm_wheel.sh manylinux-cuda
+
+    - name: Install CUDA toolkit
+      if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
+      id: cuda-toolkit
+      uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2
+
+    - name: Build CUDA sidecar on host
+      if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
+      shell: bash -l {0}
+      env:
+        TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH || 'ON' }}
+        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
+        TVM_SKIP_CUDA: "0"
+        TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
+        CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
+      run: ci/scripts/package/build_tvm_wheel.sh cuda
diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
new file mode 100644
index 000000000000..297ce2ab4fe5
--- /dev/null
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -0,0 +1,207 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Build Wheel For Publish
+description: >
+  Build and test the LLVM-enabled TVM wheel for a given OS/architecture
+  combination using cibuildwheel.
+
+inputs:
+  arch:
+    description: "Target architecture for cibuildwheel (e.g., x86_64, aarch64, arm64, AMD64)"
+    required: true
+  build:
+    description: "cibuildwheel build selector (e.g., cp310-manylinux_x86_64)"
+    required: true
+  linux_image:
+    description: "Manylinux image tag to use on Linux runners"
+    required: false
+    default: ""
+  distribution_name:
+    description: "Optional wheel distribution name override, useful for TestPyPI"
+    required: false
+    default: ""
+  cuda_architectures:
+    description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
+    required: false
+    default: "75"
+  skip_cuda:
+    description: "Set to true to build a CPU-only wheel without the CUDA sidecar"
+    required: false
+    default: "false"
+
+runs:
+  using: "composite"
+  steps:
+    - uses: ./.github/actions/detect-env-vars
+      id: env_vars
+
+    - name: Detect wheel inputs
+      id: wheel_inputs
+      shell: bash -l {0}
+      run: |
+        set -eux
+        wheel_platform_tag=""
+        if [[ "${RUNNER_OS}" == "Linux" ]]; then
+          if [[ -z "${{ inputs.linux_image }}" ]]; then
+            echo "linux_image is required on Linux runners" >&2
+            exit 1
+          fi
+          wheel_platform_tag="${{ inputs.linux_image }}_${{ inputs.arch }}"
+        fi
+        echo "wheel_platform_tag=${wheel_platform_tag}" >> "${GITHUB_OUTPUT}"
+
+    # ---- Cache LLVM prefix ----
+    - name: Cache LLVM
+      uses: actions/cache@v4
+      id: llvm-cache
+      with:
+        path: ${{ runner.os == 'Windows' && 'C:/opt/llvm' || '/opt/llvm' }}
+        key: tvm-wheel-llvm-22.1.0-${{ runner.os }}-${{ inputs.arch }}-v3
+
+    # ---- Install LLVM via conda (cache miss only) ----
+    - name: Setup conda
+      if: steps.llvm-cache.outputs.cache-hit != 'true'
+      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      continue-on-error: true
+      id: conda1
+      with:
+        miniforge-version: latest
+
+    - name: Setup conda (retry with tar.bz2)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && steps.conda1.outcome == 'failure'
+      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      with:
+        miniforge-version: latest
+        use-only-tar-bz2: true
+
+    - name: Create /opt/llvm (macOS)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os == 'macOS'
+      shell: bash
+      run: sudo mkdir -p /opt/llvm && sudo chown -R "$(whoami)" /opt/llvm
+
+    - name: Install LLVM (Unix)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os != 'Windows'
+      shell: bash -l {0}
+      run: |
+        set -eux
+        if [[ "${RUNNER_OS}" == "Linux" ]]; then
+          sudo mkdir -p /opt/llvm
+          sudo chown -R "$(whoami)" /opt/llvm
+        fi
+        conda create -q -p /opt/llvm -c conda-forge \
+          llvmdev=22.1.0 clangdev=22.1.0 compiler-rt=22.1.0 zlib zstd-static libxml2-devel \
+          -y
+
+    - name: Install LLVM (Windows)
+      if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os == 'Windows'
+      shell: cmd /C call {0}
+      run: |
+        conda create -q -p C:\opt\llvm -c conda-forge llvmdev=22.1.0 zlib zstd-static libxml2-devel -y
+
+    - name: Create static llvm-config wrapper (Unix)
+      if: runner.os != 'Windows'
+      shell: bash
+      run: |
+        set -eux
+        printf '%s\n' \
+          '#!/usr/bin/env bash' \
+          'exec "$(dirname "$0")/llvm-config" --link-static "$@"' \
+          | sudo tee /opt/llvm/bin/llvm-config-static >/dev/null
+        sudo chmod +x /opt/llvm/bin/llvm-config-static
+
+    - name: Create static llvm-config wrapper (Windows)
+      if: runner.os == 'Windows'
+      shell: pwsh
+      run: |
+        @'
+        @echo off
+        "C:\opt\llvm\Library\bin\llvm-config.exe" --link-static %*
+        '@ | Set-Content -Path 'C:\opt\llvm\Library\bin\llvm-config-static.bat'
+
+    - name: Print build inputs
+      shell: bash -l {0}
+      run: |
+        set -eux
+        git log -1 --oneline
+        if [[ "${RUNNER_OS}" == "Windows" ]]; then
+          "C:/opt/llvm/Library/bin/llvm-config-static.bat" --version
+          "C:/opt/llvm/Library/bin/llvm-config-static.bat" --system-libs
+        else
+          /opt/llvm/bin/llvm-config-static --version
+          /opt/llvm/bin/llvm-config-static --system-libs
+        fi
+
+    # ---- Build and test wheels ----
+    - name: Build and test wheels
+      uses: pypa/cibuildwheel@298ed2fb2c105540f5ed055e8a6ad78d82dd3a7e # v3.3.1
+      with:
+        package-dir: .
+        output-dir: wheelhouse
+      env:
+        CIBW_BUILD: ${{ inputs.build }}
+        CIBW_ARCHS_LINUX: ${{ inputs.arch }}
+        CIBW_ARCHS_MACOS: ${{ inputs.arch }}
+        CIBW_ARCHS_WINDOWS: ${{ inputs.arch }}
+        CIBW_MANYLINUX_X86_64_IMAGE: ${{ inputs.linux_image }}
+        CIBW_MANYLINUX_AARCH64_IMAGE: ${{ inputs.linux_image }}
+        CIBW_BUILD_VERBOSITY: 1
+        CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
+        CIBW_CONTAINER_ENGINE: "docker; create_args: --volume /opt/llvm:/opt/llvm"
+        CIBW_BEFORE_BUILD_LINUX: >-
+          python -m pip install -U pip cmake ninja scikit-build-core wheel auditwheel &&
+          python -m pip install -v "{project}/3rdparty/tvm-ffi"
+        CIBW_BEFORE_BUILD_MACOS: >-
+          python -m pip install -U pip cmake ninja scikit-build-core wheel delocate &&
+          python -m pip install -v "{project}/3rdparty/tvm-ffi"
+        CIBW_BEFORE_BUILD_WINDOWS: >-
+          python -m pip install -U pip cmake ninja scikit-build-core wheel &&
+          python -m pip install -v "{project}/3rdparty/tvm-ffi"
+        CIBW_ENVIRONMENT: >-
+          TVM_USE_LLVM="/opt/llvm/bin/llvm-config-static"
+          CMAKE_PREFIX_PATH="/opt/llvm"
+          CMAKE_ARGS="-DUSE_LLVM=/opt/llvm/bin/llvm-config-static -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=/opt/llvm"
+          TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
+          TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
+          TVM_SKIP_CUDA="${{ inputs.skip_cuda == 'true' && '1' || '0' }}"
+          TVM_AUDITWHEEL_PLAT="${{ steps.wheel_inputs.outputs.wheel_platform_tag }}"
+          TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
+        CIBW_ENVIRONMENT_WINDOWS: >-
+          TVM_USE_LLVM="C:/opt/llvm/Library/bin/llvm-config-static.bat"
+          CMAKE_PREFIX_PATH="C:/opt/llvm/Library"
+          CMAKE_ARGS="-DUSE_LLVM=C:/opt/llvm/Library/bin/llvm-config-static.bat -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=C:/opt/llvm/Library"
+          TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
+          TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
+          TVM_SKIP_CUDA="${{ inputs.skip_cuda == 'true' && '1' || '0' }}"
+          TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
+        CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
+          bash "{project}/ci/scripts/package/build_tvm_wheel.sh" cibw-repair "{wheel}" "{dest_dir}"
+        CIBW_REPAIR_WHEEL_COMMAND_MACOS: >-
+          bash "{project}/ci/scripts/package/build_tvm_wheel.sh" cibw-repair "{wheel}" "{dest_dir}"
+        CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: >-
+          python "{project}/ci/scripts/package/inject_cuda_runtime.py" "{wheel}"
+          --output-dir "{dest_dir}"
+          --distribution-name "${{ inputs.distribution_name }}"
+        CIBW_TEST_COMMAND: >-
+          python "{project}/ci/scripts/package/verify_tvm_install.py"
+
+    - name: Verify final wheel
+      shell: bash -l {0}
+      env:
+        TVM_PYTHON: python
+        TVM_EXPECT_WHEEL_PLATFORM_TAG: ${{ steps.wheel_inputs.outputs.wheel_platform_tag }}
+      run: ci/scripts/package/build_tvm_wheel.sh verify
diff --git a/.github/actions/detect-env-vars/action.yml b/.github/actions/detect-env-vars/action.yml
new file mode 100644
index 000000000000..e20b15746dac
--- /dev/null
+++ b/.github/actions/detect-env-vars/action.yml
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+name: Detect Environment Variables
+description: Detects environment variables such as CPU count and sets them as outputs.
+runs:
+  using: "composite"
+  steps:
+    - name: Run Python to detect environment variables
+      shell: python
+      id: detect
+      run: |
+        import multiprocessing
+        import os
+
+        with open(os.environ["GITHUB_OUTPUT"], "a", encoding="utf-8") as output_file:
+            value = multiprocessing.cpu_count()
+            output_file.write(f"cpu_count={value}\n")
+            print(f"Detected environment variable: cpu_count={value}")
+
+outputs:
+  cpu_count:
+    description: "The number of CPU cores"
+    value: "${{ steps.detect.outputs.cpu_count }}"
diff --git a/.github/actions/tvm-wheel-for-publish/action.yml b/.github/actions/tvm-wheel-for-publish/action.yml
deleted file mode 100644
index 8d0c4c27f07a..000000000000
--- a/.github/actions/tvm-wheel-for-publish/action.yml
+++ /dev/null
@@ -1,246 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-name: Build TVM Wheel For Publish
-description: >
-  Build and test a TVM wheel for a given platform using the publish packaging flow.
-
-inputs:
-  os:
-    description: "Runner operating system (e.g., ubuntu-latest, macos-14, windows-latest)"
-    required: true
-  arch:
-    description: "Target architecture (e.g., x86_64, arm64, AMD64)"
-    required: true
-  linux_image:
-    description: "Manylinux image tag to use on Linux runners (empty string for non-Linux)"
-    required: false
-    default: ""
-  checkout_ref:
-    description: "Branch, tag, or SHA to check out before building"
-    required: true
-  distribution_name:
-    description: "Optional wheel distribution name override, useful for TestPyPI"
-    required: false
-    default: ""
-  cuda_architectures:
-    description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
-    required: false
-    default: "75"
-  skip_cuda:
-    description: "Set to true to build a CPU-only wheel without the CUDA sidecar"
-    required: false
-    default: "false"
-
-runs:
-  using: "composite"
-  steps:
-    - name: Check out source
-      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
-      with:
-        ref: ${{ inputs.checkout_ref }}
-        submodules: recursive
-        fetch-depth: 1
-        fetch-tags: true
-
-    - name: Free runner disk space
-      if: runner.os == 'Linux'
-      shell: bash -l {0}
-      run: |
-        set -eux
-        df -h
-        sudo rm -rf /usr/share/dotnet
-        sudo rm -rf /usr/local/lib/android
-        sudo rm -rf /opt/ghc
-        sudo rm -rf /opt/hostedtoolcache/CodeQL
-        sudo rm -rf /usr/local/share/boost
-        docker image prune -af || true
-        sudo apt-get clean
-        df -h
-
-    - name: Cache LLVM for manylinux build
-      if: runner.os == 'Linux'
-      uses: actions/cache@v4
-      id: llvm-cache
-      with:
-        path: /opt/llvm
-        key: tvm-wheel-llvm-22.1.0-${{ runner.os }}-${{ inputs.arch }}-v1
-
-    - name: Set up conda for LLVM cache
-      if: runner.os == 'Linux' && steps.llvm-cache.outputs.cache-hit != 'true'
-      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
-      continue-on-error: true
-      id: conda1
-      with:
-        miniforge-version: latest
-
-    - name: Set up conda for LLVM cache (retry with tar.bz2)
-      if: runner.os == 'Linux' && steps.llvm-cache.outputs.cache-hit != 'true' && steps.conda1.outcome == 'failure'
-      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
-      with:
-        miniforge-version: latest
-        use-only-tar-bz2: true
-
-    - name: Install LLVM for manylinux build
-      if: runner.os == 'Linux' && steps.llvm-cache.outputs.cache-hit != 'true'
-      shell: bash -l {0}
-      run: |
-        set -eux
-        sudo mkdir -p /opt/llvm
-        sudo chown -R "$(whoami)" /opt/llvm
-        conda create -q -p /opt/llvm -c conda-forge \
-          llvmdev=22.1.0 clangdev=22.1.0 compiler-rt=22.1.0 zlib zstd-static \
-          -y
-
-    - name: Set up TVM build environment
-      if: runner.os != 'Linux'
-      uses: ./.github/actions/setup
-
-    - name: Install CUDA toolkit
-      if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2
-
-    - name: Install wheel build tools
-      if: runner.os != 'Linux'
-      shell: bash -l {0}
-      run: |
-        set -eux
-        python -m pip install -U pip build twine scikit-build-core wheel
-        if [[ "$(uname -s)" == "Darwin" ]]; then
-          python -m pip install delocate
-        fi
-
-    - name: Detect host CPU count
-      if: runner.os != 'Linux'
-      id: host-env
-      shell: bash -l {0}
-      run: |
-        set -eux
-        if command -v nproc >/dev/null 2>&1; then
-          cpu_count="$(nproc)"
-        elif [[ "$(uname -s)" == "Darwin" ]]; then
-          cpu_count="$(sysctl -n hw.ncpu)"
-        else
-          cpu_count="${NUMBER_OF_PROCESSORS:-4}"
-        fi
-        echo "cpu_count=${cpu_count}" >> "${GITHUB_OUTPUT}"
-
-    - name: Print host build inputs
-      if: runner.os != 'Linux'
-      shell: bash -l {0}
-      run: |
-        set -eux
-        git log -1 --oneline
-        python --version
-        cmake --version
-        if command -v llvm-config >/dev/null 2>&1; then
-          llvm-config --version
-          llvm-config --link-static --system-libs
-        fi
-        if [[ "${{ inputs.skip_cuda }}" != "true" ]]; then
-          "${{ steps.cuda-toolkit.outputs.CUDA_PATH }}/bin/nvcc" --version
-        fi
-
-    - name: Build, repair, and test manylinux wheel
-      if: runner.os == 'Linux'
-      shell: bash -l {0}
-      env:
-        TVM_MANYLINUX_IMAGE: ${{ inputs.linux_image }}
-        TVM_ARCH: ${{ inputs.arch }}
-        TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
-        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
-        TVM_SKIP_CUDA: ${{ inputs.skip_cuda == 'true' && '1' || '0' }}
-      run: |
-        set -eux
-        if [[ -z "${TVM_MANYLINUX_IMAGE}" ]]; then
-          echo "linux_image is required on Linux runners" >&2
-          exit 1
-        fi
-
-        image="quay.io/pypa/${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}:latest"
-        container="tvm_wheel_build_${GITHUB_RUN_ID}_${GITHUB_RUN_ATTEMPT}"
-        docker pull "${image}"
-        docker run --name "${container}" -d \
-          --workdir /workspace \
-          --volume "${GITHUB_WORKSPACE}:/workspace" \
-          --volume /opt/llvm:/opt/llvm \
-          "${image}" tail -f /dev/null
-        trap 'docker rm -f "${container}" || true' EXIT
-
-        if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
-          cuda_rpm="cuda-repo-rhel8-13-0-local-13.0.2_580.95.05-1.${TVM_ARCH}.rpm"
-          curl -fsSLo "${cuda_rpm}" "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/${cuda_rpm}"
-          docker cp "${cuda_rpm}" "${container}:/${cuda_rpm}"
-          rm "${cuda_rpm}"
-          docker exec "${container}" bash -lc "
-            rpm -i /${cuda_rpm} && \
-            dnf clean all && \
-            dnf -y install cuda-toolkit-13-0 && \
-            rm /${cuda_rpm} && \
-            dnf clean all"
-        fi
-
-        docker exec \
-          -e TVM_PYTHON=/opt/python/cp310-cp310/bin/python \
-          -e TVM_USE_LLVM="/opt/llvm/bin/llvm-config --link-static" \
-          -e TVM_USE_CUDA=/usr/local/cuda \
-          -e TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES}" \
-          -e TVM_WHEEL_DIST_NAME="${TVM_WHEEL_DIST_NAME}" \
-          -e TVM_SKIP_CUDA="${TVM_SKIP_CUDA}" \
-          -e TVM_AUDITWHEEL_PLAT="${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}" \
-          -e TVM_EXPECT_WHEEL_PLATFORM_TAG="${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}" \
-          -e CMAKE_PREFIX_PATH=/opt/llvm \
-          -e CMAKE_BUILD_PARALLEL_LEVEL="$(nproc)" \
-          -e TVM_BUILD_PARALLEL_LEVEL="$(nproc)" \
-          "${container}" bash -lc '
-            set -eux
-            export PATH=/opt/python/cp310-cp310/bin:/opt/llvm/bin:/usr/local/cuda/bin:$PATH
-            export CMAKE_PREFIX_PATH=/opt/llvm${CMAKE_PREFIX_PATH:+:$CMAKE_PREFIX_PATH}
-            python -m pip install -U pip build auditwheel twine scikit-build-core wheel cmake ninja
-            python -m pip install -v ./3rdparty/tvm-ffi
-            python --version
-            cmake --version
-            llvm-config --version
-            if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
-              nvcc --version
-            fi
-            ci/scripts/package/build_tvm_wheel.sh all'
-        docker exec "${container}" bash -lc \
-          "chown -R $(id -u):$(id -g) /workspace/wheelhouse /workspace/dist /workspace/build-wheel-* || true"
-
-    - name: Build, repair, and test host wheel
-      if: runner.os != 'Linux'
-      shell: bash -l {0}
-      env:
-        TVM_PYTHON: python
-        TVM_USE_LLVM: llvm-config --link-static
-        TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH || 'OFF' }}
-        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
-        TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
-        TVM_SKIP_CUDA: ${{ inputs.skip_cuda == 'true' && '1' || '0' }}
-        CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.host-env.outputs.cpu_count }}
-        TVM_BUILD_PARALLEL_LEVEL: ${{ steps.host-env.outputs.cpu_count }}
-      run: |
-        set -eux
-        if [[ "${TVM_SKIP_CUDA}" != "1" ]]; then
-          ci/scripts/package/build_tvm_wheel.sh cuda
-        fi
-        ci/scripts/package/build_tvm_wheel.sh wheel
-        ci/scripts/package/build_tvm_wheel.sh inject
-        ci/scripts/package/build_tvm_wheel.sh repair
-        ci/scripts/package/build_tvm_wheel.sh verify
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 6446db699b2c..f2a3a592db43 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -60,40 +60,54 @@ jobs:
           - name: Linux x86_64 CUDA sidecar wheel (manylinux_2_28)
             os: ubuntu-latest
             arch: x86_64
+            build: cp310-manylinux_x86_64
             linux_image: manylinux_2_28
             skip_cuda: "false"
             artifact_suffix: linux-x86_64-manylinux_2_28
           - name: Linux aarch64 CUDA sidecar wheel (manylinux_2_28)
             os: ubuntu-24.04-arm
             arch: aarch64
+            build: cp310-manylinux_aarch64
             linux_image: manylinux_2_28
             skip_cuda: "false"
             artifact_suffix: linux-aarch64-manylinux_2_28
           - name: macOS arm64 CPU wheel
             os: macos-14
             arch: arm64
+            build: cp310-macosx_arm64
             linux_image: ""
             skip_cuda: "true"
             artifact_suffix: macos-arm64
           - name: Windows AMD64 CPU wheel
             os: windows-latest
             arch: AMD64
+            build: cp310-win_amd64
             linux_image: ""
             skip_cuda: "true"
             artifact_suffix: windows-amd64
     steps:
-      - name: Checkout repository for local action
+      - name: Checkout source
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
+          ref: ${{ inputs.tag }}
+          submodules: recursive
           fetch-depth: 1
+          fetch-tags: true
+
+      - name: Build CUDA sidecar
+        uses: ./.github/actions/build-cuda-sidecar
+        with:
+          arch: ${{ matrix.arch }}
+          linux_image: ${{ matrix.linux_image }}
+          cuda_architectures: ${{ inputs.cuda_architectures }}
+          skip_cuda: ${{ matrix.skip_cuda }}
 
       - name: Build TVM wheel
-        uses: ./.github/actions/tvm-wheel-for-publish
+        uses: ./.github/actions/build-wheel-for-publish
         with:
-          os: ${{ matrix.os }}
           arch: ${{ matrix.arch }}
+          build: ${{ matrix.build }}
           linux_image: ${{ matrix.linux_image }}
-          checkout_ref: ${{ inputs.tag }}
           distribution_name: ${{ inputs.distribution_name }}
           cuda_architectures: ${{ inputs.cuda_architectures }}
           skip_cuda: ${{ matrix.skip_cuda }}
diff --git a/.gitignore b/.gitignore
index 9e734b0be06d..0ee1eb241807 100644
--- a/.gitignore
+++ b/.gitignore
@@ -14,6 +14,8 @@ __pycache__/
 env/
 build/
 build-*/
+!.github/actions/build-*/
+!.github/actions/build-*/action.yml
 develop-eggs/
 dist/
 downloads/
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 90e70363f729..753fa2ce5974 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -17,12 +17,13 @@
 
 # TVM wheel packaging helper
 
-This helper follows the CUDA-sidecar packaging flow used for release
-validation:
+The GitHub Actions release-validation flow is:
 
 1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
-2. Build the main Python wheel with LLVM enabled and CUDA disabled.
-3. Inject the CUDA runtime DSO into `tvm/lib/` inside the wheel.
+2. Build the main Python wheel with `cibuildwheel`, LLVM enabled, and CUDA
+   disabled.
+3. Inject the CUDA runtime DSO into `tvm/lib/` during the `cibuildwheel`
+   repair hook.
 4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`.
 5. Validate ELF links so intra-wheel TVM DSOs resolve through relative rpaths.
    LLVM is expected to be linked statically; the final wheel must not bundle
@@ -59,6 +60,16 @@ packaging pattern. This avoids accidentally publishing a wheel tagged for the
 GitHub runner's host glibc, such as `manylinux_2_39`, which would not install
 on older supported Linux systems.
 
+The workflow mirrors the TVM-FFI `.github` layout: a small matrix workflow
+directly calls focused composite actions under `.github/actions`.
+
+- `.github/actions/detect-env-vars`: shared environment detection.
+- `.github/actions/build-cuda-sidecar`: builds only the optional CUDA sidecar.
+  On Linux this action owns the manylinux Docker/CUDA setup.
+- `.github/actions/build-wheel-for-publish`: installs the cached LLVM prefix
+  and runs `pypa/cibuildwheel` for the LLVM-enabled runtime wheel. Its custom
+  repair hook injects the sidecar before `auditwheel`/`delocate`/copy repair.
+
 To test this from the fork `tlopex/tvm` without publishing:
 
 ```bash
@@ -82,6 +93,10 @@ workflows once the workflow file exists in the repository.
 
 Typical TestPyPI dry run:
 
+The local helper keeps the same CUDA injection, repair, and verification
+steps for debugging outside GitHub Actions. The GitHub workflow uses
+`cibuildwheel` for the main wheel build.
+
 ```bash
 python version.py --git-describe
 git tag -a v0.25.dev-test0 -m "Test TVM wheel v0.25.dev-test0"
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 761b21d87356..248434c1d9e9 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -40,7 +40,7 @@ TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
 
 usage() {
   cat <<'EOF'
-Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|wheel|inject|repair|validate|verify|upload|verify-pypi]
+Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|manylinux-cuda|wheel|inject|repair|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
 
 Environment knobs:
   TVM_USE_LLVM                 LLVM config for the base wheel, default "llvm-config --link-static"
@@ -53,6 +53,8 @@ Environment knobs:
   TVM_SKIP_REPAIR=1            Keep injected wheel as final wheel
   TVM_BUILD_NO_ISOLATION=1     Pass --no-isolation to python -m build
   TVM_KEEP_BUILD_DIRS=1        Reuse CMake build dirs instead of cleaning them
+  TVM_MANYLINUX_IMAGE          manylinux image tag for manylinux-cuda
+  TVM_ARCH                     Target architecture for manylinux-cuda
   TVM_AUDITWHEEL_PLAT          Optional auditwheel --plat value
   TVM_EXPECT_WHEEL_PLATFORM_TAG
                                 Require the final wheel filename to include this tag
@@ -119,6 +121,64 @@ cuda_runtime_path() {
   find "$TVM_CUDA_BUILD_DIR" -type f -name 'libtvm_runtime_cuda.so' | sort | tail -n 1
 }
 
+run_manylinux_cuda_container() {
+  if [[ "$TVM_SKIP_CUDA" == "1" ]]; then
+    echo "Skipping manylinux CUDA sidecar build because TVM_SKIP_CUDA=1"
+    return 0
+  fi
+
+  require_cmd docker
+  require_cmd curl
+  if [[ -z "${TVM_MANYLINUX_IMAGE:-}" ]]; then
+    echo "error: TVM_MANYLINUX_IMAGE is required for manylinux-cuda" >&2
+    return 1
+  fi
+  if [[ -z "${TVM_ARCH:-}" ]]; then
+    echo "error: TVM_ARCH is required for manylinux-cuda" >&2
+    return 1
+  fi
+
+  local image="quay.io/pypa/${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}:latest"
+  local container="tvm_wheel_cuda_${GITHUB_RUN_ID:-local}_${GITHUB_RUN_ATTEMPT:-1}_${TVM_ARCH}"
+  docker pull "$image"
+  docker rm -f "$container" >/dev/null 2>&1 || true
+  docker run --name "$container" -d \
+    --workdir /workspace \
+    --volume "${REPO_ROOT}:/workspace" \
+    "$image" tail -f /dev/null
+  trap "docker rm -f '${container}' >/dev/null 2>&1 || true" EXIT
+
+  local cuda_rpm="cuda-repo-rhel8-13-0-local-13.0.2_580.95.05-1.${TVM_ARCH}.rpm"
+  curl -fsSLo "$cuda_rpm" "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/${cuda_rpm}"
+  docker cp "$cuda_rpm" "${container}:/${cuda_rpm}"
+  rm "$cuda_rpm"
+  docker exec "$container" bash -lc "
+    rpm -i /${cuda_rpm} && \
+    dnf clean all && \
+    dnf -y install cuda-toolkit-13-0 && \
+    rm /${cuda_rpm} && \
+    dnf clean all"
+
+  docker exec \
+    -e TVM_PYTHON=/opt/python/cp310-cp310/bin/python \
+    -e TVM_USE_CUDA=/usr/local/cuda \
+    -e TVM_CUDA_ARCHITECTURES="$TVM_CUDA_ARCHITECTURES" \
+    -e TVM_SKIP_CUDA="$TVM_SKIP_CUDA" \
+    -e CMAKE_BUILD_PARALLEL_LEVEL="$TVM_BUILD_PARALLEL_LEVEL" \
+    -e TVM_BUILD_PARALLEL_LEVEL="$TVM_BUILD_PARALLEL_LEVEL" \
+    "$container" bash -lc '
+      set -eux
+      export PATH=/opt/python/cp310-cp310/bin:/usr/local/cuda/bin:$PATH
+      python -m pip install -U pip cmake ninja
+      python --version
+      cmake --version
+      nvcc --version
+      ci/scripts/package/build_tvm_wheel.sh cuda'
+
+  docker exec "$container" bash -lc \
+    "chown -R $(id -u):$(id -g) /workspace/build-wheel-cuda || true"
+}
+
 build_cuda_runtime() {
   if [[ "$TVM_SKIP_CUDA" == "1" ]]; then
     echo "Skipping CUDA sidecar build because TVM_SKIP_CUDA=1"
@@ -193,14 +253,13 @@ build_base_wheel() {
   single_wheel "$TVM_RAW_DIST" >/dev/null
 }
 
-inject_cuda_runtime() {
-  rm -rf "$TVM_INJECTED_DIST"
-  mkdir -p "$TVM_INJECTED_DIST"
+inject_wheel_file() {
+  local raw_wheel="$1"
+  local output_dir="$2"
+  rm -rf "$output_dir"
+  mkdir -p "$output_dir"
 
-  local raw_wheel
-  raw_wheel="$(single_wheel "$TVM_RAW_DIST")"
-
-  local inject_args=(--output-dir "$TVM_INJECTED_DIST")
+  local inject_args=(--output-dir "$output_dir")
   if [[ "$TVM_SKIP_CUDA" != "1" ]]; then
     local cuda_lib
     cuda_lib="$(cuda_runtime_path)"
@@ -224,6 +283,12 @@ inject_cuda_runtime() {
   "$TVM_PYTHON" "$SCRIPT_DIR/inject_cuda_runtime.py" "$raw_wheel" "${inject_args[@]}"
 }
 
+inject_cuda_runtime() {
+  local raw_wheel
+  raw_wheel="$(single_wheel "$TVM_RAW_DIST")"
+  inject_wheel_file "$raw_wheel" "$TVM_INJECTED_DIST"
+}
+
 auditwheel_excludes() {
   local cuda_lib="$1"
   local seen
@@ -298,16 +363,13 @@ base_cmake_args() {
   printf '%q ' "${args[@]}"
 }
 
-repair_wheel() {
-  rm -rf "$TVM_WHEELHOUSE"
-  mkdir -p "$TVM_WHEELHOUSE"
-
-  local injected_wheel
-  injected_wheel="$(single_wheel "$TVM_INJECTED_DIST")"
-
+repair_wheel_to_dir() {
+  local injected_wheel="$1"
+  local output_dir="$2"
+  mkdir -p "$output_dir"
   if [[ "$TVM_SKIP_REPAIR" == "1" ]]; then
-    cp "$injected_wheel" "$TVM_WHEELHOUSE/"
-    echo "Repair skipped; final wheel copied to ${TVM_WHEELHOUSE}"
+    cp "$injected_wheel" "$output_dir/"
+    echo "Repair skipped; final wheel copied to ${output_dir}"
     return 0
   fi
 
@@ -333,7 +395,7 @@ repair_wheel() {
           export LD_LIBRARY_PATH="${llvm_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
         fi
         auditwheel repair "${auditwheel_plat_args[@]}" "${exclude_args[@]}" \
-          -w "$TVM_WHEELHOUSE" "$injected_wheel"
+          -w "$output_dir" "$injected_wheel"
       )
       ;;
     Darwin)
@@ -342,16 +404,37 @@ repair_wheel() {
       delocate-wheel \
         --ignore-missing-dependencies \
         --exclude libtvm_ffi.dylib \
-        -w "$TVM_WHEELHOUSE" \
+        -w "$output_dir" \
         -v "$injected_wheel"
       ;;
     *)
-      cp "$injected_wheel" "$TVM_WHEELHOUSE/"
-      echo "No repair step for this platform; final wheel copied to ${TVM_WHEELHOUSE}"
+      cp "$injected_wheel" "$output_dir/"
+      echo "No repair step for this platform; final wheel copied to ${output_dir}"
       ;;
   esac
 
-  single_wheel "$TVM_WHEELHOUSE" >/dev/null
+  single_wheel "$output_dir" >/dev/null
+}
+
+repair_wheel() {
+  rm -rf "$TVM_WHEELHOUSE"
+  mkdir -p "$TVM_WHEELHOUSE"
+
+  local injected_wheel
+  injected_wheel="$(single_wheel "$TVM_INJECTED_DIST")"
+  repair_wheel_to_dir "$injected_wheel" "$TVM_WHEELHOUSE"
+}
+
+cibw_repair_wheel() {
+  local raw_wheel="$1"
+  local dest_dir="$2"
+  local injected_dir
+  injected_dir="$(mktemp -d)"
+  inject_wheel_file "$raw_wheel" "$injected_dir"
+  local injected_wheel
+  injected_wheel="$(single_wheel "$injected_dir")"
+  repair_wheel_to_dir "$injected_wheel" "$dest_dir"
+  rm -rf "$injected_dir"
 }
 
 validate_wheel_elf() {
@@ -384,31 +467,7 @@ verify_wheel() {
 
   "$venv_python" -m pip install --upgrade pip
   "$venv_python" -m pip install --extra-index-url "${TVM_EXTRA_INDEX_URL:-https://pypi.org/simple}" "$final_wheel"
-  "$venv_python" - <<'PY'
-from pathlib import Path
-import sys
-import tvm
-
-root = Path(tvm.__file__).resolve().parent
-libdir = root / "lib"
-if sys.platform == "darwin":
-    runtime_lib = libdir / "libtvm_runtime.dylib"
-    cuda_sidecar = libdir / "libtvm_runtime_cuda.dylib"
-elif sys.platform == "win32":
-    runtime_lib = libdir / "tvm_runtime.dll"
-    cuda_sidecar = libdir / "tvm_runtime_cuda.dll"
-else:
-    runtime_lib = libdir / "libtvm_runtime.so"
-    cuda_sidecar = libdir / "libtvm_runtime_cuda.so"
-
-print("tvm version:", tvm.__version__)
-print("tvm package:", root)
-print("llvm enabled:", tvm.runtime.enabled("llvm"))
-print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
-print("runtime library:", runtime_lib)
-assert runtime_lib.exists()
-print("cuda sidecar present:", cuda_sidecar.exists())
-PY
+  "$venv_python" "$SCRIPT_DIR/verify_tvm_install.py"
 }
 
 upload_wheel() {
@@ -444,31 +503,7 @@ verify_pypi_wheel() {
     --index-url "$index_url" \
     --extra-index-url "$extra_index_url" \
     "${package_name}==${package_version}"
-  "$venv_python" - <<'PY'
-from pathlib import Path
-import sys
-import tvm
-
-root = Path(tvm.__file__).resolve().parent
-libdir = root / "lib"
-if sys.platform == "darwin":
-    runtime_lib = libdir / "libtvm_runtime.dylib"
-    cuda_sidecar = libdir / "libtvm_runtime_cuda.dylib"
-elif sys.platform == "win32":
-    runtime_lib = libdir / "tvm_runtime.dll"
-    cuda_sidecar = libdir / "tvm_runtime_cuda.dll"
-else:
-    runtime_lib = libdir / "libtvm_runtime.so"
-    cuda_sidecar = libdir / "libtvm_runtime_cuda.so"
-
-print("tvm version:", tvm.__version__)
-print("tvm package:", root)
-print("llvm enabled:", tvm.runtime.enabled("llvm"))
-print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
-print("runtime library:", runtime_lib)
-assert runtime_lib.exists()
-print("cuda sidecar present:", cuda_sidecar.exists())
-PY
+  "$venv_python" "$SCRIPT_DIR/verify_tvm_install.py"
 }
 
 main() {
@@ -482,11 +517,20 @@ main() {
       verify_wheel
       ;;
     cuda) build_cuda_runtime ;;
+    manylinux-cuda) run_manylinux_cuda_container ;;
     wheel) build_base_wheel ;;
     inject) inject_cuda_runtime ;;
     repair) repair_wheel ;;
+    cibw-repair)
+      if [[ "$#" -ne 3 ]]; then
+        echo "error: cibw-repair requires <wheel> <dest-dir>" >&2
+        return 1
+      fi
+      cibw_repair_wheel "$2" "$3"
+      ;;
     validate) validate_wheel_elf ;;
     verify) verify_wheel ;;
+    verify-installed) "$TVM_PYTHON" "$SCRIPT_DIR/verify_tvm_install.py" ;;
     upload) upload_wheel ;;
     verify-pypi) verify_pypi_wheel ;;
     -h|--help|help) usage ;;
diff --git a/ci/scripts/package/inject_cuda_runtime.py b/ci/scripts/package/inject_cuda_runtime.py
index 0e85942c4367..121cc1172ce1 100755
--- a/ci/scripts/package/inject_cuda_runtime.py
+++ b/ci/scripts/package/inject_cuda_runtime.py
@@ -211,8 +211,8 @@ def main() -> int:
         output_dir=args.output_dir,
         cuda_runtime=cuda_runtime,
         target_path=target_path,
-        distribution_name=args.distribution_name,
-        distribution_version=args.distribution_version,
+        distribution_name=args.distribution_name or None,
+        distribution_version=args.distribution_version or None,
         set_rpath=args.set_rpath,
     )
     print(output_path)
diff --git a/ci/scripts/package/verify_tvm_install.py b/ci/scripts/package/verify_tvm_install.py
new file mode 100644
index 000000000000..87234b587a2d
--- /dev/null
+++ b/ci/scripts/package/verify_tvm_install.py
@@ -0,0 +1,52 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Verify an installed TVM wheel imports and ships the expected runtime DSO."""
+
+from __future__ import annotations
+
+from pathlib import Path
+import sys
+
+import tvm
+
+
+def main() -> int:
+    root = Path(tvm.__file__).resolve().parent
+    libdir = root / "lib"
+    if sys.platform == "darwin":
+        runtime_lib = libdir / "libtvm_runtime.dylib"
+        cuda_sidecar = libdir / "libtvm_runtime_cuda.dylib"
+    elif sys.platform == "win32":
+        runtime_lib = libdir / "tvm_runtime.dll"
+        cuda_sidecar = libdir / "tvm_runtime_cuda.dll"
+    else:
+        runtime_lib = libdir / "libtvm_runtime.so"
+        cuda_sidecar = libdir / "libtvm_runtime_cuda.so"
+
+    print("tvm version:", tvm.__version__)
+    print("tvm package:", root)
+    print("llvm enabled:", tvm.runtime.enabled("llvm"))
+    print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
+    print("runtime library:", runtime_lib)
+    if not runtime_lib.exists():
+        raise RuntimeError(f"runtime library is missing: {runtime_lib}")
+    print("cuda sidecar present:", cuda_sidecar.exists())
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())
diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
index 98540dbb5994..3fef7baa1b7d 100644
--- a/cmake/utils/FindLLVM.cmake
+++ b/cmake/utils/FindLLVM.cmake
@@ -231,8 +231,18 @@ macro(find_llvm use_llvm)
           endif()
         endif()
       elseif("${__flag}" STREQUAL "-lxml2")
-        message(STATUS "LLVM links against xml2")
-        list(APPEND LLVM_LIBS "-lxml2")
+        find_library(LIBXML2_LIBRARY
+          NAMES xml2 libxml2
+          HINTS ${__llvm_lib_hints}
+          NO_DEFAULT_PATH)
+        if (LIBXML2_LIBRARY)
+          message(STATUS "LLVM links against xml2: ${LIBXML2_LIBRARY}")
+          list(APPEND LLVM_LIBS "${LIBXML2_LIBRARY}")
+        else()
+          message(STATUS "LLVM links against xml2")
+          find_package(LibXml2 REQUIRED)
+          list(APPEND LLVM_LIBS "LibXml2::LibXml2")
+        endif()
       elseif("${__flag}" STREQUAL "zstd.dll.lib")
         message(STATUS "LLVM linker flag under LLVM libdir: ${__llvm_libdir}/zstd.lib")
         list(APPEND LLVM_LIBS "${__llvm_libdir}/zstd.lib")

From 1865e468d927c2aac10c4eb6e28d76004b9c91d5 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 16:04:32 -0400
Subject: [PATCH 19/43] Rename CUDA wheel helper action

---
 .../action.yml                                |  8 +--
 .github/workflows/publish_wheel.yml           |  8 +--
 ci/scripts/package/README.md                  | 62 ++++++++++---------
 ci/scripts/package/build_tvm_wheel.sh         | 12 ++--
 ci/scripts/package/verify_tvm_install.py      |  8 +--
 5 files changed, 50 insertions(+), 48 deletions(-)
 rename .github/actions/{build-cuda-sidecar => build-cuda}/action.yml (93%)

diff --git a/.github/actions/build-cuda-sidecar/action.yml b/.github/actions/build-cuda/action.yml
similarity index 93%
rename from .github/actions/build-cuda-sidecar/action.yml
rename to .github/actions/build-cuda/action.yml
index 8aeb81419f88..1254bcd0d17c 100644
--- a/.github/actions/build-cuda-sidecar/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Build CUDA Sidecar
+name: Build CUDA Runtime
 description: Build libtvm_runtime_cuda for the TVM wheel packaging flow.
 
 inputs:
@@ -31,7 +31,7 @@ inputs:
     required: false
     default: "75"
   skip_cuda:
-    description: "Set to true to skip the CUDA sidecar"
+    description: "Set to true to skip the CUDA runtime build"
     required: false
     default: "false"
 
@@ -41,7 +41,7 @@ runs:
     - uses: ./.github/actions/detect-env-vars
       id: env_vars
 
-    - name: Build CUDA sidecar in manylinux
+    - name: Build CUDA runtime in manylinux
       if: runner.os == 'Linux' && inputs.skip_cuda != 'true'
       shell: bash -l {0}
       env:
@@ -58,7 +58,7 @@ runs:
       id: cuda-toolkit
       uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2
 
-    - name: Build CUDA sidecar on host
+    - name: Build CUDA runtime on host
       if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
       shell: bash -l {0}
       env:
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index f2a3a592db43..c7afa78fb539 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -57,14 +57,14 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: Linux x86_64 CUDA sidecar wheel (manylinux_2_28)
+          - name: Linux x86_64 CUDA wheel (manylinux_2_28)
             os: ubuntu-latest
             arch: x86_64
             build: cp310-manylinux_x86_64
             linux_image: manylinux_2_28
             skip_cuda: "false"
             artifact_suffix: linux-x86_64-manylinux_2_28
-          - name: Linux aarch64 CUDA sidecar wheel (manylinux_2_28)
+          - name: Linux aarch64 CUDA wheel (manylinux_2_28)
             os: ubuntu-24.04-arm
             arch: aarch64
             build: cp310-manylinux_aarch64
@@ -94,8 +94,8 @@ jobs:
           fetch-depth: 1
           fetch-tags: true
 
-      - name: Build CUDA sidecar
-        uses: ./.github/actions/build-cuda-sidecar
+      - name: Build CUDA runtime
+        uses: ./.github/actions/build-cuda
         with:
           arch: ${{ matrix.arch }}
           linux_image: ${{ matrix.linux_image }}
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 753fa2ce5974..60a8661359cc 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -17,7 +17,12 @@
 
 # TVM wheel packaging helper
 
-The GitHub Actions release-validation flow is:
+This directory contains the helper scripts used to build, repair, verify, and
+publish TVM Python wheels. The GitHub Actions workflow keeps orchestration in
+YAML and puts platform-specific packaging behavior in focused composite actions
+and shell/Python helpers.
+
+The wheel build flow is:
 
 1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
 2. Build the main Python wheel with `cibuildwheel`, LLVM enabled, and CUDA
@@ -31,58 +36,55 @@ The GitHub Actions release-validation flow is:
 6. Verify the wheel in a fresh virtualenv.
 7. Upload with `twine`.
 
-It mirrors the TVM-FFI packaging patterns from
-`apache/tvm-ffi/.github`, especially:
-
-- `apache/tvm-ffi/.github/workflows/publish_wheel.yml`
-- `apache/tvm-ffi/.github/actions/build-wheel-for-publish/action.yml`
-- `apache/tvm-ffi/.github/actions/build-orcjit-wheel/action.yml`
-- `apache/tvm-ffi/addons/tvm_ffi_orcjit/pyproject.toml`
-
 GitHub Actions flow:
 
 1. Create a tag that contains these packaging files.
 2. Open the `Publish TVM wheel` workflow in GitHub Actions.
 3. Fill `tag` with that tag.
 4. The workflow builds a platform wheel matrix:
-   - Linux x86_64 in a `manylinux_2_28` container, with the CUDA sidecar.
-   - Linux aarch64 in a `manylinux_2_28` container, with the CUDA sidecar.
+   - Linux x86_64 in a `manylinux_2_28` container, with CUDA enabled.
+   - Linux aarch64 in a `manylinux_2_28` container, with CUDA enabled.
    - macOS arm64 CPU-only.
    - Windows AMD64 CPU-only.
 5. For a TestPyPI run, set `publish_repository=testpypi` and set
-   `distribution_name` to a temporary package name such as
-   `tvm-yourname-test`.
+   `distribution_name` to a temporary package name.
 6. After the workflow build, upload, and `verify_pypi` jobs pass, run it again
    with the final tag/name and `publish_repository=pypi`.
 
-Linux wheels are built inside a manylinux image, following the TVM-FFI
-packaging pattern. This avoids accidentally publishing a wheel tagged for the
-GitHub runner's host glibc, such as `manylinux_2_39`, which would not install
-on older supported Linux systems.
+Linux wheels are built inside manylinux images. This avoids accidentally
+publishing a wheel tagged for the GitHub runner's host glibc, such as
+`manylinux_2_39`, which would not install on older supported Linux systems.
 
-The workflow mirrors the TVM-FFI `.github` layout: a small matrix workflow
-directly calls focused composite actions under `.github/actions`.
+Workflow structure:
 
+- `.github/workflows/publish_wheel.yml`: defines the platform matrix,
+  artifact upload, optional publishing, and post-upload verification.
 - `.github/actions/detect-env-vars`: shared environment detection.
-- `.github/actions/build-cuda-sidecar`: builds only the optional CUDA sidecar.
+- `.github/actions/build-cuda`: builds only the optional CUDA runtime library.
   On Linux this action owns the manylinux Docker/CUDA setup.
 - `.github/actions/build-wheel-for-publish`: installs the cached LLVM prefix
   and runs `pypa/cibuildwheel` for the LLVM-enabled runtime wheel. Its custom
-  repair hook injects the sidecar before `auditwheel`/`delocate`/copy repair.
+  repair hook injects the CUDA runtime before `auditwheel`/`delocate`/copy repair.
+- `ci/scripts/package/build_tvm_wheel.sh`: implements reusable local and CI
+  entrypoints such as `cuda`, `wheel`, `repair`, `verify`, and `upload`.
+- `ci/scripts/package/inject_cuda_runtime.py`: rewrites wheel metadata and
+  injects the CUDA runtime library when CUDA is enabled.
+- `ci/scripts/package/verify_tvm_install.py`: imports the installed wheel and
+  checks that the platform runtime library is present.
 
-To test this from the fork `tlopex/tvm` without publishing:
+To test the workflow from a fork without publishing:
 
 ```bash
-git push mine HEAD:pypi
+git push origin HEAD:<branch>
 git tag -a tvm-wheel-test0 -m "Test TVM wheel workflow"
-git push mine tvm-wheel-test0
+git push origin tvm-wheel-test0
 
 gh workflow run publish_wheel.yml \
-  --repo tlopex/tvm \
-  --ref pypi \
+  --repo <owner>/<repo> \
+  --ref <branch> \
   -f tag=tvm-wheel-test0 \
   -f publish_repository=none \
-  -f distribution_name=tvm-tlopexh-test \
+  -f distribution_name=<temporary-package-name> \
   -f cuda_architectures=75 \
   -f verify_from_repository=false
 ```
@@ -107,7 +109,7 @@ python -m venv /tmp/tvm-wheel-tools
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
 TVM_USE_LLVM="/path/to/llvm-config --link-static" \
 TVM_USE_CUDA=/usr/local/cuda-12.8 \
-TVM_WHEEL_DIST_NAME=tvm-tlopexh-test \
+TVM_WHEEL_DIST_NAME=tvm-temporary-test \
 ci/scripts/package/build_tvm_wheel.sh all
 
 TVM_UPLOAD_REPOSITORY_URL=https://test.pypi.org/legacy/ \
@@ -137,12 +139,12 @@ Useful knobs:
 
 - `TVM_USE_LLVM`: LLVM config for the base wheel, default
   `llvm-config --link-static`.
-- `TVM_USE_CUDA`: CUDA root or `ON` for the sidecar build, default `ON`.
+- `TVM_USE_CUDA`: CUDA root or `ON` for the CUDA build, default `ON`.
 - `TVM_CUDA_ARCHITECTURES`: CMake CUDA architectures, default `75`.
 - `TVM_WHEEL_DIST_NAME`: optional distribution rename for TestPyPI.
 - `TVM_WHEEL_DIST_VERSION`: optional distribution version rewrite.
 - `TVM_SKIP_REPAIR=1`: leave the injected wheel unrepaired.
-- `TVM_SKIP_CUDA=1`: build a base wheel without a CUDA sidecar.
+- `TVM_SKIP_CUDA=1`: build a base wheel without the CUDA runtime.
 - `TVM_KEEP_BUILD_DIRS=1`: reuse the CMake build directories.
 - `TVM_AUDITWHEEL_PLAT`: optional `auditwheel repair --plat` override.
 - `TVM_EXPECT_WHEEL_PLATFORM_TAG`: require the final wheel filename to include
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 248434c1d9e9..0babbb068a13 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -44,7 +44,7 @@ Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|manylinux-cuda|wheel|inje
 
 Environment knobs:
   TVM_USE_LLVM                 LLVM config for the base wheel, default "llvm-config --link-static"
-  TVM_USE_CUDA                 CUDA root or ON for the sidecar build, default ON
+  TVM_USE_CUDA                 CUDA root or ON for the CUDA build, default ON
   TVM_CUDA_ARCHITECTURES       CMake CUDA arch list, default 75
   TVM_WHEEL_DIST_NAME          Optional distribution rename for TestPyPI
   TVM_WHEEL_DIST_VERSION       Optional distribution version rewrite
@@ -123,7 +123,7 @@ cuda_runtime_path() {
 
 run_manylinux_cuda_container() {
   if [[ "$TVM_SKIP_CUDA" == "1" ]]; then
-    echo "Skipping manylinux CUDA sidecar build because TVM_SKIP_CUDA=1"
+    echo "Skipping manylinux CUDA build because TVM_SKIP_CUDA=1"
     return 0
   fi
 
@@ -181,7 +181,7 @@ run_manylinux_cuda_container() {
 
 build_cuda_runtime() {
   if [[ "$TVM_SKIP_CUDA" == "1" ]]; then
-    echo "Skipping CUDA sidecar build because TVM_SKIP_CUDA=1"
+    echo "Skipping CUDA build because TVM_SKIP_CUDA=1"
     return 0
   fi
 
@@ -220,7 +220,7 @@ build_cuda_runtime() {
     require_cmd patchelf
     patchelf --set-rpath '$ORIGIN' "$cuda_lib"
   fi
-  echo "CUDA sidecar: ${cuda_lib}"
+  echo "CUDA runtime: ${cuda_lib}"
 }
 
 build_base_wheel() {
@@ -264,7 +264,7 @@ inject_wheel_file() {
     local cuda_lib
     cuda_lib="$(cuda_runtime_path)"
     if [[ -z "$cuda_lib" ]]; then
-      echo "error: CUDA sidecar missing; run the 'cuda' step first" >&2
+      echo "error: CUDA runtime missing; run the 'cuda' step first" >&2
       return 1
     fi
     inject_args+=(--cuda-runtime "$cuda_lib")
@@ -279,7 +279,7 @@ inject_wheel_file() {
     inject_args+=(--set-rpath '$ORIGIN')
   fi
 
-  echo "Injecting sidecar/metadata into ${raw_wheel}"
+  echo "Injecting CUDA runtime/metadata into ${raw_wheel}"
   "$TVM_PYTHON" "$SCRIPT_DIR/inject_cuda_runtime.py" "$raw_wheel" "${inject_args[@]}"
 }
 
diff --git a/ci/scripts/package/verify_tvm_install.py b/ci/scripts/package/verify_tvm_install.py
index 87234b587a2d..ace784490840 100644
--- a/ci/scripts/package/verify_tvm_install.py
+++ b/ci/scripts/package/verify_tvm_install.py
@@ -29,13 +29,13 @@ def main() -> int:
     libdir = root / "lib"
     if sys.platform == "darwin":
         runtime_lib = libdir / "libtvm_runtime.dylib"
-        cuda_sidecar = libdir / "libtvm_runtime_cuda.dylib"
+        cuda_runtime = libdir / "libtvm_runtime_cuda.dylib"
     elif sys.platform == "win32":
         runtime_lib = libdir / "tvm_runtime.dll"
-        cuda_sidecar = libdir / "tvm_runtime_cuda.dll"
+        cuda_runtime = libdir / "tvm_runtime_cuda.dll"
     else:
         runtime_lib = libdir / "libtvm_runtime.so"
-        cuda_sidecar = libdir / "libtvm_runtime_cuda.so"
+        cuda_runtime = libdir / "libtvm_runtime_cuda.so"
 
     print("tvm version:", tvm.__version__)
     print("tvm package:", root)
@@ -44,7 +44,7 @@ def main() -> int:
     print("runtime library:", runtime_lib)
     if not runtime_lib.exists():
         raise RuntimeError(f"runtime library is missing: {runtime_lib}")
-    print("cuda sidecar present:", cuda_sidecar.exists())
+    print("cuda runtime present:", cuda_runtime.exists())
     return 0
 
 

From 2b20497cddd34b61f2cfddfa6b90041313784ecd Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 16:09:38 -0400
Subject: [PATCH 20/43] Remove CUDA sidecar wording

---
 .github/actions/build-wheel-for-publish/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index 297ce2ab4fe5..c3d1c99972c9 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -40,7 +40,7 @@ inputs:
     required: false
     default: "75"
   skip_cuda:
-    description: "Set to true to build a CPU-only wheel without the CUDA sidecar"
+    description: "Set to true to build a CPU-only wheel without the CUDA runtime"
     required: false
     default: "false"
 

From c87fdf2ebda561ccc779ed25fa64102633755edd Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 16:14:44 -0400
Subject: [PATCH 21/43] Keep CIBW as sole wheel builder

---
 ci/scripts/package/README.md          | 40 ++++-------
 ci/scripts/package/build_tvm_wheel.sh | 96 ++-------------------------
 2 files changed, 18 insertions(+), 118 deletions(-)

diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 60a8661359cc..845df075e7ef 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -66,7 +66,8 @@ Workflow structure:
   and runs `pypa/cibuildwheel` for the LLVM-enabled runtime wheel. Its custom
   repair hook injects the CUDA runtime before `auditwheel`/`delocate`/copy repair.
 - `ci/scripts/package/build_tvm_wheel.sh`: implements reusable local and CI
-  entrypoints such as `cuda`, `wheel`, `repair`, `verify`, and `upload`.
+  entrypoints around the `cibuildwheel` build, such as `cuda`,
+  `manylinux-cuda`, `cibw-repair`, `verify`, `upload`, and `verify-pypi`.
 - `ci/scripts/package/inject_cuda_runtime.py`: rewrites wheel metadata and
   injects the CUDA runtime library when CUDA is enabled.
 - `ci/scripts/package/verify_tvm_install.py`: imports the installed wheel and
@@ -93,28 +94,20 @@ If the workflow is not visible in the GitHub UI yet, push or merge these files
 to the fork's default branch first. GitHub only lists manually dispatched
 workflows once the workflow file exists in the repository.
 
-Typical TestPyPI dry run:
+Local debugging:
 
-The local helper keeps the same CUDA injection, repair, and verification
-steps for debugging outside GitHub Actions. The GitHub workflow uses
-`cibuildwheel` for the main wheel build.
+The main wheel build is owned by `cibuildwheel`. The shell helper is used for
+the build pieces around `cibuildwheel`: CUDA runtime construction, the
+`CIBW_REPAIR_WHEEL_COMMAND` hook, final wheel verification, and optional
+publish verification.
 
-```bash
-python version.py --git-describe
-git tag -a v0.25.dev-test0 -m "Test TVM wheel v0.25.dev-test0"
-
-python -m venv /tmp/tvm-wheel-tools
-/tmp/tvm-wheel-tools/bin/python -m pip install -U pip build auditwheel twine
+For the exact `cibuildwheel` environment, use
+`.github/actions/build-wheel-for-publish/action.yml` as the source of truth.
+For local checks after a wheel exists under `wheelhouse/`, run:
 
+```bash
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-TVM_USE_LLVM="/path/to/llvm-config --link-static" \
-TVM_USE_CUDA=/usr/local/cuda-12.8 \
-TVM_WHEEL_DIST_NAME=tvm-temporary-test \
-ci/scripts/package/build_tvm_wheel.sh all
-
-TVM_UPLOAD_REPOSITORY_URL=https://test.pypi.org/legacy/ \
-TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-ci/scripts/package/build_tvm_wheel.sh upload
+ci/scripts/package/build_tvm_wheel.sh verify
 
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
 ci/scripts/package/build_tvm_wheel.sh verify-pypi
@@ -124,11 +117,6 @@ For a real PyPI upload, leave `TVM_WHEEL_DIST_NAME` unset and set the normal
 Twine credentials:
 
 ```bash
-TWINE_USERNAME=__token__ \
-TWINE_PASSWORD="$PYPI_TOKEN" \
-TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-ci/scripts/package/build_tvm_wheel.sh all
-
 TWINE_USERNAME=__token__ \
 TWINE_PASSWORD="$PYPI_TOKEN" \
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
@@ -137,14 +125,14 @@ ci/scripts/package/build_tvm_wheel.sh upload
 
 Useful knobs:
 
-- `TVM_USE_LLVM`: LLVM config for the base wheel, default
+- `TVM_USE_LLVM`: LLVM config for the CIBW build and repair helpers, default
   `llvm-config --link-static`.
 - `TVM_USE_CUDA`: CUDA root or `ON` for the CUDA build, default `ON`.
 - `TVM_CUDA_ARCHITECTURES`: CMake CUDA architectures, default `75`.
 - `TVM_WHEEL_DIST_NAME`: optional distribution rename for TestPyPI.
 - `TVM_WHEEL_DIST_VERSION`: optional distribution version rewrite.
 - `TVM_SKIP_REPAIR=1`: leave the injected wheel unrepaired.
-- `TVM_SKIP_CUDA=1`: build a base wheel without the CUDA runtime.
+- `TVM_SKIP_CUDA=1`: build or repair a wheel without the CUDA runtime.
 - `TVM_KEEP_BUILD_DIRS=1`: reuse the CMake build directories.
 - `TVM_AUDITWHEEL_PLAT`: optional `auditwheel repair --plat` override.
 - `TVM_EXPECT_WHEEL_PLATFORM_TAG`: require the final wheel filename to include
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/build_tvm_wheel.sh
index 0babbb068a13..6bcc6cb9dfec 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/build_tvm_wheel.sh
@@ -22,11 +22,8 @@ SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
 REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 
 TVM_PYTHON="${TVM_PYTHON:-python}"
-TVM_RAW_DIST="${TVM_RAW_DIST:-${REPO_ROOT}/dist/tvm-raw}"
-TVM_INJECTED_DIST="${TVM_INJECTED_DIST:-${REPO_ROOT}/dist/tvm-injected}"
 TVM_WHEELHOUSE="${TVM_WHEELHOUSE:-${REPO_ROOT}/wheelhouse}"
 TVM_CUDA_BUILD_DIR="${TVM_CUDA_BUILD_DIR:-${REPO_ROOT}/build-wheel-cuda}"
-TVM_BASE_BUILD_DIR="${TVM_BASE_BUILD_DIR:-${REPO_ROOT}/build-wheel-base}"
 TVM_USE_LLVM="${TVM_USE_LLVM:-llvm-config --link-static}"
 TVM_USE_CUDA="${TVM_USE_CUDA:-ON}"
 TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES:-75}"
@@ -35,15 +32,14 @@ TVM_WHEEL_DIST_NAME="${TVM_WHEEL_DIST_NAME:-}"
 TVM_WHEEL_DIST_VERSION="${TVM_WHEEL_DIST_VERSION:-}"
 TVM_SKIP_CUDA="${TVM_SKIP_CUDA:-0}"
 TVM_SKIP_REPAIR="${TVM_SKIP_REPAIR:-0}"
-TVM_BUILD_NO_ISOLATION="${TVM_BUILD_NO_ISOLATION:-0}"
 TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
 
 usage() {
   cat <<'EOF'
-Usage: ci/scripts/package/build_tvm_wheel.sh [all|cuda|manylinux-cuda|wheel|inject|repair|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
+Usage: ci/scripts/package/build_tvm_wheel.sh [cuda|manylinux-cuda|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
 
 Environment knobs:
-  TVM_USE_LLVM                 LLVM config for the base wheel, default "llvm-config --link-static"
+  TVM_USE_LLVM                 LLVM config used by repair helpers, default "llvm-config --link-static"
   TVM_USE_CUDA                 CUDA root or ON for the CUDA build, default ON
   TVM_CUDA_ARCHITECTURES       CMake CUDA arch list, default 75
   TVM_WHEEL_DIST_NAME          Optional distribution rename for TestPyPI
@@ -51,7 +47,6 @@ Environment knobs:
   TVM_UPLOAD_REPOSITORY_URL    Twine repository URL, e.g. TestPyPI legacy URL
   TVM_SKIP_CUDA=1              Do not build/inject libtvm_runtime_cuda.so
   TVM_SKIP_REPAIR=1            Keep injected wheel as final wheel
-  TVM_BUILD_NO_ISOLATION=1     Pass --no-isolation to python -m build
   TVM_KEEP_BUILD_DIRS=1        Reuse CMake build dirs instead of cleaning them
   TVM_MANYLINUX_IMAGE          manylinux image tag for manylinux-cuda
   TVM_ARCH                     Target architecture for manylinux-cuda
@@ -70,17 +65,6 @@ require_cmd() {
   fi
 }
 
-require_pypa_build() {
-  local check_dir
-  check_dir="$(mktemp -d)"
-  if ! (cd "$check_dir" && "$TVM_PYTHON" -m build --version >/dev/null 2>&1); then
-    rm -rf "$check_dir"
-    echo "error: PyPA build is missing; install it with: ${TVM_PYTHON} -m pip install build" >&2
-    return 1
-  fi
-  rm -rf "$check_dir"
-}
-
 single_wheel() {
   local dir="$1"
   local wheels=()
@@ -155,7 +139,7 @@ run_manylinux_cuda_container() {
   docker exec "$container" bash -lc "
     rpm -i /${cuda_rpm} && \
     dnf clean all && \
-    dnf -y install cuda-toolkit-13-0 && \
+    dnf -y --disablerepo=epel install cuda-toolkit-13-0 && \
     rm /${cuda_rpm} && \
     dnf clean all"
 
@@ -223,36 +207,6 @@ build_cuda_runtime() {
   echo "CUDA runtime: ${cuda_lib}"
 }
 
-build_base_wheel() {
-  require_pypa_build
-  rm -rf "$TVM_RAW_DIST"
-  mkdir -p "$TVM_RAW_DIST"
-  if [[ "$TVM_KEEP_BUILD_DIRS" != "1" ]]; then
-    rm -rf "$TVM_BASE_BUILD_DIR"
-  fi
-
-  echo "Building base TVM wheel with LLVM=${TVM_USE_LLVM}, CUDA=OFF"
-  local cmake_args
-  cmake_args="$(base_cmake_args)"
-  (
-    cd "$TVM_RAW_DIST"
-    if [[ "$TVM_BUILD_NO_ISOLATION" == "1" ]]; then
-      CMAKE_ARGS="${cmake_args}${TVM_EXTRA_CMAKE_ARGS:-}" \
-        "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
-          --no-isolation \
-          -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
-          "$REPO_ROOT"
-    else
-      CMAKE_ARGS="${cmake_args}${TVM_EXTRA_CMAKE_ARGS:-}" \
-        "$TVM_PYTHON" -m build --wheel --outdir "$TVM_RAW_DIST" \
-          -Cbuild-dir="$TVM_BASE_BUILD_DIR" \
-          "$REPO_ROOT"
-    fi
-  )
-
-  single_wheel "$TVM_RAW_DIST" >/dev/null
-}
-
 inject_wheel_file() {
   local raw_wheel="$1"
   local output_dir="$2"
@@ -283,12 +237,6 @@ inject_wheel_file() {
   "$TVM_PYTHON" "$SCRIPT_DIR/inject_cuda_runtime.py" "$raw_wheel" "${inject_args[@]}"
 }
 
-inject_cuda_runtime() {
-  local raw_wheel
-  raw_wheel="$(single_wheel "$TVM_RAW_DIST")"
-  inject_wheel_file "$raw_wheel" "$TVM_INJECTED_DIST"
-}
-
 auditwheel_excludes() {
   local cuda_lib="$1"
   local seen
@@ -346,23 +294,6 @@ llvm_prefix() {
   fi
 }
 
-base_cmake_args() {
-  local llvm_prefix_dir
-  llvm_prefix_dir="$(llvm_prefix || true)"
-  local args=(
-    "-DUSE_LLVM=${TVM_USE_LLVM}"
-    "-DUSE_CUDA=OFF"
-    "-DBUILD_TESTING=OFF"
-    "-DTVM_BUILD_PYTHON_MODULE=ON"
-  )
-  if [[ -n "$llvm_prefix_dir" && -d "$llvm_prefix_dir" ]]; then
-    # scikit-build-core writes its own CMAKE_PREFIX_PATH init cache, so pass
-    # the LLVM prefix as an explicit CMake argument.
-    args+=("-DCMAKE_PREFIX_PATH=${llvm_prefix_dir}")
-  fi
-  printf '%q ' "${args[@]}"
-}
-
 repair_wheel_to_dir() {
   local injected_wheel="$1"
   local output_dir="$2"
@@ -416,15 +347,6 @@ repair_wheel_to_dir() {
   single_wheel "$output_dir" >/dev/null
 }
 
-repair_wheel() {
-  rm -rf "$TVM_WHEELHOUSE"
-  mkdir -p "$TVM_WHEELHOUSE"
-
-  local injected_wheel
-  injected_wheel="$(single_wheel "$TVM_INJECTED_DIST")"
-  repair_wheel_to_dir "$injected_wheel" "$TVM_WHEELHOUSE"
-}
-
 cibw_repair_wheel() {
   local raw_wheel="$1"
   local dest_dir="$2"
@@ -507,20 +429,10 @@ verify_pypi_wheel() {
 }
 
 main() {
-  local step="${1:-all}"
+  local step="${1:-help}"
   case "$step" in
-    all)
-      build_cuda_runtime
-      build_base_wheel
-      inject_cuda_runtime
-      repair_wheel
-      verify_wheel
-      ;;
     cuda) build_cuda_runtime ;;
     manylinux-cuda) run_manylinux_cuda_container ;;
-    wheel) build_base_wheel ;;
-    inject) inject_cuda_runtime ;;
-    repair) repair_wheel ;;
     cibw-repair)
       if [[ "$#" -ne 3 ]]; then
         echo "error: cibw-repair requires <wheel> <dest-dir>" >&2

From 656b89cb1db2f987a48a962c7b53d19df3d77c24 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 16:22:07 -0400
Subject: [PATCH 22/43] Rename TVM wheel packaging helper

---
 .github/actions/build-cuda/action.yml                     | 4 ++--
 .github/actions/build-wheel-for-publish/action.yml        | 6 +++---
 .github/workflows/publish_wheel.yml                       | 4 ++--
 ci/scripts/package/README.md                              | 8 ++++----
 .../package/{build_tvm_wheel.sh => tvm_wheel_helper.sh}   | 4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)
 rename ci/scripts/package/{build_tvm_wheel.sh => tvm_wheel_helper.sh} (98%)

diff --git a/.github/actions/build-cuda/action.yml b/.github/actions/build-cuda/action.yml
index 1254bcd0d17c..0236bdba04a1 100644
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -51,7 +51,7 @@ runs:
         TVM_SKIP_CUDA: "0"
         TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
         CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
-      run: ci/scripts/package/build_tvm_wheel.sh manylinux-cuda
+      run: ci/scripts/package/tvm_wheel_helper.sh manylinux-cuda
 
     - name: Install CUDA toolkit
       if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
@@ -67,4 +67,4 @@ runs:
         TVM_SKIP_CUDA: "0"
         TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
         CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
-      run: ci/scripts/package/build_tvm_wheel.sh cuda
+      run: ci/scripts/package/tvm_wheel_helper.sh cuda
diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index c3d1c99972c9..dde1a3491428 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -189,9 +189,9 @@ runs:
           TVM_SKIP_CUDA="${{ inputs.skip_cuda == 'true' && '1' || '0' }}"
           TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
         CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
-          bash "{project}/ci/scripts/package/build_tvm_wheel.sh" cibw-repair "{wheel}" "{dest_dir}"
+          bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
         CIBW_REPAIR_WHEEL_COMMAND_MACOS: >-
-          bash "{project}/ci/scripts/package/build_tvm_wheel.sh" cibw-repair "{wheel}" "{dest_dir}"
+          bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
         CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: >-
           python "{project}/ci/scripts/package/inject_cuda_runtime.py" "{wheel}"
           --output-dir "{dest_dir}"
@@ -204,4 +204,4 @@ runs:
       env:
         TVM_PYTHON: python
         TVM_EXPECT_WHEEL_PLATFORM_TAG: ${{ steps.wheel_inputs.outputs.wheel_platform_tag }}
-      run: ci/scripts/package/build_tvm_wheel.sh verify
+      run: ci/scripts/package/tvm_wheel_helper.sh verify
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index c7afa78fb539..323661ca3710 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -184,7 +184,7 @@ jobs:
           TVM_PYTHON: python
           TVM_TEST_INDEX_URL: https://test.pypi.org/simple/
           TVM_EXTRA_INDEX_URL: https://pypi.org/simple
-        run: ci/scripts/package/build_tvm_wheel.sh verify-pypi
+        run: ci/scripts/package/tvm_wheel_helper.sh verify-pypi
 
       - name: Verify package from PyPI
         if: ${{ inputs.publish_repository == 'pypi' }}
@@ -192,4 +192,4 @@ jobs:
           TVM_PYTHON: python
           TVM_TEST_INDEX_URL: https://pypi.org/simple/
           TVM_EXTRA_INDEX_URL: https://pypi.org/simple
-        run: ci/scripts/package/build_tvm_wheel.sh verify-pypi
+        run: ci/scripts/package/tvm_wheel_helper.sh verify-pypi
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 845df075e7ef..f1d5e8c4f3b2 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -65,7 +65,7 @@ Workflow structure:
 - `.github/actions/build-wheel-for-publish`: installs the cached LLVM prefix
   and runs `pypa/cibuildwheel` for the LLVM-enabled runtime wheel. Its custom
   repair hook injects the CUDA runtime before `auditwheel`/`delocate`/copy repair.
-- `ci/scripts/package/build_tvm_wheel.sh`: implements reusable local and CI
+- `ci/scripts/package/tvm_wheel_helper.sh`: implements reusable local and CI
   entrypoints around the `cibuildwheel` build, such as `cuda`,
   `manylinux-cuda`, `cibw-repair`, `verify`, `upload`, and `verify-pypi`.
 - `ci/scripts/package/inject_cuda_runtime.py`: rewrites wheel metadata and
@@ -107,10 +107,10 @@ For local checks after a wheel exists under `wheelhouse/`, run:
 
 ```bash
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-ci/scripts/package/build_tvm_wheel.sh verify
+ci/scripts/package/tvm_wheel_helper.sh verify
 
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-ci/scripts/package/build_tvm_wheel.sh verify-pypi
+ci/scripts/package/tvm_wheel_helper.sh verify-pypi
 ```
 
 For a real PyPI upload, leave `TVM_WHEEL_DIST_NAME` unset and set the normal
@@ -120,7 +120,7 @@ Twine credentials:
 TWINE_USERNAME=__token__ \
 TWINE_PASSWORD="$PYPI_TOKEN" \
 TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
-ci/scripts/package/build_tvm_wheel.sh upload
+ci/scripts/package/tvm_wheel_helper.sh upload
 ```
 
 Useful knobs:
diff --git a/ci/scripts/package/build_tvm_wheel.sh b/ci/scripts/package/tvm_wheel_helper.sh
similarity index 98%
rename from ci/scripts/package/build_tvm_wheel.sh
rename to ci/scripts/package/tvm_wheel_helper.sh
index 6bcc6cb9dfec..45c10f76077f 100755
--- a/ci/scripts/package/build_tvm_wheel.sh
+++ b/ci/scripts/package/tvm_wheel_helper.sh
@@ -36,7 +36,7 @@ TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
 
 usage() {
   cat <<'EOF'
-Usage: ci/scripts/package/build_tvm_wheel.sh [cuda|manylinux-cuda|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
+Usage: ci/scripts/package/tvm_wheel_helper.sh [cuda|manylinux-cuda|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
 
 Environment knobs:
   TVM_USE_LLVM                 LLVM config used by repair helpers, default "llvm-config --link-static"
@@ -157,7 +157,7 @@ run_manylinux_cuda_container() {
       python --version
       cmake --version
       nvcc --version
-      ci/scripts/package/build_tvm_wheel.sh cuda'
+      ci/scripts/package/tvm_wheel_helper.sh cuda'
 
   docker exec "$container" bash -lc \
     "chown -R $(id -u):$(id -g) /workspace/build-wheel-cuda || true"

From 99eba8c62053331c25c8f5b78daa79b69acdd907 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 16:45:22 -0400
Subject: [PATCH 23/43] Expose LLVM libs to macOS wheel repair

---
 ci/scripts/package/tvm_wheel_helper.sh | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/ci/scripts/package/tvm_wheel_helper.sh b/ci/scripts/package/tvm_wheel_helper.sh
index 45c10f76077f..4caa7ed9a2de 100755
--- a/ci/scripts/package/tvm_wheel_helper.sh
+++ b/ci/scripts/package/tvm_wheel_helper.sh
@@ -332,11 +332,18 @@ repair_wheel_to_dir() {
     Darwin)
       require_cmd delocate-wheel
       echo "Repairing macOS wheel with delocate"
-      delocate-wheel \
-        --ignore-missing-dependencies \
-        --exclude libtvm_ffi.dylib \
-        -w "$output_dir" \
-        -v "$injected_wheel"
+      (
+        llvm_dir="$(llvm_libdir || true)"
+        if [[ -n "${llvm_dir:-}" && -d "$llvm_dir" ]]; then
+          echo "Adding LLVM libdir to DYLD_LIBRARY_PATH for delocate: ${llvm_dir}"
+          export DYLD_LIBRARY_PATH="${llvm_dir}${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}"
+        fi
+        delocate-wheel \
+          --ignore-missing-dependencies \
+          --exclude libtvm_ffi.dylib \
+          -w "$output_dir" \
+          -v "$injected_wheel"
+      )
       ;;
     *)
       cp "$injected_wheel" "$output_dir/"

From 265a1c83aa70b5556965fbf622e11b603ecc094e Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 17:38:16 -0400
Subject: [PATCH 24/43] Fix wheel repair verification paths

---
 .../build-wheel-for-publish/action.yml        |  2 +-
 ci/scripts/package/tvm_wheel_helper.sh        | 19 +++++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index dde1a3491428..4735121139f9 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -202,6 +202,6 @@ runs:
     - name: Verify final wheel
       shell: bash -l {0}
       env:
-        TVM_PYTHON: python
+        TVM_PYTHON: ${{ runner.os == 'Windows' && 'python' || 'python3' }}
         TVM_EXPECT_WHEEL_PLATFORM_TAG: ${{ steps.wheel_inputs.outputs.wheel_platform_tag }}
       run: ci/scripts/package/tvm_wheel_helper.sh verify
diff --git a/ci/scripts/package/tvm_wheel_helper.sh b/ci/scripts/package/tvm_wheel_helper.sh
index 4caa7ed9a2de..4cee68f23ad5 100755
--- a/ci/scripts/package/tvm_wheel_helper.sh
+++ b/ci/scripts/package/tvm_wheel_helper.sh
@@ -316,14 +316,29 @@ repair_wheel_to_dir() {
       done < <(auditwheel_excludes "$cuda_lib")
       echo "Repairing Linux wheel with auditwheel"
       (
+        auditwheel_libdir=""
+        trap '[[ -z "${auditwheel_libdir:-}" ]] || rm -rf "$auditwheel_libdir"' EXIT
         auditwheel_plat_args=()
         if [[ -n "${TVM_AUDITWHEEL_PLAT:-}" ]]; then
           auditwheel_plat_args+=(--plat "$TVM_AUDITWHEEL_PLAT")
         fi
         llvm_dir="$(llvm_libdir || true)"
         if [[ -n "${llvm_dir:-}" && -d "$llvm_dir" ]]; then
-          echo "Adding LLVM libdir to LD_LIBRARY_PATH for auditwheel: ${llvm_dir}"
-          export LD_LIBRARY_PATH="${llvm_dir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+          auditwheel_libdir="$(mktemp -d)"
+          shopt -s nullglob
+          lib=""
+          for lib in "$llvm_dir"/*.so "$llvm_dir"/*.so.*; do
+            case "$(basename "$lib")" in
+              libstdc++*|libgcc*|libgomp*|libatomic*|libasan*|libtsan*|libubsan*)
+                ;;
+              *)
+                ln -sf "$lib" "$auditwheel_libdir/$(basename "$lib")"
+                ;;
+            esac
+          done
+          shopt -u nullglob
+          echo "Adding filtered LLVM libdir to LD_LIBRARY_PATH for auditwheel: ${auditwheel_libdir}"
+          export LD_LIBRARY_PATH="${auditwheel_libdir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
         fi
         auditwheel repair "${auditwheel_plat_args[@]}" "${exclude_args[@]}" \
           -w "$output_dir" "$injected_wheel"

From 7ac2e4c65f11f4987137a803d18dda2292752eeb Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 18:41:28 -0400
Subject: [PATCH 25/43] Clarify wheel CUDA runtime inputs

---
 .github/actions/build-cuda/action.yml         |  50 +++++--
 .../build-wheel-for-publish/action.yml        |  48 ++++++-
 .github/workflows/publish_wheel.yml           |  24 ++--
 ci/scripts/package/README.md                  |  32 +++--
 ci/scripts/package/inject_cuda_runtime.py     |   4 +-
 ci/scripts/package/tvm_wheel_helper.sh        | 129 ++++++++++++++----
 ci/scripts/package/validate_wheel_elf.py      |  11 +-
 cmake/utils/FindLLVM.cmake                    |  31 +++--
 8 files changed, 244 insertions(+), 85 deletions(-)

diff --git a/.github/actions/build-cuda/action.yml b/.github/actions/build-cuda/action.yml
index 0236bdba04a1..07497df371da 100644
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -26,15 +26,24 @@ inputs:
     description: "Manylinux image tag to use on Linux runners"
     required: false
     default: ""
+  linux_image_tag:
+    description: "Pinned manylinux container tag shared with cibuildwheel"
+    required: false
+    default: ""
   cuda_architectures:
     description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
     required: false
     default: "75"
-  skip_cuda:
-    description: "Set to true to skip the CUDA runtime build"
+  include_cuda_runtime:
+    description: "Set to true to build the CUDA runtime library"
     required: false
     default: "false"
 
+outputs:
+  cuda_runtime_path:
+    description: "Absolute path to the built libtvm_runtime_cuda.so, or empty for CPU-only wheels"
+    value: ${{ steps.cuda_runtime.outputs.path }}
+
 runs:
   using: "composite"
   steps:
@@ -42,29 +51,40 @@ runs:
       id: env_vars
 
     - name: Build CUDA runtime in manylinux
-      if: runner.os == 'Linux' && inputs.skip_cuda != 'true'
+      if: runner.os == 'Linux' && inputs.include_cuda_runtime == 'true'
       shell: bash -l {0}
       env:
         TVM_MANYLINUX_IMAGE: ${{ inputs.linux_image }}
+        TVM_MANYLINUX_IMAGE_TAG: ${{ inputs.linux_image_tag }}
         TVM_ARCH: ${{ inputs.arch }}
         TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
+        TVM_CUDA_BUILD_DIR: ${{ runner.temp }}/tvm-wheel-cuda
         TVM_SKIP_CUDA: "0"
         TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
         CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
       run: ci/scripts/package/tvm_wheel_helper.sh manylinux-cuda
 
-    - name: Install CUDA toolkit
-      if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
-      id: cuda-toolkit
-      uses: Jimver/cuda-toolkit@6008063726ffe3309d1b22e413d9e88fed91a2f2
+    - name: Reject non-Linux CUDA runtime builds
+      if: runner.os != 'Linux' && inputs.include_cuda_runtime == 'true'
+      shell: bash -l {0}
+      run: |
+        echo "CUDA runtime wheels are only enabled on Linux in this workflow" >&2
+        exit 1
 
-    - name: Build CUDA runtime on host
-      if: runner.os != 'Linux' && inputs.skip_cuda != 'true'
+    - name: Report CUDA runtime output
+      id: cuda_runtime
       shell: bash -l {0}
       env:
-        TVM_USE_CUDA: ${{ steps.cuda-toolkit.outputs.CUDA_PATH || 'ON' }}
-        TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
-        TVM_SKIP_CUDA: "0"
-        TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
-        CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
-      run: ci/scripts/package/tvm_wheel_helper.sh cuda
+        TVM_CUDA_BUILD_DIR: ${{ runner.temp }}/tvm-wheel-cuda
+      run: |
+        set -eux
+        if [[ "${{ inputs.include_cuda_runtime }}" != "true" ]]; then
+          echo "path=" >> "${GITHUB_OUTPUT}"
+          exit 0
+        fi
+        cuda_runtime="$(ci/scripts/package/tvm_wheel_helper.sh cuda-path)"
+        if [[ -z "${cuda_runtime}" ]]; then
+          echo "CUDA runtime build did not produce libtvm_runtime_cuda.so" >&2
+          exit 1
+        fi
+        echo "path=${cuda_runtime}" >> "${GITHUB_OUTPUT}"
diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index 4735121139f9..c8f5a08581e5 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -31,6 +31,10 @@ inputs:
     description: "Manylinux image tag to use on Linux runners"
     required: false
     default: ""
+  linux_image_tag:
+    description: "Pinned manylinux container tag shared with the CUDA runtime build"
+    required: false
+    default: ""
   distribution_name:
     description: "Optional wheel distribution name override, useful for TestPyPI"
     required: false
@@ -39,10 +43,14 @@ inputs:
     description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
     required: false
     default: "75"
-  skip_cuda:
-    description: "Set to true to build a CPU-only wheel without the CUDA runtime"
+  include_cuda_runtime:
+    description: "Set to true to inject the CUDA runtime library"
     required: false
     default: "false"
+  cuda_runtime_path:
+    description: "Absolute path to libtvm_runtime_cuda.so produced by build-cuda"
+    required: false
+    default: ""
 
 runs:
   using: "composite"
@@ -56,14 +64,37 @@ runs:
       run: |
         set -eux
         wheel_platform_tag=""
+        manylinux_container_image=""
+        cibw_manylinux_x86_64_image="${{ inputs.linux_image }}"
+        cibw_manylinux_aarch64_image="${{ inputs.linux_image }}"
+        cibw_container_engine=""
         if [[ "${RUNNER_OS}" == "Linux" ]]; then
           if [[ -z "${{ inputs.linux_image }}" ]]; then
             echo "linux_image is required on Linux runners" >&2
             exit 1
           fi
           wheel_platform_tag="${{ inputs.linux_image }}_${{ inputs.arch }}"
+          if [[ -n "${{ inputs.linux_image_tag }}" ]]; then
+            manylinux_container_image="quay.io/pypa/${{ inputs.linux_image }}_${{ inputs.arch }}:${{ inputs.linux_image_tag }}"
+          else
+            manylinux_container_image="${{ inputs.linux_image }}"
+          fi
+          if [[ "${{ inputs.arch }}" == "x86_64" ]]; then
+            cibw_manylinux_x86_64_image="${manylinux_container_image}"
+          elif [[ "${{ inputs.arch }}" == "aarch64" ]]; then
+            cibw_manylinux_aarch64_image="${manylinux_container_image}"
+          fi
+          cibw_container_engine="docker; create_args: --volume /opt/llvm:/opt/llvm"
+          if [[ -n "${{ inputs.cuda_runtime_path }}" ]]; then
+            cuda_runtime_dir="$(dirname "${{ inputs.cuda_runtime_path }}")"
+            cibw_container_engine+=" --volume ${cuda_runtime_dir}:${cuda_runtime_dir}:ro"
+          fi
         fi
         echo "wheel_platform_tag=${wheel_platform_tag}" >> "${GITHUB_OUTPUT}"
+        echo "manylinux_container_image=${manylinux_container_image}" >> "${GITHUB_OUTPUT}"
+        echo "cibw_manylinux_x86_64_image=${cibw_manylinux_x86_64_image}" >> "${GITHUB_OUTPUT}"
+        echo "cibw_manylinux_aarch64_image=${cibw_manylinux_aarch64_image}" >> "${GITHUB_OUTPUT}"
+        echo "cibw_container_engine=${cibw_container_engine}" >> "${GITHUB_OUTPUT}"
 
     # ---- Cache LLVM prefix ----
     - name: Cache LLVM
@@ -157,12 +188,13 @@ runs:
         CIBW_ARCHS_LINUX: ${{ inputs.arch }}
         CIBW_ARCHS_MACOS: ${{ inputs.arch }}
         CIBW_ARCHS_WINDOWS: ${{ inputs.arch }}
-        CIBW_MANYLINUX_X86_64_IMAGE: ${{ inputs.linux_image }}
-        CIBW_MANYLINUX_AARCH64_IMAGE: ${{ inputs.linux_image }}
+        CIBW_MANYLINUX_X86_64_IMAGE: ${{ steps.wheel_inputs.outputs.cibw_manylinux_x86_64_image }}
+        CIBW_MANYLINUX_AARCH64_IMAGE: ${{ steps.wheel_inputs.outputs.cibw_manylinux_aarch64_image }}
         CIBW_BUILD_VERBOSITY: 1
         CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
-        CIBW_CONTAINER_ENGINE: "docker; create_args: --volume /opt/llvm:/opt/llvm"
+        CIBW_CONTAINER_ENGINE: ${{ steps.wheel_inputs.outputs.cibw_container_engine }}
         CIBW_BEFORE_BUILD_LINUX: >-
+          if command -v dnf >/dev/null 2>&1; then dnf -y install libxml2; fi &&
           python -m pip install -U pip cmake ninja scikit-build-core wheel auditwheel &&
           python -m pip install -v "{project}/3rdparty/tvm-ffi"
         CIBW_BEFORE_BUILD_MACOS: >-
@@ -177,7 +209,8 @@ runs:
           CMAKE_ARGS="-DUSE_LLVM=/opt/llvm/bin/llvm-config-static -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=/opt/llvm"
           TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
           TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
-          TVM_SKIP_CUDA="${{ inputs.skip_cuda == 'true' && '1' || '0' }}"
+          TVM_SKIP_CUDA="${{ inputs.include_cuda_runtime == 'true' && '0' || '1' }}"
+          TVM_CUDA_RUNTIME_PATH="${{ inputs.cuda_runtime_path }}"
           TVM_AUDITWHEEL_PLAT="${{ steps.wheel_inputs.outputs.wheel_platform_tag }}"
           TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
         CIBW_ENVIRONMENT_WINDOWS: >-
@@ -186,7 +219,8 @@ runs:
           CMAKE_ARGS="-DUSE_LLVM=C:/opt/llvm/Library/bin/llvm-config-static.bat -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=C:/opt/llvm/Library"
           TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
           TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
-          TVM_SKIP_CUDA="${{ inputs.skip_cuda == 'true' && '1' || '0' }}"
+          TVM_SKIP_CUDA="${{ inputs.include_cuda_runtime == 'true' && '0' || '1' }}"
+          TVM_CUDA_RUNTIME_PATH="${{ inputs.cuda_runtime_path }}"
           TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
         CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
           bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 323661ca3710..9d22a500b5af 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -57,33 +57,37 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - name: Linux x86_64 CUDA wheel (manylinux_2_28)
+          - name: Linux x86_64 wheel with CUDA runtime (manylinux_2_28)
             os: ubuntu-latest
             arch: x86_64
             build: cp310-manylinux_x86_64
             linux_image: manylinux_2_28
-            skip_cuda: "false"
+            linux_image_tag: 2026.01.04-1
+            include_cuda_runtime: "true"
             artifact_suffix: linux-x86_64-manylinux_2_28
-          - name: Linux aarch64 CUDA wheel (manylinux_2_28)
+          - name: Linux aarch64 wheel with CUDA runtime (manylinux_2_28)
             os: ubuntu-24.04-arm
             arch: aarch64
             build: cp310-manylinux_aarch64
             linux_image: manylinux_2_28
-            skip_cuda: "false"
+            linux_image_tag: 2026.01.04-1
+            include_cuda_runtime: "true"
             artifact_suffix: linux-aarch64-manylinux_2_28
           - name: macOS arm64 CPU wheel
             os: macos-14
             arch: arm64
             build: cp310-macosx_arm64
             linux_image: ""
-            skip_cuda: "true"
+            linux_image_tag: ""
+            include_cuda_runtime: "false"
             artifact_suffix: macos-arm64
           - name: Windows AMD64 CPU wheel
             os: windows-latest
             arch: AMD64
             build: cp310-win_amd64
             linux_image: ""
-            skip_cuda: "true"
+            linux_image_tag: ""
+            include_cuda_runtime: "false"
             artifact_suffix: windows-amd64
     steps:
       - name: Checkout source
@@ -95,12 +99,14 @@ jobs:
           fetch-tags: true
 
       - name: Build CUDA runtime
+        id: build_cuda
         uses: ./.github/actions/build-cuda
         with:
           arch: ${{ matrix.arch }}
           linux_image: ${{ matrix.linux_image }}
+          linux_image_tag: ${{ matrix.linux_image_tag }}
           cuda_architectures: ${{ inputs.cuda_architectures }}
-          skip_cuda: ${{ matrix.skip_cuda }}
+          include_cuda_runtime: ${{ matrix.include_cuda_runtime }}
 
       - name: Build TVM wheel
         uses: ./.github/actions/build-wheel-for-publish
@@ -108,9 +114,11 @@ jobs:
           arch: ${{ matrix.arch }}
           build: ${{ matrix.build }}
           linux_image: ${{ matrix.linux_image }}
+          linux_image_tag: ${{ matrix.linux_image_tag }}
           distribution_name: ${{ inputs.distribution_name }}
           cuda_architectures: ${{ inputs.cuda_architectures }}
-          skip_cuda: ${{ matrix.skip_cuda }}
+          include_cuda_runtime: ${{ matrix.include_cuda_runtime }}
+          cuda_runtime_path: ${{ steps.build_cuda.outputs.cuda_runtime_path }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@v4
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index f1d5e8c4f3b2..defad74136f1 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -24,32 +24,30 @@ and shell/Python helpers.
 
 The wheel build flow is:
 
-1. Build `libtvm_runtime_cuda.so` in a CUDA-enabled CMake build.
+1. Optionally build `libtvm_runtime_cuda.so` in a CUDA-enabled Linux CMake build.
 2. Build the main Python wheel with `cibuildwheel`, LLVM enabled, and CUDA
    disabled.
-3. Inject the CUDA runtime DSO into `tvm/lib/` during the `cibuildwheel`
-   repair hook.
+3. When requested, inject the CUDA runtime DSO into `tvm/lib/` during the
+   `cibuildwheel` repair hook.
 4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`.
 5. Validate ELF links so intra-wheel TVM DSOs resolve through relative rpaths.
    LLVM is expected to be linked statically; the final wheel must not bundle
    or dynamically depend on `libLLVM`.
 6. Verify the wheel in a fresh virtualenv.
-7. Upload with `twine`.
+7. Optionally upload and verify the uploaded package.
 
 GitHub Actions flow:
 
-1. Create a tag that contains these packaging files.
-2. Open the `Publish TVM wheel` workflow in GitHub Actions.
-3. Fill `tag` with that tag.
-4. The workflow builds a platform wheel matrix:
-   - Linux x86_64 in a `manylinux_2_28` container, with CUDA enabled.
-   - Linux aarch64 in a `manylinux_2_28` container, with CUDA enabled.
+1. The `Publish TVM wheel` workflow builds a platform wheel matrix:
+   - Linux x86_64 in a pinned `manylinux_2_28` container, with the CUDA runtime.
+   - Linux aarch64 in a pinned `manylinux_2_28` container, with the CUDA runtime.
    - macOS arm64 CPU-only.
    - Windows AMD64 CPU-only.
-5. For a TestPyPI run, set `publish_repository=testpypi` and set
-   `distribution_name` to a temporary package name.
-6. After the workflow build, upload, and `verify_pypi` jobs pass, run it again
-   with the final tag/name and `publish_repository=pypi`.
+2. The Linux CUDA runtime action exposes the built DSO path as an action output.
+   The wheel action receives that path explicitly and mounts it into the
+   `cibuildwheel` container for the repair hook.
+3. The optional publishing jobs upload the artifacts and can verify the package
+   from the selected package index.
 
 Linux wheels are built inside manylinux images. This avoids accidentally
 publishing a wheel tagged for the GitHub runner's host glibc, such as
@@ -61,7 +59,8 @@ Workflow structure:
   artifact upload, optional publishing, and post-upload verification.
 - `.github/actions/detect-env-vars`: shared environment detection.
 - `.github/actions/build-cuda`: builds only the optional CUDA runtime library.
-  On Linux this action owns the manylinux Docker/CUDA setup.
+  On Linux this action owns the pinned manylinux Docker/CUDA setup and exposes
+  the runtime DSO path as an action output.
 - `.github/actions/build-wheel-for-publish`: installs the cached LLVM prefix
   and runs `pypa/cibuildwheel` for the LLVM-enabled runtime wheel. Its custom
   repair hook injects the CUDA runtime before `auditwheel`/`delocate`/copy repair.
@@ -128,6 +127,7 @@ Useful knobs:
 - `TVM_USE_LLVM`: LLVM config for the CIBW build and repair helpers, default
   `llvm-config --link-static`.
 - `TVM_USE_CUDA`: CUDA root or `ON` for the CUDA build, default `ON`.
+- `TVM_CUDA_RUNTIME_PATH`: explicit path to `libtvm_runtime_cuda.so` for repair.
 - `TVM_CUDA_ARCHITECTURES`: CMake CUDA architectures, default `75`.
 - `TVM_WHEEL_DIST_NAME`: optional distribution rename for TestPyPI.
 - `TVM_WHEEL_DIST_VERSION`: optional distribution version rewrite.
@@ -135,6 +135,8 @@ Useful knobs:
 - `TVM_SKIP_CUDA=1`: build or repair a wheel without the CUDA runtime.
 - `TVM_KEEP_BUILD_DIRS=1`: reuse the CMake build directories.
 - `TVM_AUDITWHEEL_PLAT`: optional `auditwheel repair --plat` override.
+- `TVM_AUDITWHEEL_LIBRARY_PATH`: optional, explicit library search path for
+  `auditwheel repair`.
 - `TVM_EXPECT_WHEEL_PLATFORM_TAG`: require the final wheel filename to include
   a specific platform tag, such as `manylinux_2_28_x86_64`.
 - `TVM_TEST_INDEX_URL`: package index for `verify-pypi`, default TestPyPI.
diff --git a/ci/scripts/package/inject_cuda_runtime.py b/ci/scripts/package/inject_cuda_runtime.py
index 121cc1172ce1..0f48028d65fc 100755
--- a/ci/scripts/package/inject_cuda_runtime.py
+++ b/ci/scripts/package/inject_cuda_runtime.py
@@ -108,7 +108,7 @@ def _retag_wheel_filename(
     if len(parts) not in (5, 6):
         raise ValueError(f"Unsupported wheel filename: {wheel.name}")
     tags = parts[2:]
-    return f"{_wheel_escape(dist_name)}-{version}-{'-'.join(tags)}.whl"
+    return f"{_wheel_escape(dist_name)}-{_wheel_escape(version)}-{'-'.join(tags)}.whl"
 
 
 def rewrite_wheel(
@@ -129,7 +129,7 @@ def rewrite_wheel(
 
         final_name = distribution_name or original_name
         final_version = distribution_version or original_version
-        final_dist_info = f"{_wheel_escape(final_name)}-{final_version}.dist-info"
+        final_dist_info = f"{_wheel_escape(final_name)}-{_wheel_escape(final_version)}.dist-info"
         record_path = f"{final_dist_info}/RECORD"
         output_path = output_dir / _retag_wheel_filename(wheel, final_name, final_version)
 
diff --git a/ci/scripts/package/tvm_wheel_helper.sh b/ci/scripts/package/tvm_wheel_helper.sh
index 4cee68f23ad5..fdd1a9641e05 100755
--- a/ci/scripts/package/tvm_wheel_helper.sh
+++ b/ci/scripts/package/tvm_wheel_helper.sh
@@ -24,6 +24,7 @@ REPO_ROOT="$(cd "${SCRIPT_DIR}/../../.." && pwd)"
 TVM_PYTHON="${TVM_PYTHON:-python}"
 TVM_WHEELHOUSE="${TVM_WHEELHOUSE:-${REPO_ROOT}/wheelhouse}"
 TVM_CUDA_BUILD_DIR="${TVM_CUDA_BUILD_DIR:-${REPO_ROOT}/build-wheel-cuda}"
+TVM_CUDA_RUNTIME_PATH="${TVM_CUDA_RUNTIME_PATH:-}"
 TVM_USE_LLVM="${TVM_USE_LLVM:-llvm-config --link-static}"
 TVM_USE_CUDA="${TVM_USE_CUDA:-ON}"
 TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES:-75}"
@@ -36,11 +37,12 @@ TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
 
 usage() {
   cat <<'EOF'
-Usage: ci/scripts/package/tvm_wheel_helper.sh [cuda|manylinux-cuda|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
+Usage: ci/scripts/package/tvm_wheel_helper.sh [cuda|cuda-path|manylinux-cuda|cibw-repair|validate|verify|verify-installed|upload|verify-pypi]
 
 Environment knobs:
   TVM_USE_LLVM                 LLVM config used by repair helpers, default "llvm-config --link-static"
   TVM_USE_CUDA                 CUDA root or ON for the CUDA build, default ON
+  TVM_CUDA_RUNTIME_PATH        Explicit libtvm_runtime_cuda.so path for repair
   TVM_CUDA_ARCHITECTURES       CMake CUDA arch list, default 75
   TVM_WHEEL_DIST_NAME          Optional distribution rename for TestPyPI
   TVM_WHEEL_DIST_VERSION       Optional distribution version rewrite
@@ -49,8 +51,10 @@ Environment knobs:
   TVM_SKIP_REPAIR=1            Keep injected wheel as final wheel
   TVM_KEEP_BUILD_DIRS=1        Reuse CMake build dirs instead of cleaning them
   TVM_MANYLINUX_IMAGE          manylinux image tag for manylinux-cuda
+  TVM_MANYLINUX_IMAGE_TAG      optional pinned image tag for manylinux-cuda
   TVM_ARCH                     Target architecture for manylinux-cuda
   TVM_AUDITWHEEL_PLAT          Optional auditwheel --plat value
+  TVM_AUDITWHEEL_LIBRARY_PATH  Optional library search path for auditwheel repair
   TVM_EXPECT_WHEEL_PLATFORM_TAG
                                 Require the final wheel filename to include this tag
   TVM_TEST_INDEX_URL           Package index for verify-pypi, default TestPyPI
@@ -99,12 +103,31 @@ PY
 }
 
 cuda_runtime_path() {
+  if [[ -n "$TVM_CUDA_RUNTIME_PATH" ]]; then
+    if [[ -f "$TVM_CUDA_RUNTIME_PATH" ]]; then
+      echo "$TVM_CUDA_RUNTIME_PATH"
+    fi
+    return 0
+  fi
   if [[ ! -d "$TVM_CUDA_BUILD_DIR" ]]; then
     return 0
   fi
   find "$TVM_CUDA_BUILD_DIR" -type f -name 'libtvm_runtime_cuda.so' | sort | tail -n 1
 }
 
+manylinux_image_name() {
+  local base="$1"
+  local arch="$2"
+  local tag="${3:-}"
+  if [[ "$base" == *"/"* || "$base" == *":"* ]]; then
+    echo "$base"
+  elif [[ -n "$tag" ]]; then
+    echo "quay.io/pypa/${base}_${arch}:${tag}"
+  else
+    echo "quay.io/pypa/${base}_${arch}:latest"
+  fi
+}
+
 run_manylinux_cuda_container() {
   if [[ "$TVM_SKIP_CUDA" == "1" ]]; then
     echo "Skipping manylinux CUDA build because TVM_SKIP_CUDA=1"
@@ -122,13 +145,18 @@ run_manylinux_cuda_container() {
     return 1
   fi
 
-  local image="quay.io/pypa/${TVM_MANYLINUX_IMAGE}_${TVM_ARCH}:latest"
+  local image
+  image="$(manylinux_image_name "$TVM_MANYLINUX_IMAGE" "$TVM_ARCH" "${TVM_MANYLINUX_IMAGE_TAG:-}")"
   local container="tvm_wheel_cuda_${GITHUB_RUN_ID:-local}_${GITHUB_RUN_ATTEMPT:-1}_${TVM_ARCH}"
+  local host_cuda_build_dir="$TVM_CUDA_BUILD_DIR"
+  local container_cuda_build_dir="/workspace-cuda-build"
+  mkdir -p "$host_cuda_build_dir"
   docker pull "$image"
   docker rm -f "$container" >/dev/null 2>&1 || true
   docker run --name "$container" -d \
     --workdir /workspace \
     --volume "${REPO_ROOT}:/workspace" \
+    --volume "${host_cuda_build_dir}:${container_cuda_build_dir}" \
     "$image" tail -f /dev/null
   trap "docker rm -f '${container}' >/dev/null 2>&1 || true" EXIT
 
@@ -147,6 +175,7 @@ run_manylinux_cuda_container() {
     -e TVM_PYTHON=/opt/python/cp310-cp310/bin/python \
     -e TVM_USE_CUDA=/usr/local/cuda \
     -e TVM_CUDA_ARCHITECTURES="$TVM_CUDA_ARCHITECTURES" \
+    -e TVM_CUDA_BUILD_DIR="$container_cuda_build_dir" \
     -e TVM_SKIP_CUDA="$TVM_SKIP_CUDA" \
     -e CMAKE_BUILD_PARALLEL_LEVEL="$TVM_BUILD_PARALLEL_LEVEL" \
     -e TVM_BUILD_PARALLEL_LEVEL="$TVM_BUILD_PARALLEL_LEVEL" \
@@ -160,7 +189,7 @@ run_manylinux_cuda_container() {
       ci/scripts/package/tvm_wheel_helper.sh cuda'
 
   docker exec "$container" bash -lc \
-    "chown -R $(id -u):$(id -g) /workspace/build-wheel-cuda || true"
+    "chown -R $(id -u):$(id -g) ${container_cuda_build_dir} || true"
 }
 
 build_cuda_runtime() {
@@ -294,6 +323,58 @@ llvm_prefix() {
   fi
 }
 
+prepare_repair_libdir() {
+  local source_dir="$1"
+  shift
+  local repair_libdir
+  repair_libdir="$(mktemp -d)"
+  shopt -s nullglob
+  local pattern lib
+  for pattern in "$@"; do
+    for lib in "$source_dir"/$pattern; do
+      ln -sf "$lib" "$repair_libdir/$(basename "$lib")"
+    done
+  done
+  shopt -u nullglob
+  if find "$repair_libdir" -type l -print -quit | grep -q .; then
+    echo "$repair_libdir"
+  else
+    rm -rf "$repair_libdir"
+  fi
+}
+
+diagnose_wheel_elf() (
+  local wheel="$1"
+  if ! command -v readelf >/dev/null 2>&1; then
+    return 0
+  fi
+  local tmpdir
+  tmpdir="$(mktemp -d)"
+  trap 'rm -rf "$tmpdir"' EXIT
+  "$TVM_PYTHON" - "$wheel" "$tmpdir" <<'PY'
+from pathlib import Path
+import sys
+import zipfile
+
+wheel = Path(sys.argv[1])
+target = Path(sys.argv[2])
+with zipfile.ZipFile(wheel) as zf:
+    for name in zf.namelist():
+        if name.endswith(".so") or ".so." in name:
+            zf.extract(name, target)
+PY
+  local lib rel
+  while IFS= read -r lib; do
+    rel="${lib#"$tmpdir"/}"
+    echo "::group::ELF diagnostics: ${rel}"
+    readelf -d "$lib" | sed -n 's/.*Shared library: \[\(.*\)\].*/NEEDED \1/p; s/.*Library .*path: \[\(.*\)\].*/RPATH \1/p' || true
+    readelf --version-info "$lib" \
+      | sed -n 's/.*Name: \(GLIBC[^ ]*\|GLIBCXX[^ ]*\|CXXABI[^ ]*\).*/VERSION \1/p' \
+      | sort -Vu || true
+    echo "::endgroup::"
+  done < <(find "$tmpdir" -type f \( -name '*.so' -o -name '*.so.*' \) | sort)
+)
+
 repair_wheel_to_dir() {
   local injected_wheel="$1"
   local output_dir="$2"
@@ -316,42 +397,37 @@ repair_wheel_to_dir() {
       done < <(auditwheel_excludes "$cuda_lib")
       echo "Repairing Linux wheel with auditwheel"
       (
-        auditwheel_libdir=""
-        trap '[[ -z "${auditwheel_libdir:-}" ]] || rm -rf "$auditwheel_libdir"' EXIT
         auditwheel_plat_args=()
         if [[ -n "${TVM_AUDITWHEEL_PLAT:-}" ]]; then
           auditwheel_plat_args+=(--plat "$TVM_AUDITWHEEL_PLAT")
         fi
-        llvm_dir="$(llvm_libdir || true)"
-        if [[ -n "${llvm_dir:-}" && -d "$llvm_dir" ]]; then
-          auditwheel_libdir="$(mktemp -d)"
-          shopt -s nullglob
-          lib=""
-          for lib in "$llvm_dir"/*.so "$llvm_dir"/*.so.*; do
-            case "$(basename "$lib")" in
-              libstdc++*|libgcc*|libgomp*|libatomic*|libasan*|libtsan*|libubsan*)
-                ;;
-              *)
-                ln -sf "$lib" "$auditwheel_libdir/$(basename "$lib")"
-                ;;
-            esac
-          done
-          shopt -u nullglob
-          echo "Adding filtered LLVM libdir to LD_LIBRARY_PATH for auditwheel: ${auditwheel_libdir}"
-          export LD_LIBRARY_PATH="${auditwheel_libdir}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+        if [[ -n "${TVM_AUDITWHEEL_LIBRARY_PATH:-}" ]]; then
+          echo "Adding explicit library path to LD_LIBRARY_PATH for auditwheel: ${TVM_AUDITWHEEL_LIBRARY_PATH}"
+          export LD_LIBRARY_PATH="${TVM_AUDITWHEEL_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}"
+        fi
+        if ! auditwheel -v repair "${auditwheel_plat_args[@]}" "${exclude_args[@]}" \
+          -w "$output_dir" "$injected_wheel"; then
+          echo "auditwheel repair failed; printing diagnostics for ${injected_wheel}" >&2
+          auditwheel -v show "$injected_wheel" >&2 || true
+          diagnose_wheel_elf "$injected_wheel" >&2 || true
+          return 1
         fi
-        auditwheel repair "${auditwheel_plat_args[@]}" "${exclude_args[@]}" \
-          -w "$output_dir" "$injected_wheel"
       )
       ;;
     Darwin)
       require_cmd delocate-wheel
       echo "Repairing macOS wheel with delocate"
       (
+        repair_libdir=""
+        trap '[[ -z "${repair_libdir:-}" ]] || rm -rf "$repair_libdir"' EXIT
         llvm_dir="$(llvm_libdir || true)"
         if [[ -n "${llvm_dir:-}" && -d "$llvm_dir" ]]; then
-          echo "Adding LLVM libdir to DYLD_LIBRARY_PATH for delocate: ${llvm_dir}"
-          export DYLD_LIBRARY_PATH="${llvm_dir}${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}"
+          repair_libdir="$(prepare_repair_libdir "$llvm_dir" \
+            'libxml2*.dylib' 'libz*.dylib' 'libzstd*.dylib' 'liblzma*.dylib' 'libiconv*.dylib' || true)"
+          if [[ -n "${repair_libdir:-}" ]]; then
+            echo "Adding filtered LLVM libdir to DYLD_LIBRARY_PATH for delocate: ${repair_libdir}"
+            export DYLD_LIBRARY_PATH="${repair_libdir}${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}"
+          fi
         fi
         delocate-wheel \
           --ignore-missing-dependencies \
@@ -454,6 +530,7 @@ main() {
   local step="${1:-help}"
   case "$step" in
     cuda) build_cuda_runtime ;;
+    cuda-path) cuda_runtime_path ;;
     manylinux-cuda) run_manylinux_cuda_container ;;
     cibw-repair)
       if [[ "$#" -ne 3 ]]; then
diff --git a/ci/scripts/package/validate_wheel_elf.py b/ci/scripts/package/validate_wheel_elf.py
index cde90bd43a45..7ff81234d73a 100644
--- a/ci/scripts/package/validate_wheel_elf.py
+++ b/ci/scripts/package/validate_wheel_elf.py
@@ -86,11 +86,18 @@ def validate(wheel: Path) -> None:
         if not libdir.is_dir():
             raise RuntimeError(f"wheel does not contain {libdir.relative_to(root)}")
 
+        bundled_tvm_ffi = sorted(
+            str(path.relative_to(root)) for path in root.rglob("libtvm_ffi*.so*") if path.is_file()
+        )
+        if bundled_tvm_ffi:
+            raise RuntimeError(
+                "TVM wheel must depend on tvm_ffi instead of bundling libtvm_ffi: "
+                + ", ".join(bundled_tvm_ffi)
+            )
+
         libs = {path.name: path for path in sorted(libdir.glob("*.so*")) if path.is_file()}
         if "libtvm_runtime.so" not in libs:
             raise RuntimeError("wheel does not contain tvm/lib/libtvm_runtime.so")
-        if "libtvm_ffi.so" in libs:
-            raise RuntimeError("TVM wheel must depend on tvm_ffi instead of bundling libtvm_ffi.so")
         bundled_llvm = sorted(
             str(path.relative_to(root)) for path in root.rglob("libLLVM*.so*") if path.is_file()
         )
diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
index 3fef7baa1b7d..439c7d412ef1 100644
--- a/cmake/utils/FindLLVM.cmake
+++ b/cmake/utils/FindLLVM.cmake
@@ -231,17 +231,28 @@ macro(find_llvm use_llvm)
           endif()
         endif()
       elseif("${__flag}" STREQUAL "-lxml2")
-        find_library(LIBXML2_LIBRARY
-          NAMES xml2 libxml2
-          HINTS ${__llvm_lib_hints}
-          NO_DEFAULT_PATH)
-        if (LIBXML2_LIBRARY)
-          message(STATUS "LLVM links against xml2: ${LIBXML2_LIBRARY}")
-          list(APPEND LLVM_LIBS "${LIBXML2_LIBRARY}")
+        if (UNIX AND NOT APPLE)
+          find_library(LIBXML2_SYSTEM_LIBRARY
+            NAMES libxml2.so.2 xml2 libxml2
+            PATHS /usr/lib64 /usr/lib /lib64 /lib
+            NO_DEFAULT_PATH)
+        endif()
+        if (LIBXML2_SYSTEM_LIBRARY)
+          message(STATUS "LLVM links against system xml2: ${LIBXML2_SYSTEM_LIBRARY}")
+          list(APPEND LLVM_LIBS "${LIBXML2_SYSTEM_LIBRARY}")
         else()
-          message(STATUS "LLVM links against xml2")
-          find_package(LibXml2 REQUIRED)
-          list(APPEND LLVM_LIBS "LibXml2::LibXml2")
+          find_library(LIBXML2_LIBRARY
+            NAMES libxml2.a xml2 libxml2
+            HINTS ${__llvm_lib_hints}
+            NO_DEFAULT_PATH)
+          if (LIBXML2_LIBRARY)
+            message(STATUS "LLVM links against xml2: ${LIBXML2_LIBRARY}")
+            list(APPEND LLVM_LIBS "${LIBXML2_LIBRARY}")
+          else()
+            message(STATUS "LLVM links against xml2")
+            find_package(LibXml2 REQUIRED)
+            list(APPEND LLVM_LIBS "LibXml2::LibXml2")
+          endif()
         endif()
       elseif("${__flag}" STREQUAL "zstd.dll.lib")
         message(STATUS "LLVM linker flag under LLVM libdir: ${__llvm_libdir}/zstd.lib")

From a5594e07f1b84bfaac9e46675d75f579a8ee355b Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 18:48:01 -0400
Subject: [PATCH 26/43] Fix wheel action runtime paths

---
 .github/actions/build-wheel-for-publish/action.yml | 6 +++---
 ci/scripts/package/tvm_wheel_helper.sh             | 7 ++++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index c8f5a08581e5..a05d9ed9584d 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -65,9 +65,9 @@ runs:
         set -eux
         wheel_platform_tag=""
         manylinux_container_image=""
-        cibw_manylinux_x86_64_image="${{ inputs.linux_image }}"
-        cibw_manylinux_aarch64_image="${{ inputs.linux_image }}"
-        cibw_container_engine=""
+        cibw_manylinux_x86_64_image="${{ inputs.linux_image || 'manylinux_2_28' }}"
+        cibw_manylinux_aarch64_image="${{ inputs.linux_image || 'manylinux_2_28' }}"
+        cibw_container_engine="docker"
         if [[ "${RUNNER_OS}" == "Linux" ]]; then
           if [[ -z "${{ inputs.linux_image }}" ]]; then
             echo "linux_image is required on Linux runners" >&2
diff --git a/ci/scripts/package/tvm_wheel_helper.sh b/ci/scripts/package/tvm_wheel_helper.sh
index fdd1a9641e05..efaf63062165 100755
--- a/ci/scripts/package/tvm_wheel_helper.sh
+++ b/ci/scripts/package/tvm_wheel_helper.sh
@@ -149,14 +149,15 @@ run_manylinux_cuda_container() {
   image="$(manylinux_image_name "$TVM_MANYLINUX_IMAGE" "$TVM_ARCH" "${TVM_MANYLINUX_IMAGE_TAG:-}")"
   local container="tvm_wheel_cuda_${GITHUB_RUN_ID:-local}_${GITHUB_RUN_ATTEMPT:-1}_${TVM_ARCH}"
   local host_cuda_build_dir="$TVM_CUDA_BUILD_DIR"
-  local container_cuda_build_dir="/workspace-cuda-build"
+  local container_cuda_root="/workspace-cuda-build"
+  local container_cuda_build_dir="${container_cuda_root}/build"
   mkdir -p "$host_cuda_build_dir"
   docker pull "$image"
   docker rm -f "$container" >/dev/null 2>&1 || true
   docker run --name "$container" -d \
     --workdir /workspace \
     --volume "${REPO_ROOT}:/workspace" \
-    --volume "${host_cuda_build_dir}:${container_cuda_build_dir}" \
+    --volume "${host_cuda_build_dir}:${container_cuda_root}" \
     "$image" tail -f /dev/null
   trap "docker rm -f '${container}' >/dev/null 2>&1 || true" EXIT
 
@@ -189,7 +190,7 @@ run_manylinux_cuda_container() {
       ci/scripts/package/tvm_wheel_helper.sh cuda'
 
   docker exec "$container" bash -lc \
-    "chown -R $(id -u):$(id -g) ${container_cuda_build_dir} || true"
+    "chown -R $(id -u):$(id -g) ${container_cuda_root} || true"
 }
 
 build_cuda_runtime() {

From 1c663a194a8709d2bf41dd1a49d915e8eb6611b2 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 19:51:47 -0400
Subject: [PATCH 27/43] Bundle Windows wheel support DLLs

---
 .../build-wheel-for-publish/action.yml        |  7 ++
 ci/scripts/package/README.md                  |  8 +-
 ci/scripts/package/inject_cuda_runtime.py     | 78 ++++++++++++++++++-
 3 files changed, 90 insertions(+), 3 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index a05d9ed9584d..e4f959d31d9a 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -230,6 +230,13 @@ runs:
           python "{project}/ci/scripts/package/inject_cuda_runtime.py" "{wheel}"
           --output-dir "{dest_dir}"
           --distribution-name "${{ inputs.distribution_name }}"
+          --extra-library-dir "C:/opt/llvm/Library/bin"
+          --extra-library-pattern "libxml2*.dll"
+          --extra-library-pattern "zstd*.dll"
+          --extra-library-pattern "zlib*.dll"
+          --extra-library-pattern "*lzma*.dll"
+          --extra-library-pattern "*iconv*.dll"
+          --extra-library-pattern "*charset*.dll"
         CIBW_TEST_COMMAND: >-
           python "{project}/ci/scripts/package/verify_tvm_install.py"
 
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index defad74136f1..ac2ecb450c79 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -30,6 +30,8 @@ The wheel build flow is:
 3. When requested, inject the CUDA runtime DSO into `tvm/lib/` during the
    `cibuildwheel` repair hook.
 4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`.
+   On Windows, copy the small runtime DLLs required by LLVM support libraries
+   into `tvm/lib/` because there is no auditwheel-style repair tool.
 5. Validate ELF links so intra-wheel TVM DSOs resolve through relative rpaths.
    LLVM is expected to be linked statically; the final wheel must not bundle
    or dynamically depend on `libLLVM`.
@@ -63,12 +65,14 @@ Workflow structure:
   the runtime DSO path as an action output.
 - `.github/actions/build-wheel-for-publish`: installs the cached LLVM prefix
   and runs `pypa/cibuildwheel` for the LLVM-enabled runtime wheel. Its custom
-  repair hook injects the CUDA runtime before `auditwheel`/`delocate`/copy repair.
+  repair hook injects the CUDA runtime before `auditwheel`/`delocate`/Windows
+  dependency-copy repair.
 - `ci/scripts/package/tvm_wheel_helper.sh`: implements reusable local and CI
   entrypoints around the `cibuildwheel` build, such as `cuda`,
   `manylinux-cuda`, `cibw-repair`, `verify`, `upload`, and `verify-pypi`.
 - `ci/scripts/package/inject_cuda_runtime.py`: rewrites wheel metadata and
-  injects the CUDA runtime library when CUDA is enabled.
+  injects extra runtime files, including the CUDA runtime library when CUDA is
+  enabled.
 - `ci/scripts/package/verify_tvm_install.py`: imports the installed wheel and
   checks that the platform runtime library is present.
 
diff --git a/ci/scripts/package/inject_cuda_runtime.py b/ci/scripts/package/inject_cuda_runtime.py
index 0f48028d65fc..d2aaa344404e 100755
--- a/ci/scripts/package/inject_cuda_runtime.py
+++ b/ci/scripts/package/inject_cuda_runtime.py
@@ -111,6 +111,33 @@ def _retag_wheel_filename(
     return f"{_wheel_escape(dist_name)}-{_wheel_escape(version)}-{'-'.join(tags)}.whl"
 
 
+def _parse_extra_file(value: str) -> tuple[Path, str]:
+    if "=" not in value:
+        raise argparse.ArgumentTypeError("extra files must use SOURCE=TARGET format")
+    source, target = value.split("=", 1)
+    if not source or not target:
+        raise argparse.ArgumentTypeError("extra files must use SOURCE=TARGET format")
+    target = target.replace("\\", "/").lstrip("/")
+    return Path(source), target
+
+
+def _extra_library_files(
+    library_dirs: list[Path],
+    patterns: list[str],
+    target_dir: str,
+) -> list[tuple[Path, str]]:
+    target_dir = target_dir.replace("\\", "/").strip("/")
+    extra_files: dict[str, Path] = {}
+    for library_dir in library_dirs:
+        if not library_dir.is_dir():
+            continue
+        for pattern in patterns:
+            for source in sorted(library_dir.glob(pattern)):
+                if source.is_file():
+                    extra_files[f"{target_dir}/{source.name}"] = source
+    return [(source, target) for target, source in sorted(extra_files.items())]
+
+
 def rewrite_wheel(
     wheel: Path,
     output_dir: Path,
@@ -119,8 +146,10 @@ def rewrite_wheel(
     distribution_name: str | None,
     distribution_version: str | None,
     set_rpath: str | None,
+    extra_files: list[tuple[Path, str]],
 ) -> Path:
     output_dir.mkdir(parents=True, exist_ok=True)
+    extra_targets = {target for _, target in extra_files}
     with zipfile.ZipFile(wheel, "r") as zin:
         original_names = zin.namelist()
         original_dist_info = _find_dist_info(original_names)
@@ -142,7 +171,9 @@ def rewrite_wheel(
                 continue
             if mapped_name.startswith(f"{original_dist_info}/"):
                 mapped_name = f"{final_dist_info}/{mapped_name.split('/', 1)[1]}"
-            if cuda_runtime is not None and mapped_name == target_path:
+            if (
+                cuda_runtime is not None and mapped_name == target_path
+            ) or mapped_name in extra_targets:
                 continue
 
             data = zin.read(info.filename)
@@ -164,6 +195,13 @@ def rewrite_wheel(
             info.external_attr = 0o644 << 16
             entries.append((info, data))
 
+        for source, target in extra_files:
+            data = source.read_bytes()
+            info = zipfile.ZipInfo(target)
+            info.compress_type = zipfile.ZIP_DEFLATED
+            info.external_attr = 0o644 << 16
+            entries.append((info, data))
+
     record_buffer = io.StringIO()
     writer = csv.writer(record_buffer, lineterminator="\n")
     for info, data in entries:
@@ -191,6 +229,31 @@ def main() -> int:
     parser.add_argument("--distribution-name")
     parser.add_argument("--distribution-version")
     parser.add_argument("--set-rpath")
+    parser.add_argument(
+        "--extra-file",
+        action="append",
+        default=[],
+        type=_parse_extra_file,
+        help="Additional file to place in the wheel, using SOURCE=TARGET format.",
+    )
+    parser.add_argument(
+        "--extra-library-dir",
+        action="append",
+        default=[],
+        type=Path,
+        help="Directory to scan for extra runtime libraries.",
+    )
+    parser.add_argument(
+        "--extra-library-pattern",
+        action="append",
+        default=[],
+        help="Glob pattern for files under --extra-library-dir.",
+    )
+    parser.add_argument(
+        "--extra-library-target-dir",
+        default="tvm/lib",
+        help="Wheel directory for files matched by --extra-library-pattern.",
+    )
     args = parser.parse_args()
 
     cuda_runtime = args.cuda_runtime
@@ -206,6 +269,18 @@ def main() -> int:
         else:
             target_path = f"tvm/lib/{cuda_runtime.name}"
 
+    extra_files = list(args.extra_file)
+    extra_files.extend(
+        _extra_library_files(
+            library_dirs=args.extra_library_dir,
+            patterns=args.extra_library_pattern,
+            target_dir=args.extra_library_target_dir,
+        )
+    )
+    missing_extra_files = [str(source) for source, _ in extra_files if not source.is_file()]
+    if missing_extra_files:
+        parser.error(f"extra files do not exist: {', '.join(missing_extra_files)}")
+
     output_path = rewrite_wheel(
         wheel=args.wheel,
         output_dir=args.output_dir,
@@ -214,6 +289,7 @@ def main() -> int:
         distribution_name=args.distribution_name or None,
         distribution_version=args.distribution_version or None,
         set_rpath=args.set_rpath,
+        extra_files=extra_files,
     )
     print(output_path)
     return 0

From 94a4e0fdcdccde93eb8e70024ad41a48c99d1c7f Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Wed, 27 May 2026 23:09:20 -0400
Subject: [PATCH 28/43] Harden TVM wheel publish workflow

---
 .github/actions/build-cuda/action.yml         |  37 ++++-
 .../build-wheel-for-publish/action.yml        |  84 +++++++---
 .github/workflows/publish_wheel.yml           |  72 ++++++++-
 ci/scripts/package/README.md                  |  33 ++--
 ...nject_cuda_runtime.py => rewrite_wheel.py} | 153 ++++++++++++++----
 ci/scripts/package/tvm_wheel_helper.sh        | 105 ++++++++++--
 ci/scripts/package/verify_tvm_install.py      | 116 +++++++++++--
 7 files changed, 501 insertions(+), 99 deletions(-)
 rename ci/scripts/package/{inject_cuda_runtime.py => rewrite_wheel.py} (65%)

diff --git a/.github/actions/build-cuda/action.yml b/.github/actions/build-cuda/action.yml
index 07497df371da..156632d81da2 100644
--- a/.github/actions/build-cuda/action.yml
+++ b/.github/actions/build-cuda/action.yml
@@ -20,7 +20,7 @@ description: Build libtvm_runtime_cuda for the TVM wheel packaging flow.
 
 inputs:
   arch:
-    description: "Target architecture (e.g., x86_64, aarch64, arm64, AMD64)"
+    description: "Target Linux architecture for CUDA builds (x86_64 or aarch64)"
     required: true
   linux_image:
     description: "Manylinux image tag to use on Linux runners"
@@ -50,8 +50,26 @@ runs:
     - uses: ./.github/actions/detect-env-vars
       id: env_vars
 
+    - name: Detect CUDA inputs
+      id: cuda_inputs
+      shell: bash -l {0}
+      env:
+        INPUT_INCLUDE_CUDA_RUNTIME: ${{ inputs.include_cuda_runtime }}
+      run: |
+        set -eux
+        include_cuda_runtime="$(printf '%s' "${INPUT_INCLUDE_CUDA_RUNTIME}" | tr '[:upper:]' '[:lower:]')"
+        case "${include_cuda_runtime}" in
+          1|true|yes|on) include_cuda_runtime=1 ;;
+          0|false|no|off) include_cuda_runtime=0 ;;
+          *)
+            echo "include_cuda_runtime must be a boolean value" >&2
+            exit 1
+            ;;
+        esac
+        echo "include_cuda_runtime=${include_cuda_runtime}" >> "${GITHUB_OUTPUT}"
+
     - name: Build CUDA runtime in manylinux
-      if: runner.os == 'Linux' && inputs.include_cuda_runtime == 'true'
+      if: runner.os == 'Linux' && steps.cuda_inputs.outputs.include_cuda_runtime == '1'
       shell: bash -l {0}
       env:
         TVM_MANYLINUX_IMAGE: ${{ inputs.linux_image }}
@@ -59,13 +77,13 @@ runs:
         TVM_ARCH: ${{ inputs.arch }}
         TVM_CUDA_ARCHITECTURES: ${{ inputs.cuda_architectures }}
         TVM_CUDA_BUILD_DIR: ${{ runner.temp }}/tvm-wheel-cuda
-        TVM_SKIP_CUDA: "0"
+        TVM_INCLUDE_CUDA_RUNTIME: "1"
         TVM_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
         CMAKE_BUILD_PARALLEL_LEVEL: ${{ steps.env_vars.outputs.cpu_count }}
       run: ci/scripts/package/tvm_wheel_helper.sh manylinux-cuda
 
     - name: Reject non-Linux CUDA runtime builds
-      if: runner.os != 'Linux' && inputs.include_cuda_runtime == 'true'
+      if: runner.os != 'Linux' && steps.cuda_inputs.outputs.include_cuda_runtime == '1'
       shell: bash -l {0}
       run: |
         echo "CUDA runtime wheels are only enabled on Linux in this workflow" >&2
@@ -75,10 +93,12 @@ runs:
       id: cuda_runtime
       shell: bash -l {0}
       env:
+        INCLUDE_CUDA_RUNTIME: ${{ steps.cuda_inputs.outputs.include_cuda_runtime }}
         TVM_CUDA_BUILD_DIR: ${{ runner.temp }}/tvm-wheel-cuda
+        TVM_CUDA_RUNTIME_PATH: ""
       run: |
         set -eux
-        if [[ "${{ inputs.include_cuda_runtime }}" != "true" ]]; then
+        if [[ "${INCLUDE_CUDA_RUNTIME}" != "1" ]]; then
           echo "path=" >> "${GITHUB_OUTPUT}"
           exit 0
         fi
@@ -87,4 +107,11 @@ runs:
           echo "CUDA runtime build did not produce libtvm_runtime_cuda.so" >&2
           exit 1
         fi
+        case "${cuda_runtime}" in
+          "${TVM_CUDA_BUILD_DIR}"/*) ;;
+          *)
+            echo "CUDA runtime path is outside the expected build directory: ${cuda_runtime}" >&2
+            exit 1
+            ;;
+        esac
         echo "path=${cuda_runtime}" >> "${GITHUB_OUTPUT}"
diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index e4f959d31d9a..a13e86886e67 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Build Wheel For Publish
+name: Build TVM Wheel
 description: >
   Build and test the LLVM-enabled TVM wheel for a given OS/architecture
   combination using cibuildwheel.
@@ -28,11 +28,11 @@ inputs:
     description: "cibuildwheel build selector (e.g., cp310-manylinux_x86_64)"
     required: true
   linux_image:
-    description: "Manylinux image tag to use on Linux runners"
+    description: "Manylinux image family to use on Linux runners; required on Linux"
     required: false
     default: ""
   linux_image_tag:
-    description: "Pinned manylinux container tag shared with the CUDA runtime build"
+    description: "Pinned manylinux container tag shared with the CUDA runtime build; required on Linux"
     required: false
     default: ""
   distribution_name:
@@ -61,40 +61,69 @@ runs:
     - name: Detect wheel inputs
       id: wheel_inputs
       shell: bash -l {0}
+      env:
+        INPUT_ARCH: ${{ inputs.arch }}
+        INPUT_INCLUDE_CUDA_RUNTIME: ${{ inputs.include_cuda_runtime }}
+        INPUT_LINUX_IMAGE: ${{ inputs.linux_image }}
+        INPUT_LINUX_IMAGE_TAG: ${{ inputs.linux_image_tag }}
+        INPUT_CUDA_RUNTIME_PATH: ${{ inputs.cuda_runtime_path }}
       run: |
         set -eux
         wheel_platform_tag=""
         manylinux_container_image=""
-        cibw_manylinux_x86_64_image="${{ inputs.linux_image || 'manylinux_2_28' }}"
-        cibw_manylinux_aarch64_image="${{ inputs.linux_image || 'manylinux_2_28' }}"
+        cibw_manylinux_x86_64_image="${INPUT_LINUX_IMAGE:-manylinux_2_28}"
+        cibw_manylinux_aarch64_image="${INPUT_LINUX_IMAGE:-manylinux_2_28}"
         cibw_container_engine="docker"
+        include_cuda_runtime="$(printf '%s' "${INPUT_INCLUDE_CUDA_RUNTIME}" | tr '[:upper:]' '[:lower:]')"
+        case "${include_cuda_runtime}" in
+          1|true|yes|on) include_cuda_runtime=1 ;;
+          0|false|no|off) include_cuda_runtime=0 ;;
+          *)
+            echo "include_cuda_runtime must be a boolean value" >&2
+            exit 1
+            ;;
+        esac
         if [[ "${RUNNER_OS}" == "Linux" ]]; then
-          if [[ -z "${{ inputs.linux_image }}" ]]; then
+          if [[ -z "${INPUT_LINUX_IMAGE}" ]]; then
             echo "linux_image is required on Linux runners" >&2
             exit 1
           fi
-          wheel_platform_tag="${{ inputs.linux_image }}_${{ inputs.arch }}"
-          if [[ -n "${{ inputs.linux_image_tag }}" ]]; then
-            manylinux_container_image="quay.io/pypa/${{ inputs.linux_image }}_${{ inputs.arch }}:${{ inputs.linux_image_tag }}"
-          else
-            manylinux_container_image="${{ inputs.linux_image }}"
+          if [[ -z "${INPUT_LINUX_IMAGE_TAG}" ]]; then
+            echo "linux_image_tag is required on Linux runners" >&2
+            exit 1
+          fi
+          if [[ "${include_cuda_runtime}" == "1" ]]; then
+            if [[ -z "${INPUT_CUDA_RUNTIME_PATH}" ]]; then
+              echo "cuda_runtime_path is required when include_cuda_runtime=true" >&2
+              exit 1
+            fi
+            if [[ ! -f "${INPUT_CUDA_RUNTIME_PATH}" ]]; then
+              echo "cuda_runtime_path does not exist: ${INPUT_CUDA_RUNTIME_PATH}" >&2
+              exit 1
+            fi
           fi
-          if [[ "${{ inputs.arch }}" == "x86_64" ]]; then
+          wheel_platform_tag="${INPUT_LINUX_IMAGE}_${INPUT_ARCH}"
+          manylinux_container_image="quay.io/pypa/${INPUT_LINUX_IMAGE}_${INPUT_ARCH}:${INPUT_LINUX_IMAGE_TAG}"
+          if [[ "${INPUT_ARCH}" == "x86_64" ]]; then
             cibw_manylinux_x86_64_image="${manylinux_container_image}"
-          elif [[ "${{ inputs.arch }}" == "aarch64" ]]; then
+          elif [[ "${INPUT_ARCH}" == "aarch64" ]]; then
             cibw_manylinux_aarch64_image="${manylinux_container_image}"
           fi
-          cibw_container_engine="docker; create_args: --volume /opt/llvm:/opt/llvm"
-          if [[ -n "${{ inputs.cuda_runtime_path }}" ]]; then
-            cuda_runtime_dir="$(dirname "${{ inputs.cuda_runtime_path }}")"
+          cibw_container_engine="docker; create_args: --volume /opt/llvm:/opt/llvm:ro"
+          if [[ "${include_cuda_runtime}" == "1" && -n "${INPUT_CUDA_RUNTIME_PATH}" ]]; then
+            cuda_runtime_dir="$(dirname "${INPUT_CUDA_RUNTIME_PATH}")"
             cibw_container_engine+=" --volume ${cuda_runtime_dir}:${cuda_runtime_dir}:ro"
           fi
+        elif [[ "${include_cuda_runtime}" == "1" ]]; then
+          echo "CUDA runtime injection is only enabled on Linux in this workflow" >&2
+          exit 1
         fi
         echo "wheel_platform_tag=${wheel_platform_tag}" >> "${GITHUB_OUTPUT}"
         echo "manylinux_container_image=${manylinux_container_image}" >> "${GITHUB_OUTPUT}"
         echo "cibw_manylinux_x86_64_image=${cibw_manylinux_x86_64_image}" >> "${GITHUB_OUTPUT}"
         echo "cibw_manylinux_aarch64_image=${cibw_manylinux_aarch64_image}" >> "${GITHUB_OUTPUT}"
         echo "cibw_container_engine=${cibw_container_engine}" >> "${GITHUB_OUTPUT}"
+        echo "include_cuda_runtime=${include_cuda_runtime}" >> "${GITHUB_OUTPUT}"
 
     # ---- Cache LLVM prefix ----
     - name: Cache LLVM
@@ -209,25 +238,34 @@ runs:
           CMAKE_ARGS="-DUSE_LLVM=/opt/llvm/bin/llvm-config-static -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=/opt/llvm"
           TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
           TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
-          TVM_SKIP_CUDA="${{ inputs.include_cuda_runtime == 'true' && '0' || '1' }}"
-          TVM_CUDA_RUNTIME_PATH="${{ inputs.cuda_runtime_path }}"
+          TVM_INCLUDE_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
+          TVM_CUDA_RUNTIME_PATH="${{ steps.wheel_inputs.outputs.include_cuda_runtime == '1' && inputs.cuda_runtime_path || '' }}"
           TVM_AUDITWHEEL_PLAT="${{ steps.wheel_inputs.outputs.wheel_platform_tag }}"
           TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
+          TVM_EXPECT_LLVM_ENABLED=1
+          TVM_EXPECT_STATIC_LLVM=1
+          TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
+          TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
         CIBW_ENVIRONMENT_WINDOWS: >-
           TVM_USE_LLVM="C:/opt/llvm/Library/bin/llvm-config-static.bat"
           CMAKE_PREFIX_PATH="C:/opt/llvm/Library"
           CMAKE_ARGS="-DUSE_LLVM=C:/opt/llvm/Library/bin/llvm-config-static.bat -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=C:/opt/llvm/Library"
           TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
           TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
-          TVM_SKIP_CUDA="${{ inputs.include_cuda_runtime == 'true' && '0' || '1' }}"
-          TVM_CUDA_RUNTIME_PATH="${{ inputs.cuda_runtime_path }}"
+          TVM_INCLUDE_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
+          TVM_CUDA_RUNTIME_PATH="${{ steps.wheel_inputs.outputs.include_cuda_runtime == '1' && inputs.cuda_runtime_path || '' }}"
           TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
+          TVM_EXPECT_LLVM_ENABLED=1
+          TVM_EXPECT_STATIC_LLVM=1
+          TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
+          TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
         CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
           bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
         CIBW_REPAIR_WHEEL_COMMAND_MACOS: >-
+          TVM_DELOCATE_ARCHS="{delocate_archs}"
           bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
         CIBW_REPAIR_WHEEL_COMMAND_WINDOWS: >-
-          python "{project}/ci/scripts/package/inject_cuda_runtime.py" "{wheel}"
+          python "{project}/ci/scripts/package/rewrite_wheel.py" "{wheel}"
           --output-dir "{dest_dir}"
           --distribution-name "${{ inputs.distribution_name }}"
           --extra-library-dir "C:/opt/llvm/Library/bin"
@@ -245,4 +283,8 @@ runs:
       env:
         TVM_PYTHON: ${{ runner.os == 'Windows' && 'python' || 'python3' }}
         TVM_EXPECT_WHEEL_PLATFORM_TAG: ${{ steps.wheel_inputs.outputs.wheel_platform_tag }}
+        TVM_EXPECT_LLVM_ENABLED: "1"
+        TVM_EXPECT_STATIC_LLVM: "1"
+        TVM_EXPECT_CUDA_RUNTIME: ${{ steps.wheel_inputs.outputs.include_cuda_runtime }}
+        TVM_EXPECT_CUDA_ENABLED: ${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}
       run: ci/scripts/package/tvm_wheel_helper.sh verify
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 9d22a500b5af..770b7778741f 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -15,13 +15,13 @@
 # specific language governing permissions and limitations
 # under the License.
 
-name: Publish TVM wheel
+name: Publish TVM wheels
 
 on:
   workflow_dispatch:
     inputs:
       tag:
-        description: "Tag, branch, or SHA to publish"
+        description: "Tag, branch, or SHA to build; PyPI publishes require refs/tags/<tag>"
         required: true
         type: string
       publish_repository:
@@ -49,6 +49,9 @@ on:
         default: true
         type: boolean
 
+permissions:
+  contents: read
+
 jobs:
   build_wheels:
     name: ${{ matrix.name }}
@@ -90,6 +93,32 @@ jobs:
             include_cuda_runtime: "false"
             artifact_suffix: windows-amd64
     steps:
+      - name: Validate publish inputs
+        shell: bash
+        env:
+          TVM_PUBLISH_REPOSITORY: ${{ inputs.publish_repository }}
+          TVM_PUBLISH_REF: ${{ inputs.tag }}
+          TVM_VERIFY_FROM_REPOSITORY: ${{ inputs.verify_from_repository }}
+          TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
+        run: |
+          set -eux
+          if [[ -n "${TVM_WHEEL_DIST_NAME}" && ! "${TVM_WHEEL_DIST_NAME}" =~ ^[A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?$ ]]; then
+            echo "distribution_name must be a valid Python package name override" >&2
+            exit 1
+          fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && -n "${TVM_WHEEL_DIST_NAME}" ]]; then
+            echo "distribution_name must be empty when publishing to PyPI" >&2
+            exit 1
+          fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && "${TVM_PUBLISH_REF}" != refs/tags/* ]]; then
+            echo "PyPI publishes must use an immutable refs/tags/<tag> ref" >&2
+            exit 1
+          fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && "${TVM_VERIFY_FROM_REPOSITORY}" != "true" ]]; then
+            echo "verify_from_repository must be enabled when publishing to PyPI" >&2
+            exit 1
+          fi
+
       - name: Checkout source
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
         with:
@@ -125,6 +154,7 @@ jobs:
         with:
           name: tvm-wheel-${{ matrix.artifact_suffix }}
           path: wheelhouse/*.whl
+          if-no-files-found: error
 
   upload_pypi:
     name: Upload package distributions
@@ -133,6 +163,8 @@ jobs:
     runs-on: ubuntu-latest
     environment: ${{ inputs.publish_repository }}
     permissions:
+      actions: read
+      contents: read
       id-token: write
       attestations: write
     steps:
@@ -142,6 +174,33 @@ jobs:
           path: dist
           merge-multiple: true
 
+      - name: Check wheel sizes
+        shell: bash
+        run: |
+          set -euo pipefail
+          limit_bytes=100000000
+          shopt -s nullglob
+          wheels=(dist/*.whl)
+          if [[ "${#wheels[@]}" -eq 0 ]]; then
+            echo "No wheel artifacts found under dist/" >&2
+            exit 1
+          fi
+          if [[ "${#wheels[@]}" -ne 4 ]]; then
+            echo "Expected 4 wheel artifacts, found ${#wheels[@]}" >&2
+            printf '%s\n' "${wheels[@]}" >&2
+            exit 1
+          fi
+          failed=0
+          for wheel in "${wheels[@]}"; do
+            size="$(stat -c '%s' "$wheel")"
+            printf '%s %s bytes\n' "$wheel" "$size"
+            if (( size > limit_bytes )); then
+              echo "Wheel exceeds 100 MB PyPI/TestPyPI upload limit: ${wheel}" >&2
+              failed=1
+            fi
+          done
+          exit "$failed"
+
       - name: Generate artifact attestation for wheels
         uses: actions/attest-build-provenance@v1
         with:
@@ -167,6 +226,9 @@ jobs:
     needs: [upload_pypi]
     if: ${{ inputs.publish_repository != 'none' && inputs.verify_from_repository }}
     runs-on: ubuntu-latest
+    permissions:
+      actions: read
+      contents: read
     steps:
       - name: Check out source
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -192,6 +254,9 @@ jobs:
           TVM_PYTHON: python
           TVM_TEST_INDEX_URL: https://test.pypi.org/simple/
           TVM_EXTRA_INDEX_URL: https://pypi.org/simple
+          TVM_EXPECT_LLVM_ENABLED: "1"
+          TVM_EXPECT_STATIC_LLVM: "1"
+          TVM_EXPECT_CUDA_RUNTIME: "1"
         run: ci/scripts/package/tvm_wheel_helper.sh verify-pypi
 
       - name: Verify package from PyPI
@@ -200,4 +265,7 @@ jobs:
           TVM_PYTHON: python
           TVM_TEST_INDEX_URL: https://pypi.org/simple/
           TVM_EXTRA_INDEX_URL: https://pypi.org/simple
+          TVM_EXPECT_LLVM_ENABLED: "1"
+          TVM_EXPECT_STATIC_LLVM: "1"
+          TVM_EXPECT_CUDA_RUNTIME: "1"
         run: ci/scripts/package/tvm_wheel_helper.sh verify-pypi
diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index ac2ecb450c79..5822e46790b3 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -29,7 +29,9 @@ The wheel build flow is:
    disabled.
 3. When requested, inject the CUDA runtime DSO into `tvm/lib/` during the
    `cibuildwheel` repair hook.
-4. Repair the wheel, excluding CUDA driver/runtime DSOs and `libtvm_ffi`.
+4. Repair the wheel, excluding CUDA toolkit/driver DSOs and `libtvm_ffi`.
+   `libtvm_runtime_cuda.so`, when requested, is the TVM CUDA runtime that is
+   intentionally injected into the wheel.
    On Windows, copy the small runtime DLLs required by LLVM support libraries
    into `tvm/lib/` because there is no auditwheel-style repair tool.
 5. Validate ELF links so intra-wheel TVM DSOs resolve through relative rpaths.
@@ -40,7 +42,7 @@ The wheel build flow is:
 
 GitHub Actions flow:
 
-1. The `Publish TVM wheel` workflow builds a platform wheel matrix:
+1. The `Publish TVM wheels` workflow builds a platform wheel matrix:
    - Linux x86_64 in a pinned `manylinux_2_28` container, with the CUDA runtime.
    - Linux aarch64 in a pinned `manylinux_2_28` container, with the CUDA runtime.
    - macOS arm64 CPU-only.
@@ -49,7 +51,8 @@ GitHub Actions flow:
    The wheel action receives that path explicitly and mounts it into the
    `cibuildwheel` container for the repair hook.
 3. The optional publishing jobs upload the artifacts and can verify the package
-   from the selected package index.
+   from the selected package index. PyPI publishing requires a `refs/tags/<tag>`
+   input and keeps post-upload verification enabled.
 
 Linux wheels are built inside manylinux images. This avoids accidentally
 publishing a wheel tagged for the GitHub runner's host glibc, such as
@@ -70,11 +73,12 @@ Workflow structure:
 - `ci/scripts/package/tvm_wheel_helper.sh`: implements reusable local and CI
   entrypoints around the `cibuildwheel` build, such as `cuda`,
   `manylinux-cuda`, `cibw-repair`, `verify`, `upload`, and `verify-pypi`.
-- `ci/scripts/package/inject_cuda_runtime.py`: rewrites wheel metadata and
-  injects extra runtime files, including the CUDA runtime library when CUDA is
-  enabled.
+- `ci/scripts/package/rewrite_wheel.py`: rewrites wheel metadata and injects
+  extra runtime files, including the CUDA runtime library when CUDA is enabled.
 - `ci/scripts/package/verify_tvm_install.py`: imports the installed wheel and
-  checks that the platform runtime library is present.
+  checks that the runtime library was loaded from the wheel, expected runtime
+  DSOs are present, and dynamic LLVM libraries are not bundled when static LLVM
+  is required.
 
 To test the workflow from a fork without publishing:
 
@@ -116,8 +120,8 @@ TVM_PYTHON=/tmp/tvm-wheel-tools/bin/python \
 ci/scripts/package/tvm_wheel_helper.sh verify-pypi
 ```
 
-For a real PyPI upload, leave `TVM_WHEEL_DIST_NAME` unset and set the normal
-Twine credentials:
+For a manual or local upload with the helper, leave `TVM_WHEEL_DIST_NAME`
+unset and set the normal Twine credentials:
 
 ```bash
 TWINE_USERNAME=__token__ \
@@ -135,13 +139,24 @@ Useful knobs:
 - `TVM_CUDA_ARCHITECTURES`: CMake CUDA architectures, default `75`.
 - `TVM_WHEEL_DIST_NAME`: optional distribution rename for TestPyPI.
 - `TVM_WHEEL_DIST_VERSION`: optional distribution version rewrite.
+- `TVM_INCLUDE_CUDA_RUNTIME=1`: build or repair a wheel with the CUDA runtime.
+  Do not set this to a value that conflicts with `TVM_SKIP_CUDA`.
 - `TVM_SKIP_REPAIR=1`: leave the injected wheel unrepaired.
 - `TVM_SKIP_CUDA=1`: build or repair a wheel without the CUDA runtime.
 - `TVM_KEEP_BUILD_DIRS=1`: reuse the CMake build directories.
+- `TVM_MANYLINUX_IMAGE`: manylinux image family for `manylinux-cuda`, such as
+  `manylinux_2_28`.
+- `TVM_MANYLINUX_IMAGE_TAG`: pinned manylinux image tag for `manylinux-cuda`.
+- `TVM_ARCH`: target architecture for `manylinux-cuda`, such as `x86_64` or
+  `aarch64`.
 - `TVM_AUDITWHEEL_PLAT`: optional `auditwheel repair --plat` override.
 - `TVM_AUDITWHEEL_LIBRARY_PATH`: optional, explicit library search path for
   `auditwheel repair`.
 - `TVM_EXPECT_WHEEL_PLATFORM_TAG`: require the final wheel filename to include
   a specific platform tag, such as `manylinux_2_28_x86_64`.
+- `TVM_EXPECT_CUDA_RUNTIME`: verify whether the installed wheel ships a CUDA
+  runtime library.
+- `TVM_EXPECT_STATIC_LLVM`: verify that the installed wheel does not ship a
+  dynamic LLVM library.
 - `TVM_TEST_INDEX_URL`: package index for `verify-pypi`, default TestPyPI.
 - `TVM_EXTRA_INDEX_URL`: extra package index for dependencies, default PyPI.
diff --git a/ci/scripts/package/inject_cuda_runtime.py b/ci/scripts/package/rewrite_wheel.py
similarity index 65%
rename from ci/scripts/package/inject_cuda_runtime.py
rename to ci/scripts/package/rewrite_wheel.py
index d2aaa344404e..ce991c69e4bf 100755
--- a/ci/scripts/package/inject_cuda_runtime.py
+++ b/ci/scripts/package/rewrite_wheel.py
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-"""Inject TVM's CUDA runtime DSO into a wheel and refresh RECORD."""
+"""Rewrite TVM wheel metadata and inject extra runtime files."""
 
 from __future__ import annotations
 
@@ -87,7 +87,11 @@ def _metadata_headers(metadata: bytes) -> tuple[str, str]:
 
 
 def _is_elf_shared_lib(name: str, data: bytes) -> bool:
-    return name.startswith("tvm/lib/") and name.endswith(".so") and data.startswith(b"\x7fELF")
+    return (
+        name.startswith("tvm/lib/")
+        and re.search(r"\.so(?:\.|$)", Path(name).name) is not None
+        and data.startswith(b"\x7fELF")
+    )
 
 
 def _set_rpath(data: bytes, rpath: str, name: str) -> bytes:
@@ -95,7 +99,10 @@ def _set_rpath(data: bytes, rpath: str, name: str) -> bytes:
         path = Path(tmpdir) / Path(name).name
         path.write_bytes(data)
         path.chmod(0o755)
-        subprocess.run(["patchelf", "--set-rpath", rpath, str(path)], check=True)
+        try:
+            subprocess.run(["patchelf", "--set-rpath", rpath, str(path)], check=True)
+        except subprocess.CalledProcessError as err:
+            raise ValueError(f"patchelf failed while setting rpath on {name}") from err
         return path.read_bytes()
 
 
@@ -111,14 +118,43 @@ def _retag_wheel_filename(
     return f"{_wheel_escape(dist_name)}-{_wheel_escape(version)}-{'-'.join(tags)}.whl"
 
 
+def _normalize_wheel_path(value: str, label: str) -> str:
+    raw = value.replace("\\", "/")
+    if raw.startswith("/") or re.match(r"^[A-Za-z]:", raw) is not None:
+        raise argparse.ArgumentTypeError(
+            f"{label} must be a relative wheel path without drive, empty, '.' or '..' segments"
+        )
+    normalized = raw
+    parts = normalized.split("/")
+    if (
+        not normalized
+        or any(part in {"", ".", ".."} for part in parts)
+    ):
+        raise argparse.ArgumentTypeError(
+            f"{label} must be a relative wheel path without drive, empty, '.' or '..' segments"
+        )
+    return normalized
+
+
+def _validate_wheel_member_path(value: str) -> str:
+    if "\\" in value:
+        raise ValueError(f"Wheel member path must use forward slashes: {value}")
+    try:
+        normalized = _normalize_wheel_path(value, "wheel member path")
+    except argparse.ArgumentTypeError as err:
+        raise ValueError(str(err)) from err
+    if normalized != value:
+        raise ValueError(f"Wheel member path is not normalized: {value}")
+    return normalized
+
+
 def _parse_extra_file(value: str) -> tuple[Path, str]:
     if "=" not in value:
         raise argparse.ArgumentTypeError("extra files must use SOURCE=TARGET format")
     source, target = value.split("=", 1)
     if not source or not target:
         raise argparse.ArgumentTypeError("extra files must use SOURCE=TARGET format")
-    target = target.replace("\\", "/").lstrip("/")
-    return Path(source), target
+    return Path(source), _normalize_wheel_path(target, "extra file target")
 
 
 def _extra_library_files(
@@ -126,16 +162,33 @@ def _extra_library_files(
     patterns: list[str],
     target_dir: str,
 ) -> list[tuple[Path, str]]:
-    target_dir = target_dir.replace("\\", "/").strip("/")
-    extra_files: dict[str, Path] = {}
+    target_dir = _normalize_wheel_path(target_dir, "extra library target dir")
+    extra_files: list[tuple[Path, str]] = []
+    missing_dirs = [str(library_dir) for library_dir in library_dirs if not library_dir.is_dir()]
+    if missing_dirs:
+        raise ValueError(f"extra library dirs do not exist: {', '.join(missing_dirs)}")
     for library_dir in library_dirs:
-        if not library_dir.is_dir():
-            continue
         for pattern in patterns:
             for source in sorted(library_dir.glob(pattern)):
                 if source.is_file():
-                    extra_files[f"{target_dir}/{source.name}"] = source
-    return [(source, target) for target, source in sorted(extra_files.items())]
+                    extra_files.append((source, f"{target_dir}/{source.name}"))
+    if library_dirs and patterns and not extra_files:
+        raise ValueError(
+            "extra library patterns did not match any files: " + ", ".join(patterns)
+        )
+    return sorted(extra_files, key=lambda item: (item[1], str(item[0])))
+
+
+def _check_duplicate_targets(targets: list[str]) -> None:
+    seen: set[str] = set()
+    duplicates: set[str] = set()
+    for target in targets:
+        if target in seen:
+            duplicates.add(target)
+        seen.add(target)
+    if duplicates:
+        joined = ", ".join(sorted(duplicates))
+        raise ValueError(f"Duplicate wheel target paths are not allowed: {joined}")
 
 
 def rewrite_wheel(
@@ -151,32 +204,41 @@ def rewrite_wheel(
     output_dir.mkdir(parents=True, exist_ok=True)
     extra_targets = {target for _, target in extra_files}
     with zipfile.ZipFile(wheel, "r") as zin:
-        original_names = zin.namelist()
+        original_infos = [info for info in zin.infolist() if not info.is_dir()]
+        original_names = [_validate_wheel_member_path(info.filename) for info in original_infos]
+        _check_duplicate_targets(original_names)
         original_dist_info = _find_dist_info(original_names)
         metadata_path = f"{original_dist_info}/METADATA"
+        if metadata_path not in original_names:
+            raise ValueError(f"Wheel metadata is missing: {metadata_path}")
         original_name, original_version = _metadata_headers(zin.read(metadata_path))
 
         final_name = distribution_name or original_name
         final_version = distribution_version or original_version
         final_dist_info = f"{_wheel_escape(final_name)}-{_wheel_escape(final_version)}.dist-info"
         record_path = f"{final_dist_info}/RECORD"
+        target_paths = [target for _, target in extra_files]
+        if cuda_runtime is not None:
+            target_paths.append(target_path)
+        target_paths.append(record_path)
+        _check_duplicate_targets(target_paths)
         output_path = output_dir / _retag_wheel_filename(wheel, final_name, final_version)
 
         entries: list[tuple[zipfile.ZipInfo, bytes]] = []
-        for info in zin.infolist():
-            if info.is_dir():
-                continue
+        entry_names: list[str] = []
+        for info in original_infos:
             mapped_name = info.filename
             if mapped_name == f"{original_dist_info}/RECORD":
                 continue
             if mapped_name.startswith(f"{original_dist_info}/"):
                 mapped_name = f"{final_dist_info}/{mapped_name.split('/', 1)[1]}"
+            mapped_name = _validate_wheel_member_path(mapped_name)
             if (
                 cuda_runtime is not None and mapped_name == target_path
             ) or mapped_name in extra_targets:
                 continue
 
-            data = zin.read(info.filename)
+            data = zin.read(info)
             if mapped_name == f"{final_dist_info}/METADATA":
                 if distribution_name is not None:
                     data = _replace_header(data, "Name", final_name)
@@ -185,6 +247,7 @@ def rewrite_wheel(
             if set_rpath is not None and _is_elf_shared_lib(mapped_name, data):
                 data = _set_rpath(data, set_rpath, mapped_name)
             entries.append((_copy_info(info, mapped_name), data))
+            entry_names.append(mapped_name)
 
         if cuda_runtime is not None:
             data = cuda_runtime.read_bytes()
@@ -194,13 +257,19 @@ def rewrite_wheel(
             info.compress_type = zipfile.ZIP_DEFLATED
             info.external_attr = 0o644 << 16
             entries.append((info, data))
+            entry_names.append(target_path)
 
         for source, target in extra_files:
             data = source.read_bytes()
+            if set_rpath is not None and _is_elf_shared_lib(target, data):
+                data = _set_rpath(data, set_rpath, target)
             info = zipfile.ZipInfo(target)
             info.compress_type = zipfile.ZIP_DEFLATED
             info.external_attr = 0o644 << 16
             entries.append((info, data))
+            entry_names.append(target)
+
+        _check_duplicate_targets([*entry_names, record_path])
 
     record_buffer = io.StringIO()
     writer = csv.writer(record_buffer, lineterminator="\n")
@@ -268,29 +337,47 @@ def main() -> int:
             target_path = "tvm/lib/libtvm_runtime_cuda.so"
         else:
             target_path = f"tvm/lib/{cuda_runtime.name}"
+    else:
+        try:
+            target_path = _normalize_wheel_path(target_path, "target path")
+        except argparse.ArgumentTypeError as err:
+            parser.error(str(err))
 
     extra_files = list(args.extra_file)
-    extra_files.extend(
-        _extra_library_files(
-            library_dirs=args.extra_library_dir,
-            patterns=args.extra_library_pattern,
-            target_dir=args.extra_library_target_dir,
+    try:
+        extra_files.extend(
+            _extra_library_files(
+                library_dirs=args.extra_library_dir,
+                patterns=args.extra_library_pattern,
+                target_dir=args.extra_library_target_dir,
+            )
         )
-    )
+    except (argparse.ArgumentTypeError, ValueError) as err:
+        parser.error(str(err))
     missing_extra_files = [str(source) for source, _ in extra_files if not source.is_file()]
     if missing_extra_files:
         parser.error(f"extra files do not exist: {', '.join(missing_extra_files)}")
-
-    output_path = rewrite_wheel(
-        wheel=args.wheel,
-        output_dir=args.output_dir,
-        cuda_runtime=cuda_runtime,
-        target_path=target_path,
-        distribution_name=args.distribution_name or None,
-        distribution_version=args.distribution_version or None,
-        set_rpath=args.set_rpath,
-        extra_files=extra_files,
-    )
+    target_paths = [target for _, target in extra_files]
+    if cuda_runtime is not None:
+        target_paths.append(target_path)
+    try:
+        _check_duplicate_targets(target_paths)
+    except ValueError as err:
+        parser.error(str(err))
+
+    try:
+        output_path = rewrite_wheel(
+            wheel=args.wheel,
+            output_dir=args.output_dir,
+            cuda_runtime=cuda_runtime,
+            target_path=target_path,
+            distribution_name=args.distribution_name or None,
+            distribution_version=args.distribution_version or None,
+            set_rpath=args.set_rpath,
+            extra_files=extra_files,
+        )
+    except (ValueError, zipfile.BadZipFile, KeyError) as err:
+        parser.error(str(err))
     print(output_path)
     return 0
 
diff --git a/ci/scripts/package/tvm_wheel_helper.sh b/ci/scripts/package/tvm_wheel_helper.sh
index efaf63062165..7752d39b08c5 100755
--- a/ci/scripts/package/tvm_wheel_helper.sh
+++ b/ci/scripts/package/tvm_wheel_helper.sh
@@ -31,9 +31,45 @@ TVM_CUDA_ARCHITECTURES="${TVM_CUDA_ARCHITECTURES:-75}"
 TVM_BUILD_PARALLEL_LEVEL="${TVM_BUILD_PARALLEL_LEVEL:-${CMAKE_BUILD_PARALLEL_LEVEL:-$(getconf _NPROCESSORS_ONLN 2>/dev/null || echo 4)}}"
 TVM_WHEEL_DIST_NAME="${TVM_WHEEL_DIST_NAME:-}"
 TVM_WHEEL_DIST_VERSION="${TVM_WHEEL_DIST_VERSION:-}"
-TVM_SKIP_CUDA="${TVM_SKIP_CUDA:-0}"
+TVM_INCLUDE_CUDA_RUNTIME="${TVM_INCLUDE_CUDA_RUNTIME:-}"
+
+normalize_bool() {
+  local name="$1"
+  local value="$2"
+  local normalized
+  normalized="$(printf '%s' "$value" | tr '[:upper:]' '[:lower:]')"
+  case "$normalized" in
+    1|true|yes|on) echo 1 ;;
+    0|false|no|off) echo 0 ;;
+    *)
+      echo "error: ${name} must be a boolean value" >&2
+      return 1
+      ;;
+  esac
+}
+
+if [[ -n "$TVM_INCLUDE_CUDA_RUNTIME" ]]; then
+  tvm_include_cuda_runtime_normalized="$(normalize_bool TVM_INCLUDE_CUDA_RUNTIME "$TVM_INCLUDE_CUDA_RUNTIME")"
+  if [[ -n "${TVM_SKIP_CUDA+x}" ]]; then
+    tvm_skip_cuda_normalized="$(normalize_bool TVM_SKIP_CUDA "$TVM_SKIP_CUDA")"
+    if [[ "$tvm_include_cuda_runtime_normalized" == "$tvm_skip_cuda_normalized" ]]; then
+      echo "error: TVM_INCLUDE_CUDA_RUNTIME conflicts with TVM_SKIP_CUDA" >&2
+      exit 1
+    fi
+  fi
+  if [[ "$tvm_include_cuda_runtime_normalized" == "1" ]]; then
+    TVM_SKIP_CUDA=0
+  else
+    TVM_SKIP_CUDA=1
+  fi
+else
+  TVM_SKIP_CUDA="${TVM_SKIP_CUDA:-0}"
+fi
+TVM_SKIP_CUDA="$(normalize_bool TVM_SKIP_CUDA "$TVM_SKIP_CUDA")"
 TVM_SKIP_REPAIR="${TVM_SKIP_REPAIR:-0}"
 TVM_KEEP_BUILD_DIRS="${TVM_KEEP_BUILD_DIRS:-0}"
+TVM_SKIP_REPAIR="$(normalize_bool TVM_SKIP_REPAIR "$TVM_SKIP_REPAIR")"
+TVM_KEEP_BUILD_DIRS="$(normalize_bool TVM_KEEP_BUILD_DIRS "$TVM_KEEP_BUILD_DIRS")"
 
 usage() {
   cat <<'EOF'
@@ -47,16 +83,20 @@ Environment knobs:
   TVM_WHEEL_DIST_NAME          Optional distribution rename for TestPyPI
   TVM_WHEEL_DIST_VERSION       Optional distribution version rewrite
   TVM_UPLOAD_REPOSITORY_URL    Twine repository URL, e.g. TestPyPI legacy URL
+  TVM_INCLUDE_CUDA_RUNTIME=1   Build/inject libtvm_runtime_cuda.so
   TVM_SKIP_CUDA=1              Do not build/inject libtvm_runtime_cuda.so
   TVM_SKIP_REPAIR=1            Keep injected wheel as final wheel
   TVM_KEEP_BUILD_DIRS=1        Reuse CMake build dirs instead of cleaning them
   TVM_MANYLINUX_IMAGE          manylinux image tag for manylinux-cuda
-  TVM_MANYLINUX_IMAGE_TAG      optional pinned image tag for manylinux-cuda
+  TVM_MANYLINUX_IMAGE_TAG      pinned image tag for manylinux-cuda
   TVM_ARCH                     Target architecture for manylinux-cuda
   TVM_AUDITWHEEL_PLAT          Optional auditwheel --plat value
   TVM_AUDITWHEEL_LIBRARY_PATH  Optional library search path for auditwheel repair
   TVM_EXPECT_WHEEL_PLATFORM_TAG
                                 Require the final wheel filename to include this tag
+  TVM_EXPECT_CUDA_RUNTIME      Verify whether the installed wheel ships a CUDA runtime DSO
+  TVM_EXPECT_STATIC_LLVM       Verify that the installed wheel does not ship libLLVM
+  TVM_DELOCATE_ARCHS           Optional delocate --require-archs value for macOS repair
   TVM_TEST_INDEX_URL           Package index for verify-pypi, default TestPyPI
   TVM_EXTRA_INDEX_URL          Extra package index for dependencies, default PyPI
 EOF
@@ -106,6 +146,9 @@ cuda_runtime_path() {
   if [[ -n "$TVM_CUDA_RUNTIME_PATH" ]]; then
     if [[ -f "$TVM_CUDA_RUNTIME_PATH" ]]; then
       echo "$TVM_CUDA_RUNTIME_PATH"
+    else
+      echo "error: TVM_CUDA_RUNTIME_PATH does not exist: ${TVM_CUDA_RUNTIME_PATH}" >&2
+      return 1
     fi
     return 0
   fi
@@ -120,11 +163,25 @@ manylinux_image_name() {
   local arch="$2"
   local tag="${3:-}"
   if [[ "$base" == *"/"* || "$base" == *":"* ]]; then
+    if [[ "$base" != *@sha256:* && "${base##*/}" != *":"* ]]; then
+      echo "error: fully qualified TVM_MANYLINUX_IMAGE must include a tag or digest" >&2
+      return 1
+    fi
     echo "$base"
   elif [[ -n "$tag" ]]; then
     echo "quay.io/pypa/${base}_${arch}:${tag}"
   else
-    echo "quay.io/pypa/${base}_${arch}:latest"
+    echo "error: TVM_MANYLINUX_IMAGE_TAG is required when TVM_MANYLINUX_IMAGE is not fully qualified" >&2
+    return 1
+  fi
+}
+
+validate_manylinux_cuda_image() {
+  local image="$1"
+  local image_name="${image##*/}"
+  if [[ "$image_name" != manylinux_2_28_*:* && "$image_name" != manylinux_2_28_*@sha256:* ]]; then
+    echo "error: manylinux-cuda currently supports only pinned manylinux_2_28 images" >&2
+    return 1
   fi
 }
 
@@ -147,11 +204,14 @@ run_manylinux_cuda_container() {
 
   local image
   image="$(manylinux_image_name "$TVM_MANYLINUX_IMAGE" "$TVM_ARCH" "${TVM_MANYLINUX_IMAGE_TAG:-}")"
+  validate_manylinux_cuda_image "$image"
   local container="tvm_wheel_cuda_${GITHUB_RUN_ID:-local}_${GITHUB_RUN_ATTEMPT:-1}_${TVM_ARCH}"
   local host_cuda_build_dir="$TVM_CUDA_BUILD_DIR"
   local container_cuda_root="/workspace-cuda-build"
   local container_cuda_build_dir="${container_cuda_root}/build"
   mkdir -p "$host_cuda_build_dir"
+  local cuda_rpm="/tmp/cuda-repo-rhel8-13-0-local-13.0.2_580.95.05-1.${TVM_ARCH}.rpm"
+  trap "rm -f '${cuda_rpm}'; docker exec '${container}' bash -lc 'chown -R $(id -u):$(id -g) ${container_cuda_root} || true' >/dev/null 2>&1 || true; docker rm -f '${container}' >/dev/null 2>&1 || true" EXIT
   docker pull "$image"
   docker rm -f "$container" >/dev/null 2>&1 || true
   docker run --name "$container" -d \
@@ -159,17 +219,17 @@ run_manylinux_cuda_container() {
     --volume "${REPO_ROOT}:/workspace" \
     --volume "${host_cuda_build_dir}:${container_cuda_root}" \
     "$image" tail -f /dev/null
-  trap "docker rm -f '${container}' >/dev/null 2>&1 || true" EXIT
 
-  local cuda_rpm="cuda-repo-rhel8-13-0-local-13.0.2_580.95.05-1.${TVM_ARCH}.rpm"
-  curl -fsSLo "$cuda_rpm" "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/${cuda_rpm}"
-  docker cp "$cuda_rpm" "${container}:/${cuda_rpm}"
+  local cuda_rpm_name
+  cuda_rpm_name="$(basename "$cuda_rpm")"
+  curl -fsSLo "$cuda_rpm" "https://developer.download.nvidia.com/compute/cuda/13.0.2/local_installers/${cuda_rpm_name}"
+  docker cp "$cuda_rpm" "${container}:/${cuda_rpm_name}"
   rm "$cuda_rpm"
   docker exec "$container" bash -lc "
-    rpm -i /${cuda_rpm} && \
+    rpm -i /${cuda_rpm_name} && \
     dnf clean all && \
     dnf -y --disablerepo=epel install cuda-toolkit-13-0 && \
-    rm /${cuda_rpm} && \
+    rm /${cuda_rpm_name} && \
     dnf clean all"
 
   docker exec \
@@ -177,7 +237,7 @@ run_manylinux_cuda_container() {
     -e TVM_USE_CUDA=/usr/local/cuda \
     -e TVM_CUDA_ARCHITECTURES="$TVM_CUDA_ARCHITECTURES" \
     -e TVM_CUDA_BUILD_DIR="$container_cuda_build_dir" \
-    -e TVM_SKIP_CUDA="$TVM_SKIP_CUDA" \
+    -e TVM_INCLUDE_CUDA_RUNTIME=1 \
     -e CMAKE_BUILD_PARALLEL_LEVEL="$TVM_BUILD_PARALLEL_LEVEL" \
     -e TVM_BUILD_PARALLEL_LEVEL="$TVM_BUILD_PARALLEL_LEVEL" \
     "$container" bash -lc '
@@ -263,8 +323,8 @@ inject_wheel_file() {
     inject_args+=(--set-rpath '$ORIGIN')
   fi
 
-  echo "Injecting CUDA runtime/metadata into ${raw_wheel}"
-  "$TVM_PYTHON" "$SCRIPT_DIR/inject_cuda_runtime.py" "$raw_wheel" "${inject_args[@]}"
+  echo "Rewriting wheel metadata/runtime files for ${raw_wheel}"
+  "$TVM_PYTHON" "$SCRIPT_DIR/rewrite_wheel.py" "$raw_wheel" "${inject_args[@]}"
 }
 
 auditwheel_excludes() {
@@ -380,6 +440,10 @@ repair_wheel_to_dir() {
   local injected_wheel="$1"
   local output_dir="$2"
   mkdir -p "$output_dir"
+  local existing_wheel
+  while IFS= read -r existing_wheel; do
+    rm -f "$existing_wheel"
+  done < <(find "$output_dir" -maxdepth 1 -type f -name '*.whl')
   if [[ "$TVM_SKIP_REPAIR" == "1" ]]; then
     cp "$injected_wheel" "$output_dir/"
     echo "Repair skipped; final wheel copied to ${output_dir}"
@@ -389,8 +453,10 @@ repair_wheel_to_dir() {
   case "$(uname -s)" in
     Linux)
       require_cmd auditwheel
-      local cuda_lib
-      cuda_lib="$(cuda_runtime_path || true)"
+      local cuda_lib=""
+      if [[ "$TVM_SKIP_CUDA" != "1" ]]; then
+        cuda_lib="$(cuda_runtime_path)"
+      fi
       local exclude_args=()
       local exclude_arg
       while IFS= read -r exclude_arg; do
@@ -430,7 +496,12 @@ repair_wheel_to_dir() {
             export DYLD_LIBRARY_PATH="${repair_libdir}${DYLD_LIBRARY_PATH:+:${DYLD_LIBRARY_PATH}}"
           fi
         fi
+        delocate_arch_args=()
+        if [[ -n "${TVM_DELOCATE_ARCHS:-}" ]]; then
+          delocate_arch_args+=(--require-archs "$TVM_DELOCATE_ARCHS")
+        fi
         delocate-wheel \
+          "${delocate_arch_args[@]}" \
           --ignore-missing-dependencies \
           --exclude libtvm_ffi.dylib \
           -w "$output_dir" \
@@ -446,17 +517,17 @@ repair_wheel_to_dir() {
   single_wheel "$output_dir" >/dev/null
 }
 
-cibw_repair_wheel() {
+cibw_repair_wheel() (
   local raw_wheel="$1"
   local dest_dir="$2"
   local injected_dir
   injected_dir="$(mktemp -d)"
+  trap 'rm -rf "$injected_dir"' EXIT
   inject_wheel_file "$raw_wheel" "$injected_dir"
   local injected_wheel
   injected_wheel="$(single_wheel "$injected_dir")"
   repair_wheel_to_dir "$injected_wheel" "$dest_dir"
-  rm -rf "$injected_dir"
-}
+)
 
 validate_wheel_elf() {
   local final_wheel
diff --git a/ci/scripts/package/verify_tvm_install.py b/ci/scripts/package/verify_tvm_install.py
index ace784490840..3f9ed8a0c91c 100644
--- a/ci/scripts/package/verify_tvm_install.py
+++ b/ci/scripts/package/verify_tvm_install.py
@@ -18,33 +18,125 @@
 
 from __future__ import annotations
 
+import os
 from pathlib import Path
 import sys
 
-import tvm
+
+def expect_bool(name: str) -> bool | None:
+    value = os.environ.get(name)
+    if value is None or value == "":
+        return None
+    normalized = value.strip().lower()
+    if normalized in {"1", "true", "yes", "on"}:
+        return True
+    if normalized in {"0", "false", "no", "off"}:
+        return False
+    raise RuntimeError(f"{name} must be a boolean value, got {value!r}")
+
+
+def _clear_external_library_overrides() -> None:
+    for name in ("TVM_LIBRARY_PATH", "LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH"):
+        if name in os.environ:
+            print(f"clearing {name} before importing tvm")
+            os.environ.pop(name, None)
+
+
+def _first_existing(candidates: list[Path]) -> Path:
+    for candidate in candidates:
+        if candidate.exists():
+            return candidate
+    return candidates[0]
+
+
+def _assert_loaded_runtime_from_wheel(libdir: Path, runtime_candidates: list[Path]) -> None:
+    import tvm.base as tvm_base  # pylint: disable=import-outside-toplevel
+
+    loaded_runtime = Path(tvm_base._LIB_RUNTIME._name).resolve()  # pylint: disable=protected-access
+    expected_runtime_paths = {candidate.resolve() for candidate in runtime_candidates}
+    print("loaded runtime library:", loaded_runtime)
+    if loaded_runtime not in expected_runtime_paths:
+        expected = ", ".join(str(path) for path in sorted(expected_runtime_paths))
+        raise RuntimeError(
+            f"loaded runtime library is not from the installed wheel: "
+            f"got {loaded_runtime}, expected one of {expected}"
+        )
+
+
+def _dynamic_llvm_libs(libdir: Path) -> list[Path]:
+    if sys.platform == "darwin":
+        patterns = ["libLLVM*.dylib"]
+    elif sys.platform == "win32":
+        patterns = ["LLVM*.dll", "libLLVM*.dll"]
+    else:
+        patterns = ["libLLVM*.so", "libLLVM*.so.*"]
+    found: set[Path] = set()
+    for pattern in patterns:
+        found.update(libdir.glob(pattern))
+    return sorted(found)
 
 
 def main() -> int:
+    _clear_external_library_overrides()
+
+    import tvm  # pylint: disable=import-outside-toplevel
+
     root = Path(tvm.__file__).resolve().parent
     libdir = root / "lib"
     if sys.platform == "darwin":
-        runtime_lib = libdir / "libtvm_runtime.dylib"
-        cuda_runtime = libdir / "libtvm_runtime_cuda.dylib"
+        runtime_candidates = [libdir / "libtvm_runtime.dylib"]
+        cuda_runtime_candidates = [libdir / "libtvm_runtime_cuda.dylib"]
     elif sys.platform == "win32":
-        runtime_lib = libdir / "tvm_runtime.dll"
-        cuda_runtime = libdir / "tvm_runtime_cuda.dll"
+        runtime_candidates = [libdir / "tvm_runtime.dll", libdir / "libtvm_runtime.dll"]
+        cuda_runtime_candidates = [
+            libdir / "tvm_runtime_cuda.dll",
+            libdir / "libtvm_runtime_cuda.dll",
+        ]
     else:
-        runtime_lib = libdir / "libtvm_runtime.so"
-        cuda_runtime = libdir / "libtvm_runtime_cuda.so"
+        runtime_candidates = [libdir / "libtvm_runtime.so"]
+        cuda_runtime_candidates = [libdir / "libtvm_runtime_cuda.so"]
 
     print("tvm version:", tvm.__version__)
     print("tvm package:", root)
-    print("llvm enabled:", tvm.runtime.enabled("llvm"))
-    print("cuda runtime enabled:", tvm.runtime.enabled("cuda"))
+    llvm_enabled = bool(tvm.runtime.enabled("llvm"))
+    cuda_enabled = bool(tvm.runtime.enabled("cuda"))
+    runtime_lib = _first_existing(runtime_candidates)
+    cuda_runtime = _first_existing(cuda_runtime_candidates)
+    runtime_present = any(candidate.exists() for candidate in runtime_candidates)
+    cuda_runtime_present = any(candidate.exists() for candidate in cuda_runtime_candidates)
+    dynamic_llvm_libs = _dynamic_llvm_libs(libdir)
+
+    print("llvm enabled:", llvm_enabled)
+    print("cuda runtime enabled:", cuda_enabled)
     print("runtime library:", runtime_lib)
-    if not runtime_lib.exists():
-        raise RuntimeError(f"runtime library is missing: {runtime_lib}")
-    print("cuda runtime present:", cuda_runtime.exists())
+    if not runtime_present:
+        raise RuntimeError(
+            "runtime library is missing; checked "
+            + ", ".join(str(candidate) for candidate in runtime_candidates)
+        )
+    _assert_loaded_runtime_from_wheel(libdir, runtime_candidates)
+    print("cuda runtime present:", cuda_runtime_present)
+    if cuda_runtime_present:
+        print("cuda runtime library:", cuda_runtime)
+    print("dynamic LLVM libraries:", [str(path) for path in dynamic_llvm_libs])
+
+    expected_llvm = expect_bool("TVM_EXPECT_LLVM_ENABLED")
+    if expected_llvm is not None and llvm_enabled != expected_llvm:
+        raise RuntimeError(f"llvm enabled: expected {expected_llvm}, got {llvm_enabled}")
+    expected_static_llvm = expect_bool("TVM_EXPECT_STATIC_LLVM")
+    if expected_static_llvm and dynamic_llvm_libs:
+        raise RuntimeError(
+            "expected LLVM to be linked statically, but dynamic LLVM libraries are present: "
+            + ", ".join(str(path) for path in dynamic_llvm_libs)
+        )
+    expected_cuda_runtime = expect_bool("TVM_EXPECT_CUDA_RUNTIME")
+    if expected_cuda_runtime is not None and cuda_runtime_present != expected_cuda_runtime:
+        raise RuntimeError(
+            f"cuda runtime present: expected {expected_cuda_runtime}, got {cuda_runtime_present}"
+        )
+    expected_cuda = expect_bool("TVM_EXPECT_CUDA_ENABLED")
+    if expected_cuda is not None and cuda_enabled != expected_cuda:
+        raise RuntimeError(f"cuda runtime enabled: expected {expected_cuda}, got {cuda_enabled}")
     return 0
 
 

From 198224a20b7c9de5e609516895166f68f0e5bd83 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 15:44:03 -0400
Subject: [PATCH 29/43] Fix LLVM RTTI handling in wheel builds

---
 ci/scripts/package/README.md             |  4 +-
 ci/scripts/package/verify_tvm_install.py | 50 ++++++++++++++++++++++++
 cmake/modules/LLVM.cmake                 |  2 +-
 cmake/utils/FindLLVM.cmake               | 19 +++++++++
 4 files changed, 72 insertions(+), 3 deletions(-)

diff --git a/ci/scripts/package/README.md b/ci/scripts/package/README.md
index 5822e46790b3..3602f15f9737 100644
--- a/ci/scripts/package/README.md
+++ b/ci/scripts/package/README.md
@@ -77,8 +77,8 @@ Workflow structure:
   extra runtime files, including the CUDA runtime library when CUDA is enabled.
 - `ci/scripts/package/verify_tvm_install.py`: imports the installed wheel and
   checks that the runtime library was loaded from the wheel, expected runtime
-  DSOs are present, and dynamic LLVM libraries are not bundled when static LLVM
-  is required.
+  DSOs are present, dynamic LLVM libraries are not bundled when static LLVM is
+  required, and minimal TIRX/Relax programs compile and run through LLVM.
 
 To test the workflow from a fork without publishing:
 
diff --git a/ci/scripts/package/verify_tvm_install.py b/ci/scripts/package/verify_tvm_install.py
index 3f9ed8a0c91c..3c2a59747add 100644
--- a/ci/scripts/package/verify_tvm_install.py
+++ b/ci/scripts/package/verify_tvm_install.py
@@ -22,6 +22,8 @@
 from pathlib import Path
 import sys
 
+import numpy as np
+
 
 def expect_bool(name: str) -> bool | None:
     value = os.environ.get(name)
@@ -76,6 +78,51 @@ def _dynamic_llvm_libs(libdir: Path) -> list[Path]:
     return sorted(found)
 
 
+def _verify_llvm_tirx_compile() -> None:
+    import tvm  # pylint: disable=import-outside-toplevel
+    from tvm import te  # pylint: disable=import-outside-toplevel
+
+    extent = 8
+    lhs_np = np.arange(extent, dtype="float32")
+    rhs_np = np.arange(extent, dtype="float32") * np.float32(2)
+    out_np = np.zeros(extent, dtype="float32")
+
+    lhs = te.placeholder((extent,), name="lhs", dtype="float32")
+    rhs = te.placeholder((extent,), name="rhs", dtype="float32")
+    out = te.compute((extent,), lambda i: lhs[i] + rhs[i], name="out")
+    executable = tvm.compile(te.create_prim_func([lhs, rhs, out]), target="llvm")
+
+    dev = tvm.cpu()
+    lhs_t = tvm.runtime.tensor(lhs_np, dev)
+    rhs_t = tvm.runtime.tensor(rhs_np, dev)
+    out_t = tvm.runtime.tensor(out_np, dev)
+    executable(lhs_t, rhs_t, out_t)
+    np.testing.assert_allclose(out_t.numpy(), lhs_np + rhs_np, rtol=1e-6)
+    print("llvm tirx compile smoke: passed")
+
+
+def _verify_relax_compile() -> None:
+    import tvm  # pylint: disable=import-outside-toplevel
+    from tvm import relax  # pylint: disable=import-outside-toplevel
+
+    lhs_np = np.arange(8, dtype="float32")
+    rhs_np = np.arange(8, dtype="float32") * np.float32(3)
+    dev = tvm.cpu()
+
+    lhs = relax.Var("lhs", relax.TensorStructInfo((8,), "float32"))
+    rhs = relax.Var("rhs", relax.TensorStructInfo((8,), "float32"))
+    builder = relax.BlockBuilder()
+    with builder.function("main", [lhs, rhs]):
+        out = builder.emit(relax.op.add(lhs, rhs))
+        builder.emit_func_output(out)
+
+    executable = tvm.compile(builder.get(), target="llvm")
+    vm = relax.VirtualMachine(executable, dev)
+    out = vm["main"](tvm.runtime.tensor(lhs_np, dev), tvm.runtime.tensor(rhs_np, dev))
+    np.testing.assert_allclose(out.numpy(), lhs_np + rhs_np, rtol=1e-6)
+    print("llvm relax compile smoke: passed")
+
+
 def main() -> int:
     _clear_external_library_overrides()
 
@@ -123,6 +170,9 @@ def main() -> int:
     expected_llvm = expect_bool("TVM_EXPECT_LLVM_ENABLED")
     if expected_llvm is not None and llvm_enabled != expected_llvm:
         raise RuntimeError(f"llvm enabled: expected {expected_llvm}, got {llvm_enabled}")
+    if llvm_enabled:
+        _verify_llvm_tirx_compile()
+        _verify_relax_compile()
     expected_static_llvm = expect_bool("TVM_EXPECT_STATIC_LLVM")
     if expected_static_llvm and dynamic_llvm_libs:
         raise RuntimeError(
diff --git a/cmake/modules/LLVM.cmake b/cmake/modules/LLVM.cmake
index f944b4130415..fcb31481afc8 100644
--- a/cmake/modules/LLVM.cmake
+++ b/cmake/modules/LLVM.cmake
@@ -50,7 +50,7 @@ if(NOT ${USE_LLVM} MATCHES ${IS_FALSE_PATTERN})
   )
   list(APPEND TVM_LINKER_LIBS ${LLVM_LIBS})
   list(APPEND COMPILER_SRCS ${COMPILER_LLVM_SRCS})
-  if(NOT MSVC)
+  if(NOT MSVC AND NOT TVM_LLVM_HAS_RTTI)
     set_source_files_properties(${COMPILER_LLVM_SRCS}
       PROPERTIES COMPILE_FLAGS "-fno-rtti")
   endif()
diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake
index 439c7d412ef1..a947f05da0bd 100644
--- a/cmake/utils/FindLLVM.cmake
+++ b/cmake/utils/FindLLVM.cmake
@@ -59,6 +59,10 @@ macro(find_llvm use_llvm)
       message(STATUS "Fall back to using llvm-config")
       set(LLVM_CONFIG "${LLVM_TOOLS_BINARY_DIR}/llvm-config")
     endif()
+    set(TVM_LLVM_HAS_RTTI 0)
+    if(LLVM_ENABLE_RTTI)
+      set(TVM_LLVM_HAS_RTTI 1)
+    endif()
   endif()
 
   if(LLVM_LIBS) # check if defined, not if it is true
@@ -142,6 +146,13 @@ macro(find_llvm use_llvm)
     if(NOT "${__llvm_exit_code}" STREQUAL "0")
       message(FATAL_ERROR "Fatal error executing: ${LLVM_CONFIG} --targets-built")
     endif()
+    execute_process(COMMAND ${LLVM_CONFIG} --has-rtti
+      RESULT_VARIABLE __llvm_exit_code
+      OUTPUT_VARIABLE __llvm_has_rtti
+      OUTPUT_STRIP_TRAILING_WHITESPACE)
+    if(NOT "${__llvm_exit_code}" STREQUAL "0")
+      message(FATAL_ERROR "Fatal error executing: ${LLVM_CONFIG} --has-rtti")
+    endif()
     cmake_path(SET "__llvm_cmakedir" "${__llvm_cmakedir}")
     message(STATUS "LLVM cmakedir: ${__llvm_cmakedir}")
     # map prefix => $
@@ -181,6 +192,13 @@ macro(find_llvm use_llvm)
     if("AArch64" IN_LIST BUILT_TARGET_LIST)
       set(TVM_LLVM_HAS_AARCH64_TARGET 1)
     endif()
+    string(TOUPPER "${__llvm_has_rtti}" __llvm_has_rtti_upper)
+    set(TVM_LLVM_HAS_RTTI 0)
+    if("${__llvm_has_rtti_upper}" STREQUAL "YES"
+       OR "${__llvm_has_rtti_upper}" STREQUAL "ON"
+       OR "${__llvm_has_rtti_upper}" STREQUAL "1")
+      set(TVM_LLVM_HAS_RTTI 1)
+    endif()
     if (${USE_MLIR})
       if (EXISTS "${__llvm_libdir}/libMLIRPresburger.a")
         if (EXISTS "${__llvm_libdir}/libMLIRSupport.a")
@@ -279,6 +297,7 @@ macro(find_llvm use_llvm)
     message(FATAL_ERROR "TVM requires LLVM 15.0 or higher.")
   endif()
   message(STATUS "Found TVM_LLVM_HAS_AARCH64_TARGET=" ${TVM_LLVM_HAS_AARCH64_TARGET})
+  message(STATUS "Found TVM_LLVM_HAS_RTTI=" ${TVM_LLVM_HAS_RTTI})
 
   # Detect whether DIBuilder insertion APIs (insertDeclare,
   # insertDbgValueIntrinsic) accept BasicBlock::iterator as the insertion point

From bf10c6ab4bc675d8f0aa4d8fa79d5d83a0eef940 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 16:42:43 -0400
Subject: [PATCH 30/43] Improve wheel verify crash diagnostics

---
 .../build-wheel-for-publish/action.yml        |  2 +-
 ci/scripts/package/verify_tvm_install.py      | 38 +++++++++++++------
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index a13e86886e67..7e6b4a8efd58 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -276,7 +276,7 @@ runs:
           --extra-library-pattern "*iconv*.dll"
           --extra-library-pattern "*charset*.dll"
         CIBW_TEST_COMMAND: >-
-          python "{project}/ci/scripts/package/verify_tvm_install.py"
+          python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py"
 
     - name: Verify final wheel
       shell: bash -l {0}
diff --git a/ci/scripts/package/verify_tvm_install.py b/ci/scripts/package/verify_tvm_install.py
index 3c2a59747add..e4bb2b9cacef 100644
--- a/ci/scripts/package/verify_tvm_install.py
+++ b/ci/scripts/package/verify_tvm_install.py
@@ -18,13 +18,22 @@
 
 from __future__ import annotations
 
+import faulthandler
 import os
 from pathlib import Path
 import sys
 
+faulthandler.enable(all_threads=True)
+if hasattr(sys.stdout, "reconfigure"):
+    sys.stdout.reconfigure(line_buffering=True)
+
 import numpy as np
 
 
+def log(*args: object) -> None:
+    print(*args, flush=True)
+
+
 def expect_bool(name: str) -> bool | None:
     value = os.environ.get(name)
     if value is None or value == "":
@@ -40,7 +49,7 @@ def expect_bool(name: str) -> bool | None:
 def _clear_external_library_overrides() -> None:
     for name in ("TVM_LIBRARY_PATH", "LD_LIBRARY_PATH", "DYLD_LIBRARY_PATH"):
         if name in os.environ:
-            print(f"clearing {name} before importing tvm")
+            log(f"clearing {name} before importing tvm")
             os.environ.pop(name, None)
 
 
@@ -56,7 +65,7 @@ def _assert_loaded_runtime_from_wheel(libdir: Path, runtime_candidates: list[Pat
 
     loaded_runtime = Path(tvm_base._LIB_RUNTIME._name).resolve()  # pylint: disable=protected-access
     expected_runtime_paths = {candidate.resolve() for candidate in runtime_candidates}
-    print("loaded runtime library:", loaded_runtime)
+    log("loaded runtime library:", loaded_runtime)
     if loaded_runtime not in expected_runtime_paths:
         expected = ", ".join(str(path) for path in sorted(expected_runtime_paths))
         raise RuntimeError(
@@ -82,6 +91,7 @@ def _verify_llvm_tirx_compile() -> None:
     import tvm  # pylint: disable=import-outside-toplevel
     from tvm import te  # pylint: disable=import-outside-toplevel
 
+    log("llvm tirx compile smoke: starting")
     extent = 8
     lhs_np = np.arange(extent, dtype="float32")
     rhs_np = np.arange(extent, dtype="float32") * np.float32(2)
@@ -98,13 +108,14 @@ def _verify_llvm_tirx_compile() -> None:
     out_t = tvm.runtime.tensor(out_np, dev)
     executable(lhs_t, rhs_t, out_t)
     np.testing.assert_allclose(out_t.numpy(), lhs_np + rhs_np, rtol=1e-6)
-    print("llvm tirx compile smoke: passed")
+    log("llvm tirx compile smoke: passed")
 
 
 def _verify_relax_compile() -> None:
     import tvm  # pylint: disable=import-outside-toplevel
     from tvm import relax  # pylint: disable=import-outside-toplevel
 
+    log("llvm relax compile smoke: starting")
     lhs_np = np.arange(8, dtype="float32")
     rhs_np = np.arange(8, dtype="float32") * np.float32(3)
     dev = tvm.cpu()
@@ -120,13 +131,15 @@ def _verify_relax_compile() -> None:
     vm = relax.VirtualMachine(executable, dev)
     out = vm["main"](tvm.runtime.tensor(lhs_np, dev), tvm.runtime.tensor(rhs_np, dev))
     np.testing.assert_allclose(out.numpy(), lhs_np + rhs_np, rtol=1e-6)
-    print("llvm relax compile smoke: passed")
+    log("llvm relax compile smoke: passed")
 
 
 def main() -> int:
     _clear_external_library_overrides()
 
+    log("import tvm: starting")
     import tvm  # pylint: disable=import-outside-toplevel
+    log("import tvm: passed")
 
     root = Path(tvm.__file__).resolve().parent
     libdir = root / "lib"
@@ -143,8 +156,8 @@ def main() -> int:
         runtime_candidates = [libdir / "libtvm_runtime.so"]
         cuda_runtime_candidates = [libdir / "libtvm_runtime_cuda.so"]
 
-    print("tvm version:", tvm.__version__)
-    print("tvm package:", root)
+    log("tvm version:", tvm.__version__)
+    log("tvm package:", root)
     llvm_enabled = bool(tvm.runtime.enabled("llvm"))
     cuda_enabled = bool(tvm.runtime.enabled("cuda"))
     runtime_lib = _first_existing(runtime_candidates)
@@ -153,19 +166,19 @@ def main() -> int:
     cuda_runtime_present = any(candidate.exists() for candidate in cuda_runtime_candidates)
     dynamic_llvm_libs = _dynamic_llvm_libs(libdir)
 
-    print("llvm enabled:", llvm_enabled)
-    print("cuda runtime enabled:", cuda_enabled)
-    print("runtime library:", runtime_lib)
+    log("llvm enabled:", llvm_enabled)
+    log("cuda runtime enabled:", cuda_enabled)
+    log("runtime library:", runtime_lib)
     if not runtime_present:
         raise RuntimeError(
             "runtime library is missing; checked "
             + ", ".join(str(candidate) for candidate in runtime_candidates)
         )
     _assert_loaded_runtime_from_wheel(libdir, runtime_candidates)
-    print("cuda runtime present:", cuda_runtime_present)
+    log("cuda runtime present:", cuda_runtime_present)
     if cuda_runtime_present:
-        print("cuda runtime library:", cuda_runtime)
-    print("dynamic LLVM libraries:", [str(path) for path in dynamic_llvm_libs])
+        log("cuda runtime library:", cuda_runtime)
+    log("dynamic LLVM libraries:", [str(path) for path in dynamic_llvm_libs])
 
     expected_llvm = expect_bool("TVM_EXPECT_LLVM_ENABLED")
     if expected_llvm is not None and llvm_enabled != expected_llvm:
@@ -187,6 +200,7 @@ def main() -> int:
     expected_cuda = expect_bool("TVM_EXPECT_CUDA_ENABLED")
     if expected_cuda is not None and cuda_enabled != expected_cuda:
         raise RuntimeError(f"cuda runtime enabled: expected {expected_cuda}, got {cuda_enabled}")
+    log("verify tvm install: passed")
     return 0
 
 

From bb87383d7736713ab98e6a607d898c724b5e51c9 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 16:48:38 -0400
Subject: [PATCH 31/43] Update wheel workflow actions

---
 .../build-wheel-for-publish/action.yml        | 23 +++++++++++--------
 .github/actions/setup/action.yml              |  8 ++++---
 .github/workflows/publish_wheel.yml           |  6 ++---
 pyproject.toml                                |  2 +-
 4 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index 7e6b4a8efd58..a78328063315 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -125,34 +125,39 @@ runs:
         echo "cibw_container_engine=${cibw_container_engine}" >> "${GITHUB_OUTPUT}"
         echo "include_cuda_runtime=${include_cuda_runtime}" >> "${GITHUB_OUTPUT}"
 
+    - name: Prepare LLVM cache path (Unix)
+      if: runner.os != 'Windows'
+      shell: bash
+      run: |
+        set -eux
+        sudo mkdir -p /opt/llvm
+        sudo chown -R "$(whoami)" /opt/llvm
+
     # ---- Cache LLVM prefix ----
     - name: Cache LLVM
-      uses: actions/cache@v4
+      uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
       id: llvm-cache
       with:
         path: ${{ runner.os == 'Windows' && 'C:/opt/llvm' || '/opt/llvm' }}
-        key: tvm-wheel-llvm-22.1.0-${{ runner.os }}-${{ inputs.arch }}-v3
+        key: tvm-wheel-llvm-22.1.0-${{ runner.os }}-${{ inputs.arch }}-v4
 
     # ---- Install LLVM via conda (cache miss only) ----
     - name: Setup conda
       if: steps.llvm-cache.outputs.cache-hit != 'true'
-      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
       continue-on-error: true
       id: conda1
       with:
         miniforge-version: latest
+        conda-remove-defaults: true
 
     - name: Setup conda (retry with tar.bz2)
       if: steps.llvm-cache.outputs.cache-hit != 'true' && steps.conda1.outcome == 'failure'
-      uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
       with:
         miniforge-version: latest
         use-only-tar-bz2: true
-
-    - name: Create /opt/llvm (macOS)
-      if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os == 'macOS'
-      shell: bash
-      run: sudo mkdir -p /opt/llvm && sudo chown -R "$(whoami)" /opt/llvm
+        conda-remove-defaults: true
 
     - name: Install LLVM (Unix)
       if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os != 'Windows'
diff --git a/.github/actions/setup/action.yml b/.github/actions/setup/action.yml
index e78ce2f66d7a..c0fb205c111e 100644
--- a/.github/actions/setup/action.yml
+++ b/.github/actions/setup/action.yml
@@ -1,13 +1,13 @@
 runs:
  using: "composite"
  steps:
-  - uses: actions/cache@cdf6c1fa76f9f475f3d7449005a359c84ca0f306 # v5.0.3
+  - uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5
     env:
       CACHE_NUMBER: 2
     with:
       path: ~/conda_pkgs_dir
       key: ${{ runner.os }}-conda-${{ env.CACHE_NUMBER }}-${{ hashFiles('tests/conda/build-environment.yaml') }}
-  - uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+  - uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
     continue-on-error: true
     id: conda1
     with:
@@ -18,7 +18,8 @@ runs:
       miniforge-version: latest
       python-version: "3.10"
       condarc-file: tests/conda/condarc
-  - uses: conda-incubator/setup-miniconda@fc2d68f6413eb2d87b895e92f8584b5b94a10167 # v3.3.0
+      conda-remove-defaults: true
+  - uses: conda-incubator/setup-miniconda@8ee1f361103df19b6f8c8655fd3967a8ecb162d5 # v4.0.1
     if: steps.conda1.outcome == 'failure'
     with:
       activate-environment: tvm-build
@@ -29,6 +30,7 @@ runs:
       use-only-tar-bz2: true
       python-version: "3.10"
       condarc-file: tests/conda/condarc
+      conda-remove-defaults: true
   - name: Conda info
     shell: pwsh
     run: |
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 770b7778741f..3c7aa8511a47 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -150,7 +150,7 @@ jobs:
           cuda_runtime_path: ${{ steps.build_cuda.outputs.cuda_runtime_path }}
 
       - name: Upload wheel artifact
-        uses: actions/upload-artifact@v4
+        uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
         with:
           name: tvm-wheel-${{ matrix.artifact_suffix }}
           path: wheelhouse/*.whl
@@ -168,7 +168,7 @@ jobs:
       id-token: write
       attestations: write
     steps:
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           pattern: tvm-wheel-*
           path: dist
@@ -238,7 +238,7 @@ jobs:
           fetch-depth: 0
           fetch-tags: true
 
-      - uses: actions/download-artifact@v4
+      - uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c # v8.0.1
         with:
           name: tvm-wheel-linux-x86_64-manylinux_2_28
           path: wheelhouse
diff --git a/pyproject.toml b/pyproject.toml
index 2e400f0609e6..e1edbf5c6db0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ classifiers = [
 ]
 # Core dependencies - these are the minimum required for basic TVM functionality
 dependencies = [
-  "apache-tvm-ffi",
+  "apache-tvm-ffi>=0.1.11",
   "cloudpickle",
   "ml_dtypes",
   "numpy",

From 11f7e6b15e77c16c51af6115934f7deafa0bcc76 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 17:39:00 -0400
Subject: [PATCH 32/43] Add detailed wheel verify diagnostics

---
 ci/scripts/package/verify_tvm_install.py | 69 +++++++++++++++++++++++-
 1 file changed, 68 insertions(+), 1 deletion(-)

diff --git a/ci/scripts/package/verify_tvm_install.py b/ci/scripts/package/verify_tvm_install.py
index e4bb2b9cacef..bdb27678be73 100644
--- a/ci/scripts/package/verify_tvm_install.py
+++ b/ci/scripts/package/verify_tvm_install.py
@@ -21,6 +21,7 @@
 import faulthandler
 import os
 from pathlib import Path
+import signal
 import sys
 
 faulthandler.enable(all_threads=True)
@@ -34,6 +35,15 @@ def log(*args: object) -> None:
     print(*args, flush=True)
 
 
+def _enable_python_fault_handler() -> None:
+    """Install Python's signal handler after native libraries may install theirs."""
+    faulthandler.enable(all_threads=True)
+    try:
+        faulthandler.register(signal.SIGUSR1, all_threads=True)
+    except (AttributeError, RuntimeError, ValueError):
+        pass
+
+
 def expect_bool(name: str) -> bool | None:
     value = os.environ.get(name)
     if value is None or value == "":
@@ -87,26 +97,73 @@ def _dynamic_llvm_libs(libdir: Path) -> list[Path]:
     return sorted(found)
 
 
+def _log_tvm_ffi_details() -> None:
+    import tvm_ffi  # pylint: disable=import-outside-toplevel
+
+    ffi_lib = getattr(tvm_ffi, "LIB", None)
+    ffi_lib_path = getattr(ffi_lib, "_name", None)
+    log("tvm_ffi version:", getattr(tvm_ffi, "__version__", "<unknown>"))
+    log("tvm_ffi package:", Path(tvm_ffi.__file__).resolve().parent)
+    if ffi_lib_path:
+        log("tvm_ffi library:", Path(ffi_lib_path).resolve())
+
+
+def _log_loaded_native_libraries() -> None:
+    if sys.platform != "linux":
+        return
+    maps_path = Path("/proc/self/maps")
+    if not maps_path.exists():
+        return
+    interesting_names = (
+        "libtvm",
+        "libLLVM",
+        "libstdc++",
+        "libgcc_s",
+        "libxml2",
+        "libzstd",
+        "liblzma",
+    )
+    loaded: set[str] = set()
+    for line in maps_path.read_text(encoding="utf-8", errors="ignore").splitlines():
+        path = line.rsplit(maxsplit=1)[-1]
+        if "/" not in path:
+            continue
+        if any(name in path for name in interesting_names):
+            loaded.add(path)
+    log("loaded native libraries:")
+    for path in sorted(loaded):
+        log("  ", path)
+
+
 def _verify_llvm_tirx_compile() -> None:
     import tvm  # pylint: disable=import-outside-toplevel
     from tvm import te  # pylint: disable=import-outside-toplevel
 
     log("llvm tirx compile smoke: starting")
     extent = 8
+    log("llvm tirx compile smoke: create numpy inputs")
     lhs_np = np.arange(extent, dtype="float32")
     rhs_np = np.arange(extent, dtype="float32") * np.float32(2)
     out_np = np.zeros(extent, dtype="float32")
 
+    log("llvm tirx compile smoke: create placeholders")
     lhs = te.placeholder((extent,), name="lhs", dtype="float32")
     rhs = te.placeholder((extent,), name="rhs", dtype="float32")
+    log("llvm tirx compile smoke: create compute")
     out = te.compute((extent,), lambda i: lhs[i] + rhs[i], name="out")
-    executable = tvm.compile(te.create_prim_func([lhs, rhs, out]), target="llvm")
+    log("llvm tirx compile smoke: create prim func")
+    prim_func = te.create_prim_func([lhs, rhs, out])
+    log("llvm tirx compile smoke: compile")
+    executable = tvm.compile(prim_func, target="llvm")
 
+    log("llvm tirx compile smoke: create tensors")
     dev = tvm.cpu()
     lhs_t = tvm.runtime.tensor(lhs_np, dev)
     rhs_t = tvm.runtime.tensor(rhs_np, dev)
     out_t = tvm.runtime.tensor(out_np, dev)
+    log("llvm tirx compile smoke: execute")
     executable(lhs_t, rhs_t, out_t)
+    log("llvm tirx compile smoke: check output")
     np.testing.assert_allclose(out_t.numpy(), lhs_np + rhs_np, rtol=1e-6)
     log("llvm tirx compile smoke: passed")
 
@@ -116,20 +173,27 @@ def _verify_relax_compile() -> None:
     from tvm import relax  # pylint: disable=import-outside-toplevel
 
     log("llvm relax compile smoke: starting")
+    log("llvm relax compile smoke: create numpy inputs")
     lhs_np = np.arange(8, dtype="float32")
     rhs_np = np.arange(8, dtype="float32") * np.float32(3)
     dev = tvm.cpu()
 
+    log("llvm relax compile smoke: create vars")
     lhs = relax.Var("lhs", relax.TensorStructInfo((8,), "float32"))
     rhs = relax.Var("rhs", relax.TensorStructInfo((8,), "float32"))
+    log("llvm relax compile smoke: create module")
     builder = relax.BlockBuilder()
     with builder.function("main", [lhs, rhs]):
         out = builder.emit(relax.op.add(lhs, rhs))
         builder.emit_func_output(out)
 
+    log("llvm relax compile smoke: compile")
     executable = tvm.compile(builder.get(), target="llvm")
+    log("llvm relax compile smoke: create vm")
     vm = relax.VirtualMachine(executable, dev)
+    log("llvm relax compile smoke: execute")
     out = vm["main"](tvm.runtime.tensor(lhs_np, dev), tvm.runtime.tensor(rhs_np, dev))
+    log("llvm relax compile smoke: check output")
     np.testing.assert_allclose(out.numpy(), lhs_np + rhs_np, rtol=1e-6)
     log("llvm relax compile smoke: passed")
 
@@ -140,6 +204,8 @@ def main() -> int:
     log("import tvm: starting")
     import tvm  # pylint: disable=import-outside-toplevel
     log("import tvm: passed")
+    _enable_python_fault_handler()
+    _log_tvm_ffi_details()
 
     root = Path(tvm.__file__).resolve().parent
     libdir = root / "lib"
@@ -179,6 +245,7 @@ def main() -> int:
     if cuda_runtime_present:
         log("cuda runtime library:", cuda_runtime)
     log("dynamic LLVM libraries:", [str(path) for path in dynamic_llvm_libs])
+    _log_loaded_native_libraries()
 
     expected_llvm = expect_bool("TVM_EXPECT_LLVM_ENABLED")
     if expected_llvm is not None and llvm_enabled != expected_llvm:

From af2a217d8ff99befd7009e602b0ccf8dc481259b Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 18:37:04 -0400
Subject: [PATCH 33/43] Allow TestPyPI wheel version override

---
 .../actions/build-wheel-for-publish/action.yml    |  6 ++++++
 .github/workflows/publish_wheel.yml               | 15 +++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index a78328063315..10697e808396 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -39,6 +39,10 @@ inputs:
     description: "Optional wheel distribution name override, useful for TestPyPI"
     required: false
     default: ""
+  distribution_version:
+    description: "Optional wheel distribution version override, useful for TestPyPI"
+    required: false
+    default: ""
   cuda_architectures:
     description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
     required: false
@@ -243,6 +247,7 @@ runs:
           CMAKE_ARGS="-DUSE_LLVM=/opt/llvm/bin/llvm-config-static -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=/opt/llvm"
           TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
           TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
+          TVM_WHEEL_DIST_VERSION="${{ inputs.distribution_version }}"
           TVM_INCLUDE_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_CUDA_RUNTIME_PATH="${{ steps.wheel_inputs.outputs.include_cuda_runtime == '1' && inputs.cuda_runtime_path || '' }}"
           TVM_AUDITWHEEL_PLAT="${{ steps.wheel_inputs.outputs.wheel_platform_tag }}"
@@ -257,6 +262,7 @@ runs:
           CMAKE_ARGS="-DUSE_LLVM=C:/opt/llvm/Library/bin/llvm-config-static.bat -DUSE_CUDA=OFF -DBUILD_TESTING=OFF -DTVM_BUILD_PYTHON_MODULE=ON -DCMAKE_PREFIX_PATH=C:/opt/llvm/Library"
           TVM_CUDA_ARCHITECTURES="${{ inputs.cuda_architectures }}"
           TVM_WHEEL_DIST_NAME="${{ inputs.distribution_name }}"
+          TVM_WHEEL_DIST_VERSION="${{ inputs.distribution_version }}"
           TVM_INCLUDE_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_CUDA_RUNTIME_PATH="${{ steps.wheel_inputs.outputs.include_cuda_runtime == '1' && inputs.cuda_runtime_path || '' }}"
           TVM_BUILD_PARALLEL_LEVEL="${{ steps.env_vars.outputs.cpu_count }}"
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 3c7aa8511a47..9101c3d6cdca 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -38,6 +38,11 @@ on:
         required: false
         default: ""
         type: string
+      distribution_version:
+        description: "Optional package version override for TestPyPI validation builds"
+        required: false
+        default: ""
+        type: string
       cuda_architectures:
         description: "CMake CUDA architectures for libtvm_runtime_cuda.so"
         required: false
@@ -100,16 +105,25 @@ jobs:
           TVM_PUBLISH_REF: ${{ inputs.tag }}
           TVM_VERIFY_FROM_REPOSITORY: ${{ inputs.verify_from_repository }}
           TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
+          TVM_WHEEL_DIST_VERSION: ${{ inputs.distribution_version }}
         run: |
           set -eux
           if [[ -n "${TVM_WHEEL_DIST_NAME}" && ! "${TVM_WHEEL_DIST_NAME}" =~ ^[A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?$ ]]; then
             echo "distribution_name must be a valid Python package name override" >&2
             exit 1
           fi
+          if [[ -n "${TVM_WHEEL_DIST_VERSION}" && ! "${TVM_WHEEL_DIST_VERSION}" =~ ^[A-Za-z0-9][A-Za-z0-9._!+-]*$ ]]; then
+            echo "distribution_version must be a valid Python package version override" >&2
+            exit 1
+          fi
           if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && -n "${TVM_WHEEL_DIST_NAME}" ]]; then
             echo "distribution_name must be empty when publishing to PyPI" >&2
             exit 1
           fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && -n "${TVM_WHEEL_DIST_VERSION}" ]]; then
+            echo "distribution_version must be empty when publishing to PyPI" >&2
+            exit 1
+          fi
           if [[ "${TVM_PUBLISH_REPOSITORY}" == "pypi" && "${TVM_PUBLISH_REF}" != refs/tags/* ]]; then
             echo "PyPI publishes must use an immutable refs/tags/<tag> ref" >&2
             exit 1
@@ -145,6 +159,7 @@ jobs:
           linux_image: ${{ matrix.linux_image }}
           linux_image_tag: ${{ matrix.linux_image_tag }}
           distribution_name: ${{ inputs.distribution_name }}
+          distribution_version: ${{ inputs.distribution_version }}
           cuda_architectures: ${{ inputs.cuda_architectures }}
           include_cuda_runtime: ${{ matrix.include_cuda_runtime }}
           cuda_runtime_path: ${{ steps.build_cuda.outputs.cuda_runtime_path }}

From b9189189951aa16ddf7a62a1891355084001f4d0 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 18:56:37 -0400
Subject: [PATCH 34/43] Fix wheel version escaping for TestPyPI

---
 ci/scripts/package/rewrite_wheel.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/ci/scripts/package/rewrite_wheel.py b/ci/scripts/package/rewrite_wheel.py
index ce991c69e4bf..7584edfcdf8c 100755
--- a/ci/scripts/package/rewrite_wheel.py
+++ b/ci/scripts/package/rewrite_wheel.py
@@ -35,12 +35,18 @@
 from pathlib import Path
 
 
-def _wheel_escape(value: str) -> str:
+def _wheel_escape_distribution(value: str) -> str:
     """Escape a distribution component for wheel filenames and dist-info dirs."""
 
     return re.sub(r"[^\w\d.]+", "_", value).lower()
 
 
+def _wheel_escape_version(value: str) -> str:
+    """Escape a version component while preserving PEP 440 local version markers."""
+
+    return re.sub(r"[^\w\d.!+]+", "_", value).lower()
+
+
 def _hash_record(data: bytes) -> tuple[str, str]:
     digest = hashlib.sha256(data).digest()
     encoded = base64.urlsafe_b64encode(digest).rstrip(b"=").decode("ascii")
@@ -115,7 +121,11 @@ def _retag_wheel_filename(
     if len(parts) not in (5, 6):
         raise ValueError(f"Unsupported wheel filename: {wheel.name}")
     tags = parts[2:]
-    return f"{_wheel_escape(dist_name)}-{_wheel_escape(version)}-{'-'.join(tags)}.whl"
+    return (
+        f"{_wheel_escape_distribution(dist_name)}-"
+        f"{_wheel_escape_version(version)}-"
+        f"{'-'.join(tags)}.whl"
+    )
 
 
 def _normalize_wheel_path(value: str, label: str) -> str:
@@ -215,7 +225,10 @@ def rewrite_wheel(
 
         final_name = distribution_name or original_name
         final_version = distribution_version or original_version
-        final_dist_info = f"{_wheel_escape(final_name)}-{_wheel_escape(final_version)}.dist-info"
+        final_dist_info = (
+            f"{_wheel_escape_distribution(final_name)}-"
+            f"{_wheel_escape_version(final_version)}.dist-info"
+        )
         record_path = f"{final_dist_info}/RECORD"
         target_paths = [target for _, target in extra_files]
         if cuda_runtime is not None:

From 0c40e0d74f2d62849128d5a8ef4be615827138a7 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 19:47:04 -0400
Subject: [PATCH 35/43] Collect Linux wheel verify backtraces

---
 .github/actions/build-wheel-for-publish/action.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index 10697e808396..23c49e67c44e 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -288,6 +288,14 @@ runs:
           --extra-library-pattern "*charset*.dll"
         CIBW_TEST_COMMAND: >-
           python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py"
+        CIBW_TEST_COMMAND_LINUX: >-
+          bash -lc 'if command -v gdb >/dev/null 2>&1; then
+          exec gdb -q -batch -return-child-result
+          -ex "set pagination off" -ex run -ex "thread apply all bt"
+          --args python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py";
+          else
+          exec python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py";
+          fi'
 
     - name: Verify final wheel
       shell: bash -l {0}

From 7ce86b0454038de570ffeeff56337bcab47b1179 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 20:40:10 -0400
Subject: [PATCH 36/43] Improve Linux wheel crash diagnostics

---
 .../build-wheel-for-publish/action.yml        |  17 ++-
 .github/workflows/publish_wheel.yml           | 132 ++++++++++++-----
 ci/scripts/package/verify_tvm_linux.sh        | 135 ++++++++++++++++++
 3 files changed, 243 insertions(+), 41 deletions(-)
 create mode 100644 ci/scripts/package/verify_tvm_linux.sh

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index 23c49e67c44e..cbd47d12cf95 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -55,6 +55,10 @@ inputs:
     description: "Absolute path to libtvm_runtime_cuda.so produced by build-cuda"
     required: false
     default: ""
+  debug_symbols:
+    description: "Build Linux wheels with debug symbols and keep them unstripped for crash diagnosis"
+    required: false
+    default: "false"
 
 runs:
   using: "composite"
@@ -256,6 +260,9 @@ runs:
           TVM_EXPECT_STATIC_LLVM=1
           TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
+          TVM_WHEEL_DEBUG_SYMBOLS="${{ inputs.debug_symbols }}"
+          SKBUILD_CMAKE_BUILD_TYPE="${{ inputs.debug_symbols == 'true' && 'RelWithDebInfo' || '' }}"
+          SKBUILD_INSTALL_STRIP="${{ inputs.debug_symbols == 'true' && 'false' || '' }}"
         CIBW_ENVIRONMENT_WINDOWS: >-
           TVM_USE_LLVM="C:/opt/llvm/Library/bin/llvm-config-static.bat"
           CMAKE_PREFIX_PATH="C:/opt/llvm/Library"
@@ -270,6 +277,7 @@ runs:
           TVM_EXPECT_STATIC_LLVM=1
           TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
+          TVM_WHEEL_DEBUG_SYMBOLS="${{ inputs.debug_symbols }}"
         CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
           bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
         CIBW_REPAIR_WHEEL_COMMAND_MACOS: >-
@@ -289,13 +297,8 @@ runs:
         CIBW_TEST_COMMAND: >-
           python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py"
         CIBW_TEST_COMMAND_LINUX: >-
-          bash -lc 'if command -v gdb >/dev/null 2>&1; then
-          exec gdb -q -batch -return-child-result
-          -ex "set pagination off" -ex run -ex "thread apply all bt"
-          --args python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py";
-          else
-          exec python -u -X faulthandler "{project}/ci/scripts/package/verify_tvm_install.py";
-          fi'
+          bash "{project}/ci/scripts/package/verify_tvm_linux.sh"
+          "{project}/ci/scripts/package/verify_tvm_install.py"
 
     - name: Verify final wheel
       shell: bash -l {0}
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index 9101c3d6cdca..f9fa2da95b81 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -53,50 +53,103 @@ on:
         required: true
         default: true
         type: boolean
+      build_target:
+        description: "Wheel job subset to build"
+        required: true
+        default: "all"
+        type: choice
+        options:
+          - all
+          - linux-x86_64
+          - linux-aarch64
+          - macos-arm64
+          - windows-amd64
+      debug_symbols:
+        description: "Keep Linux wheel debug symbols for CI crash diagnosis; do not use for publishing"
+        required: true
+        default: false
+        type: boolean
 
 permissions:
   contents: read
 
 jobs:
+  select_build_matrix:
+    name: Select wheel build matrix
+    runs-on: ubuntu-latest
+    outputs:
+      matrix: ${{ steps.matrix.outputs.matrix }}
+    steps:
+      - name: Select matrix entries
+        id: matrix
+        shell: bash
+        env:
+          TVM_BUILD_TARGET: ${{ inputs.build_target }}
+        run: |
+          set -eux
+          python - <<'PY' >> "${GITHUB_OUTPUT}"
+          import json
+          import os
+
+          entries = [
+              {
+                  "name": "Linux x86_64 wheel with CUDA runtime (manylinux_2_28)",
+                  "target": "linux-x86_64",
+                  "os": "ubuntu-latest",
+                  "arch": "x86_64",
+                  "build": "cp310-manylinux_x86_64",
+                  "linux_image": "manylinux_2_28",
+                  "linux_image_tag": "2026.01.04-1",
+                  "include_cuda_runtime": "true",
+                  "artifact_suffix": "linux-x86_64-manylinux_2_28",
+              },
+              {
+                  "name": "Linux aarch64 wheel with CUDA runtime (manylinux_2_28)",
+                  "target": "linux-aarch64",
+                  "os": "ubuntu-24.04-arm",
+                  "arch": "aarch64",
+                  "build": "cp310-manylinux_aarch64",
+                  "linux_image": "manylinux_2_28",
+                  "linux_image_tag": "2026.01.04-1",
+                  "include_cuda_runtime": "true",
+                  "artifact_suffix": "linux-aarch64-manylinux_2_28",
+              },
+              {
+                  "name": "macOS arm64 CPU wheel",
+                  "target": "macos-arm64",
+                  "os": "macos-14",
+                  "arch": "arm64",
+                  "build": "cp310-macosx_arm64",
+                  "linux_image": "",
+                  "linux_image_tag": "",
+                  "include_cuda_runtime": "false",
+                  "artifact_suffix": "macos-arm64",
+              },
+              {
+                  "name": "Windows AMD64 CPU wheel",
+                  "target": "windows-amd64",
+                  "os": "windows-latest",
+                  "arch": "AMD64",
+                  "build": "cp310-win_amd64",
+                  "linux_image": "",
+                  "linux_image_tag": "",
+                  "include_cuda_runtime": "false",
+                  "artifact_suffix": "windows-amd64",
+              },
+          ]
+          selected = os.environ["TVM_BUILD_TARGET"]
+          if selected != "all":
+              entries = [entry for entry in entries if entry["target"] == selected]
+          print("matrix=" + json.dumps({"include": entries}, separators=(",", ":")))
+          PY
+
   build_wheels:
     name: ${{ matrix.name }}
+    needs: [select_build_matrix]
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
-      matrix:
-        include:
-          - name: Linux x86_64 wheel with CUDA runtime (manylinux_2_28)
-            os: ubuntu-latest
-            arch: x86_64
-            build: cp310-manylinux_x86_64
-            linux_image: manylinux_2_28
-            linux_image_tag: 2026.01.04-1
-            include_cuda_runtime: "true"
-            artifact_suffix: linux-x86_64-manylinux_2_28
-          - name: Linux aarch64 wheel with CUDA runtime (manylinux_2_28)
-            os: ubuntu-24.04-arm
-            arch: aarch64
-            build: cp310-manylinux_aarch64
-            linux_image: manylinux_2_28
-            linux_image_tag: 2026.01.04-1
-            include_cuda_runtime: "true"
-            artifact_suffix: linux-aarch64-manylinux_2_28
-          - name: macOS arm64 CPU wheel
-            os: macos-14
-            arch: arm64
-            build: cp310-macosx_arm64
-            linux_image: ""
-            linux_image_tag: ""
-            include_cuda_runtime: "false"
-            artifact_suffix: macos-arm64
-          - name: Windows AMD64 CPU wheel
-            os: windows-latest
-            arch: AMD64
-            build: cp310-win_amd64
-            linux_image: ""
-            linux_image_tag: ""
-            include_cuda_runtime: "false"
-            artifact_suffix: windows-amd64
+      matrix: ${{ fromJSON(needs.select_build_matrix.outputs.matrix) }}
     steps:
       - name: Validate publish inputs
         shell: bash
@@ -104,8 +157,10 @@ jobs:
           TVM_PUBLISH_REPOSITORY: ${{ inputs.publish_repository }}
           TVM_PUBLISH_REF: ${{ inputs.tag }}
           TVM_VERIFY_FROM_REPOSITORY: ${{ inputs.verify_from_repository }}
+          TVM_BUILD_TARGET: ${{ inputs.build_target }}
           TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
           TVM_WHEEL_DIST_VERSION: ${{ inputs.distribution_version }}
+          TVM_DEBUG_SYMBOLS: ${{ inputs.debug_symbols }}
         run: |
           set -eux
           if [[ -n "${TVM_WHEEL_DIST_NAME}" && ! "${TVM_WHEEL_DIST_NAME}" =~ ^[A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?$ ]]; then
@@ -132,6 +187,14 @@ jobs:
             echo "verify_from_repository must be enabled when publishing to PyPI" >&2
             exit 1
           fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" != "none" && "${TVM_BUILD_TARGET}" != "all" ]]; then
+            echo "build_target must be all when publishing wheels" >&2
+            exit 1
+          fi
+          if [[ "${TVM_PUBLISH_REPOSITORY}" != "none" && "${TVM_DEBUG_SYMBOLS}" == "true" ]]; then
+            echo "debug_symbols keeps unstripped debug wheels and cannot be used while publishing" >&2
+            exit 1
+          fi
 
       - name: Checkout source
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -163,6 +226,7 @@ jobs:
           cuda_architectures: ${{ inputs.cuda_architectures }}
           include_cuda_runtime: ${{ matrix.include_cuda_runtime }}
           cuda_runtime_path: ${{ steps.build_cuda.outputs.cuda_runtime_path }}
+          debug_symbols: ${{ inputs.debug_symbols }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/ci/scripts/package/verify_tvm_linux.sh b/ci/scripts/package/verify_tvm_linux.sh
new file mode 100644
index 000000000000..f36a18fcf8b7
--- /dev/null
+++ b/ci/scripts/package/verify_tvm_linux.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set -euo pipefail
+
+VERIFY_SCRIPT="${1:?usage: verify_tvm_linux.sh /path/to/verify_tvm_install.py}"
+
+echo "Linux wheel verify diagnostics"
+echo "TVM_WHEEL_DEBUG_SYMBOLS=${TVM_WHEEL_DEBUG_SYMBOLS:-}"
+echo "SKBUILD_CMAKE_BUILD_TYPE=${SKBUILD_CMAKE_BUILD_TYPE:-}"
+echo "SKBUILD_INSTALL_STRIP=${SKBUILD_INSTALL_STRIP:-}"
+
+for name in TVM_LIBRARY_PATH LD_LIBRARY_PATH DYLD_LIBRARY_PATH; do
+  if [[ -n "${!name:-}" ]]; then
+    echo "clearing ${name} before importing tvm"
+    unset "${name}"
+  fi
+done
+
+python - <<'PY'
+from __future__ import annotations
+
+from pathlib import Path
+import shlex
+import subprocess
+
+import tvm
+
+
+def run(cmd: list[str], *, capture: bool = False) -> subprocess.CompletedProcess[str]:
+    print("$", " ".join(shlex.quote(part) for part in cmd), flush=True)
+    try:
+        return subprocess.run(
+            cmd,
+            check=False,
+            capture_output=capture,
+            text=True,
+        )
+    except FileNotFoundError:
+        print(f"{cmd[0]} is not available", flush=True)
+        return subprocess.CompletedProcess(cmd, 127, "", "")
+
+
+def print_section_markers(path: Path) -> None:
+    result = run(["readelf", "-S", str(path)], capture=True)
+    text = result.stdout or ""
+    markers = [".symtab", ".debug_info", ".debug_line", ".gnu_debuglink"]
+    for marker in markers:
+        print(f"{path.name} contains {marker}: {marker in text}", flush=True)
+
+
+def print_symbol_sample(path: Path) -> None:
+    result = run(["nm", "-C", str(path)], capture=True)
+    if result.returncode != 0:
+        print(f"nm failed for {path.name} with code {result.returncode}", flush=True)
+        if result.stderr:
+            print(result.stderr, flush=True)
+        return
+    patterns = (
+        "CodeGenLLVM",
+        "LLVMModuleNode",
+        "LLVMTargetInfo",
+        "target.build.llvm",
+        "BuildLLVM",
+    )
+    matches = [line for line in result.stdout.splitlines() if any(p in line for p in patterns)]
+    print(f"{path.name} symbol sample count: {len(matches)}", flush=True)
+    for line in matches[:40]:
+        print("  ", line, flush=True)
+
+
+root = Path(tvm.__file__).resolve().parent
+libdir = root / "lib"
+print("diagnostic tvm package:", root, flush=True)
+print("diagnostic libdir:", libdir, flush=True)
+for libname in (
+    "libtvm_compiler.so",
+    "libtvm_runtime.so",
+    "libtvm_runtime_extra.so",
+    "libtvm_runtime_cuda.so",
+):
+    path = libdir / libname
+    print(f"{libname}: exists={path.exists()}", flush=True)
+    if not path.exists():
+        continue
+    print(f"{libname}: size={path.stat().st_size} bytes", flush=True)
+    run(["file", str(path)])
+    run(["readelf", "-n", str(path)])
+    print_section_markers(path)
+    if libname == "libtvm_compiler.so":
+        print_symbol_sample(path)
+PY
+
+if command -v gdb >/dev/null 2>&1; then
+  gdb_cmd=(
+    gdb -q -batch -return-child-result
+    -ex "set pagination off"
+    -ex "set print frame-arguments all"
+    -ex "set print elements 0"
+    -ex "set backtrace limit 200"
+    -ex "run"
+    -ex 'info symbol $pc'
+    -ex "info registers"
+    -ex 'x/16i $pc-32'
+    -ex "thread apply all bt full"
+    -ex "info sharedlibrary"
+    --args python -u -X faulthandler "${VERIFY_SCRIPT}"
+  )
+  printf 'Running under gdb:'
+  printf ' %q' "${gdb_cmd[@]}"
+  printf '\n'
+  set +e
+  "${gdb_cmd[@]}"
+  status=$?
+  set -e
+  echo "gdb verify exited with status ${status}"
+  exit "${status}"
+fi
+
+exec python -u -X faulthandler "${VERIFY_SCRIPT}"

From 4855ebe01625943a07ee8754fe17a1ba53a550ea Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Thu, 28 May 2026 21:20:24 -0400
Subject: [PATCH 37/43] Keep diagnostic wheel builds in Release mode

---
 .github/actions/build-wheel-for-publish/action.yml | 3 ++-
 ci/scripts/package/verify_tvm_linux.sh             | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index cbd47d12cf95..c1526b923e17 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -261,8 +261,9 @@ runs:
           TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
           TVM_WHEEL_DEBUG_SYMBOLS="${{ inputs.debug_symbols }}"
-          SKBUILD_CMAKE_BUILD_TYPE="${{ inputs.debug_symbols == 'true' && 'RelWithDebInfo' || '' }}"
           SKBUILD_INSTALL_STRIP="${{ inputs.debug_symbols == 'true' && 'false' || '' }}"
+          CFLAGS="${{ inputs.debug_symbols == 'true' && '-g' || '' }}"
+          CXXFLAGS="${{ inputs.debug_symbols == 'true' && '-g' || '' }}"
         CIBW_ENVIRONMENT_WINDOWS: >-
           TVM_USE_LLVM="C:/opt/llvm/Library/bin/llvm-config-static.bat"
           CMAKE_PREFIX_PATH="C:/opt/llvm/Library"
diff --git a/ci/scripts/package/verify_tvm_linux.sh b/ci/scripts/package/verify_tvm_linux.sh
index f36a18fcf8b7..ae82b029f996 100644
--- a/ci/scripts/package/verify_tvm_linux.sh
+++ b/ci/scripts/package/verify_tvm_linux.sh
@@ -24,6 +24,8 @@ echo "Linux wheel verify diagnostics"
 echo "TVM_WHEEL_DEBUG_SYMBOLS=${TVM_WHEEL_DEBUG_SYMBOLS:-}"
 echo "SKBUILD_CMAKE_BUILD_TYPE=${SKBUILD_CMAKE_BUILD_TYPE:-}"
 echo "SKBUILD_INSTALL_STRIP=${SKBUILD_INSTALL_STRIP:-}"
+echo "CFLAGS=${CFLAGS:-}"
+echo "CXXFLAGS=${CXXFLAGS:-}"
 
 for name in TVM_LIBRARY_PATH LD_LIBRARY_PATH DYLD_LIBRARY_PATH; do
   if [[ -n "${!name:-}" ]]; then

From 82212383b96eb61c598828a682068d50bf2a0c2e Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Fri, 29 May 2026 00:22:23 -0400
Subject: [PATCH 38/43] Disable linker relaxation for the static-LLVM compiler
 library

Linking the full LLVM static archives into libtvm_compiler.so on Linux
produces a library large enough to trigger a GNU ld (binutils)
R_X86_64_GOTPCRELX relaxation bug. conda-forge LLVM is built with
-fno-plt, so calls are emitted as indirect GOT calls; ld relaxes one of
them to a direct call with an incorrect displacement that lands in
read-only data instead of the intended function. The result is a runtime
SIGSEGV inside llvm::X86Subtarget while building the code-generation pass
pipeline (e.g. tvm.compile(target="llvm")).

This reproduces with a freshly linked, unrepaired manylinux build
(gcc-toolset-14 + GNU ld 2.41), so it is the linker relaxation -- not
auditwheel, LLVM 22, or any TVM source change. The same objects linked by
a different ld (or with relaxation disabled) work correctly.

Pass -Wl,--no-relax when linking tvm_compiler with LLVM on Linux to keep
the GOT-indirect sequences and avoid the miscompilation. It is harmless
when LLVM is linked dynamically. See binutils bug ld/25754.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CMakeLists.txt | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2738a198786..422d7112a6ea 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -611,6 +611,20 @@ target_include_directories(tvm_compiler PUBLIC "$<INSTALL_INTERFACE:${CMAKE_INST
 set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "${TVM_NO_UNDEFINED_SYMBOLS}")
 set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}")
 
+# Work around a GNU ld (binutils) relaxation bug that miscompiles
+# R_X86_64_GOTPCRELX relocations inside very large statically-linked archives.
+# When the full LLVM static libraries are linked into libtvm_compiler.so, the
+# library is large enough that ld can relax an indirect GOT call (LLVM built
+# with -fno-plt emits these) into a direct call with an incorrect displacement.
+# The call then targets read-only data instead of the intended function and
+# crashes at runtime with a SIGSEGV inside llvm::X86Subtarget during code
+# generation. Disabling linker relaxation keeps the GOT-indirect sequences and
+# avoids the miscompilation; it is harmless when LLVM is linked dynamically.
+# See binutils bug ld/25754.
+if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND NOT ${USE_LLVM} MATCHES ${IS_FALSE_PATTERN})
+  set_property(TARGET tvm_compiler APPEND PROPERTY LINK_OPTIONS "-Wl,--no-relax")
+endif()
+
 # Place runtime/compiler/allvisible artifacts under build/lib/ to mirror the
 # tvm-ffi layout and make tvm_ffi.libinfo.load_lib_ctypes(package="tvm") able
 # to discover them in dev / editable builds.

From 01934ab813057fd2f60331363a2771bd6d636710 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Fri, 29 May 2026 01:40:12 -0400
Subject: [PATCH 39/43] Remove wheel crash-diagnostic scaffolding

Drop the debugging instrumentation added while diagnosing the Linux wheel
SIGSEGV now that the root cause is fixed (linker --no-relax):

- verify_tvm_linux.sh: remove the gdb backtrace wrapper and the readelf/nm
  symbol dumps; it now just clears dev library-path overrides and runs the
  cross-platform verifier directly.
- publish workflow + wheel action: remove the debug_symbols option (which
  produced unstripped -g wheels and is unsuitable for publishing).

The legitimate verification (import, runtime-from-wheel and static-LLVM
checks, and the TIRX/Relax compile smoke tests) is unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../build-wheel-for-publish/action.yml        |   9 --
 .github/workflows/publish_wheel.yml           |  11 --
 ci/scripts/package/verify_tvm_linux.sh        | 111 +-----------------
 3 files changed, 4 insertions(+), 127 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index c1526b923e17..47e159952df5 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -55,10 +55,6 @@ inputs:
     description: "Absolute path to libtvm_runtime_cuda.so produced by build-cuda"
     required: false
     default: ""
-  debug_symbols:
-    description: "Build Linux wheels with debug symbols and keep them unstripped for crash diagnosis"
-    required: false
-    default: "false"
 
 runs:
   using: "composite"
@@ -260,10 +256,6 @@ runs:
           TVM_EXPECT_STATIC_LLVM=1
           TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
-          TVM_WHEEL_DEBUG_SYMBOLS="${{ inputs.debug_symbols }}"
-          SKBUILD_INSTALL_STRIP="${{ inputs.debug_symbols == 'true' && 'false' || '' }}"
-          CFLAGS="${{ inputs.debug_symbols == 'true' && '-g' || '' }}"
-          CXXFLAGS="${{ inputs.debug_symbols == 'true' && '-g' || '' }}"
         CIBW_ENVIRONMENT_WINDOWS: >-
           TVM_USE_LLVM="C:/opt/llvm/Library/bin/llvm-config-static.bat"
           CMAKE_PREFIX_PATH="C:/opt/llvm/Library"
@@ -278,7 +270,6 @@ runs:
           TVM_EXPECT_STATIC_LLVM=1
           TVM_EXPECT_CUDA_RUNTIME="${{ steps.wheel_inputs.outputs.include_cuda_runtime }}"
           TVM_EXPECT_CUDA_ENABLED="${{ steps.wheel_inputs.outputs.include_cuda_runtime != '1' && '0' || '' }}"
-          TVM_WHEEL_DEBUG_SYMBOLS="${{ inputs.debug_symbols }}"
         CIBW_REPAIR_WHEEL_COMMAND_LINUX: >-
           bash "{project}/ci/scripts/package/tvm_wheel_helper.sh" cibw-repair "{wheel}" "{dest_dir}"
         CIBW_REPAIR_WHEEL_COMMAND_MACOS: >-
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index f9fa2da95b81..f7f7b028f428 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -64,11 +64,6 @@ on:
           - linux-aarch64
           - macos-arm64
           - windows-amd64
-      debug_symbols:
-        description: "Keep Linux wheel debug symbols for CI crash diagnosis; do not use for publishing"
-        required: true
-        default: false
-        type: boolean
 
 permissions:
   contents: read
@@ -160,7 +155,6 @@ jobs:
           TVM_BUILD_TARGET: ${{ inputs.build_target }}
           TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
           TVM_WHEEL_DIST_VERSION: ${{ inputs.distribution_version }}
-          TVM_DEBUG_SYMBOLS: ${{ inputs.debug_symbols }}
         run: |
           set -eux
           if [[ -n "${TVM_WHEEL_DIST_NAME}" && ! "${TVM_WHEEL_DIST_NAME}" =~ ^[A-Za-z0-9]([A-Za-z0-9._-]*[A-Za-z0-9])?$ ]]; then
@@ -191,10 +185,6 @@ jobs:
             echo "build_target must be all when publishing wheels" >&2
             exit 1
           fi
-          if [[ "${TVM_PUBLISH_REPOSITORY}" != "none" && "${TVM_DEBUG_SYMBOLS}" == "true" ]]; then
-            echo "debug_symbols keeps unstripped debug wheels and cannot be used while publishing" >&2
-            exit 1
-          fi
 
       - name: Checkout source
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
@@ -226,7 +216,6 @@ jobs:
           cuda_architectures: ${{ inputs.cuda_architectures }}
           include_cuda_runtime: ${{ matrix.include_cuda_runtime }}
           cuda_runtime_path: ${{ steps.build_cuda.outputs.cuda_runtime_path }}
-          debug_symbols: ${{ inputs.debug_symbols }}
 
       - name: Upload wheel artifact
         uses: actions/upload-artifact@043fb46d1a93c77aae656e7c1c64a875d1fc6a0a # v7.0.1
diff --git a/ci/scripts/package/verify_tvm_linux.sh b/ci/scripts/package/verify_tvm_linux.sh
index ae82b029f996..54b5515170a2 100644
--- a/ci/scripts/package/verify_tvm_linux.sh
+++ b/ci/scripts/package/verify_tvm_linux.sh
@@ -16,17 +16,14 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# Linux wheel verification entrypoint. Clears any development library-path
+# overrides so the verifier loads the libraries bundled in the installed wheel,
+# then runs the cross-platform verifier.
+
 set -euo pipefail
 
 VERIFY_SCRIPT="${1:?usage: verify_tvm_linux.sh /path/to/verify_tvm_install.py}"
 
-echo "Linux wheel verify diagnostics"
-echo "TVM_WHEEL_DEBUG_SYMBOLS=${TVM_WHEEL_DEBUG_SYMBOLS:-}"
-echo "SKBUILD_CMAKE_BUILD_TYPE=${SKBUILD_CMAKE_BUILD_TYPE:-}"
-echo "SKBUILD_INSTALL_STRIP=${SKBUILD_INSTALL_STRIP:-}"
-echo "CFLAGS=${CFLAGS:-}"
-echo "CXXFLAGS=${CXXFLAGS:-}"
-
 for name in TVM_LIBRARY_PATH LD_LIBRARY_PATH DYLD_LIBRARY_PATH; do
   if [[ -n "${!name:-}" ]]; then
     echo "clearing ${name} before importing tvm"
@@ -34,104 +31,4 @@ for name in TVM_LIBRARY_PATH LD_LIBRARY_PATH DYLD_LIBRARY_PATH; do
   fi
 done
 
-python - <<'PY'
-from __future__ import annotations
-
-from pathlib import Path
-import shlex
-import subprocess
-
-import tvm
-
-
-def run(cmd: list[str], *, capture: bool = False) -> subprocess.CompletedProcess[str]:
-    print("$", " ".join(shlex.quote(part) for part in cmd), flush=True)
-    try:
-        return subprocess.run(
-            cmd,
-            check=False,
-            capture_output=capture,
-            text=True,
-        )
-    except FileNotFoundError:
-        print(f"{cmd[0]} is not available", flush=True)
-        return subprocess.CompletedProcess(cmd, 127, "", "")
-
-
-def print_section_markers(path: Path) -> None:
-    result = run(["readelf", "-S", str(path)], capture=True)
-    text = result.stdout or ""
-    markers = [".symtab", ".debug_info", ".debug_line", ".gnu_debuglink"]
-    for marker in markers:
-        print(f"{path.name} contains {marker}: {marker in text}", flush=True)
-
-
-def print_symbol_sample(path: Path) -> None:
-    result = run(["nm", "-C", str(path)], capture=True)
-    if result.returncode != 0:
-        print(f"nm failed for {path.name} with code {result.returncode}", flush=True)
-        if result.stderr:
-            print(result.stderr, flush=True)
-        return
-    patterns = (
-        "CodeGenLLVM",
-        "LLVMModuleNode",
-        "LLVMTargetInfo",
-        "target.build.llvm",
-        "BuildLLVM",
-    )
-    matches = [line for line in result.stdout.splitlines() if any(p in line for p in patterns)]
-    print(f"{path.name} symbol sample count: {len(matches)}", flush=True)
-    for line in matches[:40]:
-        print("  ", line, flush=True)
-
-
-root = Path(tvm.__file__).resolve().parent
-libdir = root / "lib"
-print("diagnostic tvm package:", root, flush=True)
-print("diagnostic libdir:", libdir, flush=True)
-for libname in (
-    "libtvm_compiler.so",
-    "libtvm_runtime.so",
-    "libtvm_runtime_extra.so",
-    "libtvm_runtime_cuda.so",
-):
-    path = libdir / libname
-    print(f"{libname}: exists={path.exists()}", flush=True)
-    if not path.exists():
-        continue
-    print(f"{libname}: size={path.stat().st_size} bytes", flush=True)
-    run(["file", str(path)])
-    run(["readelf", "-n", str(path)])
-    print_section_markers(path)
-    if libname == "libtvm_compiler.so":
-        print_symbol_sample(path)
-PY
-
-if command -v gdb >/dev/null 2>&1; then
-  gdb_cmd=(
-    gdb -q -batch -return-child-result
-    -ex "set pagination off"
-    -ex "set print frame-arguments all"
-    -ex "set print elements 0"
-    -ex "set backtrace limit 200"
-    -ex "run"
-    -ex 'info symbol $pc'
-    -ex "info registers"
-    -ex 'x/16i $pc-32'
-    -ex "thread apply all bt full"
-    -ex "info sharedlibrary"
-    --args python -u -X faulthandler "${VERIFY_SCRIPT}"
-  )
-  printf 'Running under gdb:'
-  printf ' %q' "${gdb_cmd[@]}"
-  printf '\n'
-  set +e
-  "${gdb_cmd[@]}"
-  status=$?
-  set -e
-  echo "gdb verify exited with status ${status}"
-  exit "${status}"
-fi
-
 exec python -u -X faulthandler "${VERIFY_SCRIPT}"

From bc200886f22e3e4f74d3453dbe2829145155b49b Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Fri, 29 May 2026 03:07:51 -0400
Subject: [PATCH 40/43] Simplify wheel publish workflow structure

- Replace the select_build_matrix job (which generated the matrix from
  embedded Python and the build_target input) with a static
  strategy.matrix.include in build_wheels. The workflow always builds all
  four platform wheels, which is what publishing requires anyway, so the
  dynamic-matrix indirection and the build_target input/validation are
  removed.
- Single-source the LLVM toolchain version: it is now set once in
  LLVM_VERSION and referenced by the cache key and the conda install steps,
  instead of being hardcoded in three places.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .../build-wheel-for-publish/action.yml        |  12 +-
 .github/workflows/publish_wheel.yml           | 121 +++++-------------
 2 files changed, 43 insertions(+), 90 deletions(-)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index 47e159952df5..deab82a31adb 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -129,6 +129,12 @@ runs:
         echo "cibw_container_engine=${cibw_container_engine}" >> "${GITHUB_OUTPUT}"
         echo "include_cuda_runtime=${include_cuda_runtime}" >> "${GITHUB_OUTPUT}"
 
+    # Single source of truth for the LLVM toolchain version, shared by the cache
+    # key and the conda install steps below.
+    - name: Set LLVM version
+      shell: bash
+      run: echo "LLVM_VERSION=22.1.0" >> "$GITHUB_ENV"
+
     - name: Prepare LLVM cache path (Unix)
       if: runner.os != 'Windows'
       shell: bash
@@ -143,7 +149,7 @@ runs:
       id: llvm-cache
       with:
         path: ${{ runner.os == 'Windows' && 'C:/opt/llvm' || '/opt/llvm' }}
-        key: tvm-wheel-llvm-22.1.0-${{ runner.os }}-${{ inputs.arch }}-v4
+        key: tvm-wheel-llvm-${{ env.LLVM_VERSION }}-${{ runner.os }}-${{ inputs.arch }}-v4
 
     # ---- Install LLVM via conda (cache miss only) ----
     - name: Setup conda
@@ -173,14 +179,14 @@ runs:
           sudo chown -R "$(whoami)" /opt/llvm
         fi
         conda create -q -p /opt/llvm -c conda-forge \
-          llvmdev=22.1.0 clangdev=22.1.0 compiler-rt=22.1.0 zlib zstd-static libxml2-devel \
+          "llvmdev=${LLVM_VERSION}" "clangdev=${LLVM_VERSION}" "compiler-rt=${LLVM_VERSION}" zlib zstd-static libxml2-devel \
           -y
 
     - name: Install LLVM (Windows)
       if: steps.llvm-cache.outputs.cache-hit != 'true' && runner.os == 'Windows'
       shell: cmd /C call {0}
       run: |
-        conda create -q -p C:\opt\llvm -c conda-forge llvmdev=22.1.0 zlib zstd-static libxml2-devel -y
+        conda create -q -p C:\opt\llvm -c conda-forge llvmdev=%LLVM_VERSION% zlib zstd-static libxml2-devel -y
 
     - name: Create static llvm-config wrapper (Unix)
       if: runner.os != 'Windows'
diff --git a/.github/workflows/publish_wheel.yml b/.github/workflows/publish_wheel.yml
index f7f7b028f428..4ac52c25d91e 100644
--- a/.github/workflows/publish_wheel.yml
+++ b/.github/workflows/publish_wheel.yml
@@ -53,98 +53,50 @@ on:
         required: true
         default: true
         type: boolean
-      build_target:
-        description: "Wheel job subset to build"
-        required: true
-        default: "all"
-        type: choice
-        options:
-          - all
-          - linux-x86_64
-          - linux-aarch64
-          - macos-arm64
-          - windows-amd64
 
 permissions:
   contents: read
 
 jobs:
-  select_build_matrix:
-    name: Select wheel build matrix
-    runs-on: ubuntu-latest
-    outputs:
-      matrix: ${{ steps.matrix.outputs.matrix }}
-    steps:
-      - name: Select matrix entries
-        id: matrix
-        shell: bash
-        env:
-          TVM_BUILD_TARGET: ${{ inputs.build_target }}
-        run: |
-          set -eux
-          python - <<'PY' >> "${GITHUB_OUTPUT}"
-          import json
-          import os
-
-          entries = [
-              {
-                  "name": "Linux x86_64 wheel with CUDA runtime (manylinux_2_28)",
-                  "target": "linux-x86_64",
-                  "os": "ubuntu-latest",
-                  "arch": "x86_64",
-                  "build": "cp310-manylinux_x86_64",
-                  "linux_image": "manylinux_2_28",
-                  "linux_image_tag": "2026.01.04-1",
-                  "include_cuda_runtime": "true",
-                  "artifact_suffix": "linux-x86_64-manylinux_2_28",
-              },
-              {
-                  "name": "Linux aarch64 wheel with CUDA runtime (manylinux_2_28)",
-                  "target": "linux-aarch64",
-                  "os": "ubuntu-24.04-arm",
-                  "arch": "aarch64",
-                  "build": "cp310-manylinux_aarch64",
-                  "linux_image": "manylinux_2_28",
-                  "linux_image_tag": "2026.01.04-1",
-                  "include_cuda_runtime": "true",
-                  "artifact_suffix": "linux-aarch64-manylinux_2_28",
-              },
-              {
-                  "name": "macOS arm64 CPU wheel",
-                  "target": "macos-arm64",
-                  "os": "macos-14",
-                  "arch": "arm64",
-                  "build": "cp310-macosx_arm64",
-                  "linux_image": "",
-                  "linux_image_tag": "",
-                  "include_cuda_runtime": "false",
-                  "artifact_suffix": "macos-arm64",
-              },
-              {
-                  "name": "Windows AMD64 CPU wheel",
-                  "target": "windows-amd64",
-                  "os": "windows-latest",
-                  "arch": "AMD64",
-                  "build": "cp310-win_amd64",
-                  "linux_image": "",
-                  "linux_image_tag": "",
-                  "include_cuda_runtime": "false",
-                  "artifact_suffix": "windows-amd64",
-              },
-          ]
-          selected = os.environ["TVM_BUILD_TARGET"]
-          if selected != "all":
-              entries = [entry for entry in entries if entry["target"] == selected]
-          print("matrix=" + json.dumps({"include": entries}, separators=(",", ":")))
-          PY
-
   build_wheels:
     name: ${{ matrix.name }}
-    needs: [select_build_matrix]
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
-      matrix: ${{ fromJSON(needs.select_build_matrix.outputs.matrix) }}
+      matrix:
+        include:
+          - name: "Linux x86_64 wheel with CUDA runtime (manylinux_2_28)"
+            os: ubuntu-latest
+            arch: x86_64
+            build: cp310-manylinux_x86_64
+            linux_image: manylinux_2_28
+            linux_image_tag: "2026.01.04-1"
+            include_cuda_runtime: "true"
+            artifact_suffix: linux-x86_64-manylinux_2_28
+          - name: "Linux aarch64 wheel with CUDA runtime (manylinux_2_28)"
+            os: ubuntu-24.04-arm
+            arch: aarch64
+            build: cp310-manylinux_aarch64
+            linux_image: manylinux_2_28
+            linux_image_tag: "2026.01.04-1"
+            include_cuda_runtime: "true"
+            artifact_suffix: linux-aarch64-manylinux_2_28
+          - name: "macOS arm64 CPU wheel"
+            os: macos-14
+            arch: arm64
+            build: cp310-macosx_arm64
+            linux_image: ""
+            linux_image_tag: ""
+            include_cuda_runtime: "false"
+            artifact_suffix: macos-arm64
+          - name: "Windows AMD64 CPU wheel"
+            os: windows-latest
+            arch: AMD64
+            build: cp310-win_amd64
+            linux_image: ""
+            linux_image_tag: ""
+            include_cuda_runtime: "false"
+            artifact_suffix: windows-amd64
     steps:
       - name: Validate publish inputs
         shell: bash
@@ -152,7 +104,6 @@ jobs:
           TVM_PUBLISH_REPOSITORY: ${{ inputs.publish_repository }}
           TVM_PUBLISH_REF: ${{ inputs.tag }}
           TVM_VERIFY_FROM_REPOSITORY: ${{ inputs.verify_from_repository }}
-          TVM_BUILD_TARGET: ${{ inputs.build_target }}
           TVM_WHEEL_DIST_NAME: ${{ inputs.distribution_name }}
           TVM_WHEEL_DIST_VERSION: ${{ inputs.distribution_version }}
         run: |
@@ -181,10 +132,6 @@ jobs:
             echo "verify_from_repository must be enabled when publishing to PyPI" >&2
             exit 1
           fi
-          if [[ "${TVM_PUBLISH_REPOSITORY}" != "none" && "${TVM_BUILD_TARGET}" != "all" ]]; then
-            echo "build_target must be all when publishing wheels" >&2
-            exit 1
-          fi
 
       - name: Checkout source
         uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2

From 6f7743e7c3d6b142335f5a235c89640ba6b49208 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Fri, 29 May 2026 04:50:34 -0400
Subject: [PATCH 41/43] Apply distribution version override to Windows wheel
 repair

The Windows repair command (rewrite_wheel.py) passed --distribution-name
but not --distribution-version, so Windows wheels kept the pyproject
version while Linux/macOS wheels got the override. When publishing to
TestPyPI with a version override this produced a mismatched, stale-version
Windows wheel that collided with a previously used filename and failed the
upload. Pass --distribution-version too (empty is a no-op).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .github/actions/build-wheel-for-publish/action.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/actions/build-wheel-for-publish/action.yml b/.github/actions/build-wheel-for-publish/action.yml
index deab82a31adb..d6daf503816e 100644
--- a/.github/actions/build-wheel-for-publish/action.yml
+++ b/.github/actions/build-wheel-for-publish/action.yml
@@ -285,6 +285,7 @@ runs:
           python "{project}/ci/scripts/package/rewrite_wheel.py" "{wheel}"
           --output-dir "{dest_dir}"
           --distribution-name "${{ inputs.distribution_name }}"
+          --distribution-version "${{ inputs.distribution_version }}"
           --extra-library-dir "C:/opt/llvm/Library/bin"
           --extra-library-pattern "libxml2*.dll"
           --extra-library-pattern "zstd*.dll"

From f03ec665828a1edd0ac0f8028c22e33e1abe1e4e Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Fri, 29 May 2026 13:18:03 -0400
Subject: [PATCH 42/43] Fix C-compilation header discovery for installed wheels

find_include_path() only searched source-tree-relative locations
(<pkg>/../include, ...), so it could not find the headers bundled in an
installed wheel (at <pkg>/include), and it looked for the tvm-ffi/dlpack
headers under 3rdparty/ which wheels do not ship. As a result target="c"
and Module.export_library() (compiling generated C/C++ sources) failed
from an installed wheel with "Cannot find the source directory" or a
missing <tvm/ffi/c_api.h>.

Search "." as well (the wheel layout), and fall back to the include
directories reported by the installed apache-tvm-ffi package
(tvm_ffi.libinfo) for the tvm-ffi and dlpack headers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 python/tvm/libinfo.py | 28 +++++++++++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/python/tvm/libinfo.py b/python/tvm/libinfo.py
index 136d85a49f30..a8dd1ea34144 100644
--- a/python/tvm/libinfo.py
+++ b/python/tvm/libinfo.py
@@ -265,7 +265,11 @@ def find_include_path(name=None, search_path=None, optional=False):
         source_dir = os.environ["TVM_HOME"]
     else:
         ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-        for source_dir in ["..", "../..", "../../.."]:
+        # "." covers the installed-wheel layout, where the bundled headers live
+        # in ``<package>/include``; the remaining entries cover a source tree,
+        # where the package is at ``<root>/python/tvm`` and headers at
+        # ``<root>/include``.
+        for source_dir in [".", "..", "../..", "../../.."]:
             source_dir = os.path.join(ffi_dir, source_dir)
             if os.path.isdir(os.path.join(source_dir, "include")):
                 break
@@ -310,6 +314,28 @@ def find_include_path(name=None, search_path=None, optional=False):
         include_found += [p for p in tvm_ffi_include_path if os.path.exists(p) and os.path.isdir(p)]
         include_found += [p for p in dlpack_include_path if os.path.exists(p) and os.path.isdir(p)]
 
+        # In a wheel install the tvm-ffi (and bundled dlpack) headers ship with
+        # the separate apache-tvm-ffi package rather than under 3rdparty/, so
+        # ask it where its headers live. This lets C/C++ source compilation
+        # (e.g. target="c" / Module.export_library) work from installed wheels.
+        try:
+            import tvm_ffi.libinfo as _ffi_libinfo  # pylint: disable=import-outside-toplevel
+
+            for _finder in (
+                getattr(_ffi_libinfo, "find_include_path", None),
+                getattr(_ffi_libinfo, "find_dlpack_include_path", None),
+            ):
+                if _finder is None:
+                    continue
+                try:
+                    _p = _finder()
+                except Exception:  # pylint: disable=broad-except
+                    continue
+                if _p and os.path.isdir(_p) and _p not in include_found:
+                    include_found.append(_p)
+        except Exception:  # pylint: disable=broad-except
+            pass
+
     if not include_found:
         message = (
             "Cannot find the files.\n"

From 887734cb9f9353f425f8ab5edb22a12dbfaf50b3 Mon Sep 17 00:00:00 2001
From: tlopex <820958424@qq.com>
Date: Fri, 29 May 2026 14:37:33 -0400
Subject: [PATCH 43/43] Fix stale Simplify import in TIRX lowering test

test_transform_lower_tirx.py imported and called Simplify, but the pass
was renamed to StmtSimplify (the only simplify pass exported from
tvm.tirx.transform). The stale name caused a collection-time ImportError.
Use StmtSimplify; the file now collects and all 32 tests pass.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 tests/python/tirx/transform/test_transform_lower_tirx.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/tirx/transform/test_transform_lower_tirx.py b/tests/python/tirx/transform/test_transform_lower_tirx.py
index 3e20d61f8059..80e68243d0b3 100644
--- a/tests/python/tirx/transform/test_transform_lower_tirx.py
+++ b/tests/python/tirx/transform/test_transform_lower_tirx.py
@@ -24,7 +24,7 @@
 from tvm.tirx.layout import laneid, warpid, wg_local_layout
 from tvm.tirx.stmt import ExecScopeStmt
 from tvm.tirx.stmt_functor import post_order_visit
-from tvm.tirx.transform import LowerTIRx, Simplify
+from tvm.tirx.transform import LowerTIRx, StmtSimplify
 
 
 def _contains_exec_scope(mod):
@@ -1000,7 +1000,7 @@ def before(A_ptr: Tx.handle):
 
     with tvm.target.Target("cuda"):
         lowered = LowerTIRx()(tvm.IRModule({"main": before}))
-        simplified = Simplify()(lowered)
+        simplified = StmtSimplify()(lowered)
 
     script = simplified.script(extra_config={"tirx.prefix": "Tx"})
     assert "if warp_id_in_cta // 4 == 0:" in script