From 057b34acf48faf3b202a13fc73430c5cba652b7a Mon Sep 17 00:00:00 2001 From: tongke <124763920+tongke6@users.noreply.github.com> Date: Fri, 12 Jun 2026 11:27:33 +0800 Subject: [PATCH 1/9] Split CUDA extensions by SM architecture for fat-binary wheel builds (#83) Replace the monolithic `cula.cudac` extension with per-arch extensions (`cula._cudac_sm90`, `cula._cudac_sm100`) so that SM90 and SM100/SM103 kernels are compiled independently with their own `-gencode` flags. This enables building fat-binary wheels containing all architectures without needing the target GPU present at build time. Key changes: - Split pybind.cu into per-file PYBIND11_MODULE definitions - Add `cula/cudac.py` proxy module for backwards-compatible imports - Add `CULA_BUILD_ALL_ARCHS=1` env var to enable all SM targets - Add `--fat` flag to build_wheel.sh for CI fat-binary builds - Pin dependency versions and use `no-local-version` scheme for reproducible wheel filenames - Use setuptools_scm for dynamic `__version__` - Document pre-built wheel installation in README --- .github/workflows/build-release.yml | 125 +++++++++++++++++++++++ README.md | 12 +++ csrc/api/kda_sm100.cu | 6 ++ csrc/api/kda_sm90.cu | 5 + csrc/api/pybind.cu | 80 --------------- cula/__init__.py | 5 +- cula/cudac.py | 65 ++++++++++++ pyproject.toml | 9 +- scripts/build_wheel.sh | 18 +++- setup.py | 147 ++++++++++++++-------------- 10 files changed, 309 insertions(+), 163 deletions(-) create mode 100644 .github/workflows/build-release.yml delete mode 100644 csrc/api/pybind.cu create mode 100644 cula/cudac.py diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml new file mode 100644 index 00000000..cd5b54d0 --- /dev/null +++ b/.github/workflows/build-release.yml @@ -0,0 +1,125 @@ +name: Build & Release Wheels + +on: + push: + tags: + - "v*" + workflow_dispatch: + +concurrency: + group: build-release-${{ github.ref }} + cancel-in-progress: true + +jobs: + build-wheel: + name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}" + runs-on: ${{ matrix.runner }} + strategy: + fail-fast: false + matrix: + cuda: + - cu129 + - cu130 + arch: + - x86_64 + - aarch64 + include: + - cuda: cu129 + cuda_version: "12.9.0" + torch_index: "https://download.pytorch.org/whl/cu129" + - cuda: cu130 + cuda_version: "13.0.0" + torch_index: "https://download.pytorch.org/whl/cu130" + - arch: x86_64 + runner: ubuntu-latest + - arch: aarch64 + runner: ubuntu-24.04-arm + container: + image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04" + + steps: + - name: Free disk space + run: | + rm -rf /opt/hostedtoolcache /usr/local/lib/android /usr/share/dotnet \ + /usr/local/share/boost /opt/ghc 2>/dev/null || true + apt-get clean 2>/dev/null || true + df -h / || true + + - name: Install git + run: | + apt-get update && apt-get install -y --no-install-recommends git \ + && rm -rf /var/lib/apt/lists/* + + - name: Checkout + uses: actions/checkout@v5 + with: + fetch-depth: 0 + submodules: recursive + + - name: Configure git safe directory + run: git config --global --add safe.directory "$GITHUB_WORKSPACE" + + - name: Set up Python + uses: actions/setup-python@v6 + with: + python-version: "3.12" + + - name: Install Python dependencies + run: | + python -m pip install --no-cache-dir --upgrade pip + python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }} + python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja + + - name: Compute version + id: version + run: | + if [[ "$GITHUB_REF" == refs/tags/v* ]]; then + BASE="${GITHUB_REF#refs/tags/v}" + else + # Strip any local segment (+gXXX) so we get a clean base + BASE=$(python -c "from setuptools_scm import get_version; print(get_version().split('+')[0])") + fi + echo "version=${BASE}+${{ matrix.cuda }}" >> "$GITHUB_OUTPUT" + + - name: Build fat-binary wheel + env: + CULA_BUILD_ALL_ARCHS: "1" + SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}" + NVCC_THREADS: "4" + MAX_JOBS: "4" + run: python -m build --wheel --no-isolation + + - name: Verify wheel + run: | + echo "Built wheel:" + ls -lh dist/*.whl + ls dist/*.whl | grep -q "+${{ matrix.cuda }}" \ + || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; } + + - name: Upload wheel artifact + uses: actions/upload-artifact@v6 + with: + name: wheel-${{ matrix.cuda }}-${{ matrix.arch }} + path: dist/*.whl + + release: + name: Create GitHub Release + needs: [build-wheel] + runs-on: ubuntu-latest + if: startsWith(github.ref, 'refs/tags/v') + permissions: + contents: write + steps: + - name: Download all artifacts + uses: actions/download-artifact@v6 + with: + path: artifacts/ + + - name: Create release + uses: softprops/action-gh-release@v3 + with: + files: | + artifacts/wheel-*/*.whl + generate_release_notes: true + draft: true + prerelease: ${{ contains(github.ref, 'rc') || contains(github.ref, 'beta') || contains(github.ref, 'alpha') }} diff --git a/README.md b/README.md index 7bed61e2..16e8e7b6 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,18 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs. > **Note:** The PyTorch CUDA version must match your system CUDA Toolkit version. Check with `nvcc --version` and `python -c "import torch; print(torch.version.cuda)"`. +### Pre-built Wheels + +Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases): + +```bash +pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ +``` + +Replace `` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install .whl`. + +### Build from Source + **Clone cuLA & dependencies:** ```bash diff --git a/csrc/api/kda_sm100.cu b/csrc/api/kda_sm100.cu index 7edca370..020d90ca 100644 --- a/csrc/api/kda_sm100.cu +++ b/csrc/api/kda_sm100.cu @@ -188,4 +188,10 @@ ChunkKDAFwdRecompWU( StaticPersistentTileScheduler::Params{tile_num, params.h_v, params.heads_per_group, params.num_sm, nullptr}; kda::sm100::run_kda_fwd_recomp_w_u_sm100(params, at::cuda::getCurrentCUDAStream()); +} + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "cuLA SM100/SM103 kernels"; + m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra); + m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU); } \ No newline at end of file diff --git a/csrc/api/kda_sm90.cu b/csrc/api/kda_sm90.cu index 9e016eb1..d80df7cc 100644 --- a/csrc/api/kda_sm90.cu +++ b/csrc/api/kda_sm90.cu @@ -191,3 +191,8 @@ kda_fwd_prefill( return {output, output_state}; } + +PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { + m.doc() = "cuLA SM90 kernels"; + m.def("kda_fwd_prefill", &kda_fwd_prefill); +} diff --git a/csrc/api/pybind.cu b/csrc/api/pybind.cu deleted file mode 100644 index d14a41c5..00000000 --- a/csrc/api/pybind.cu +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2025-2026 Ant Group Co., Ltd. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include -#include -#include -#include - -#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED) -void -ChunkKDAFwdIntra( - at::Tensor q, - at::Tensor k, - at::Tensor g, - at::Tensor beta, - at::Tensor cu_seqlens, - at::Tensor chunk_indices, - at::Tensor Aqk_out, - at::Tensor Akk_out, - at::Tensor tile_counter, - float scale, - int chunk_size, - bool use_tf32_inverse, - bool unified_gref); -void -ChunkKDAFwdRecompWU( - at::Tensor k, - at::Tensor v, - at::Tensor beta, - at::Tensor A, - at::Tensor g, - at::Tensor cu_seqlens, - at::Tensor chunk_indices, - at::Tensor w_out, - at::Tensor u_out, - at::Tensor kg_out, - int chunk_size, - std::optional q, - std::optional qg_out); -#endif - -#if defined(CULA_SM90A_ENABLED) -std::tuple> -kda_fwd_prefill( - std::optional output_, - std::optional output_state_, - torch::Tensor const& q, - torch::Tensor const& k, - torch::Tensor const& v, - std::optional input_state_, - std::optional alpha_, - std::optional beta_, - torch::Tensor const& cu_seqlens, - torch::Tensor workspace_buffer, - float scale, - bool output_final_state, - bool safe_gate); -#endif - -PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { - m.doc() = "cuLA"; -#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED) - m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra); - m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU); -#endif -#if defined(CULA_SM90A_ENABLED) - m.def("kda_fwd_prefill", &kda_fwd_prefill); -#endif -} diff --git a/cula/__init__.py b/cula/__init__.py index 7272e289..6e13aa13 100644 --- a/cula/__init__.py +++ b/cula/__init__.py @@ -12,7 +12,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -__version__ = "0.1.0" +try: + from cula._version import version as __version__ +except ImportError: + __version__ = "0.1.0" from cula.ops.lightning_attn_sm100 import LinearAttentionChunkwiseDecay diff --git a/cula/cudac.py b/cula/cudac.py new file mode 100644 index 00000000..287d9747 --- /dev/null +++ b/cula/cudac.py @@ -0,0 +1,65 @@ +# Copyright 2025-2026 Ant Group Co., Ltd. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Unified interface to per-architecture CUDA extensions. + +Downstream code can continue to use ``import cula.cudac as cula_cuda`` +and call ``cula_cuda.kda_fwd_prefill(...)`` or +``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which +extension provides the function. +""" + +import importlib +import sys +from types import ModuleType + + +class _CudacProxy(ModuleType): + """Lazy proxy that exposes functions from all built arch extensions.""" + + def __init__(self): + super().__init__(__name__) + self.__path__ = [] + self._modules_loaded = False + self._funcs: dict[str, object] = {} + + def _load(self): + if self._modules_loaded: + return + self._modules_loaded = True + for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): + try: + mod = importlib.import_module(ext_name) + for attr in dir(mod): + if not attr.startswith("_"): + self._funcs[attr] = getattr(mod, attr) + except ImportError: + pass + self.__dict__.update(self._funcs) + + def __getattr__(self, name: str): + if name.startswith("_"): + raise AttributeError(name) + self._load() + try: + return self._funcs[name] + except KeyError: + raise AttributeError(f"module 'cula.cudac' has no attribute '{name}'") from None + + def __dir__(self): + self._load() + return list(self._funcs.keys()) + + +sys.modules[__name__] = _CudacProxy() diff --git a/pyproject.toml b/pyproject.toml index ef1a531b..a6c34562 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,8 +11,8 @@ readme = "README.md" authors = [ { name = "cula contributors" } ] requires-python = ">=3.10" dependencies = [ - "nvidia-cutlass-dsl>=4.4.2", - "apache-tvm-ffi>=0.1.9", + "nvidia-cutlass-dsl==4.4.2", + "apache-tvm-ffi==0.1.9", ] license = { text = "Apache-2.0" } @@ -84,9 +84,6 @@ force-sort-within-sections = false "cula/kda/blackwell_fused_fwd.py" = ["F821"] [tool.setuptools_scm] -# write generated version into package for runtime access write_to = "cula/_version.py" -# add a date-based local suffix when needed -local_scheme = "node-and-date" -# fallback for non-git sources +local_scheme = "no-local-version" fallback_version = "0.1.0" diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh index 42b35665..79ac3305 100755 --- a/scripts/build_wheel.sh +++ b/scripts/build_wheel.sh @@ -18,10 +18,19 @@ cd "$REPO_ROOT" # Parse args ISOLATION_FLAG="--no-isolation" -if [[ "${1:-}" == "--isolated" ]]; then - ISOLATION_FLAG="" - echo "[build_wheel] Using isolated build environment" -else +for arg in "$@"; do + case "$arg" in + --isolated) + ISOLATION_FLAG="" + echo "[build_wheel] Using isolated build environment" + ;; + --fat) + export CULA_BUILD_ALL_ARCHS=1 + echo "[build_wheel] Fat binary: building for all SM architectures" + ;; + esac +done +if [[ "$ISOLATION_FLAG" == "--no-isolation" ]]; then echo "[build_wheel] Using current environment (--no-isolation)" fi @@ -33,6 +42,7 @@ rm -rf dist build *.egg-info echo "[build_wheel] Python: $(python -V 2>&1)" echo "[build_wheel] torch: $(python -c 'import torch; print(torch.__version__)' 2>/dev/null || echo 'not installed')" echo "[build_wheel] CUDA: $(nvcc --version 2>/dev/null | grep 'release' | sed 's/.*release //' | sed 's/,.*//' || echo 'not found')" +echo "[build_wheel] Fat binary: ${CULA_BUILD_ALL_ARCHS:-0}" # Build wheel echo "[build_wheel] Building wheel..." diff --git a/setup.py b/setup.py index f7b11b95..0e127e81 100644 --- a/setup.py +++ b/setup.py @@ -46,13 +46,15 @@ def detect_gpu_archs() -> tuple[bool, bool, bool]: def resolve_disable_flag(env_name: str, detected: bool) -> bool: """ Resolve whether to disable a given SM target. + - If CULA_BUILD_ALL_ARCHS is set, all targets are enabled unconditionally. - If the environment variable is explicitly set, honour it. - Otherwise, disable the target when no matching GPU is detected. """ + if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1": + return False env_val = os.getenv(env_name) if env_val is not None: return env_val.lower() in ["true", "1", "y", "yes"] - # Auto-detect: disable if no matching device found disable = not detected if disable: print(f" No matching GPU detected; auto-setting {env_name}=1 (disable). Set {env_name}=0 to override.") @@ -66,7 +68,11 @@ def get_features_args(): USE_FAST_MATH = os.getenv("CULA_USE_FAST_MATH", "1") == "1" -print("Detecting GPU architectures...") +if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1": + print("CULA_BUILD_ALL_ARCHS=1: enabling all SM targets (sm90a, sm100a, sm103a)") +else: + print("Detecting GPU architectures...") + _has_sm100, _has_sm103, _has_sm90 = detect_gpu_archs() DISABLE_SM100 = resolve_disable_flag("CULA_DISABLE_SM100", _has_sm100) DISABLE_SM103 = resolve_disable_flag("CULA_DISABLE_SM103", _has_sm103) @@ -111,26 +117,6 @@ def assert_blackwell_build_env() -> None: ) -def get_arch_flags(): - major, minor = get_nvcc_version() - print(f"Compiling using NVCC {major}.{minor}") - - # Validate Blackwell build environment - assert_blackwell_build_env() - - arch_flags = [] - if not DISABLE_SM100: - arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"]) - arch_flags.extend(["-DCULA_SM100_ENABLED"]) - if not DISABLE_SM103: - arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"]) - arch_flags.extend(["-DCULA_SM103_ENABLED"]) - if not DISABLE_SM90: - arch_flags.extend(["-gencode", "arch=compute_90a,code=sm_90a"]) - arch_flags.extend(["-DCULA_SM90A_ENABLED"]) - return arch_flags - - def get_nvcc_thread_args(): nvcc_threads = os.getenv("NVCC_THREADS") or "32" return ["--threads", nvcc_threads] @@ -145,61 +131,78 @@ def get_nvcc_thread_args(): else: cxx_args = ["-O3", "-std=c++20", "-DNDEBUG", "-Wno-deprecated-declarations"] -cuda_sources = [ - "csrc/api/pybind.cu", +nvcc_common_args = [ + "-O3", + "-std=c++20", + "-DNDEBUG", + # "-D_USE_MATH_DEFINES", + "-Wno-deprecated-declarations", + "-U__CUDA_NO_HALF_OPERATORS__", + "-U__CUDA_NO_HALF_CONVERSIONS__", + "-U__CUDA_NO_HALF2_OPERATORS__", + "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", + "--expt-relaxed-constexpr", + "--expt-extended-lambda", + "-lineinfo", + "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage", + "-diag-suppress=3189", ] + +include_dirs = [ + Path(this_dir) / "csrc", + Path(this_dir) / "csrc" / "kerutils" / "include", + Path(this_dir) / "csrc" / "cutlass" / "include", + Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include", +] + +major, minor = get_nvcc_version() +print(f"Compiling using NVCC {major}.{minor}") +assert_blackwell_build_env() + +ext_modules = [] + if not DISABLE_SM100 or not DISABLE_SM103: - cuda_sources.extend( - [ - "csrc/api/kda_sm100.cu", - "csrc/kda/sm100/kda_fwd_sm100.cu", - ] - ) -if not DISABLE_SM90: - cuda_sources.extend( - [ - "csrc/api/kda_sm90.cu", - "csrc/kda/sm90/kda_fwd_sm90.cu", - "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu", - ] + sm100_arch_flags = [] + if not DISABLE_SM100: + sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"]) + if not DISABLE_SM103: + sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"]) + + ext_modules.append( + CUDAExtension( + name="cula._cudac_sm100", + sources=[ + "csrc/api/kda_sm100.cu", + "csrc/kda/sm100/kda_fwd_sm100.cu", + ], + extra_compile_args={ + "cxx": cxx_args + get_features_args(), + "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags + + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + }, + include_dirs=include_dirs, + ) ) -ext_modules = [] -ext_modules.append( - CUDAExtension( - name="cula.cudac", - sources=cuda_sources, - extra_compile_args={ - "cxx": cxx_args + get_features_args(), - "nvcc": [ - "-O3", - "-std=c++20", - "-DNDEBUG", - # "-D_USE_MATH_DEFINES", - "-Wno-deprecated-declarations", - "-U__CUDA_NO_HALF_OPERATORS__", - "-U__CUDA_NO_HALF_CONVERSIONS__", - "-U__CUDA_NO_HALF2_OPERATORS__", - "-U__CUDA_NO_BFLOAT16_CONVERSIONS__", - "--expt-relaxed-constexpr", - "--expt-extended-lambda", - "-lineinfo", - "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage", - "-diag-suppress=3189", # suppress the warning of torch in C++ 20 - ] - + get_features_args() - + get_arch_flags() - + get_nvcc_thread_args() - + (["--use_fast_math"] if USE_FAST_MATH else []), - }, - include_dirs=[ - Path(this_dir) / "csrc", - Path(this_dir) / "csrc" / "kerutils" / "include", - Path(this_dir) / "csrc" / "cutlass" / "include", - Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include", - ], +if not DISABLE_SM90: + sm90_arch_flags = ["-gencode", "arch=compute_90a,code=sm_90a", "-DCULA_SM90A_ENABLED"] + + ext_modules.append( + CUDAExtension( + name="cula._cudac_sm90", + sources=[ + "csrc/api/kda_sm90.cu", + "csrc/kda/sm90/kda_fwd_sm90.cu", + "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu", + ], + extra_compile_args={ + "cxx": cxx_args + get_features_args(), + "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags + + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + }, + include_dirs=include_dirs, + ) ) -) setup( name="cuda-linear-attention", From 761673708d326bffe3d102aff3e09610e897e088 Mon Sep 17 00:00:00 2001 From: tongke Date: Fri, 12 Jun 2026 11:38:28 +0800 Subject: [PATCH 2/9] fix ruff lint errors --- setup.py | 14 ++++++++++---- tests/conftest.py | 7 ++----- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 0e127e81..3df211b5 100644 --- a/setup.py +++ b/setup.py @@ -177,8 +177,11 @@ def get_nvcc_thread_args(): ], extra_compile_args={ "cxx": cxx_args + get_features_args(), - "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags - + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + "nvcc": nvcc_common_args + + get_features_args() + + sm100_arch_flags + + get_nvcc_thread_args() + + (["--use_fast_math"] if USE_FAST_MATH else []), }, include_dirs=include_dirs, ) @@ -197,8 +200,11 @@ def get_nvcc_thread_args(): ], extra_compile_args={ "cxx": cxx_args + get_features_args(), - "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags - + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []), + "nvcc": nvcc_common_args + + get_features_args() + + sm90_arch_flags + + get_nvcc_thread_args() + + (["--use_fast_math"] if USE_FAST_MATH else []), }, include_dirs=include_dirs, ) diff --git a/tests/conftest.py b/tests/conftest.py index f144c10b..a9338aca 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,5 @@ import re + import pytest import torch @@ -56,9 +57,5 @@ def pytest_collection_modifyitems(config, items): item.add_marker(skip_slow) continue callspec = getattr(item, "callspec", None) - if ( - callspec is not None - and callspec.params.get("disable_recompute") - and "kda_fast_norecomp" not in item.keywords - ): + if callspec is not None and callspec.params.get("disable_recompute") and "kda_fast_norecomp" not in item.keywords: item.add_marker(skip_fast_norecomp) From b148c526d513062da686aa5c00c30c67590bca16 Mon Sep 17 00:00:00 2001 From: tongke Date: Fri, 12 Jun 2026 11:52:37 +0800 Subject: [PATCH 3/9] revert version requirements changes --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a6c34562..fe93e562 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,8 +11,8 @@ readme = "README.md" authors = [ { name = "cula contributors" } ] requires-python = ">=3.10" dependencies = [ - "nvidia-cutlass-dsl==4.4.2", - "apache-tvm-ffi==0.1.9", + "nvidia-cutlass-dsl>=4.4.2", + "apache-tvm-ffi>=0.1.9", ] license = { text = "Apache-2.0" } From 2c56304dc128cb8935238a7566e91528cd9c296e Mon Sep 17 00:00:00 2001 From: tongke Date: Fri, 12 Jun 2026 12:02:35 +0800 Subject: [PATCH 4/9] Make cudac proxy thread-safe and raise on missing extensions Add double-checked locking to _CudacProxy._load() to prevent race conditions in multi-threaded environments. Raise a descriptive ImportError when no CUDA extensions can be loaded instead of silently producing AttributeError later. --- cula/cudac.py | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index 287d9747..c256f6ef 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -22,6 +22,7 @@ import importlib import sys +import threading from types import ModuleType @@ -33,20 +34,31 @@ def __init__(self): self.__path__ = [] self._modules_loaded = False self._funcs: dict[str, object] = {} + self._lock = threading.Lock() def _load(self): if self._modules_loaded: return - self._modules_loaded = True - for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): - try: - mod = importlib.import_module(ext_name) - for attr in dir(mod): - if not attr.startswith("_"): - self._funcs[attr] = getattr(mod, attr) - except ImportError: - pass - self.__dict__.update(self._funcs) + with self._lock: + if self._modules_loaded: + return + loaded_any = False + for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): + try: + mod = importlib.import_module(ext_name) + for attr in dir(mod): + if not attr.startswith("_"): + self._funcs[attr] = getattr(mod, attr) + loaded_any = True + except ImportError: + pass + if not loaded_any: + raise ImportError( + "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') " + "could be imported. Please make sure cuLA is compiled correctly." + ) + self.__dict__.update(self._funcs) + self._modules_loaded = True def __getattr__(self, name: str): if name.startswith("_"): From c955d4762867c7986d58a138edac187fd803e0aa Mon Sep 17 00:00:00 2001 From: yz262713 Date: Wed, 17 Jun 2026 19:36:51 +0800 Subject: [PATCH 5/9] Surface per-extension import errors in cudac proxy The blanket `except ImportError: pass` swallowed the actual failure reason, making it impossible to diagnose missing shared libraries or build issues. Collect each extension's ImportError and include them in the raised message. --- cula/cudac.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index c256f6ef..8dfcc49b 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -43,6 +43,7 @@ def _load(self): if self._modules_loaded: return loaded_any = False + errors: dict[str, Exception] = {} for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"): try: mod = importlib.import_module(ext_name) @@ -50,12 +51,14 @@ def _load(self): if not attr.startswith("_"): self._funcs[attr] = getattr(mod, attr) loaded_any = True - except ImportError: - pass + except ImportError as exc: + errors[ext_name] = exc if not loaded_any: + details = "; ".join(f"{name}: {exc}" for name, exc in errors.items()) raise ImportError( - "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') " - "could be imported. Please make sure cuLA is compiled correctly." + "None of the cuLA CUDA extensions could be imported. " + f"Per-extension errors: [{details}]. " + "Please make sure cuLA is compiled correctly." ) self.__dict__.update(self._funcs) self._modules_loaded = True From 6ccb8fac78c7ade3b16ed9f30f1734f55b0b0a8f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Jun 2026 14:47:35 +0000 Subject: [PATCH 6/9] Fix build-release matrix with DRY expression mapping --- .github/workflows/build-release.yml | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml index cd5b54d0..ae15a184 100644 --- a/.github/workflows/build-release.yml +++ b/.github/workflows/build-release.yml @@ -13,7 +13,7 @@ concurrency: jobs: build-wheel: name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}" - runs-on: ${{ matrix.runner }} + runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-latest' }} strategy: fail-fast: false matrix: @@ -23,19 +23,8 @@ jobs: arch: - x86_64 - aarch64 - include: - - cuda: cu129 - cuda_version: "12.9.0" - torch_index: "https://download.pytorch.org/whl/cu129" - - cuda: cu130 - cuda_version: "13.0.0" - torch_index: "https://download.pytorch.org/whl/cu130" - - arch: x86_64 - runner: ubuntu-latest - - arch: aarch64 - runner: ubuntu-24.04-arm container: - image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04" + image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubuntu24.04" steps: - name: Free disk space @@ -67,7 +56,7 @@ jobs: - name: Install Python dependencies run: | python -m pip install --no-cache-dir --upgrade pip - python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }} + python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }} python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja - name: Compute version From f700810b816c79af0c56351711940d3fc7d8c292 Mon Sep 17 00:00:00 2001 From: tongke <124763920+tongke6@users.noreply.github.com> Date: Wed, 17 Jun 2026 22:51:28 +0800 Subject: [PATCH 7/9] Apply suggestions from code review Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- cula/cudac.py | 4 +++- setup.py | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/cula/cudac.py b/cula/cudac.py index 8dfcc49b..1bbaf108 100644 --- a/cula/cudac.py +++ b/cula/cudac.py @@ -77,4 +77,6 @@ def __dir__(self): return list(self._funcs.keys()) -sys.modules[__name__] = _CudacProxy() +_proxy = _CudacProxy() +_proxy.__dict__.update({k: globals().get(k) for k in ("__spec__", "__file__", "__package__", "__loader__")}) +sys.modules[__name__] = _proxy diff --git a/setup.py b/setup.py index 3df211b5..78c61e5c 100644 --- a/setup.py +++ b/setup.py @@ -164,9 +164,9 @@ def get_nvcc_thread_args(): if not DISABLE_SM100 or not DISABLE_SM103: sm100_arch_flags = [] if not DISABLE_SM100: - sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"]) + sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"]) if not DISABLE_SM103: - sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"]) + sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"]) ext_modules.append( CUDAExtension( From e0a1e214179fd5e53dcfe5bb2e114909e104e91b Mon Sep 17 00:00:00 2001 From: tongke <124763920+tongke6@users.noreply.github.com> Date: Wed, 17 Jun 2026 23:06:57 +0800 Subject: [PATCH 8/9] Potential fix for pull request finding Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com> --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 16e8e7b6..eabf435b 100644 --- a/README.md +++ b/README.md @@ -28,11 +28,9 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs. Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases): -```bash -pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ -``` + pip install "cuda-linear-attention==+" -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/ -Replace `` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install .whl`. +Replace `` with the release tag (e.g., `v0.2.0`), `` with the base version (e.g., `0.2.0`), and `` with your PyTorch CUDA build tag (e.g., `cu129` or `cu130`). Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install .whl`. ### Build from Source From ead4f7eb5204194854f8f3a7dbddca03a24dedbb Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 17 Jun 2026 15:14:59 +0000 Subject: [PATCH 9/9] Add README example for building fat wheels --- README.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.md b/README.md index eabf435b..7502e302 100644 --- a/README.md +++ b/README.md @@ -57,6 +57,12 @@ pip install -e third_party/flash-linear-attention pip install -e . --no-build-isolation ``` +**Build fat wheel (SM90 + SM100 + SM103):** + +```bash +CULA_BUILD_ALL_ARCHS=1 python -m build --wheel --no-isolation +``` + ## Quick Start ### KDA (Kimi Delta Attention) — Blackwell (SM10X)