From 057b34acf48faf3b202a13fc73430c5cba652b7a Mon Sep 17 00:00:00 2001
From: tongke <124763920+tongke6@users.noreply.github.com>
Date: Fri, 12 Jun 2026 11:27:33 +0800
Subject: [PATCH 1/9] Split CUDA extensions by SM architecture for fat-binary
 wheel builds (#83)

Replace the monolithic `cula.cudac` extension with per-arch extensions
(`cula._cudac_sm90`, `cula._cudac_sm100`) so that SM90 and SM100/SM103
kernels are compiled independently with their own `-gencode` flags. This
enables building fat-binary wheels containing all architectures without
needing the target GPU present at build time.

Key changes:
- Split pybind.cu into per-file PYBIND11_MODULE definitions
- Add `cula/cudac.py` proxy module for backwards-compatible imports
- Add `CULA_BUILD_ALL_ARCHS=1` env var to enable all SM targets
- Add `--fat` flag to build_wheel.sh for CI fat-binary builds
- Pin dependency versions and use `no-local-version` scheme for
  reproducible wheel filenames
- Use setuptools_scm for dynamic `__version__`
- Document pre-built wheel installation in README
---
 .github/workflows/build-release.yml | 125 +++++++++++++++++++++++
 README.md                           |  12 +++
 csrc/api/kda_sm100.cu               |   6 ++
 csrc/api/kda_sm90.cu                |   5 +
 csrc/api/pybind.cu                  |  80 ---------------
 cula/__init__.py                    |   5 +-
 cula/cudac.py                       |  65 ++++++++++++
 pyproject.toml                      |   9 +-
 scripts/build_wheel.sh              |  18 +++-
 setup.py                            | 147 ++++++++++++++--------------
 10 files changed, 309 insertions(+), 163 deletions(-)
 create mode 100644 .github/workflows/build-release.yml
 delete mode 100644 csrc/api/pybind.cu
 create mode 100644 cula/cudac.py

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
new file mode 100644
index 00000000..cd5b54d0
--- /dev/null
+++ b/.github/workflows/build-release.yml
@@ -0,0 +1,125 @@
+name: Build & Release Wheels
+
+on:
+  push:
+    tags:
+      - "v*"
+  workflow_dispatch:
+
+concurrency:
+  group: build-release-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build-wheel:
+    name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}"
+    runs-on: ${{ matrix.runner }}
+    strategy:
+      fail-fast: false
+      matrix:
+        cuda:
+          - cu129
+          - cu130
+        arch:
+          - x86_64
+          - aarch64
+        include:
+          - cuda: cu129
+            cuda_version: "12.9.0"
+            torch_index: "https://download.pytorch.org/whl/cu129"
+          - cuda: cu130
+            cuda_version: "13.0.0"
+            torch_index: "https://download.pytorch.org/whl/cu130"
+          - arch: x86_64
+            runner: ubuntu-latest
+          - arch: aarch64
+            runner: ubuntu-24.04-arm
+    container:
+      image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04"
+
+    steps:
+      - name: Free disk space
+        run: |
+          rm -rf /opt/hostedtoolcache /usr/local/lib/android /usr/share/dotnet \
+                 /usr/local/share/boost /opt/ghc 2>/dev/null || true
+          apt-get clean 2>/dev/null || true
+          df -h / || true
+
+      - name: Install git
+        run: |
+          apt-get update && apt-get install -y --no-install-recommends git \
+            && rm -rf /var/lib/apt/lists/*
+
+      - name: Checkout
+        uses: actions/checkout@v5
+        with:
+          fetch-depth: 0
+          submodules: recursive
+
+      - name: Configure git safe directory
+        run: git config --global --add safe.directory "$GITHUB_WORKSPACE"
+
+      - name: Set up Python
+        uses: actions/setup-python@v6
+        with:
+          python-version: "3.12"
+
+      - name: Install Python dependencies
+        run: |
+          python -m pip install --no-cache-dir --upgrade pip
+          python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }}
+          python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja
+
+      - name: Compute version
+        id: version
+        run: |
+          if [[ "$GITHUB_REF" == refs/tags/v* ]]; then
+            BASE="${GITHUB_REF#refs/tags/v}"
+          else
+            # Strip any local segment (+gXXX) so we get a clean base
+            BASE=$(python -c "from setuptools_scm import get_version; print(get_version().split('+')[0])")
+          fi
+          echo "version=${BASE}+${{ matrix.cuda }}" >> "$GITHUB_OUTPUT"
+
+      - name: Build fat-binary wheel
+        env:
+          CULA_BUILD_ALL_ARCHS: "1"
+          SETUPTOOLS_SCM_PRETEND_VERSION: "${{ steps.version.outputs.version }}"
+          NVCC_THREADS: "4"
+          MAX_JOBS: "4"
+        run: python -m build --wheel --no-isolation
+
+      - name: Verify wheel
+        run: |
+          echo "Built wheel:"
+          ls -lh dist/*.whl
+          ls dist/*.whl | grep -q "+${{ matrix.cuda }}" \
+            || { echo "ERROR: wheel name missing +${{ matrix.cuda }} suffix"; exit 1; }
+
+      - name: Upload wheel artifact
+        uses: actions/upload-artifact@v6
+        with:
+          name: wheel-${{ matrix.cuda }}-${{ matrix.arch }}
+          path: dist/*.whl
+
+  release:
+    name: Create GitHub Release
+    needs: [build-wheel]
+    runs-on: ubuntu-latest
+    if: startsWith(github.ref, 'refs/tags/v')
+    permissions:
+      contents: write
+    steps:
+      - name: Download all artifacts
+        uses: actions/download-artifact@v6
+        with:
+          path: artifacts/
+
+      - name: Create release
+        uses: softprops/action-gh-release@v3
+        with:
+          files: |
+            artifacts/wheel-*/*.whl
+          generate_release_notes: true
+          draft: true
+          prerelease: ${{ contains(github.ref, 'rc') || contains(github.ref, 'beta') || contains(github.ref, 'alpha') }}
diff --git a/README.md b/README.md
index 7bed61e2..16e8e7b6 100644
--- a/README.md
+++ b/README.md
@@ -24,6 +24,18 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs.
 
 > **Note:** The PyTorch CUDA version must match your system CUDA Toolkit version. Check with `nvcc --version` and `python -c "import torch; print(torch.version.cuda)"`.
 
+### Pre-built Wheels
+
+Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases):
+
+```bash
+pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
+```
+
+Replace `<TAG>` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install <filename>.whl`.
+
+### Build from Source
+
 **Clone cuLA & dependencies:**
 
 ```bash
diff --git a/csrc/api/kda_sm100.cu b/csrc/api/kda_sm100.cu
index 7edca370..020d90ca 100644
--- a/csrc/api/kda_sm100.cu
+++ b/csrc/api/kda_sm100.cu
@@ -188,4 +188,10 @@ ChunkKDAFwdRecompWU(
         StaticPersistentTileScheduler::Params{tile_num, params.h_v, params.heads_per_group, params.num_sm, nullptr};
 
     kda::sm100::run_kda_fwd_recomp_w_u_sm100(params, at::cuda::getCurrentCUDAStream());
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "cuLA SM100/SM103 kernels";
+    m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra);
+    m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU);
 }
\ No newline at end of file
diff --git a/csrc/api/kda_sm90.cu b/csrc/api/kda_sm90.cu
index 9e016eb1..d80df7cc 100644
--- a/csrc/api/kda_sm90.cu
+++ b/csrc/api/kda_sm90.cu
@@ -191,3 +191,8 @@ kda_fwd_prefill(
 
     return {output, output_state};
 }
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+    m.doc() = "cuLA SM90 kernels";
+    m.def("kda_fwd_prefill", &kda_fwd_prefill);
+}
diff --git a/csrc/api/pybind.cu b/csrc/api/pybind.cu
deleted file mode 100644
index d14a41c5..00000000
--- a/csrc/api/pybind.cu
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2025-2026 Ant Group Co., Ltd.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
-#include <torch/nn/functional.h>
-#include <torch/python.h>
-
-#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED)
-void
-ChunkKDAFwdIntra(
-    at::Tensor q,
-    at::Tensor k,
-    at::Tensor g,
-    at::Tensor beta,
-    at::Tensor cu_seqlens,
-    at::Tensor chunk_indices,
-    at::Tensor Aqk_out,
-    at::Tensor Akk_out,
-    at::Tensor tile_counter,
-    float scale,
-    int chunk_size,
-    bool use_tf32_inverse,
-    bool unified_gref);
-void
-ChunkKDAFwdRecompWU(
-    at::Tensor k,
-    at::Tensor v,
-    at::Tensor beta,
-    at::Tensor A,
-    at::Tensor g,
-    at::Tensor cu_seqlens,
-    at::Tensor chunk_indices,
-    at::Tensor w_out,
-    at::Tensor u_out,
-    at::Tensor kg_out,
-    int chunk_size,
-    std::optional<at::Tensor> q,
-    std::optional<at::Tensor> qg_out);
-#endif
-
-#if defined(CULA_SM90A_ENABLED)
-std::tuple<torch::Tensor, std::optional<torch::Tensor>>
-kda_fwd_prefill(
-    std::optional<torch::Tensor> output_,
-    std::optional<torch::Tensor> output_state_,
-    torch::Tensor const& q,
-    torch::Tensor const& k,
-    torch::Tensor const& v,
-    std::optional<torch::Tensor> input_state_,
-    std::optional<torch::Tensor> alpha_,
-    std::optional<torch::Tensor> beta_,
-    torch::Tensor const& cu_seqlens,
-    torch::Tensor workspace_buffer,
-    float scale,
-    bool output_final_state,
-    bool safe_gate);
-#endif
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-    m.doc() = "cuLA";
-#if defined(CULA_SM100_ENABLED) || defined(CULA_SM103_ENABLED)
-    m.def("chunk_kda_fwd_intra_cuda", &ChunkKDAFwdIntra);
-    m.def("recompute_w_u_cuda", &ChunkKDAFwdRecompWU);
-#endif
-#if defined(CULA_SM90A_ENABLED)
-    m.def("kda_fwd_prefill", &kda_fwd_prefill);
-#endif
-}
diff --git a/cula/__init__.py b/cula/__init__.py
index 7272e289..6e13aa13 100644
--- a/cula/__init__.py
+++ b/cula/__init__.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "0.1.0"
+try:
+    from cula._version import version as __version__
+except ImportError:
+    __version__ = "0.1.0"
 
 from cula.ops.lightning_attn_sm100 import LinearAttentionChunkwiseDecay
 
diff --git a/cula/cudac.py b/cula/cudac.py
new file mode 100644
index 00000000..287d9747
--- /dev/null
+++ b/cula/cudac.py
@@ -0,0 +1,65 @@
+# Copyright 2025-2026 Ant Group Co., Ltd.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unified interface to per-architecture CUDA extensions.
+
+Downstream code can continue to use ``import cula.cudac as cula_cuda``
+and call ``cula_cuda.kda_fwd_prefill(...)`` or
+``cula_cuda.chunk_kda_fwd_intra_cuda(...)`` without knowing which
+extension provides the function.
+"""
+
+import importlib
+import sys
+from types import ModuleType
+
+
+class _CudacProxy(ModuleType):
+    """Lazy proxy that exposes functions from all built arch extensions."""
+
+    def __init__(self):
+        super().__init__(__name__)
+        self.__path__ = []
+        self._modules_loaded = False
+        self._funcs: dict[str, object] = {}
+
+    def _load(self):
+        if self._modules_loaded:
+            return
+        self._modules_loaded = True
+        for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
+            try:
+                mod = importlib.import_module(ext_name)
+                for attr in dir(mod):
+                    if not attr.startswith("_"):
+                        self._funcs[attr] = getattr(mod, attr)
+            except ImportError:
+                pass
+        self.__dict__.update(self._funcs)
+
+    def __getattr__(self, name: str):
+        if name.startswith("_"):
+            raise AttributeError(name)
+        self._load()
+        try:
+            return self._funcs[name]
+        except KeyError:
+            raise AttributeError(f"module 'cula.cudac' has no attribute '{name}'") from None
+
+    def __dir__(self):
+        self._load()
+        return list(self._funcs.keys())
+
+
+sys.modules[__name__] = _CudacProxy()
diff --git a/pyproject.toml b/pyproject.toml
index ef1a531b..a6c34562 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,8 @@ readme = "README.md"
 authors = [ { name = "cula contributors" } ]
 requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl>=4.4.2",
-    "apache-tvm-ffi>=0.1.9",
+    "nvidia-cutlass-dsl==4.4.2",
+    "apache-tvm-ffi==0.1.9",
 ]
 license = { text = "Apache-2.0" }
 
@@ -84,9 +84,6 @@ force-sort-within-sections = false
 "cula/kda/blackwell_fused_fwd.py" = ["F821"]
 
 [tool.setuptools_scm]
-# write generated version into package for runtime access
 write_to = "cula/_version.py"
-# add a date-based local suffix when needed
-local_scheme = "node-and-date"
-# fallback for non-git sources
+local_scheme = "no-local-version"
 fallback_version = "0.1.0"
diff --git a/scripts/build_wheel.sh b/scripts/build_wheel.sh
index 42b35665..79ac3305 100755
--- a/scripts/build_wheel.sh
+++ b/scripts/build_wheel.sh
@@ -18,10 +18,19 @@ cd "$REPO_ROOT"
 
 # Parse args
 ISOLATION_FLAG="--no-isolation"
-if [[ "${1:-}" == "--isolated" ]]; then
-    ISOLATION_FLAG=""
-    echo "[build_wheel] Using isolated build environment"
-else
+for arg in "$@"; do
+    case "$arg" in
+        --isolated)
+            ISOLATION_FLAG=""
+            echo "[build_wheel] Using isolated build environment"
+            ;;
+        --fat)
+            export CULA_BUILD_ALL_ARCHS=1
+            echo "[build_wheel] Fat binary: building for all SM architectures"
+            ;;
+    esac
+done
+if [[ "$ISOLATION_FLAG" == "--no-isolation" ]]; then
     echo "[build_wheel] Using current environment (--no-isolation)"
 fi
 
@@ -33,6 +42,7 @@ rm -rf dist build *.egg-info
 echo "[build_wheel] Python: $(python -V 2>&1)"
 echo "[build_wheel] torch:  $(python -c 'import torch; print(torch.__version__)' 2>/dev/null || echo 'not installed')"
 echo "[build_wheel] CUDA:   $(nvcc --version 2>/dev/null | grep 'release' | sed 's/.*release //' | sed 's/,.*//' || echo 'not found')"
+echo "[build_wheel] Fat binary: ${CULA_BUILD_ALL_ARCHS:-0}"
 
 # Build wheel
 echo "[build_wheel] Building wheel..."
diff --git a/setup.py b/setup.py
index f7b11b95..0e127e81 100644
--- a/setup.py
+++ b/setup.py
@@ -46,13 +46,15 @@ def detect_gpu_archs() -> tuple[bool, bool, bool]:
 def resolve_disable_flag(env_name: str, detected: bool) -> bool:
     """
     Resolve whether to disable a given SM target.
+    - If CULA_BUILD_ALL_ARCHS is set, all targets are enabled unconditionally.
     - If the environment variable is explicitly set, honour it.
     - Otherwise, disable the target when no matching GPU is detected.
     """
+    if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1":
+        return False
     env_val = os.getenv(env_name)
     if env_val is not None:
         return env_val.lower() in ["true", "1", "y", "yes"]
-    # Auto-detect: disable if no matching device found
     disable = not detected
     if disable:
         print(f"  No matching GPU detected; auto-setting {env_name}=1 (disable). Set {env_name}=0 to override.")
@@ -66,7 +68,11 @@ def get_features_args():
 
 USE_FAST_MATH = os.getenv("CULA_USE_FAST_MATH", "1") == "1"
 
-print("Detecting GPU architectures...")
+if os.getenv("CULA_BUILD_ALL_ARCHS", "0") == "1":
+    print("CULA_BUILD_ALL_ARCHS=1: enabling all SM targets (sm90a, sm100a, sm103a)")
+else:
+    print("Detecting GPU architectures...")
+
 _has_sm100, _has_sm103, _has_sm90 = detect_gpu_archs()
 DISABLE_SM100 = resolve_disable_flag("CULA_DISABLE_SM100", _has_sm100)
 DISABLE_SM103 = resolve_disable_flag("CULA_DISABLE_SM103", _has_sm103)
@@ -111,26 +117,6 @@ def assert_blackwell_build_env() -> None:
         )
 
 
-def get_arch_flags():
-    major, minor = get_nvcc_version()
-    print(f"Compiling using NVCC {major}.{minor}")
-
-    # Validate Blackwell build environment
-    assert_blackwell_build_env()
-
-    arch_flags = []
-    if not DISABLE_SM100:
-        arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"])
-        arch_flags.extend(["-DCULA_SM100_ENABLED"])
-    if not DISABLE_SM103:
-        arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"])
-        arch_flags.extend(["-DCULA_SM103_ENABLED"])
-    if not DISABLE_SM90:
-        arch_flags.extend(["-gencode", "arch=compute_90a,code=sm_90a"])
-        arch_flags.extend(["-DCULA_SM90A_ENABLED"])
-    return arch_flags
-
-
 def get_nvcc_thread_args():
     nvcc_threads = os.getenv("NVCC_THREADS") or "32"
     return ["--threads", nvcc_threads]
@@ -145,61 +131,78 @@ def get_nvcc_thread_args():
 else:
     cxx_args = ["-O3", "-std=c++20", "-DNDEBUG", "-Wno-deprecated-declarations"]
 
-cuda_sources = [
-    "csrc/api/pybind.cu",
+nvcc_common_args = [
+    "-O3",
+    "-std=c++20",
+    "-DNDEBUG",
+    # "-D_USE_MATH_DEFINES",
+    "-Wno-deprecated-declarations",
+    "-U__CUDA_NO_HALF_OPERATORS__",
+    "-U__CUDA_NO_HALF_CONVERSIONS__",
+    "-U__CUDA_NO_HALF2_OPERATORS__",
+    "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+    "--expt-relaxed-constexpr",
+    "--expt-extended-lambda",
+    "-lineinfo",
+    "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage",
+    "-diag-suppress=3189",
 ]
+
+include_dirs = [
+    Path(this_dir) / "csrc",
+    Path(this_dir) / "csrc" / "kerutils" / "include",
+    Path(this_dir) / "csrc" / "cutlass" / "include",
+    Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include",
+]
+
+major, minor = get_nvcc_version()
+print(f"Compiling using NVCC {major}.{minor}")
+assert_blackwell_build_env()
+
+ext_modules = []
+
 if not DISABLE_SM100 or not DISABLE_SM103:
-    cuda_sources.extend(
-        [
-            "csrc/api/kda_sm100.cu",
-            "csrc/kda/sm100/kda_fwd_sm100.cu",
-        ]
-    )
-if not DISABLE_SM90:
-    cuda_sources.extend(
-        [
-            "csrc/api/kda_sm90.cu",
-            "csrc/kda/sm90/kda_fwd_sm90.cu",
-            "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu",
-        ]
+    sm100_arch_flags = []
+    if not DISABLE_SM100:
+        sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"])
+    if not DISABLE_SM103:
+        sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"])
+
+    ext_modules.append(
+        CUDAExtension(
+            name="cula._cudac_sm100",
+            sources=[
+                "csrc/api/kda_sm100.cu",
+                "csrc/kda/sm100/kda_fwd_sm100.cu",
+            ],
+            extra_compile_args={
+                "cxx": cxx_args + get_features_args(),
+                "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags
+                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+            },
+            include_dirs=include_dirs,
+        )
     )
 
-ext_modules = []
-ext_modules.append(
-    CUDAExtension(
-        name="cula.cudac",
-        sources=cuda_sources,
-        extra_compile_args={
-            "cxx": cxx_args + get_features_args(),
-            "nvcc": [
-                "-O3",
-                "-std=c++20",
-                "-DNDEBUG",
-                # "-D_USE_MATH_DEFINES",
-                "-Wno-deprecated-declarations",
-                "-U__CUDA_NO_HALF_OPERATORS__",
-                "-U__CUDA_NO_HALF_CONVERSIONS__",
-                "-U__CUDA_NO_HALF2_OPERATORS__",
-                "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
-                "--expt-relaxed-constexpr",
-                "--expt-extended-lambda",
-                "-lineinfo",
-                "--ptxas-options=--verbose,--register-usage-level=10,--warn-on-local-memory-usage",
-                "-diag-suppress=3189",  # suppress the warning of torch in C++ 20
-            ]
-            + get_features_args()
-            + get_arch_flags()
-            + get_nvcc_thread_args()
-            + (["--use_fast_math"] if USE_FAST_MATH else []),
-        },
-        include_dirs=[
-            Path(this_dir) / "csrc",
-            Path(this_dir) / "csrc" / "kerutils" / "include",
-            Path(this_dir) / "csrc" / "cutlass" / "include",
-            Path(this_dir) / "csrc" / "cutlass" / "tools" / "util" / "include",
-        ],
+if not DISABLE_SM90:
+    sm90_arch_flags = ["-gencode", "arch=compute_90a,code=sm_90a", "-DCULA_SM90A_ENABLED"]
+
+    ext_modules.append(
+        CUDAExtension(
+            name="cula._cudac_sm90",
+            sources=[
+                "csrc/api/kda_sm90.cu",
+                "csrc/kda/sm90/kda_fwd_sm90.cu",
+                "csrc/kda/sm90/kda_fwd_sm90_safe_gate.cu",
+            ],
+            extra_compile_args={
+                "cxx": cxx_args + get_features_args(),
+                "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags
+                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+            },
+            include_dirs=include_dirs,
+        )
     )
-)
 
 setup(
     name="cuda-linear-attention",

From 761673708d326bffe3d102aff3e09610e897e088 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Fri, 12 Jun 2026 11:38:28 +0800
Subject: [PATCH 2/9] fix ruff lint errors

---
 setup.py          | 14 ++++++++++----
 tests/conftest.py |  7 ++-----
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 0e127e81..3df211b5 100644
--- a/setup.py
+++ b/setup.py
@@ -177,8 +177,11 @@ def get_nvcc_thread_args():
             ],
             extra_compile_args={
                 "cxx": cxx_args + get_features_args(),
-                "nvcc": nvcc_common_args + get_features_args() + sm100_arch_flags
-                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+                "nvcc": nvcc_common_args
+                + get_features_args()
+                + sm100_arch_flags
+                + get_nvcc_thread_args()
+                + (["--use_fast_math"] if USE_FAST_MATH else []),
             },
             include_dirs=include_dirs,
         )
@@ -197,8 +200,11 @@ def get_nvcc_thread_args():
             ],
             extra_compile_args={
                 "cxx": cxx_args + get_features_args(),
-                "nvcc": nvcc_common_args + get_features_args() + sm90_arch_flags
-                + get_nvcc_thread_args() + (["--use_fast_math"] if USE_FAST_MATH else []),
+                "nvcc": nvcc_common_args
+                + get_features_args()
+                + sm90_arch_flags
+                + get_nvcc_thread_args()
+                + (["--use_fast_math"] if USE_FAST_MATH else []),
             },
             include_dirs=include_dirs,
         )
diff --git a/tests/conftest.py b/tests/conftest.py
index f144c10b..a9338aca 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,5 @@
 import re
+
 import pytest
 import torch
 
@@ -56,9 +57,5 @@ def pytest_collection_modifyitems(config, items):
             item.add_marker(skip_slow)
             continue
         callspec = getattr(item, "callspec", None)
-        if (
-            callspec is not None
-            and callspec.params.get("disable_recompute")
-            and "kda_fast_norecomp" not in item.keywords
-        ):
+        if callspec is not None and callspec.params.get("disable_recompute") and "kda_fast_norecomp" not in item.keywords:
             item.add_marker(skip_fast_norecomp)

From b148c526d513062da686aa5c00c30c67590bca16 Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Fri, 12 Jun 2026 11:52:37 +0800
Subject: [PATCH 3/9] revert version requirements changes

---
 pyproject.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index a6c34562..fe93e562 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,8 +11,8 @@ readme = "README.md"
 authors = [ { name = "cula contributors" } ]
 requires-python = ">=3.10"
 dependencies = [
-    "nvidia-cutlass-dsl==4.4.2",
-    "apache-tvm-ffi==0.1.9",
+    "nvidia-cutlass-dsl>=4.4.2",
+    "apache-tvm-ffi>=0.1.9",
 ]
 license = { text = "Apache-2.0" }
 

From 2c56304dc128cb8935238a7566e91528cd9c296e Mon Sep 17 00:00:00 2001
From: tongke <shunde.csd@antgroup.com>
Date: Fri, 12 Jun 2026 12:02:35 +0800
Subject: [PATCH 4/9] Make cudac proxy thread-safe and raise on missing
 extensions

Add double-checked locking to _CudacProxy._load() to prevent race
conditions in multi-threaded environments. Raise a descriptive
ImportError when no CUDA extensions can be loaded instead of silently
producing AttributeError later.
---
 cula/cudac.py | 32 ++++++++++++++++++++++----------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index 287d9747..c256f6ef 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -22,6 +22,7 @@
 
 import importlib
 import sys
+import threading
 from types import ModuleType
 
 
@@ -33,20 +34,31 @@ def __init__(self):
         self.__path__ = []
         self._modules_loaded = False
         self._funcs: dict[str, object] = {}
+        self._lock = threading.Lock()
 
     def _load(self):
         if self._modules_loaded:
             return
-        self._modules_loaded = True
-        for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
-            try:
-                mod = importlib.import_module(ext_name)
-                for attr in dir(mod):
-                    if not attr.startswith("_"):
-                        self._funcs[attr] = getattr(mod, attr)
-            except ImportError:
-                pass
-        self.__dict__.update(self._funcs)
+        with self._lock:
+            if self._modules_loaded:
+                return
+            loaded_any = False
+            for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
+                try:
+                    mod = importlib.import_module(ext_name)
+                    for attr in dir(mod):
+                        if not attr.startswith("_"):
+                            self._funcs[attr] = getattr(mod, attr)
+                    loaded_any = True
+                except ImportError:
+                    pass
+            if not loaded_any:
+                raise ImportError(
+                    "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') "
+                    "could be imported. Please make sure cuLA is compiled correctly."
+                )
+            self.__dict__.update(self._funcs)
+            self._modules_loaded = True
 
     def __getattr__(self, name: str):
         if name.startswith("_"):

From c955d4762867c7986d58a138edac187fd803e0aa Mon Sep 17 00:00:00 2001
From: yz262713 <yz262713@antgroup.com>
Date: Wed, 17 Jun 2026 19:36:51 +0800
Subject: [PATCH 5/9] Surface per-extension import errors in cudac proxy

The blanket `except ImportError: pass` swallowed the actual failure
reason, making it impossible to diagnose missing shared libraries or
build issues. Collect each extension's ImportError and include them
in the raised message.
---
 cula/cudac.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index c256f6ef..8dfcc49b 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -43,6 +43,7 @@ def _load(self):
             if self._modules_loaded:
                 return
             loaded_any = False
+            errors: dict[str, Exception] = {}
             for ext_name in ("cula._cudac_sm100", "cula._cudac_sm90"):
                 try:
                     mod = importlib.import_module(ext_name)
@@ -50,12 +51,14 @@ def _load(self):
                         if not attr.startswith("_"):
                             self._funcs[attr] = getattr(mod, attr)
                     loaded_any = True
-                except ImportError:
-                    pass
+                except ImportError as exc:
+                    errors[ext_name] = exc
             if not loaded_any:
+                details = "; ".join(f"{name}: {exc}" for name, exc in errors.items())
                 raise ImportError(
-                    "None of the cuLA CUDA extensions ('cula._cudac_sm100', 'cula._cudac_sm90') "
-                    "could be imported. Please make sure cuLA is compiled correctly."
+                    "None of the cuLA CUDA extensions could be imported. "
+                    f"Per-extension errors: [{details}]. "
+                    "Please make sure cuLA is compiled correctly."
                 )
             self.__dict__.update(self._funcs)
             self._modules_loaded = True

From 6ccb8fac78c7ade3b16ed9f30f1734f55b0b0a8f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 17 Jun 2026 14:47:35 +0000
Subject: [PATCH 6/9] Fix build-release matrix with DRY expression mapping

---
 .github/workflows/build-release.yml | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/build-release.yml b/.github/workflows/build-release.yml
index cd5b54d0..ae15a184 100644
--- a/.github/workflows/build-release.yml
+++ b/.github/workflows/build-release.yml
@@ -13,7 +13,7 @@ concurrency:
 jobs:
   build-wheel:
     name: "wheel / ${{ matrix.cuda }} / cp312 / ${{ matrix.arch }}"
-    runs-on: ${{ matrix.runner }}
+    runs-on: ${{ matrix.arch == 'aarch64' && 'ubuntu-24.04-arm' || 'ubuntu-latest' }}
     strategy:
       fail-fast: false
       matrix:
@@ -23,19 +23,8 @@ jobs:
         arch:
           - x86_64
           - aarch64
-        include:
-          - cuda: cu129
-            cuda_version: "12.9.0"
-            torch_index: "https://download.pytorch.org/whl/cu129"
-          - cuda: cu130
-            cuda_version: "13.0.0"
-            torch_index: "https://download.pytorch.org/whl/cu130"
-          - arch: x86_64
-            runner: ubuntu-latest
-          - arch: aarch64
-            runner: ubuntu-24.04-arm
     container:
-      image: "nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu24.04"
+      image: "nvidia/cuda:${{ matrix.cuda == 'cu129' && '12.9.0' || '13.0.0' }}-devel-ubuntu24.04"
 
     steps:
       - name: Free disk space
@@ -67,7 +56,7 @@ jobs:
       - name: Install Python dependencies
         run: |
           python -m pip install --no-cache-dir --upgrade pip
-          python -m pip install --no-cache-dir torch --index-url ${{ matrix.torch_index }}
+          python -m pip install --no-cache-dir torch --index-url ${{ matrix.cuda == 'cu129' && 'https://download.pytorch.org/whl/cu129' || 'https://download.pytorch.org/whl/cu130' }}
           python -m pip install --no-cache-dir setuptools wheel "setuptools_scm>=6.0" build ninja
 
       - name: Compute version

From f700810b816c79af0c56351711940d3fc7d8c292 Mon Sep 17 00:00:00 2001
From: tongke <124763920+tongke6@users.noreply.github.com>
Date: Wed, 17 Jun 2026 22:51:28 +0800
Subject: [PATCH 7/9] Apply suggestions from code review

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 cula/cudac.py | 4 +++-
 setup.py      | 4 ++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/cula/cudac.py b/cula/cudac.py
index 8dfcc49b..1bbaf108 100644
--- a/cula/cudac.py
+++ b/cula/cudac.py
@@ -77,4 +77,6 @@ def __dir__(self):
         return list(self._funcs.keys())
 
 
-sys.modules[__name__] = _CudacProxy()
+_proxy = _CudacProxy()
+_proxy.__dict__.update({k: globals().get(k) for k in ("__spec__", "__file__", "__package__", "__loader__")})
+sys.modules[__name__] = _proxy
diff --git a/setup.py b/setup.py
index 3df211b5..78c61e5c 100644
--- a/setup.py
+++ b/setup.py
@@ -164,9 +164,9 @@ def get_nvcc_thread_args():
 if not DISABLE_SM100 or not DISABLE_SM103:
     sm100_arch_flags = []
     if not DISABLE_SM100:
-        sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a", "-DCULA_SM100_ENABLED"])
+        sm100_arch_flags.extend(["-gencode", "arch=compute_100a,code=sm_100a"])
     if not DISABLE_SM103:
-        sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a", "-DCULA_SM103_ENABLED"])
+        sm100_arch_flags.extend(["-gencode", "arch=compute_103a,code=sm_103a"])
 
     ext_modules.append(
         CUDAExtension(

From e0a1e214179fd5e53dcfe5bb2e114909e104e91b Mon Sep 17 00:00:00 2001
From: tongke <124763920+tongke6@users.noreply.github.com>
Date: Wed, 17 Jun 2026 23:06:57 +0800
Subject: [PATCH 8/9] Potential fix for pull request finding

Co-authored-by: Copilot Autofix powered by AI <175728472+Copilot@users.noreply.github.com>
---
 README.md | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 16e8e7b6..eabf435b 100644
--- a/README.md
+++ b/README.md
@@ -28,11 +28,9 @@ cuLA supports both **Hopper (SM90)** and **Blackwell (SM10X)** GPUs.
 
 Pre-built fat-binary wheels (SM90 + SM100 + SM103) are available on [GitHub Releases](https://github.com/inclusionAI/cuLA/releases):
 
-```bash
-pip install cuda-linear-attention -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
-```
+    pip install "cuda-linear-attention==<VERSION>+<CUDA_TAG>" -f https://github.com/inclusionAI/cuLA/releases/expanded_assets/<TAG>
 
-Replace `<TAG>` with the release version (e.g., `v0.2.0`). `pip` will select the wheel matching your local CUDA version. Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install <filename>.whl`.
+Replace `<TAG>` with the release tag (e.g., `v0.2.0`), `<VERSION>` with the base version (e.g., `0.2.0`), and `<CUDA_TAG>` with your PyTorch CUDA build tag (e.g., `cu129` or `cu130`). Or download the `.whl` file directly from the [Releases page](https://github.com/inclusionAI/cuLA/releases) and install it with `pip install <filename>.whl`.
 
 ### Build from Source
 

From ead4f7eb5204194854f8f3a7dbddca03a24dedbb Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 17 Jun 2026 15:14:59 +0000
Subject: [PATCH 9/9] Add README example for building fat wheels

---
 README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/README.md b/README.md
index eabf435b..7502e302 100644
--- a/README.md
+++ b/README.md
@@ -57,6 +57,12 @@ pip install -e third_party/flash-linear-attention
 pip install -e . --no-build-isolation
 ```
 
+**Build fat wheel (SM90 + SM100 + SM103):**
+
+```bash
+CULA_BUILD_ALL_ARCHS=1 python -m build --wheel --no-isolation
+```
+
 ## Quick Start
 
 ### KDA (Kimi Delta Attention) — Blackwell (SM10X)