From e9c0ff942b29cef1fb16e0c67b92e0e79bac8025 Mon Sep 17 00:00:00 2001
From: akleine <alb.kleine@gmx.de>
Date: Wed, 18 Mar 2026 20:32:20 +0100
Subject: [PATCH 1/3] docs: updated model file info

1. Download links have been provided instead of the conversion process.
2. The ckpt references have been replaced with those on SafeTensors.
These changes were made because this file is intended for less experienced users.
They are not required to use *.ckpt and can download the models instead of creating them themselves:
---
 docs/distilled_sd.md | 40 +++++++---------------------------------
 1 file changed, 7 insertions(+), 33 deletions(-)

diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
index 3174b18f8..afe7e79d4 100644
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@@ -87,51 +87,25 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
 ```bash
 python convert_diffusers_to_original_stable_diffusion.py \
       --model_path  ./segmindtiny-sd \
-      --checkpoint_path ./segmind_tiny-sd.ckpt --half
+      --checkpoint_path ./segmind_tiny-sd.safetensors  --half --use_safetensors
 ```
 
-The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
-
-
-##### Another available .ckpt file:
-
- * https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt
-
-To use this file, you must first adjust its non-contiguous tensors:
-
-```python
-import torch
-ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
-for key, value in ckpt['state_dict'].items():
-    if isinstance(value, torch.Tensor):
-        ckpt['state_dict'][key] = value.contiguous()
-torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
-```
+The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
 
 
 ### SDXS-512
 
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
+##### Some ready-to-run SDXS-512 model files are available online, such as:
 
-##### 1. Download the diffusers model from  Hugging Face using Python:
-
-```python
-from diffusers import StableDiffusionPipeline
-pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
-pipe.save_pretrained(save_directory="sdxs")
-```
-##### 2. Create a safetensors file
-
-```bash
-python convert_diffusers_to_original_stable_diffusion.py \
-    --model_path  sdxs  --checkpoint_path sdxs.safetensors --half --use_safetensors
-```
+* https://huggingface.co/akleine/sdxs-09
+* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
 
-##### 3. Run the model as follows:
 
+##### Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
   --cfg-scale 1 --steps 1
 ```
+Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
 
-Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.                                                 

From 9f08bc0c61cf27bb7aff63ae766c9f9e9ddfd98f Mon Sep 17 00:00:00 2001
From: akleine <alb.kleine@gmx.de>
Date: Thu, 19 Mar 2026 10:16:29 +0100
Subject: [PATCH 2/3] feat: add support for SDXS-09

To better distinguish between both SDXS versions,
the "old" VERSION_SDXS is now called VERSION_SDXS_512_DS,
where DS stands for the "DreamShaper" edition by IDKiro.
---
 src/common_block.hpp     | 17 +++++++++++---
 src/model.cpp            |  9 ++++++--
 src/model.h              |  7 +++---
 src/name_conversion.cpp  |  2 +-
 src/stable-diffusion.cpp | 49 ++++++++++++++++++++--------------------
 src/unet.hpp             |  8 +++++--
 6 files changed, 57 insertions(+), 35 deletions(-)

diff --git a/src/common_block.hpp b/src/common_block.hpp
index 2cef389af..112a4d7a1 100644
--- a/src/common_block.hpp
+++ b/src/common_block.hpp
@@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock {
     int64_t context_dim;
     int64_t n_head;
     int64_t d_head;
+    bool xtra_dim = false;
 
 public:
     CrossAttention(int64_t query_dim,
@@ -288,7 +289,11 @@ class CrossAttention : public GGMLBlock {
           query_dim(query_dim),
           context_dim(context_dim) {
         int64_t inner_dim = d_head * n_head;
-
+        if (context_dim == 320 && d_head == 320) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            xtra_dim    = true;
+            context_dim = 1024;
+        }
         blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
         blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
         blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
@@ -313,10 +318,16 @@ class CrossAttention : public GGMLBlock {
         int64_t n_context = context->ne[1];
         int64_t inner_dim = d_head * n_head;
 
-        auto q = to_q->forward(ctx, x);        // [N, n_token, inner_dim]
+        auto q = to_q->forward(ctx, x);  // [N, n_token, inner_dim]
+        if (xtra_dim) {
+            // LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
+            context->ne[0] = 1024;  // patch dim
+        }
         auto k = to_k->forward(ctx, context);  // [N, n_context, inner_dim]
         auto v = to_v->forward(ctx, context);  // [N, n_context, inner_dim]
-
+        if (xtra_dim) {
+            context->ne[0] = 320;  // reset dim to orig
+        }
         x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled);  // [N, n_token, inner_dim]
 
         x = to_out_0->forward(ctx, x);  // [N, n_token, query_dim]
diff --git a/src/model.cpp b/src/model.cpp
index d23b97fac..f443f96bf 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -1042,6 +1042,7 @@ SDVersion ModelLoader::get_sd_version() {
     bool has_middle_block_1          = false;
     bool has_output_block_311        = false;
     bool has_output_block_71         = false;
+    bool has_attn_1024               = false;
 
     for (auto& [name, tensor_storage] : tensor_storage_map) {
         if (!(is_xl)) {
@@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() {
         if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
             tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
             has_output_block_71 = true;
+            if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
+                if (tensor_storage.ne[0] == 1024)
+                    has_attn_1024 = true;
+            }
         }
         if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
             tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
@@ -1184,7 +1189,7 @@ SDVersion ModelLoader::get_sd_version() {
         }
         if (!has_middle_block_1) {
             if (!has_output_block_71) {
-                return VERSION_SDXS;
+                return VERSION_SDXS_512_DS;
             }
             return VERSION_SD1_TINY_UNET;
         }
@@ -1194,7 +1199,7 @@ SDVersion ModelLoader::get_sd_version() {
             return VERSION_SD2_INPAINT;
         }
         if (!has_middle_block_1) {
-            return VERSION_SD2_TINY_UNET;
+            return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
         }
         return VERSION_SD2;
     }
diff --git a/src/model.h b/src/model.h
index 3af35eb7e..c3f43f0a7 100644
--- a/src/model.h
+++ b/src/model.h
@@ -28,7 +28,8 @@ enum SDVersion {
     VERSION_SD2,
     VERSION_SD2_INPAINT,
     VERSION_SD2_TINY_UNET,
-    VERSION_SDXS,
+    VERSION_SDXS_512_DS,
+    VERSION_SDXS_09,
     VERSION_SDXL,
     VERSION_SDXL_INPAINT,
     VERSION_SDXL_PIX2PIX,
@@ -54,14 +55,14 @@ enum SDVersion {
 };
 
 static inline bool sd_version_is_sd1(SDVersion version) {
-    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
+    if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
         return true;
     }
     return false;
 }
 
 static inline bool sd_version_is_sd2(SDVersion version) {
-    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
+    if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
         return true;
     }
     return false;
diff --git a/src/name_conversion.cpp b/src/name_conversion.cpp
index d5d5e052c..618c7f6e9 100644
--- a/src/name_conversion.cpp
+++ b/src/name_conversion.cpp
@@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
         for (const auto& prefix : first_stage_model_prefix_vec) {
             if (starts_with(name, prefix)) {
                 name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
-                if (version == VERSION_SDXS) {
+                if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
                     name = "tae." + name;
                 } else {
                     name = prefix + name;
diff --git a/src/stable-diffusion.cpp b/src/stable-diffusion.cpp
index bbf2f979d..5a022ddef 100644
--- a/src/stable-diffusion.cpp
+++ b/src/stable-diffusion.cpp
@@ -33,7 +33,8 @@ const char* model_version_to_str[] = {
     "SD 2.x",
     "SD 2.x Inpaint",
     "SD 2.x Tiny UNet",
-    "SDXS",
+    "SDXS (512-DS)",
+    "SDXS (09)",
     "SDXL",
     "SDXL Inpaint",
     "SDXL Instruct-Pix2Pix",
@@ -789,7 +790,7 @@ class StableDiffusionGGML {
         }
 
         bool tae_preview_only = sd_ctx_params->tae_preview_only;
-        if (version == VERSION_SDXS) {
+        if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
             tae_preview_only = false;
             use_tae          = true;
         }
@@ -811,8 +812,8 @@ class StableDiffusionGGML {
                                                                      offload_params_to_cpu,
                                                                      tensor_storage_map);
                 diffusion_model  = std::make_shared<MMDiTModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map);
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map);
             } else if (sd_version_is_flux(version)) {
                 bool is_chroma = false;
                 for (auto pair : tensor_storage_map) {
@@ -860,10 +861,10 @@ class StableDiffusionGGML {
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<FluxModel>(backend,
-                                                              offload_params_to_cpu,
-                                                              tensor_storage_map,
-                                                              version,
-                                                              sd_ctx_params->chroma_use_dit_mask);
+                                                               offload_params_to_cpu,
+                                                               tensor_storage_map,
+                                                               version,
+                                                               sd_ctx_params->chroma_use_dit_mask);
             } else if (sd_version_is_wan(version)) {
                 cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
                                                                     offload_params_to_cpu,
@@ -872,10 +873,10 @@ class StableDiffusionGGML {
                                                                     1,
                                                                     true);
                 diffusion_model  = std::make_shared<WanModel>(backend,
-                                                             offload_params_to_cpu,
-                                                             tensor_storage_map,
-                                                             "model.diffusion_model",
-                                                             version);
+                                                              offload_params_to_cpu,
+                                                              tensor_storage_map,
+                                                              "model.diffusion_model",
+                                                              version);
                 if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
                     high_noise_diffusion_model = std::make_shared<WanModel>(backend,
                                                                             offload_params_to_cpu,
@@ -904,29 +905,29 @@ class StableDiffusionGGML {
                                                                  "",
                                                                  enable_vision);
                 diffusion_model  = std::make_shared<QwenImageModel>(backend,
-                                                                   offload_params_to_cpu,
-                                                                   tensor_storage_map,
-                                                                   "model.diffusion_model",
-                                                                   version,
-                                                                   sd_ctx_params->qwen_image_zero_cond_t);
+                                                                    offload_params_to_cpu,
+                                                                    tensor_storage_map,
+                                                                    "model.diffusion_model",
+                                                                    version,
+                                                                    sd_ctx_params->qwen_image_zero_cond_t);
             } else if (sd_version_is_anima(version)) {
                 cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
                                                                       offload_params_to_cpu,
                                                                       tensor_storage_map);
                 diffusion_model  = std::make_shared<AnimaModel>(backend,
-                                                               offload_params_to_cpu,
-                                                               tensor_storage_map,
-                                                               "model.diffusion_model");
+                                                                offload_params_to_cpu,
+                                                                tensor_storage_map,
+                                                                "model.diffusion_model");
             } else if (sd_version_is_z_image(version)) {
                 cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
                                                                  offload_params_to_cpu,
                                                                  tensor_storage_map,
                                                                  version);
                 diffusion_model  = std::make_shared<ZImageModel>(backend,
-                                                                offload_params_to_cpu,
-                                                                tensor_storage_map,
-                                                                "model.diffusion_model",
-                                                                version);
+                                                                 offload_params_to_cpu,
+                                                                 tensor_storage_map,
+                                                                 "model.diffusion_model",
+                                                                 version);
             } else {  // SD1.x SD2.x SDXL
                 std::map<std::string, std::string> embbeding_map;
                 for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
diff --git a/src/unet.hpp b/src/unet.hpp
index f7aa3f05d..6a333e66f 100644
--- a/src/unet.hpp
+++ b/src/unet.hpp
@@ -217,11 +217,11 @@ class UnetModelBlock : public GGMLBlock {
         } else if (sd_version_is_unet_edit(version)) {
             in_channels = 8;
         }
-        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
+        if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
             num_res_blocks = 1;
             channel_mult   = {1, 2, 4};
             tiny_unet      = true;
-            if (version == VERSION_SDXS) {
+            if (version == VERSION_SDXS_512_DS) {
                 attention_resolutions = {4, 2};  // here just like SDXL
             }
         }
@@ -264,6 +264,10 @@ class UnetModelBlock : public GGMLBlock {
             if (version == VERSION_SVD) {
                 return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             } else {
+                if (version == VERSION_SDXS_09 && n_head == 5) {
+                    n_head = 1;    // to carry a special case of sdxs_09 into CrossAttentionLayer,
+                    d_head = 320;  // works as long the product remains equal (5*64 == 1*320)
+                }
                 return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
             }
         };

From aee59cf844469860c4cefe31b1e100a2d53d7f96 Mon Sep 17 00:00:00 2001
From: akleine <alb.kleine@gmx.de>
Date: Thu, 19 Mar 2026 10:54:31 +0100
Subject: [PATCH 3/3] docs: update for SDXS-09

---
 docs/distilled_sd.md | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/docs/distilled_sd.md b/docs/distilled_sd.md
index afe7e79d4..7aa8fbede 100644
--- a/docs/distilled_sd.md
+++ b/docs/distilled_sd.md
@@ -93,15 +93,14 @@ python convert_diffusers_to_original_stable_diffusion.py \
 The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
 
 
-### SDXS-512
+### SDXS-512-DreamShaper
 
 Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
 ##### Some ready-to-run SDXS-512 model files are available online, such as:
 
-* https://huggingface.co/akleine/sdxs-09
+* https://huggingface.co/akleine/sdxs-512
 * https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF
 
-
 ##### Run the model as follows:
 ```bash
 ~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
@@ -109,3 +108,11 @@ Another very tiny and **incredibly fast**  model is SDXS by IDKiro et al.  The a
 ```
 Both options: ``` --cfg-scale 1 ``` and  ``` --steps 1 ``` are mandatory here.
 
+### SDXS-512-0.9
+
+Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
+##### Download a ready-to-run file from here:
+
+* https://huggingface.co/akleine/sdxs-09
+
+For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.