Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 16 additions & 35 deletions docs/distilled_sd.md
Original file line number Diff line number Diff line change
Expand Up @@ -87,51 +87,32 @@ pipe.save_pretrained("segmindtiny-sd", safe_serialization=True)
```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path ./segmindtiny-sd \
--checkpoint_path ./segmind_tiny-sd.ckpt --half
--checkpoint_path ./segmind_tiny-sd.safetensors --half --use_safetensors
```

The file segmind_tiny-sd.ckpt will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.
The file segmind_tiny-sd.safetensors will be generated and is now ready for use with sd.cpp. You can follow a similar process for the other models mentioned above.


##### Another available .ckpt file:

* https://huggingface.co/ClashSAN/small-sd/resolve/main/tinySDdistilled.ckpt

To use this file, you must first adjust its non-contiguous tensors:

```python
import torch
ckpt = torch.load("tinySDdistilled.ckpt", map_location=torch.device('cpu'))
for key, value in ckpt['state_dict'].items():
if isinstance(value, torch.Tensor):
ckpt['state_dict'][key] = value.contiguous()
torch.save(ckpt, "tinySDdistilled_fixed.ckpt")
```


### SDXS-512
### SDXS-512-DreamShaper

Another very tiny and **incredibly fast** model is SDXS by IDKiro et al. The authors refer to it as *"Real-Time One-Step Latent Diffusion Models with Image Conditions"*. For details read the paper: https://arxiv.org/pdf/2403.16627 . Once again the authors removed some more blocks of U-Net part and unlike other SD1 models they use an adjusted _AutoEncoderTiny_ instead of default _AutoEncoderKL_ for the VAE part.
##### Some ready-to-run SDXS-512 model files are available online, such as:

##### 1. Download the diffusers model from Hugging Face using Python:

```python
from diffusers import StableDiffusionPipeline
pipe = StableDiffusionPipeline.from_pretrained("IDKiro/sdxs-512-dreamshaper")
pipe.save_pretrained(save_directory="sdxs")
```
##### 2. Create a safetensors file

```bash
python convert_diffusers_to_original_stable_diffusion.py \
--model_path sdxs --checkpoint_path sdxs.safetensors --half --use_safetensors
```

##### 3. Run the model as follows:
* https://huggingface.co/akleine/sdxs-512
* https://huggingface.co/concedo/sdxs-512-tinySDdistilled-GGUF

##### Run the model as follows:
```bash
~/stable-diffusion.cpp/build/bin/sd-cli -m sdxs.safetensors -p "portrait of a lovely cat" \
--cfg-scale 1 --steps 1
```
Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.

### SDXS-512-0.9

Even though the name "SDXS-512-0.9" is similar to "SDXS-512-DreamShaper", it is *completely different* but also **incredibly fast**. Sometimes it is preferred, so try it yourself.
##### Download a ready-to-run file from here:

* https://huggingface.co/akleine/sdxs-09

Both options: ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are mandatory here.
For the use of this model, both options ``` --cfg-scale 1 ``` and ``` --steps 1 ``` are again absolutely necessary.
17 changes: 14 additions & 3 deletions src/common_block.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,7 @@ class CrossAttention : public GGMLBlock {
int64_t context_dim;
int64_t n_head;
int64_t d_head;
bool xtra_dim = false;

public:
CrossAttention(int64_t query_dim,
Expand All @@ -288,7 +289,11 @@ class CrossAttention : public GGMLBlock {
query_dim(query_dim),
context_dim(context_dim) {
int64_t inner_dim = d_head * n_head;

if (context_dim == 320 && d_head == 320) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
xtra_dim = true;
context_dim = 1024;
}
blocks["to_q"] = std::shared_ptr<GGMLBlock>(new Linear(query_dim, inner_dim, false));
blocks["to_k"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
blocks["to_v"] = std::shared_ptr<GGMLBlock>(new Linear(context_dim, inner_dim, false));
Expand All @@ -313,10 +318,16 @@ class CrossAttention : public GGMLBlock {
int64_t n_context = context->ne[1];
int64_t inner_dim = d_head * n_head;

auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
auto q = to_q->forward(ctx, x); // [N, n_token, inner_dim]
if (xtra_dim) {
// LOG_DEBUG("CrossAttention: temp set dim to 1024 for sdxs_09");
context->ne[0] = 1024; // patch dim
}
auto k = to_k->forward(ctx, context); // [N, n_context, inner_dim]
auto v = to_v->forward(ctx, context); // [N, n_context, inner_dim]

if (xtra_dim) {
context->ne[0] = 320; // reset dim to orig
}
x = ggml_ext_attention_ext(ctx->ggml_ctx, ctx->backend, q, k, v, n_head, nullptr, false, ctx->flash_attn_enabled); // [N, n_token, inner_dim]

x = to_out_0->forward(ctx, x); // [N, n_token, query_dim]
Expand Down
9 changes: 7 additions & 2 deletions src/model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,7 @@ SDVersion ModelLoader::get_sd_version() {
bool has_middle_block_1 = false;
bool has_output_block_311 = false;
bool has_output_block_71 = false;
bool has_attn_1024 = false;

for (auto& [name, tensor_storage] : tensor_storage_map) {
if (!(is_xl)) {
Expand Down Expand Up @@ -1111,6 +1112,10 @@ SDVersion ModelLoader::get_sd_version() {
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1") != std::string::npos ||
tensor_storage.name.find("unet.up_blocks.2.attentions.1") != std::string::npos) {
has_output_block_71 = true;
if (tensor_storage.name.find("model.diffusion_model.output_blocks.7.1.transformer_blocks.0.attn1.to_k.weight") != std::string::npos) {
if (tensor_storage.ne[0] == 1024)
has_attn_1024 = true;
}
}
if (tensor_storage.name == "cond_stage_model.transformer.text_model.embeddings.token_embedding.weight" ||
tensor_storage.name == "cond_stage_model.model.token_embedding.weight" ||
Expand Down Expand Up @@ -1184,7 +1189,7 @@ SDVersion ModelLoader::get_sd_version() {
}
if (!has_middle_block_1) {
if (!has_output_block_71) {
return VERSION_SDXS;
return VERSION_SDXS_512_DS;
}
return VERSION_SD1_TINY_UNET;
}
Expand All @@ -1194,7 +1199,7 @@ SDVersion ModelLoader::get_sd_version() {
return VERSION_SD2_INPAINT;
}
if (!has_middle_block_1) {
return VERSION_SD2_TINY_UNET;
return has_attn_1024 ? VERSION_SDXS_09 : VERSION_SD2_TINY_UNET;
}
return VERSION_SD2;
}
Expand Down
7 changes: 4 additions & 3 deletions src/model.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,8 @@ enum SDVersion {
VERSION_SD2,
VERSION_SD2_INPAINT,
VERSION_SD2_TINY_UNET,
VERSION_SDXS,
VERSION_SDXS_512_DS,
VERSION_SDXS_09,
VERSION_SDXL,
VERSION_SDXL_INPAINT,
VERSION_SDXL_PIX2PIX,
Expand All @@ -54,14 +55,14 @@ enum SDVersion {
};

static inline bool sd_version_is_sd1(SDVersion version) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS) {
if (version == VERSION_SD1 || version == VERSION_SD1_INPAINT || version == VERSION_SD1_PIX2PIX || version == VERSION_SD1_TINY_UNET || version == VERSION_SDXS_512_DS) {
return true;
}
return false;
}

static inline bool sd_version_is_sd2(SDVersion version) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET) {
if (version == VERSION_SD2 || version == VERSION_SD2_INPAINT || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_09) {
return true;
}
return false;
Expand Down
2 changes: 1 addition & 1 deletion src/name_conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1120,7 +1120,7 @@ std::string convert_tensor_name(std::string name, SDVersion version) {
for (const auto& prefix : first_stage_model_prefix_vec) {
if (starts_with(name, prefix)) {
name = convert_first_stage_model_name(name.substr(prefix.size()), prefix);
if (version == VERSION_SDXS) {
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
name = "tae." + name;
} else {
name = prefix + name;
Expand Down
49 changes: 25 additions & 24 deletions src/stable-diffusion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ const char* model_version_to_str[] = {
"SD 2.x",
"SD 2.x Inpaint",
"SD 2.x Tiny UNet",
"SDXS",
"SDXS (512-DS)",
"SDXS (09)",
"SDXL",
"SDXL Inpaint",
"SDXL Instruct-Pix2Pix",
Expand Down Expand Up @@ -789,7 +790,7 @@ class StableDiffusionGGML {
}

bool tae_preview_only = sd_ctx_params->tae_preview_only;
if (version == VERSION_SDXS) {
if (version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
tae_preview_only = false;
use_tae = true;
}
Expand All @@ -811,8 +812,8 @@ class StableDiffusionGGML {
offload_params_to_cpu,
tensor_storage_map);
diffusion_model = std::make_shared<MMDiTModel>(backend,
offload_params_to_cpu,
tensor_storage_map);
offload_params_to_cpu,
tensor_storage_map);
} else if (sd_version_is_flux(version)) {
bool is_chroma = false;
for (auto pair : tensor_storage_map) {
Expand Down Expand Up @@ -860,10 +861,10 @@ class StableDiffusionGGML {
tensor_storage_map,
version);
diffusion_model = std::make_shared<FluxModel>(backend,
offload_params_to_cpu,
tensor_storage_map,
version,
sd_ctx_params->chroma_use_dit_mask);
offload_params_to_cpu,
tensor_storage_map,
version,
sd_ctx_params->chroma_use_dit_mask);
} else if (sd_version_is_wan(version)) {
cond_stage_model = std::make_shared<T5CLIPEmbedder>(clip_backend,
offload_params_to_cpu,
Expand All @@ -872,10 +873,10 @@ class StableDiffusionGGML {
1,
true);
diffusion_model = std::make_shared<WanModel>(backend,
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model",
version);
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model",
version);
if (strlen(SAFE_STR(sd_ctx_params->high_noise_diffusion_model_path)) > 0) {
high_noise_diffusion_model = std::make_shared<WanModel>(backend,
offload_params_to_cpu,
Expand Down Expand Up @@ -904,29 +905,29 @@ class StableDiffusionGGML {
"",
enable_vision);
diffusion_model = std::make_shared<QwenImageModel>(backend,
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model",
version,
sd_ctx_params->qwen_image_zero_cond_t);
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model",
version,
sd_ctx_params->qwen_image_zero_cond_t);
} else if (sd_version_is_anima(version)) {
cond_stage_model = std::make_shared<AnimaConditioner>(clip_backend,
offload_params_to_cpu,
tensor_storage_map);
diffusion_model = std::make_shared<AnimaModel>(backend,
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model");
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model");
} else if (sd_version_is_z_image(version)) {
cond_stage_model = std::make_shared<LLMEmbedder>(clip_backend,
offload_params_to_cpu,
tensor_storage_map,
version);
diffusion_model = std::make_shared<ZImageModel>(backend,
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model",
version);
offload_params_to_cpu,
tensor_storage_map,
"model.diffusion_model",
version);
} else { // SD1.x SD2.x SDXL
std::map<std::string, std::string> embbeding_map;
for (uint32_t i = 0; i < sd_ctx_params->embedding_count; i++) {
Expand Down
8 changes: 6 additions & 2 deletions src/unet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,11 +217,11 @@ class UnetModelBlock : public GGMLBlock {
} else if (sd_version_is_unet_edit(version)) {
in_channels = 8;
}
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS) {
if (version == VERSION_SD1_TINY_UNET || version == VERSION_SD2_TINY_UNET || version == VERSION_SDXS_512_DS || version == VERSION_SDXS_09) {
num_res_blocks = 1;
channel_mult = {1, 2, 4};
tiny_unet = true;
if (version == VERSION_SDXS) {
if (version == VERSION_SDXS_512_DS) {
attention_resolutions = {4, 2}; // here just like SDXL
}
}
Expand Down Expand Up @@ -264,6 +264,10 @@ class UnetModelBlock : public GGMLBlock {
if (version == VERSION_SVD) {
return new SpatialVideoTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
} else {
if (version == VERSION_SDXS_09 && n_head == 5) {
n_head = 1; // to carry a special case of sdxs_09 into CrossAttentionLayer,
d_head = 320; // works as long the product remains equal (5*64 == 1*320)
}
return new SpatialTransformer(in_channels, n_head, d_head, depth, context_dim, use_linear_projection);
}
};
Expand Down
Loading