DiffSynth-Studio/examples/qwen_image/model_inference_low_vram/Qwen-Image-i2L.py at main · modelscope/DiffSynth-Studio · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from diffsynth.pipelines.qwen_image import (
    QwenImagePipeline, ModelConfig,
    QwenImageUnit_Image2LoRAEncode, QwenImageUnit_Image2LoRADecode
)
from diffsynth.utils.lora import merge_lora
from diffsynth import load_state_dict
from modelscope import snapshot_download
from safetensors.torch import save_file
import torch
from PIL import Image


vram_config = {
    "offload_dtype": "disk",
    "offload_device": "disk",
    "onload_dtype": torch.bfloat16,
    "onload_device": "cpu",
    "preparing_dtype": torch.bfloat16,
    "preparing_device": "cuda",
    "computation_dtype": torch.bfloat16,
    "computation_device": "cuda",
}
vram_config_disk_offload = {
    "offload_dtype": "disk",
    "offload_device": "disk",
    "onload_dtype": "disk",
    "onload_device": "disk",
    "preparing_dtype": torch.bfloat16,
    "preparing_device": "cuda",
    "computation_dtype": torch.bfloat16,
    "computation_device": "cuda",
}

def demo_style():
    # Load models
    pipe = QwenImagePipeline.from_pretrained(
        torch_dtype=torch.bfloat16,
        device="cuda",
        model_configs=[
            ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors", **vram_config_disk_offload),
            ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors", **vram_config_disk_offload),
            ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-i2L", origin_file_pattern="Qwen-Image-i2L-Style.safetensors", **vram_config_disk_offload),
        ],
        processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"),
        vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
    )

    # Load images
    snapshot_download(
        model_id="DiffSynth-Studio/Qwen-Image-i2L",
        allow_file_pattern="assets/style/1/*",
        local_dir="data/examples"
    )
    images = [
        Image.open("data/examples/assets/style/1/0.jpg"),
        Image.open("data/examples/assets/style/1/1.jpg"),
        Image.open("data/examples/assets/style/1/2.jpg"),
        Image.open("data/examples/assets/style/1/3.jpg"),
        Image.open("data/examples/assets/style/1/4.jpg"),
    ]

    # Model inference
    with torch.no_grad():
        embs = QwenImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=images)
        lora = QwenImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]
    save_file(lora, "model_style.safetensors")


def demo_coarse_fine_bias():
    # Load models
    pipe = QwenImagePipeline.from_pretrained(
        torch_dtype=torch.bfloat16,
        device="cuda",
        model_configs=[
            ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config_disk_offload),
            ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors", **vram_config_disk_offload),
            ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="DINOv3-7B/model.safetensors", **vram_config_disk_offload),
            ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-i2L", origin_file_pattern="Qwen-Image-i2L-Coarse.safetensors", **vram_config_disk_offload),
            ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-i2L", origin_file_pattern="Qwen-Image-i2L-Fine.safetensors", **vram_config_disk_offload),
        ],
        processor_config=ModelConfig(model_id="Qwen/Qwen-Image-Edit", origin_file_pattern="processor/"),
        vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
    )

    # Load images
    snapshot_download(
        model_id="DiffSynth-Studio/Qwen-Image-i2L",
        allow_file_pattern="assets/lora/3/*",
        local_dir="data/examples"
    )
    images = [
        Image.open("data/examples/assets/lora/3/0.jpg"),
        Image.open("data/examples/assets/lora/3/1.jpg"),
        Image.open("data/examples/assets/lora/3/2.jpg"),
        Image.open("data/examples/assets/lora/3/3.jpg"),
        Image.open("data/examples/assets/lora/3/4.jpg"),
        Image.open("data/examples/assets/lora/3/5.jpg"),
    ]

    # Model inference
    with torch.no_grad():
        embs = QwenImageUnit_Image2LoRAEncode().process(pipe, image2lora_images=images)
        lora = QwenImageUnit_Image2LoRADecode().process(pipe, **embs)["lora"]
        lora_bias = ModelConfig(model_id="DiffSynth-Studio/Qwen-Image-i2L", origin_file_pattern="Qwen-Image-i2L-Bias.safetensors")
        lora_bias.download_if_necessary()
        lora_bias = load_state_dict(lora_bias.path, torch_dtype=torch.bfloat16, device="cuda")
        lora = merge_lora([lora, lora_bias])
    save_file(lora, "model_coarse_fine_bias.safetensors")


def generate_image(lora_path, prompt, seed):
    pipe = QwenImagePipeline.from_pretrained(
        torch_dtype=torch.bfloat16,
        device="cuda",
        model_configs=[
            ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="transformer/diffusion_pytorch_model*.safetensors", **vram_config),
            ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="text_encoder/model*.safetensors", **vram_config),
            ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="vae/diffusion_pytorch_model.safetensors", **vram_config),
        ],
        tokenizer_config=ModelConfig(model_id="Qwen/Qwen-Image", origin_file_pattern="tokenizer/"),
        vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
    )
    pipe.load_lora(pipe.dit, lora_path)
    image = pipe(prompt, seed=seed, height=1024, width=1024, num_inference_steps=50)
    return image


demo_style()
image = generate_image("model_style.safetensors", "a cat", 0)
image.save("image_1.jpg")

demo_coarse_fine_bias()
image = generate_image("model_coarse_fine_bias.safetensors", "bowl", 1)
image.save("image_2.jpg")