diff --git a/README.md b/README.md index 6fb8b98..e9df136 100644 --- a/README.md +++ b/README.md @@ -33,6 +33,7 @@ This repo contains PyTorch model definitions, pretrained weights and inference/s ## 🔥🔥🔥 Latest Updates +- October 31, 2025: 🚀 Optimized FP8 inference now available via [AngelSlim](https://github.com/Tencent/AngelSlim), delivering up to 40% faster generation! - September 18, 2025: ✨ Try the [PromptEnhancer-32B model](https://huggingface.co/PromptEnhancer/PromptEnhancer-32B) for higher-quality prompt enhancement!​. - September 18, 2025: ✨ [ComfyUI workflow of HunyuanImage-2.1](https://github.com/KimbingNg/ComfyUI-HunyuanImage2.1) is available now! - September 16, 2025: 👑 We achieved the Top1 on Arena's leaderboard for text-to-image open-source models. [Leaderboard](https://artificialanalysis.ai/text-to-image/arena/leaderboard-text) @@ -122,6 +123,14 @@ Additionally, we **highly recommend** using the full generation pipeline for bet | Distilled text-to-image Model | hunyuanimage2.1-distilled | Distilled model for faster inference | 8 | 3.25 | 4 | | Refiner | hunyuanimage-refiner | The refiner model | N/A | N/A | N/A | +### FP8 Inference + +Enable FP8 quantized inference with [Angelslim](https://github.com/Tencent/AngelSlim) to significantly reduce GPU memory usage and speed up generation. +- **Recommended:** `fp8_mode="weight_only"` for optimal image quality and memory savings +- **Alternative:** `fp8_mode="w8a8"` for maximum speed and lower memory usage (may slightly lower quality) +- `use_fp8=True` will be deprecated and now maps to `"weight_only"`. + +### Example: ```python import os @@ -131,7 +140,9 @@ from hyimage.diffusion.pipelines.hunyuanimage_pipeline import HunyuanImagePipeli # Supported model_name: hunyuanimage-v2.1, hunyuanimage-v2.1-distilled model_name = "hunyuanimage-v2.1" -pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, use_fp8=True) +# Supported fp8_mode: weight_only, w8a8 +# pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="weight_only") +pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="w8a8") pipe = pipe.to("cuda") # The input prompt diff --git a/README_CN.md b/README_CN.md index c544e34..0098666 100644 --- a/README_CN.md +++ b/README_CN.md @@ -31,6 +31,7 @@ ## 🔥🔥🔥 最新动态 +- 2025 年 10 月 31 日:🚀 通过集成 [AngelSlim](https://github.com/Tencent/AngelSlim) 优化 FP8 推理,最高可带来约 40% 的生成加速! - 2025 年 9 月 18 日:✨ 欢迎体验 [PromptEnhancer-32B 模型](https://huggingface.co/PromptEnhancer/PromptEnhancer-32B) 以获得更高质量的提示词增强! - 2025 年 9 月 18 日:✨ [HunyuanImage-2.1 的 ComfyUI 工作流](https://github.com/KimbingNg/ComfyUI-HunyuanImage2.1) 现已开放体验! - 2025 年 9 月 16 日:👑 我们在 Arena 文生图开源模型排行榜上获得第一名![排行榜](https://artificialanalysis.ai/text-to-image/arena/leaderboard-text) @@ -110,6 +111,15 @@ HunyuanImage-2.1 **仅支持 2K** 图像生成(如 1:1 时为 2048x2048,16:9 | 蒸馏文生图模型 | hunyuanimage2.1-distilled | 蒸馏模型,推理更快 | 8 | 3.25 | 4 | | 精修模型 | hunyuanimage-refiner | 精修模型 | N/A | N/A | N/A | +### FP8 推理 + +借助 [AngelSlim](https://github.com/Tencent/AngelSlim) 可以显著降低显存占用并提升生成速度: +- **推荐:** 将 `fp8_mode` 设为 `"weight_only"`,在画质与显存之间取得最佳平衡; +- **可选:** `"w8a8"` 模式提供更高速度与更低显存占用,但画质可能略有下降; +- `use_fp8=True` 将被弃用,现已等价映射为 `"weight_only"`。 + + +### 示例 ```python import os @@ -119,7 +129,9 @@ from hyimage.diffusion.pipelines.hunyuanimage_pipeline import HunyuanImagePipeli # 支持的 model_name:hunyuanimage-v2.1, hunyuanimage-v2.1-distilled model_name = "hunyuanimage-v2.1" -pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, use_fp8=True) +# 支持的 fp8_mode:"weight_only"(推荐),"w8a8"(更快) +# pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="weight_only") +pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="w8a8") pipe = pipe.to("cuda") # 输入提示词 diff --git a/hyimage/diffusion/pipelines/hunyuanimage_pipeline.py b/hyimage/diffusion/pipelines/hunyuanimage_pipeline.py index 4fbeeff..e61e707 100644 --- a/hyimage/diffusion/pipelines/hunyuanimage_pipeline.py +++ b/hyimage/diffusion/pipelines/hunyuanimage_pipeline.py @@ -47,7 +47,12 @@ class HunyuanImagePipelineConfig: enable_vae_offloading: bool = True # offload vae after finishing enable_byt5_offloading: bool = True # offload byt5 after finishing - use_fp8: bool = False + # FP8 settings + use_fp8: bool = False # Deprecated: legacy single FP8 switch. Kept for backward compatibility. + use_fp8_weight_only: bool = False # Use FP8 weight only mode + use_fp8_w8a8: bool = False # Use FP8 w8a8 mode + # Preferred unified FP8 mode: "none" | "weight_only" | "w8a8" + fp8_mode: str = "none" cfg_mode: str = "MIX_mode_0" guidance_rescale: float = 0.0 @@ -159,7 +164,8 @@ def __init__( def _load_dit(self): dit_device = None - if self.config.enable_full_dit_offloading or self.config.use_fp8: + fp8_mode = self._resolve_fp8_mode() + if self.config.enable_full_dit_offloading or fp8_mode != "none": dit_device = 'cpu' else: dit_device = self.device @@ -167,17 +173,19 @@ def _load_dit(self): try: dit_config = self.config.dit_config self.dit = instantiate(dit_config.model, dtype=self.torch_dtype, device=dit_device) - if self.config.use_fp8: - from hyimage.models.utils.fp8_quantization import convert_fp8_linear + if fp8_mode != "none": + from angelslim.compressor.diffusion import DynamicDiTQuantizer + # Map unified mode to backend quant type + quant_type = "fp8-per-tensor-weight-only" if fp8_mode == "weight_only" else "fp8-per-tensor" + quantizer = DynamicDiTQuantizer(quant_type=quant_type) if not Path(dit_config.fp8_scale).exists(): raise FileNotFoundError(f"FP8 scale file not found: {dit_config.fp8_scale}. Please download from https://huggingface.co/tencent/HunyuanImage-2.1/") if dit_config.fp8_load_from is not None and Path(dit_config.fp8_load_from).exists(): - convert_fp8_linear(self.dit, dit_config.fp8_scale) + quantizer.convert_linear(self.dit, scale=dit_config.fp8_scale) load_hunyuan_dit_state_dict(self.dit, dit_config.fp8_load_from, strict=True) else: raise FileNotFoundError(f"FP8 ckpt not found: {dit_config.fp8_load_from}. Please download from https://huggingface.co/tencent/HunyuanImage-2.1/") - load_hunyuan_dit_state_dict(self.dit, dit_config.load_from, strict=True) - convert_fp8_linear(self.dit, dit_config.fp8_scale) + self.dit = self.dit.to(dit_device) else: load_hunyuan_dit_state_dict(self.dit, dit_config.load_from, strict=True) @@ -256,9 +264,33 @@ def refiner_pipeline(self): from hyimage.diffusion.pipelines.hunyuanimage_refiner_pipeline import HunYuanImageRefinerPipeline if self.config.enable_stage1_offloading: self.offload() - self._refiner_pipeline = HunYuanImageRefinerPipeline.from_pretrained(self.config.refiner_model_name, use_fp8=self.config.use_fp8) + self._refiner_pipeline = HunYuanImageRefinerPipeline.from_pretrained( + self.config.refiner_model_name, + fp8_mode=self._resolve_fp8_mode(), + ) return self._refiner_pipeline + def _resolve_fp8_mode(self) -> str: + """Resolve FP8 mode with backward compatibility. + + Preferred: config.fp8_mode in {"none", "weight_only", "w8a8"}. + Fallback to old boolean switches if fp8_mode == "none". + """ + mode = getattr(self.config, "fp8_mode", "none") or "none" + if mode not in {"none", "weight_only", "w8a8"}: + raise ValueError(f"Invalid fp8_mode: {mode}. Expected one of 'none', 'weight_only', 'w8a8'.") + if mode != "none": + return mode + use_fp8_weight_only = getattr(self.config, "use_fp8_weight_only", False) + use_fp8_w8a8 = getattr(self.config, "use_fp8_w8a8", False) + if use_fp8_weight_only and use_fp8_w8a8: + raise ValueError("Both use_fp8_weight_only and use_fp8_w8a8 are True. Please enable only one.") + if use_fp8_w8a8: + return "w8a8" + if self.config.use_fp8 or use_fp8_weight_only: + return "weight_only" + return "none" + @property def reprompt_model(self): """ @@ -860,6 +892,8 @@ def to(self, device: str | torch.device): self.text_encoder = self.text_encoder.to(device, non_blocking=True) if self.vae is not None: self.vae = self.vae.to(device, non_blocking=True) + if torch.cuda.is_available(): + torch.cuda.empty_cache() return self def offload(self): @@ -869,6 +903,8 @@ def offload(self): self.text_encoder = self.text_encoder.to('cpu', non_blocking=True) if self.vae is not None: self.vae = self.vae.to('cpu', non_blocking=True) + if torch.cuda.is_available(): + torch.cuda.empty_cache() return self def update_config(self, **kwargs): diff --git a/requirements.txt b/requirements.txt index d7746e8..070e2b1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -15,4 +15,5 @@ transformers[accelerate,tiktoken]==4.56.0 wheel setuptools modelscope -huggingface_hub[cli] \ No newline at end of file +huggingface_hub[cli] +angelslim[diffusion]==0.2.0 \ No newline at end of file