Tencent-Hunyuan · GGgary666 · Oct 28, 2025 · Oct 29, 2025 · yghstill · Oct 28, 2025
diff --git a/README.md b/README.md
@@ -33,6 +33,7 @@ This repo contains PyTorch model definitions, pretrained weights and inference/s
 </div>
 
 ## 🔥🔥🔥 Latest Updates
+- October 31, 2025: 🚀 Optimized FP8 inference now available via [AngelSlim](https://github.com/Tencent/AngelSlim), delivering up to 40% faster generation!
 - September 18, 2025: ✨ Try the [PromptEnhancer-32B model](https://huggingface.co/PromptEnhancer/PromptEnhancer-32B) for higher-quality prompt enhancement!.
 - September 18, 2025: ✨ [ComfyUI workflow of HunyuanImage-2.1](https://github.com/KimbingNg/ComfyUI-HunyuanImage2.1) is available now!
 - September 16, 2025: 👑 We achieved the Top1 on Arena's leaderboard for text-to-image open-source models. [Leaderboard](https://artificialanalysis.ai/text-to-image/arena/leaderboard-text)
@@ -122,6 +123,14 @@ Additionally, we **highly recommend** using the full generation pipeline for bet
 | Distilled text-to-image Model | hunyuanimage2.1-distilled | Distilled model for faster inference    | 8                   | 3.25           | 4     |
 | Refiner                  | hunyuanimage-refiner      | The refiner model                       | N/A                 | N/A            | N/A   |
 
+### FP8 Inference
+
+Enable FP8 quantized inference with [Angelslim](https://github.com/Tencent/AngelSlim) to significantly reduce GPU memory usage and speed up generation.  
+- **Recommended:** `fp8_mode="weight_only"` for optimal image quality and memory savings  
+- **Alternative:** `fp8_mode="w8a8"` for maximum speed and lower memory usage (may slightly lower quality)  
+- `use_fp8=True` will be deprecated and now maps to `"weight_only"`.
+
+### Example:
 
 ```python
 import os
@@ -131,7 +140,9 @@ from hyimage.diffusion.pipelines.hunyuanimage_pipeline import HunyuanImagePipeli
 
 # Supported model_name: hunyuanimage-v2.1, hunyuanimage-v2.1-distilled
 model_name = "hunyuanimage-v2.1"
-pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, use_fp8=True)
+# Supported fp8_mode: weight_only, w8a8
+# pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="weight_only")
+pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="w8a8")
 pipe = pipe.to("cuda")
 
 # The input prompt

diff --git a/README_CN.md b/README_CN.md
@@ -31,6 +31,7 @@
 
 
 ## 🔥🔥🔥 最新动态
+- 2025 年 10 月 31 日：🚀 通过集成 [AngelSlim](https://github.com/Tencent/AngelSlim) 优化 FP8 推理，最高可带来约 40% 的生成加速！
 - 2025 年 9 月 18 日：✨ 欢迎体验 [PromptEnhancer-32B 模型](https://huggingface.co/PromptEnhancer/PromptEnhancer-32B) 以获得更高质量的提示词增强！
 - 2025 年 9 月 18 日：✨ [HunyuanImage-2.1 的 ComfyUI 工作流](https://github.com/KimbingNg/ComfyUI-HunyuanImage2.1) 现已开放体验！
 - 2025 年 9 月 16 日：👑 我们在 Arena 文生图开源模型排行榜上获得第一名！[排行榜](https://artificialanalysis.ai/text-to-image/arena/leaderboard-text)
@@ -110,6 +111,15 @@ HunyuanImage-2.1 **仅支持 2K** 图像生成（如 1:1 时为 2048x2048，16:9
 | 蒸馏文生图模型 | hunyuanimage2.1-distilled | 蒸馏模型，推理更快    | 8                   | 3.25           | 4     |
 | 精修模型                  | hunyuanimage-refiner      | 精修模型                       | N/A                 | N/A            | N/A   |
 
+### FP8 推理
+
+借助 [AngelSlim](https://github.com/Tencent/AngelSlim) 可以显著降低显存占用并提升生成速度：
+- **推荐：** 将 `fp8_mode` 设为 `"weight_only"`，在画质与显存之间取得最佳平衡；
+- **可选：** `"w8a8"` 模式提供更高速度与更低显存占用，但画质可能略有下降；
+- `use_fp8=True` 将被弃用，现已等价映射为 `"weight_only"`。
+
+
+### 示例
 
 ```python
 import os
@@ -119,7 +129,9 @@ from hyimage.diffusion.pipelines.hunyuanimage_pipeline import HunyuanImagePipeli
 
 # 支持的 model_name：hunyuanimage-v2.1, hunyuanimage-v2.1-distilled
 model_name = "hunyuanimage-v2.1"
-pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, use_fp8=True)
+# 支持的 fp8_mode："weight_only"（推荐），"w8a8"（更快）
+# pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="weight_only")
+pipe = HunyuanImagePipeline.from_pretrained(model_name=model_name, fp8_mode="w8a8")
 pipe = pipe.to("cuda")
 
 # 输入提示词

diff --git a/hyimage/diffusion/pipelines/hunyuanimage_pipeline.py b/hyimage/diffusion/pipelines/hunyuanimage_pipeline.py
@@ -47,7 +47,12 @@ class HunyuanImagePipelineConfig:
     enable_vae_offloading: bool = True # offload vae after finishing
     enable_byt5_offloading: bool = True # offload byt5 after finishing
 
-    use_fp8: bool = False
+    # FP8 settings
+    use_fp8: bool = False # Deprecated: legacy single FP8 switch. Kept for backward compatibility.
+    use_fp8_weight_only: bool = False # Use FP8 weight only mode
+    use_fp8_w8a8: bool = False # Use FP8 w8a8 mode
+    # Preferred unified FP8 mode: "none" | "weight_only" | "w8a8"
+    fp8_mode: str = "none"
 
     cfg_mode: str = "MIX_mode_0"
     guidance_rescale: float = 0.0
@@ -159,25 +164,28 @@ def __init__(
 
     def _load_dit(self):
         dit_device = None
-        if self.config.enable_full_dit_offloading or self.config.use_fp8:
+        fp8_mode = self._resolve_fp8_mode()
+        if self.config.enable_full_dit_offloading or fp8_mode != "none":
             dit_device = 'cpu'
         else:
             dit_device = self.device
 
         try:
             dit_config = self.config.dit_config
             self.dit = instantiate(dit_config.model, dtype=self.torch_dtype, device=dit_device)
-            if self.config.use_fp8:
-                from hyimage.models.utils.fp8_quantization import convert_fp8_linear
+            if fp8_mode != "none":
+                from angelslim.compressor.diffusion import DynamicDiTQuantizer
+                # Map unified mode to backend quant type
+                quant_type = "fp8-per-tensor-weight-only" if fp8_mode == "weight_only" else "fp8-per-tensor"
+                quantizer = DynamicDiTQuantizer(quant_type=quant_type)
                 if not Path(dit_config.fp8_scale).exists():
                     raise FileNotFoundError(f"FP8 scale file not found: {dit_config.fp8_scale}. Please download from https://huggingface.co/tencent/HunyuanImage-2.1/")
                 if dit_config.fp8_load_from is not None and Path(dit_config.fp8_load_from).exists():
-                    convert_fp8_linear(self.dit, dit_config.fp8_scale)
+                    quantizer.convert_linear(self.dit, scale=dit_config.fp8_scale)
                     load_hunyuan_dit_state_dict(self.dit, dit_config.fp8_load_from, strict=True)
                 else:
                     raise FileNotFoundError(f"FP8 ckpt not found: {dit_config.fp8_load_from}. Please download from https://huggingface.co/tencent/HunyuanImage-2.1/")
-                    load_hunyuan_dit_state_dict(self.dit, dit_config.load_from, strict=True)
-                    convert_fp8_linear(self.dit, dit_config.fp8_scale)
+
                 self.dit = self.dit.to(dit_device)
             else:
                 load_hunyuan_dit_state_dict(self.dit, dit_config.load_from, strict=True)
@@ -256,9 +264,33 @@ def refiner_pipeline(self):
         from hyimage.diffusion.pipelines.hunyuanimage_refiner_pipeline import HunYuanImageRefinerPipeline
         if self.config.enable_stage1_offloading:
             self.offload()
-        self._refiner_pipeline = HunYuanImageRefinerPipeline.from_pretrained(self.config.refiner_model_name, use_fp8=self.config.use_fp8)
+        self._refiner_pipeline = HunYuanImageRefinerPipeline.from_pretrained(
+            self.config.refiner_model_name,
+            fp8_mode=self._resolve_fp8_mode(),
+        )
         return self._refiner_pipeline
 
+    def _resolve_fp8_mode(self) -> str:
+        """Resolve FP8 mode with backward compatibility.
+
+        Preferred: config.fp8_mode in {"none", "weight_only", "w8a8"}.
+        Fallback to old boolean switches if fp8_mode == "none".
+        """
+        mode = getattr(self.config, "fp8_mode", "none") or "none"
+        if mode not in {"none", "weight_only", "w8a8"}:
+            raise ValueError(f"Invalid fp8_mode: {mode}. Expected one of 'none', 'weight_only', 'w8a8'.")
+        if mode != "none":
+            return mode
+        use_fp8_weight_only = getattr(self.config, "use_fp8_weight_only", False)
+        use_fp8_w8a8 = getattr(self.config, "use_fp8_w8a8", False)
+        if use_fp8_weight_only and use_fp8_w8a8:
+            raise ValueError("Both use_fp8_weight_only and use_fp8_w8a8 are True. Please enable only one.")
+        if use_fp8_w8a8:
+            return "w8a8"
+        if self.config.use_fp8 or use_fp8_weight_only:
+            return "weight_only"
+        return "none"
+
     @property
     def reprompt_model(self):
         """
@@ -860,6 +892,8 @@ def to(self, device: str | torch.device):
                 self.text_encoder = self.text_encoder.to(device, non_blocking=True)
         if self.vae is not None:
             self.vae = self.vae.to(device, non_blocking=True)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return self
 
     def offload(self):
@@ -869,6 +903,8 @@ def offload(self):
             self.text_encoder = self.text_encoder.to('cpu', non_blocking=True)
         if self.vae is not None:
             self.vae = self.vae.to('cpu', non_blocking=True)
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         return self
 
     def update_config(self, **kwargs):

diff --git a/requirements.txt b/requirements.txt
@@ -15,4 +15,5 @@ transformers[accelerate,tiktoken]==4.56.0
 wheel
 setuptools
 modelscope
-huggingface_hub[cli]
+huggingface_hub[cli]
+angelslim[diffusion]==0.2.0