InfiniteTalk/tools/convert_img_to_video.py at main · MeiGen-AI/InfiniteTalk · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import yaml
import cv2
import numpy as np
from pathlib import Path

class ImageProcessor:
    def __init__(self, yaml_path):
        with open(yaml_path, 'r') as f:
            self.config = yaml.safe_load(f)

        self.images_info = []
        self.reference_size = None
        self._load_images()

    def _load_images(self):
        for img_config in self.config['images']:
            img = cv2.imread(img_config['path'])
            if img is None:
                raise ValueError(f"Cannot load image: {img_config['path']}")

            info = {
                'image': img,
                'duration': float(img_config.get('duration', 1.0)),
                'translation': img_config.get('translation', [0, 0]),
                'scale': float(img_config.get('scale', 1.0))
            }
            self.images_info.append(info)

            if self.reference_size is None:
                self.reference_size = (img.shape[1], img.shape[0])

    def _translate_image(self, img, translation):
        """Perform only translation"""
        height, width = img.shape[:2]

        # Calculate translation amount (pixels)
        tx = int(width * translation[0] / 100)
        ty = int(height * translation[1] / 100)

        # Create translation matrix
        M = np.float32([[1, 0, tx], [0, 1, ty]])

        # Apply translation while maintaining original dimensions
        translated = cv2.warpAffine(img, M, (width, height))

        return translated

    def _crop_black_borders(self, img):
        """Crop out black borders from the image"""
        # Convert to grayscale
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

        # Threshold to identify non-black areas
        _, thresh = cv2.threshold(gray, 1, 255, cv2.THRESH_BINARY)

        # Find bounding box of non-black pixels
        coords = cv2.findNonZero(thresh)
        if coords is None:
            return img

        x, y, w, h = cv2.boundingRect(coords)

        # Crop the image to the bounding box
        return img[y:y+h, x:x+w]

    def _scale_image(self, img, scale, target_size):
        """Scale the image"""
        if scale <= 1:
            return cv2.resize(img, target_size)

        # First scale up
        height, width = img.shape[:2]
        scaled_width = int(width * scale)
        scaled_height = int(height * scale)
        scaled = cv2.resize(img, (scaled_width, scaled_height))

        # Center-crop to target dimensions
        start_x = (scaled_width - target_size[0]) // 2
        start_y = (scaled_height - target_size[1]) // 2
        cropped = scaled[start_y:start_y+target_size[1],
                        start_x:start_x+target_size[0]]

        return cropped

    def _transform_image(self, img, translation, scale):
        """Apply transformations in sequence: translation → cropping → scaling"""
        original_size = (img.shape[1], img.shape[0])

        # 1. Translation
        translated = self._translate_image(img, translation)

        # 2. Black border cropping
        cropped = self._crop_black_borders(translated)

        # 3. Scale back to original dimensions
        transformed = self._scale_image(cropped, scale, original_size)

        return transformed

    def create_video(self, output_path, fps=25):
        fourcc = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, fourcc, fps, self.reference_size)

        try:
            for info in self.images_info:
                # Transform image
                transformed = self._transform_image(
                    info['image'],
                    info['translation'],
                    info['scale']
                )

                # Resize to reference dimensions if needed
                if transformed.shape[:2] != (self.reference_size[1], self.reference_size[0]):
                    transformed = cv2.resize(transformed, self.reference_size)

                # Write video frames
                n_frames = int(info['duration'] * fps)
                for _ in range(n_frames):
                    out.write(transformed)

        finally:
            out.release()

        # Enhance video quality
        self._improve_video_quality(output_path)

    def _improve_video_quality(self, video_path):
        import subprocess
        temp_path = video_path + '.temp.mp4'

        cmd = [
            'ffmpeg', '-i', video_path,
            '-c:v', 'libx264',
            '-preset', 'slow',
            '-crf', '18',
            '-y',
            temp_path
        ]

        subprocess.run(cmd)

        import os
        os.replace(temp_path, video_path)

def main():
    processor = ImageProcessor('tools/i2v_config.yaml')
    processor.create_video('convertd_video.mp4', fps=25)

if __name__ == '__main__':
    main()