HunyuanOCR/Hunyuan-OCR-master/Hunyuan-OCR-vllm/run_hy_ocr.py at main · Tencent-Hunyuan/HunyuanOCR · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import json
import base64
from openai import OpenAI
from tqdm import tqdm
from typing import Dict, List

def encode_image(image_path: str) -> str:
    """
    Encode image file to base64 string.

    Args:
        image_path: Path to the image file

    Returns:
        Base64 encoded string of the image
    """
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def create_chat_messages(image_path: str, prompt: str) -> List[Dict]:
    """
    Create chat messages with image and prompt.

    Args:
        image_path: Path to the image file
        prompt: Text prompt for the model

    Returns:
        List of message dictionaries
    """
    return [
        {"role": "system", "content": ""},
        {
            "role": "user",
            "content": [
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{encode_image(image_path)}"
                    }
                },
                {"type": "text", "text": prompt}
            ]
        }
    ]

def process_single_item(client: OpenAI, data: Dict) -> Dict:
    """
    Process a single data item through the VLLM API.

    Args:
        client: OpenAI client instance
        data: Input data dictionary

    Returns:
        Updated data dictionary with model response
    """
    # Extract image path and prompt
    img_path = data['image_path']
    prompt = data['question']

    # Create chat messages
    messages = create_chat_messages(img_path, prompt)

    # Get model response
    response = client.chat.completions.create(
        model="tencent/HunyuanOCR",
        messages=messages,
        temperature=0.0,
        top_p=0.95,
        seed=1234,
        stream=False,
        extra_body={
            "top_k": 1,
            "repetition_penalty": 1.0
        }
    )

    # Update data with model response
    data["vllm_answer"] = response.choices[0].message.content
    return data

def main():
    """Main function to process the JSONL file through VLLM API"""
    # Initialize OpenAI client
    client = OpenAI(
        api_key="EMPTY",
        base_url="http://localhost:8000/v1",
        timeout=3600
    )

    # Define input/output paths
    input_path = 'ominidoc_bench.jsonl'
    output_path = "infer_result_ominidoc_bench.jsonl"

    # Process data
    with open(input_path, "r", encoding="utf-8") as fin, \
         open(output_path, "w", encoding="utf-8") as fout:

        # Iterate through input file
        for line in tqdm(fin, desc="Processing documents"):
            if not line.strip():
                continue

            try:
                # Load and process data
                data = json.loads(line)
                processed_data = process_single_item(client, data)

                # Write results
                fout.write(json.dumps(processed_data, ensure_ascii=False) + "\n")
            except Exception as e:
                print(f"Error processing line: {str(e)}")
                continue

    print(f"Processing completed. Results saved to: {output_path}")

if __name__ == "__main__":
    main()