BettaFish/regenerate_latest_md.py at main · 666ghj/BettaFish · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
"""
使用最新的章节JSON重新装订并渲染Markdown报告。
"""

import json
import sys
from datetime import datetime
from pathlib import Path
from loguru import logger

# 确保可以找到项目内模块
sys.path.insert(0, str(Path(__file__).parent))

from ReportEngine.core import ChapterStorage, DocumentComposer
from ReportEngine.ir import IRValidator
from ReportEngine.renderers import MarkdownRenderer
from ReportEngine.utils.config import settings


def find_latest_run_dir(chapter_root: Path):
    """
    定位章节根目录下最新一次运行的输出目录。

    扫描 `chapter_root` 下所有子目录，筛选出包含 `manifest.json`
    的候选，按修改时间倒序取最新一条。若目录不存在或没有有效
    manifest，会记录错误并返回 None。

    参数:
        chapter_root: 章节输出的根目录（通常是 settings.CHAPTER_OUTPUT_DIR）

    返回:
        Path | None: 最新的 run 目录路径；若未找到则为 None。
    """
    if not chapter_root.exists():
        logger.error(f"章节目录不存在: {chapter_root}")
        return None

    run_dirs = []
    for candidate in chapter_root.iterdir():
        if not candidate.is_dir():
            continue
        manifest_path = candidate / "manifest.json"
        if manifest_path.exists():
            run_dirs.append((candidate, manifest_path.stat().st_mtime))

    if not run_dirs:
        logger.error("未找到带 manifest.json 的章节目录")
        return None

    latest_dir = sorted(run_dirs, key=lambda item: item[1], reverse=True)[0][0]
    logger.info(f"找到最新run目录: {latest_dir.name}")
    return latest_dir


def load_manifest(run_dir: Path):
    """
    读取单次运行目录内的 manifest.json。

    成功时返回 reportId 以及元数据字典；读取或解析失败会记录错误
    并返回 (None, None)，以便上层提前终止流程。

    参数:
        run_dir: 包含 manifest.json 的章节输出目录

    返回:
        tuple[str | None, dict | None]: (report_id, metadata)
    """
    manifest_path = run_dir / "manifest.json"
    try:
        with manifest_path.open("r", encoding="utf-8") as f:
            manifest = json.load(f)
        report_id = manifest.get("reportId") or run_dir.name
        metadata = manifest.get("metadata") or {}
        logger.info(f"报告ID: {report_id}")
        if manifest.get("createdAt"):
            logger.info(f"创建时间: {manifest['createdAt']}")
        return report_id, metadata
    except Exception as exc:
        logger.error(f"读取manifest失败: {exc}")
        return None, None


def load_chapters(run_dir: Path):
    """
    读取指定 run 目录下的所有章节 JSON。

    会复用 ChapterStorage 的 load_chapters 能力，自动按 order 排序。
    读取后打印章节数量，便于确认完整性。

    参数:
        run_dir: 单次报告的章节目录

    返回:
        list[dict]: 章节 JSON 列表（若目录为空则为空列表）
    """
    storage = ChapterStorage(settings.CHAPTER_OUTPUT_DIR)
    chapters = storage.load_chapters(run_dir)
    logger.info(f"加载章节数: {len(chapters)}")
    return chapters


def validate_chapters(chapters):
    """
    使用 IRValidator 对章节结构做快速校验。

    仅记录未通过的章节及前三条错误，不会中断流程；目的是在
    重装订前发现潜在结构问题。

    参数:
        chapters: 章节 JSON 列表
    """
    validator = IRValidator()
    invalid = []
    for chapter in chapters:
        ok, errors = validator.validate_chapter(chapter)
        if not ok:
            invalid.append((chapter.get("chapterId") or "unknown", errors))

    if invalid:
        logger.warning(f"有 {len(invalid)} 个章节未通过结构校验，将继续装订：")
        for chapter_id, errors in invalid:
            preview = "; ".join(errors[:3])
            logger.warning(f"  - {chapter_id}: {preview}")
    else:
        logger.info("章节结构校验通过")


def stitch_document(report_id, metadata, chapters):
    """
    将各章节与元数据装订为完整的 Document IR。

    使用 DocumentComposer 统一处理章节顺序、全局元数据等，并打印
    装订完成的章节与图表数量。

    参数:
        report_id: 报告 ID（来自 manifest 或目录名）
        metadata: manifest 中的全局元数据
        chapters: 已加载的章节列表

    返回:
        dict: 完整的 Document IR 对象
    """
    composer = DocumentComposer()
    document_ir = composer.build_document(report_id, metadata, chapters)
    logger.info(
        f"装订完成: {len(document_ir.get('chapters', []))} 个章节，"
        f"{count_charts(document_ir)} 个图表"
    )
    return document_ir


def count_charts(document_ir):
    """
    统计整本 Document IR 中的 Chart.js 图表数量。

    会遍历每章的 blocks，递归查找 widget 类型中以 `chart.js`
    开头的组件，便于快速感知图表规模。

    参数:
        document_ir: 完整的 Document IR

    返回:
        int: 图表总数
    """
    chart_count = 0
    for chapter in document_ir.get("chapters", []):
        blocks = chapter.get("blocks", [])
        chart_count += _count_chart_blocks(blocks)
    return chart_count


def _count_chart_blocks(blocks):
    """
    递归统计 block 列表中的 Chart.js 组件数量。

    兼容嵌套的 blocks/list/table 结构，确保所有层级的图表都被计入。

    参数:
        blocks: 任意层级的 block 列表

    返回:
        int: 统计到的 chart.js 图表数量
    """
    count = 0
    for block in blocks:
        if not isinstance(block, dict):
            continue
        if block.get("type") == "widget" and str(block.get("widgetType", "")).startswith("chart.js"):
            count += 1
        nested = block.get("blocks")
        if isinstance(nested, list):
            count += _count_chart_blocks(nested)
        if block.get("type") == "list":
            for item in block.get("items", []):
                if isinstance(item, list):
                    count += _count_chart_blocks(item)
        if block.get("type") == "table":
            for row in block.get("rows", []):
                for cell in row.get("cells", []):
                    if isinstance(cell, dict):
                        cell_blocks = cell.get("blocks", [])
                        if isinstance(cell_blocks, list):
                            count += _count_chart_blocks(cell_blocks)
    return count


def save_document_ir(document_ir, base_name, timestamp):
    """
    将重新装订好的整本 Document IR 落盘。

    按 `report_ir_{slug}_{timestamp}_regen.json` 命名写入
    `settings.DOCUMENT_IR_OUTPUT_DIR`，确保目录存在并返回保存路径。

    参数:
        document_ir: 已装订完成的整本 IR
        base_name: 由主题/标题生成的安全文件名片段
        timestamp: 时间戳字符串，用于区分多次重生成

    返回:
        Path: 保存的 IR 文件路径
    """
    output_dir = Path(settings.DOCUMENT_IR_OUTPUT_DIR)
    output_dir.mkdir(parents=True, exist_ok=True)
    ir_filename = f"report_ir_{base_name}_{timestamp}_regen.json"
    ir_path = output_dir / ir_filename
    ir_path.write_text(json.dumps(document_ir, ensure_ascii=False, indent=2), encoding="utf-8")
    logger.info(f"IR已保存: {ir_path}")
    return ir_path


def render_markdown(document_ir, base_name, timestamp, ir_path=None):
    """
    使用 MarkdownRenderer 将 Document IR 渲染为 Markdown 并保存。

    渲染后落盘到 `final_reports/md`，打印生成文件大小，便于确认
    输出内容。

    参数:
        document_ir: 装订完成的整本 IR
        base_name: 文件名片段（来源于报告主题/标题）
        timestamp: 时间戳字符串
        ir_path: 可选，IR 文件路径，提供时修复后会自动保存

    返回:
        Path: 生成的 Markdown 文件路径
    """
    renderer = MarkdownRenderer()
    # 传入 ir_file_path，修复后自动保存
    markdown_content = renderer.render(document_ir, ir_file_path=str(ir_path) if ir_path else None)

    output_dir = Path(settings.OUTPUT_DIR) / "md"
    output_dir.mkdir(parents=True, exist_ok=True)
    md_filename = f"report_md_{base_name}_{timestamp}.md"
    md_path = output_dir / md_filename
    md_path.write_text(markdown_content, encoding="utf-8")

    file_size_kb = md_path.stat().st_size / 1024
    logger.info(f"Markdown生成成功: {md_path} ({file_size_kb:.1f} KB)")
    return md_path


def build_slug(text):
    """
    将主题/标题转换为文件系统安全的片段。

    仅保留字母/数字/空格/下划线/连字符，空格统一为下划线，并限制
    最长 60 字符，避免过长文件名。

    参数:
        text: 原始主题或标题

    返回:
        str: 清洗后的安全字符串
    """
    text = str(text or "report")
    sanitized = "".join(c for c in text if c.isalnum() or c in (" ", "-", "_")).strip()
    sanitized = sanitized.replace(" ", "_")
    return sanitized[:60] or "report"


def main():
    """
    主入口：读取最新章节、装订 IR 并渲染 Markdown。

    流程：
        1) 找到最新的章节 run 目录并读取 manifest；
        2) 加载章节并执行结构校验（仅警告）；
        3) 装订整本 IR，保存 IR 副本；
        4) 渲染 Markdown 并输出路径。

    返回:
        int: 0 表示成功，其余表示失败。
    """
    logger.info("🚀 使用最新的LLM章节重新装订并渲染Markdown")

    chapter_root = Path(settings.CHAPTER_OUTPUT_DIR)
    latest_run = find_latest_run_dir(chapter_root)
    if not latest_run:
        return 1

    report_id, metadata = load_manifest(latest_run)
    if not report_id or metadata is None:
        return 1

    chapters = load_chapters(latest_run)
    if not chapters:
        logger.error("未找到章节JSON，无法装订")
        return 1

    validate_chapters(chapters)

    document_ir = stitch_document(report_id, metadata, chapters)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    base_name = build_slug(
        metadata.get("query") or metadata.get("title") or metadata.get("reportId") or report_id
    )

    ir_path = save_document_ir(document_ir, base_name, timestamp)
    # 传入 ir_path，修复后的图表会自动保存到 IR 文件
    md_path = render_markdown(document_ir, base_name, timestamp, ir_path=ir_path)

    logger.info("")
    logger.info("🎉 Markdown装订与渲染完成")
    logger.info(f"IR文件: {ir_path.resolve()}")
    logger.info(f"Markdown文件: {md_path.resolve()}")
    return 0


if __name__ == "__main__":
    sys.exit(main())