Skip to content

DECIMER can not recognized SMILES length larger than 302 #127

@wangtaiyi

Description

@wangtaiyi

Issue Type

Bug

Source

GitHub (source)

DECIMER Image Transformer Version

2.8

OS Platform and Distribution

Ubuntu 22.04

Python version

3.10

Current Behaviour?

I am a researcher on natural products, but sometimes the molecules are quite large, like MW > 1000. When I tried to OCR some molecules from images, I found more than 5% of molecules can not be OCRed into full length SMILES. Is it possible to enlarge the max_length of SMILES?

Many thanks!!

Which images caused the issue? (This is mandatory for images related issues)

Image

Standalone code to reproduce the issue

#!/usr/bin/env python3
"""
decimer_batch.py
断点续跑:先建表,再逐行回填 SMILES
支持从 config.yaml 读取输入/输出路径
"""

import os
import glob
import yaml
import pandas as pd
from tqdm import tqdm
from DECIMER import predict_SMILES   # pip install decimer

# 支持的图片后缀
IMG_EXTS = ("*.png", "*.jpg", "*.jpeg", "*.tif", "*.tiff", "*.bmp")

def gather_image_paths(folder):
    """返回所有图片的绝对路径列表"""
    paths = []
    for ext in IMG_EXTS:
        paths.extend(glob.glob(os.path.join(folder, ext)))
    return sorted(paths)

def init_csv(folder, csv_file):
    """第一次运行时建立仅有 image_path 的 csv"""
    paths = gather_image_paths(folder)
    if not paths:
        raise RuntimeError("未找到任何图片!")
    df = pd.DataFrame({
        "row_id": range(1, len(paths) + 1),
        "image_path": [os.path.abspath(p) for p in paths],
        "smiles": [None] * len(paths)
    })
    os.makedirs(os.path.dirname(csv_file), exist_ok=True)
    df.to_csv(csv_file, index=False)
    print(f"已创建 {csv_file},共 {len(paths)} 条待处理记录。")
    return df

def resume_convert(csv_file):
    """读取 csv,找到未完成的行继续转换"""
    df = pd.read_csv(csv_file)

    mask_na = df["smiles"].isna()
    todo = df[mask_na]

    if todo.empty:
        print("所有图片均已转换完成。")
        return

    print(f"发现 {len(todo)} 张图片未完成,继续转换……")

    for idx in tqdm(todo.index, desc="DECIMER"):
        img_path = df.at[idx, "image_path"]
        print(img_path)
        try:
            smiles = predict_SMILES(img_path).strip()
            if not smiles:
                smiles = None
        except Exception as e:
            print(f"[WARN] 处理 {img_path} 出错:{e}")
            smiles = None

        # 回填并立即保存
        df.at[idx, "smiles"] = smiles
        df.to_csv(csv_file, index=False)

    print("全部完成!")

def main():
    import argparse
    parser = argparse.ArgumentParser(description="可断点续跑的批量图片→SMILES(支持 config.yaml)")
    parser.add_argument("-c", "--config", default="config.yaml", help="配置文件路径 (default: config.yaml)")
    parser.add_argument("-o", "--out", help="输出 CSV 文件名(可选,覆盖配置中的默认值)")
    args = parser.parse_args()

    # 读取 config.yaml
    with open(args.config, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)

    output_dir = config['output']['output_dir']
    img_folder = os.path.join(output_dir, "imgs")

    # 确定 CSV 输出路径
    if args.out:
        csv_file = args.out
    else:
        csv_file = os.path.join(output_dir, "results.csv")

    if not os.path.isfile(csv_file):
        # 第一次运行:先建表
        init_csv(img_folder, csv_file)

    # 无论新建还是继续,都执行 resume_convert
    resume_convert(csv_file)

if __name__ == "__main__":
    main()

Relevant log output

Code of Conduct

  • I agree to follow this project's Code of Conduct

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions