gpt_academic/crazy_functions/ast_fns/comment_remove.py at 045cdb15d86c262a55ede6dd9c84a5ab63dcc797 · binary-husky/gpt_academic · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import token
import tokenize
import copy
import io


def remove_python_comments(input_source: str) -> str:
    source_flag = copy.copy(input_source)
    source = io.StringIO(input_source)
    ls = input_source.split('\n')
    prev_toktype = token.INDENT
    readline = source.readline

    def get_char_index(lineno, col):
        # find the index of the char in the source code
        if lineno == 1:
            return len('\n'.join(ls[:(lineno-1)])) + col
        else:
            return len('\n'.join(ls[:(lineno-1)])) + col + 1

    def replace_char_between(start_lineno, start_col, end_lineno, end_col, source, replace_char, ls):
        # replace char between start_lineno, start_col and end_lineno, end_col with replace_char, but keep '\n' and ' '
        b = get_char_index(start_lineno, start_col)
        e = get_char_index(end_lineno, end_col)
        for i in range(b, e):
            if source[i] == '\n':
                source = source[:i] + '\n' + source[i+1:]
            elif source[i] == ' ':
                source = source[:i] + ' ' + source[i+1:]
            else:
                source = source[:i] + replace_char + source[i+1:]
        return source

    tokgen = tokenize.generate_tokens(readline)
    for toktype, ttext, (slineno, scol), (elineno, ecol), ltext in tokgen:
        if toktype == token.STRING and (prev_toktype == token.INDENT):
            source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
        elif toktype == token.STRING and (prev_toktype == token.NEWLINE):
            source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
        elif toktype == tokenize.COMMENT:
            source_flag = replace_char_between(slineno, scol, elineno, ecol, source_flag, ' ', ls)
        prev_toktype = toktype
    return source_flag


# 示例使用
if __name__ == "__main__":
    with open("source.py", "r", encoding="utf-8") as f:
        source_code = f.read()

    cleaned_code = remove_python_comments(source_code)

    with open("cleaned_source.py", "w", encoding="utf-8") as f:
        f.write(cleaned_code)