MyST-NB/myst_nb/core/lexers.py at 2b3251739c8f99526c885ce61b3cee1c2080036d · executablebooks/MyST-NB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
"""Pygments lexers"""
from __future__ import annotations

import re

# this is not added as an entry point in ipython, so we add it in this package
from IPython.lib.lexers import IPythonTracebackLexer  # noqa: F401
import pygments.lexer
import pygments.token

_ansi_code_to_color = {
    0: "Black",
    1: "Red",
    2: "Green",
    3: "Yellow",
    4: "Blue",
    5: "Magenta",
    6: "Cyan",
    7: "White",
}


def _token_from_lexer_state(bold: bool, faint: bool, fg_color: str | None, bg_color: str | None):
    """Construct a token given the current lexer state.

    We can only emit one token even though we have a multiple-tuple state.
    To do work around this, we construct tokens like "Bold.Red".
    """
    components: tuple[str, ...] = ()

    if bold:
        components += ("Bold",)

    if faint:
        components += ("Faint",)

    if fg_color:
        components += (fg_color,)

    if bg_color:
        components += ("BG" + bg_color,)

    if len(components) == 0:
        return pygments.token.Text
    else:
        token = pygments.token.Token.Color
        for component in components:
            token = getattr(token, component)
        return token


class AnsiColorLexer(pygments.lexer.RegexLexer):
    """Pygments lexer for text containing ANSI color codes.

    Adapted from https://github.com/chriskuehl/pygments-ansi-color
    """

    name = "ANSI Color"
    aliases = ("myst-ansi",)
    flags = re.DOTALL | re.MULTILINE

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.reset_state()

    def reset_state(self):
        self.bold = False
        self.faint = False
        self.fg_color = None
        self.bg_color = None

    @property
    def current_token(self):
        return _token_from_lexer_state(
            self.bold,
            self.faint,
            self.fg_color,
            self.bg_color,
        )

    def process(self, match):
        """Produce the next token and bit of text.

        Interprets the ANSI code (which may be a color code or some other
        code), changing the lexer state and producing a new token. If it's not
        a color code, we just strip it out and move on.

        Some useful reference for ANSI codes:
          * http://ascii-table.com/ansi-escape-sequences.php
        """
        # "after_escape" contains everything after the start of the escape
        # sequence, up to the next escape sequence. We still need to separate
        # the content from the end of the escape sequence.
        after_escape = match.group(1)

        # TODO: this doesn't handle the case where the values are non-numeric.
        # This is rare but can happen for keyboard remapping, e.g.
        # '\x1b[0;59;"A"p'
        parsed = re.match(
            r"([0-9;=]*?)?([a-zA-Z])(.*)$",
            after_escape,
            re.DOTALL | re.MULTILINE,
        )
        if parsed is None:
            # This shouldn't ever happen if we're given valid text + ANSI, but
            # people can provide us with utter junk, and we should tolerate it.
            text = after_escape
        else:
            value, code, text = parsed.groups()
            if code == "m":  # "m" is "Set Graphics Mode"
                # Special case \x1b[m is a reset code
                if value == "":
                    self.reset_state()
                else:
                    try:
                        values = [int(v) for v in value.split(";")]
                    except ValueError:
                        # Shouldn't ever happen, but could with invalid ANSI.
                        values = []

                    while len(values) > 0:
                        value = values.pop(0)
                        fg_color = _ansi_code_to_color.get(value - 30)
                        bg_color = _ansi_code_to_color.get(value - 40)
                        if fg_color:
                            self.fg_color = fg_color
                        elif bg_color:
                            self.bg_color = bg_color
                        elif value == 1:
                            self.bold = True
                        elif value == 2:
                            self.faint = True
                        elif value == 22:
                            self.bold = False
                            self.faint = False
                        elif value == 39:
                            self.fg_color = None
                        elif value == 49:
                            self.bg_color = None
                        elif value == 0:
                            self.reset_state()
                        elif value in (38, 48):
                            try:
                                five = values.pop(0)
                                color = values.pop(0)
                            except IndexError:
                                continue
                            else:
                                if five != 5:
                                    continue
                                if not 0 <= color <= 255:
                                    continue
                                if value == 38:
                                    self.fg_color = f"C{color}"
                                else:
                                    self.bg_color = f"C{color}"

        yield match.start(), self.current_token, text

    tokens = {
        "root": [(r"\x1b\[([^\x1b]*)", process), (r"[^\x1b]+", pygments.token.Text)],
    }