MyST-NB/myst_nb/core/read.py at 79e865721f6361cd956e5c2f34dfafac4d7e2de2 · executablebooks/MyST-NB · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
"""Module for reading notebook formats from a string input."""
from __future__ import annotations

import dataclasses as dc
from functools import partial
import json
from pathlib import Path
from typing import Callable, Iterator

from docutils.parsers.rst import Directive
from markdown_it.renderer import RendererHTML
from myst_parser.config.main import MdParserConfig
from myst_parser.parsers.mdit import create_md_parser
import nbformat as nbf
import yaml

from myst_nb.core.config import NbParserConfig
from myst_nb.core.loggers import DocutilsDocLogger, SphinxDocLogger

NOTEBOOK_VERSION = 4
"""The notebook version that readers should return."""


@dc.dataclass()
class NbReader:
    """A data class for reading a notebook format."""

    read: Callable[[str], nbf.NotebookNode]
    """The function to read a notebook from a (utf8) string."""
    md_config: MdParserConfig
    """The configuration for parsing markdown cells."""
    read_fmt: dict | None = dc.field(default=None)
    """The type of the reader, if known."""
    support_cell_ids: bool = False
    """Whether the format supports stable cell IDs"""


def standard_nb_read(text: str) -> nbf.NotebookNode:
    """Read a standard .ipynb notebook from a string."""
    return nbf.reads(text, as_version=NOTEBOOK_VERSION)


def create_nb_reader(
    path: str,
    md_config: MdParserConfig,
    nb_config: NbParserConfig,
    content: None | str | Iterator[str],
) -> NbReader | None:
    """Create a notebook reader, given a string, source path and configuration.

    Note, we do not directly parse to a notebook, since jupyter-cache functionality
    requires the reader.

    :param path: Path to the input source being processed.
    :param nb_config: The  configuration for parsing Notebooks.
    :param md_config: The default configuration for parsing Markown.
    :param content: The input string (optionally used to check for text-based notebooks)

    :returns: the notebook reader, and the (potentially modified) MdParserConfig,
        or None if the input cannot be read as a notebook.
    """
    # the import is here so this module can be loaded without sphinx
    from sphinx.util import import_object

    # get all possible readers
    readers = nb_config.custom_formats.copy()
    # add the default reader
    readers.setdefault(".ipynb", (standard_nb_read, {}, False))  # type: ignore

    # we check suffixes ordered by longest first, to ensure we get the "closest" match
    iterator = sorted(readers.items(), key=lambda x: len(x[0]), reverse=True)
    for suffix, (reader, reader_kwargs, commonmark_only) in iterator:
        if path.endswith(suffix):
            if isinstance(reader, str):
                # attempt to load the reader as an object path
                reader = import_object(reader)
            if commonmark_only:
                # Markdown cells should be read as Markdown only
                md_config = dc.replace(md_config, commonmark_only=True)
            return NbReader(
                partial(reader, **(reader_kwargs or {})),  # type: ignore
                md_config,
                support_cell_ids=suffix == ".ipynb",
            )

    # a Markdown file is a special case, since we only treat it as a notebook,
    # if it starts with certain "top-matter"
    if content is not None and is_myst_markdown_notebook(content):
        return NbReader(
            partial(
                read_myst_markdown_notebook,
                config=md_config,
                add_source_map=True,
                path=path,
            ),
            md_config,
            {"type": "plugin", "name": "myst_nb_md"},
            support_cell_ids=False,
        )

    # if we get here, we did not find a reader
    return None


def is_myst_markdown_notebook(text: str | Iterator[str]) -> bool:
    """Check if the input is a MyST Markdown notebook.

    This is identified by the presence of a top-matter section, containing either::

        ---
        file_format: mystnb
        ---

    or::

        ---
        jupytext:
            text_representation:
                format_name: myst
        ---

    :param text: The input text.
    :returns: True if the input is a markdown notebook.
    """
    if isinstance(text, str):
        if not text.startswith("---"):  # skip creating the line list in memory
            return False
        text = (line for line in text.splitlines())
    try:
        if not next(text).startswith("---"):
            return False
    except StopIteration:
        return False
    top_matter = []
    for line in text:
        if line.startswith("---") or line.startswith("..."):
            break
        top_matter.append(line.rstrip() + "\n")
    try:
        metadata = yaml.safe_load("".join(top_matter))
        assert isinstance(metadata, dict)
    except Exception:
        return False
    if "file_format" in metadata and metadata["file_format"] == "mystnb":
        return True
    if (
        metadata.get("jupytext", {})
        .get("text_representation", {})
        .get("format_name", None)
        != "myst"
    ):
        return False

    return True

    # TODO move this to reader, since not strictly part of function objective
    # or just allow nbformat/nbclient to handle the failure
    # if "name" not in metadata.get("kernelspec", {}):
    #     raise IOError(
    #         "A myst notebook text-representation requires " "kernelspec/name metadata"
    #     )
    # if "display_name" not in metadata.get("kernelspec", {}):
    #     raise IOError(
    #         "A myst notebook text-representation requires "
    #         "kernelspec/display_name metadata"
    #     )


def myst_nb_reader_plugin(uri: str) -> nbf.NotebookNode:
    """Read a myst notebook from a string.

    Used as plugin for jupyter-cache.
    """
    return read_myst_markdown_notebook(
        Path(uri).read_text("utf8"), add_source_map=True, path=uri
    )


def read_myst_markdown_notebook(
    text,
    config: MdParserConfig | None = None,
    code_directive="{code-cell}",
    raw_directive="{raw-cell}",
    add_source_map=False,
    path: str | Path | None = None,
) -> nbf.NotebookNode:
    """Convert text written in the myst format to a notebook.

    :param text: the file text
    :param code_directive: the name of the directive to search for containing code cells
    :param raw_directive: the name of the directive to search for containing raw cells
    :param add_source_map: add a `source_map` key to the notebook metadata,
        which is a list of the starting source line number for each cell.
    :param path: path to notebook (required for :load:)

    :raises MystMetadataParsingError if the metadata block is not valid JSON/YAML

    NOTE: we assume here that all of these directives are at the top-level,
    i.e. not nested in other directives.
    """
    config = config or MdParserConfig()
    # parse markdown file up to the block level (i.e. don't worry about inline text)
    inline_config = dc.replace(
        config, disable_syntax=(list(config.disable_syntax) + ["inline"])
    )
    parser = create_md_parser(inline_config, RendererHTML)
    tokens = parser.parse(text + "\n")
    lines = text.splitlines()
    md_start_line = 0

    # get the document metadata
    metadata_nb = {}
    if tokens[0].type == "front_matter":
        metadata = tokens.pop(0)
        md_start_line = metadata.map[1] if metadata.map else 0
        try:
            metadata_nb = yaml.safe_load(metadata.content)
        except (yaml.parser.ParserError, yaml.scanner.ScannerError) as error:
            raise MystMetadataParsingError(f"Notebook metadata: {error}")

    # add missing display name to the metadata, as required by the nbformat schema:
    # https://github.com/jupyter/nbformat/blob/f712d60f13c5b168313222cbf4bee7face98a081/nbformat/v4/nbformat.v4.5.schema.json#L16
    if (
        "kernelspec" in metadata_nb
        and "name" in metadata_nb["kernelspec"]
        and "display_name" not in metadata_nb["kernelspec"]
    ):
        metadata_nb["kernelspec"]["display_name"] = metadata_nb["kernelspec"]["name"]

    # create an empty notebook
    nbf_version = nbf.v4
    kwargs = {"metadata": nbf.from_dict(metadata_nb)}
    notebook = nbf_version.new_notebook(**kwargs)
    source_map = []  # this is a list of the starting line number for each cell

    def _flush_markdown(start_line, token, md_metadata):
        """When we find a cell we check if there is preceding text.o"""
        endline = token.map[0] if token else len(lines)
        md_source = _strip_blank_lines("\n".join(lines[start_line:endline]))
        meta = nbf.from_dict(md_metadata)
        if md_source:
            source_map.append(start_line)
            notebook.cells.append(
                nbf_version.new_markdown_cell(source=md_source, metadata=meta)
            )

    # iterate through the tokens to identify notebook cells
    nesting_level = 0
    md_metadata: dict = {}

    for token in tokens:
        nesting_level += token.nesting

        if nesting_level != 0:
            # we ignore fenced block that are nested, e.g. as part of lists, etc
            continue

        token_map = token.map or [0, 0]

        if token.type == "fence" and token.info.startswith(code_directive):
            _flush_markdown(md_start_line, token, md_metadata)
            options, body_lines = _read_fenced_cell(token, len(notebook.cells), "Code")
            # Parse :load: or load: tags and populate body with contents of file
            if "load" in options:
                body_lines = _load_code_from_file(
                    path, options["load"], token, body_lines
                )
            meta = nbf.from_dict(options)
            source_map.append(token_map[0] + 1)
            notebook.cells.append(
                nbf_version.new_code_cell(source="\n".join(body_lines), metadata=meta)
            )
            md_metadata = {}
            md_start_line = token_map[1]

        elif token.type == "fence" and token.info.startswith(raw_directive):
            _flush_markdown(md_start_line, token, md_metadata)
            options, body_lines = _read_fenced_cell(token, len(notebook.cells), "Raw")
            meta = nbf.from_dict(options)
            source_map.append(token_map[0] + 1)
            notebook.cells.append(
                nbf_version.new_raw_cell(source="\n".join(body_lines), metadata=meta)
            )
            md_metadata = {}
            md_start_line = token_map[1]

        elif token.type == "myst_block_break":
            _flush_markdown(md_start_line, token, md_metadata)
            md_metadata = _read_cell_metadata(token, len(notebook.cells))
            md_start_line = token_map[1]

    _flush_markdown(md_start_line, None, md_metadata)

    if add_source_map:
        notebook.metadata["source_map"] = source_map
    return notebook


class MystMetadataParsingError(Exception):
    """Error when parsing metadata from myst formatted text"""


class _LoadFileParsingError(Exception):
    """Error when parsing files for code-blocks/code-cells"""


def _strip_blank_lines(text):
    text = text.rstrip()
    while text and text.startswith("\n"):
        text = text[1:]
    return text


class _MockDirective:
    option_spec = {"options": True}
    required_arguments = 0
    optional_arguments = 1
    has_content = True


def _read_fenced_cell(token, cell_index, cell_type):
    from myst_parser.parsers.directives import parse_directive_text

    result = parse_directive_text(
        directive_class=_MockDirective,
        first_line="",
        content=token.content,
        validate_options=False,
    )
    if result.warnings:
        raise MystMetadataParsingError(
            "{} cell {} at line {} could not be read: {}".format(
                cell_type, cell_index, token.map[0] + 1, result.warnings[0]
            )
        )

    return result.options, result.body


def _read_cell_metadata(token, cell_index):
    metadata = {}
    if token.content:
        try:
            metadata = json.loads(token.content.strip())
        except Exception as err:
            raise MystMetadataParsingError(
                "Markdown cell {} at line {} could not be read: {}".format(
                    cell_index, token.map[0] + 1, err
                )
            )
        if not isinstance(metadata, dict):
            raise MystMetadataParsingError(
                "Markdown cell {} at line {} is not a dict".format(
                    cell_index, token.map[0] + 1
                )
            )

    return metadata


def _load_code_from_file(
    nb_path: None | str | Path, file_name: str, token, body_lines: list[str]
):
    """load source code from a file."""
    if nb_path is None:
        raise _LoadFileParsingError("path to notebook not supplied for :load:")
    file_path = Path(nb_path).parent.joinpath(file_name).resolve()
    if len(body_lines):
        pass  # TODO this would make the reader dependent on sphinx
        # line = token.map[0] if token.map else 0
        # msg = (
        #     f"{nb_path}:{line} content of code-cell is being overwritten by "
        #     f":load: {file_name}"
        # )
        # LOGGER.warning(msg)
    try:
        body_lines = file_path.read_text().split("\n")
    except Exception:
        raise _LoadFileParsingError(f"Can't read file from :load: {file_path}")
    return body_lines


class UnexpectedCellDirective(Directive):
    """The `{code-cell}`` and ``{raw-cell}`` directives, are special cases,
    which are picked up by the MyST Markdown reader to convert them into notebooks.

    If any are left in the parsed Markdown, it probably means that they were nested
    inside another directive, which is not allowed.

    Therefore, we log a warning if it is triggered, and discard it.

    """

    optional_arguments = 1
    final_argument_whitespace = True
    has_content = True

    def run(self):
        """Run the directive."""
        message = (
            "Found an unexpected `code-cell` or `raw-cell` directive. "
            "Either this file was not converted to a notebook, "
            "because Jupytext header content was missing, "
            "or the `code-cell` was not converted, because it is nested. "
            "See https://myst-nb.readthedocs.io/en/latest/use/markdown.html "
            "for more information."
        )
        document = self.state.document
        if hasattr(document.settings, "env"):
            logger = SphinxDocLogger(document)
        else:
            logger = DocutilsDocLogger(document)  # type: ignore
        logger.warning(message, line=self.lineno, subtype="nbcell")
        return []