llms-txt/llms_txt/core.py at 27fcf6362f33e448a7108e2f88168b770c001c03 · AnswerDotAI/llms-txt · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""Source code for `llms_txt` Python module, containing helpers to create and use llms.txt files"""

# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/01_core.ipynb.

# %% auto 0
__all__ = ['opt_re', 'named_re', 'search', 'parse_link', 'parse_llms_file', 'get_doc_content', 'mk_ctx', 'get_sizes',
           'create_ctx', 'llms_txt2ctx']

# %% ../nbs/01_core.ipynb
import re

# %% ../nbs/01_core.ipynb
from fastcore.utils import *
from fastcore.xml import *
from fastcore.script import *
import httpx
from urllib.parse import urlparse
from nbdev.config import *

# %% ../nbs/01_core.ipynb
def opt_re(s):
    "Pattern to optionally match `s`"
    return f'(?:{s})?'

def named_re(nm, pat):
    "Pattern to match `pat` in a named capture group"
    return f'(?P<{nm}>{pat})'

def search(pat, txt, flags=0):
    "Dictionary of matched groups in `pat` within `txt`"
    res = re.search(pat, txt, flags=flags)
    return res.groupdict() if res else None

# %% ../nbs/01_core.ipynb
def parse_link(txt):
    "Parse a link section from llms.txt"
    title = named_re('title', r'[^\]]+')
    url = named_re('url', r'[^\)]+')
    desc = named_re('desc', r'.*')
    desc_pat = opt_re(fr":\s*{desc}")
    pat = fr'-\s*\[{title}\]\({url}\){desc_pat}'
    return re.search(pat, txt).groupdict()

# %% ../nbs/01_core.ipynb
def _parse_links(links):
    return [parse_link(l) for l in re.split(r'\n+', links.strip()) if l.strip()]

# %% ../nbs/01_core.ipynb
def _parse_llms(txt):
    start,*rest = re.split(fr'^##\s*(.*?$)', txt, flags=re.MULTILINE)
    d = dict(chunked(rest, 2))
    sects = {k: _parse_links(v) for k,v in d.items()}
    return start.strip(),sects

# %% ../nbs/01_core.ipynb
def parse_llms_file(txt):
    "Parse llms.txt file contents in `txt` to an `AttrDict`"
    start,sects = _parse_llms(txt)
    title = named_re('title', r'.+?$')
    summ = named_re('summary', '.+?$')
    summ_pat = opt_re(fr"^>\s*{summ}$")
    info = named_re('info', '.*')
    pat = fr'^#\s*{title}\n+{summ_pat}\n+{info}'
    d = search(pat, start, (re.MULTILINE|re.DOTALL))
    d['sections'] = sects
    return dict2obj(d)

# %% ../nbs/01_core.ipynb
from fastcore.xml import Sections,Project,Doc

# %% ../nbs/01_core.ipynb
def _local_docs_pth(cfg): return cfg.config_path/'_proc'/cfg.doc_path.name

def get_doc_content(url):
    "Fetch content from local file if in nbdev repo."
    cfg = get_config()
    if is_nbdev() and url.startswith(cfg.doc_host):
        relative_path = urlparse(url).path.lstrip('/')
        local_path = _local_docs_pth(cfg) / relative_path
        if local_path.exists(): return local_path.read_text()
    return httpx.get(url).text

# %% ../nbs/01_core.ipynb
def _doc(kw):
    "Create a `Doc` FT object with the text retrieved from `url` as the child, and `kw` as attrs."
    url = kw.pop('url')
    txt = get_doc_content(url)
    re_comment = re.compile('^<!--.*-->$', flags=re.MULTILINE)
    re_base64_img = re.compile(r'<img[^>]*src="data:image/[^"]*"[^>]*>')
    txt = '\n'.join([o for o in txt.splitlines() if not re_comment.search(o) and not re_base64_img.search(o)])
    return Doc(txt, **kw)

# %% ../nbs/01_core.ipynb
def _section(nm, items, n_workers=None):
    "Create a section containing a `Doc` object for each child."
    return ft(nm, *parallel(_doc, items, n_workers=n_workers, threadpool=True))

# %% ../nbs/01_core.ipynb
def mk_ctx(d, optional=True, n_workers=None):
    "Create a `Project` with a `Section` for each H2 part in `d`, optionally skipping the 'optional' section."
    skip = '' if optional else 'Optional'
    sections = [_section(k, v, n_workers=n_workers) for k,v in d.sections.items() if k!=skip]
    return Project(title=d.title, summary=d.summary)(d.info, *sections)

# %% ../nbs/01_core.ipynb
def get_sizes(ctx):
    "Get the size of each section of the LLM context"
    return {o.tag:{p.title:len(p.children[0]) for p in o.children} for o in ctx.children if hasattr(o,'tag')}

# %% ../nbs/01_core.ipynb
def create_ctx(txt, optional=False, n_workers=None):
    "A `Project` with a `Section` for each H2 part in `txt`, optionally skipping the 'optional' section."
    d = parse_llms_file(txt)
    ctx = mk_ctx(d, optional=optional, n_workers=n_workers)
    return to_xml(ctx, do_escape=False)

# %% ../nbs/01_core.ipynb
@call_parse
def llms_txt2ctx(
    fname:str, # File name to read
    optional:bool_arg=False, # Include 'optional' section?
    n_workers:int=None, # Number of threads to use for parallel downloading
    save_nbdev_fname:str=None #save output to nbdev `{docs_path}` instead of emitting to stdout
):
    "Print a `Project` with a `Section` for each H2 part in file read from `fname`, optionally skipping the 'optional' section."
    ctx = create_ctx(Path(fname).read_text(), optional=optional, n_workers=n_workers)
    if save_nbdev_fname:
        cfg = get_config()
        (_local_docs_pth(cfg) / save_nbdev_fname).mk_write(ctx)
    else: print(ctx)