-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathgenerate_sitemap.py
More file actions
executable file
·159 lines (124 loc) · 4.41 KB
/
generate_sitemap.py
File metadata and controls
executable file
·159 lines (124 loc) · 4.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/env python3
"""
Generate sitemap.xml and robots.txt for the MyST Markdown book.
Parses myst.yml to extract all pages and creates SEO files.
"""
import yaml
import os
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Any
def extract_pages_from_toc(toc: List[Dict[str, Any]], pages: List[str] = None) -> List[str]:
"""
Recursively extract all page files from the table of contents.
Args:
toc: Table of contents structure from myst.yml
pages: Accumulator list for pages
Returns:
List of page paths (without .md extension)
"""
if pages is None:
pages = []
for item in toc:
# Add the file if it exists
if 'file' in item:
# Remove .md extension and add to pages
page = item['file'].replace('.md', '')
pages.append(page)
# Recursively process children
if 'children' in item:
extract_pages_from_toc(item['children'], pages)
return pages
def generate_sitemap(base_url: str, output_path: str = '_build/html/sitemap.xml'):
"""
Generate sitemap.xml file.
Args:
base_url: Base URL of the site (e.g., https://username.github.io/repo-name)
output_path: Path where sitemap.xml will be written
"""
# Read myst.yml
with open('myst.yml', 'r') as f:
config = yaml.safe_load(f)
# Extract pages from TOC
toc = config.get('project', {}).get('toc', [])
pages = extract_pages_from_toc(toc)
# Ensure base_url doesn't end with /
base_url = base_url.rstrip('/')
# Get current date for lastmod
today = datetime.now().strftime('%Y-%m-%d')
# Generate sitemap XML
sitemap_lines = [
'<?xml version="1.0" encoding="UTF-8"?>',
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">',
]
# Add each page to sitemap
for page in pages:
# Convert page path to URL
# index.md -> /
# chapter_01.md -> /chapter_01.html
if page == 'index':
url = f'{base_url}/'
else:
url = f'{base_url}/{page}.html'
sitemap_lines.extend([
' <url>',
f' <loc>{url}</loc>',
f' <lastmod>{today}</lastmod>',
' <changefreq>weekly</changefreq>',
' <priority>0.8</priority>',
' </url>',
])
sitemap_lines.append('</urlset>')
# Create output directory if it doesn't exist
output_dir = Path(output_path).parent
output_dir.mkdir(parents=True, exist_ok=True)
# Write sitemap.xml
with open(output_path, 'w') as f:
f.write('\n'.join(sitemap_lines))
print(f'✓ Generated sitemap.xml with {len(pages)} pages')
print(f'✓ Output: {output_path}')
print(f'✓ Base URL: {base_url}')
def generate_robots_txt(base_url: str, output_path: str = '_build/html/robots.txt'):
"""
Generate robots.txt file.
Args:
base_url: Base URL of the site (e.g., https://username.github.io/repo-name)
output_path: Path where robots.txt will be written
"""
# Ensure base_url doesn't end with /
base_url = base_url.rstrip('/')
# Generate robots.txt content
robots_lines = [
'# robots.txt for Learn Probability',
'',
'User-agent: *',
'Allow: /',
'',
f'Sitemap: {base_url}/sitemap.xml',
]
# Create output directory if it doesn't exist
output_dir = Path(output_path).parent
output_dir.mkdir(parents=True, exist_ok=True)
# Write robots.txt
with open(output_path, 'w') as f:
f.write('\n'.join(robots_lines))
print(f'✓ Generated robots.txt')
print(f'✓ Output: {output_path}')
print(f'✓ Sitemap URL: {base_url}/sitemap.xml')
def main():
"""Main entry point."""
# Get base URL from environment or use default
repo_name = os.environ.get('GITHUB_REPOSITORY', 'learn_probability_fork').split('/')[-1]
github_pages_url = os.environ.get('GITHUB_PAGES_URL', '')
if github_pages_url:
# Use the full GitHub Pages URL if provided
base_url = github_pages_url.rstrip('/')
else:
# Construct from repository name
base_url = f'https://snowch.github.io/{repo_name}'
# Allow override via BASE_URL environment variable
base_url = os.environ.get('BASE_URL_FULL', base_url)
generate_sitemap(base_url)
generate_robots_txt(base_url)
if __name__ == '__main__':
main()