-
Notifications
You must be signed in to change notification settings - Fork 115
125 lines (112 loc) · 4.48 KB
/
meilisearch-docs-scraper.yml
File metadata and controls
125 lines (112 loc) · 4.48 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
name: Meilisearch Docs Scraper
on:
# Manual trigger
workflow_dispatch:
# Scheduled run - every day at 3 AM UTC
schedule:
- cron: '0 3 * * *'
jobs:
scrape:
runs-on: ubuntu-latest
steps:
- name: Check out code
uses: actions/checkout@v4
- name: Set up Python
uses: actions/setup-python@v5
with:
python-version: '3.11'
- name: Install docs-scraper
run: |
pip install pipenv
git clone https://github.com/meilisearch/docs-scraper.git
cd docs-scraper
pipenv install
- name: Create scraper config
run: |
cat > docs-scraper/config.json << 'EOF'
{
"index_uid": "semgrep_docs_2",
"start_urls": [
"https://semgrep.dev/docs/"
],
"sitemap_urls": [
"https://semgrep.dev/docs/sitemap.xml"
],
"stop_urls": [
"https://semgrep.dev/docs/tags/.*",
"https://semgrep.dev/docs/category/.*"
],
"selectors": {
"default": {
"lvl0": {
"selector": ".breadcrumbs > li:nth-child(2) span.breadcrumbs__link",
"global": true,
"default_value": "Semgrep documentation"
},
"lvl1": "article h1",
"lvl2": "article h2",
"lvl3": "article h3",
"lvl4": "article h4",
"lvl5": "article h5, article td:first-child",
"lvl6": "article h6",
"text": "article p, article li, article td:last-child, article code, article div table td, article div table th"
}
},
"strip_chars": " .,;:#()",
"scrape_start_urls": true,
"custom_settings": {
"synonyms": {
"autofix": ["autofix", "automatic fixes", "remediation", "code fixes"],
"ci": ["ci", "continuous integration", "pipeline", "github actions", "gitlab ci", "automation"],
"config": ["config", "configuration", "settings", "setup"],
"taint mode": ["taint", "taint mode", "taint tracking"],
"install": ["install", "setup", "set up", "installation", "configure", "configuration"],
"setup": ["setup", "set up", "install", "installation", "configure", "configuration"]
},
"stopWords": [
"what", "is", "are", "how", "to", "the", "a", "an", "do", "does", "can", "i", "my"
]
}
}
EOF
- name: Run docs-scraper
env:
MEILISEARCH_HOST_URL: ${{ secrets.MEILISEARCH_HOST_URL }}
MEILISEARCH_API_KEY: ${{ secrets.MEILISEARCH_API_KEY }}
run: |
cd docs-scraper
pipenv run ./docs_scraper config.json
- name: Apply additional settings (embeddings)
env:
MEILISEARCH_HOST_URL: ${{ secrets.MEILISEARCH_HOST_URL }}
MEILISEARCH_API_KEY: ${{ secrets.MEILISEARCH_API_KEY }}
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
run: |
echo "Applying embedder settings..."
if [ -z "$OPENAI_API_KEY" ]; then
echo "WARNING: OPENAI_API_KEY is not set. Skipping embedder configuration."
exit 0
fi
response=$(curl -w "\n%{http_code}" -X PATCH \
"${MEILISEARCH_HOST_URL}/indexes/semgrep_docs_2/settings" \
-H "Authorization: Bearer ${MEILISEARCH_API_KEY}" \
-H "Content-Type: application/json" \
-d "{
\"embedders\": {
\"default\": {
\"source\": \"openAi\",
\"model\": \"text-embedding-3-small\",
\"apiKey\": \"${OPENAI_API_KEY}\",
\"dimensions\": 1536,
\"documentTemplate\": \"{% for field in fields %}{% if field.is_searchable and field.value != nil %}{{ field.name }}: {{ field.value }}\n{% endif %}{% endfor %}\"
}
}
}")
http_code=$(echo "$response" | tail -n1)
echo "Embedder settings response code: $http_code"
if [ "$http_code" != "202" ]; then
echo "Failed to apply embedder settings"
echo "$response"
exit 1
fi
echo "✅ Scraping complete! Documents indexed successfully."