Skip to content

Commit 87cd51b

Browse files
authored
Docs to knowledge example (#2110)
1 parent 4c4f1ad commit 87cd51b

14 files changed

Lines changed: 1053 additions & 67 deletions

File tree

docs/src/content/example-posts/docs-to-knowledge-graph.md

Lines changed: 369 additions & 0 deletions
Large diffs are not rendered by default.

docs/src/data/docs-meta.json

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,10 @@
105105
"sourcePath": "contributing/setup_dev_environment.md",
106106
"reviewedTs": 1780444800
107107
},
108+
"examples/docs-to-knowledge-graph": {
109+
"sourcePath": "example-posts/docs-to-knowledge-graph.md",
110+
"reviewedTs": 1781265600
111+
},
108112
"examples/index-codebase": {
109113
"sourcePath": "example-posts/index-codebase.md",
110114
"reviewedTs": 1781092800

docs/src/data/examples.ts

Lines changed: 116 additions & 57 deletions
Large diffs are not rendered by default.

docs/src/pages/examples/[slug].astro

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ const introItems = [
104104
{ k: 'Target', v: tagOf('tgt'), reviewed: false },
105105
{ k: 'LLM', v: tagOf('llm'), reviewed: false },
106106
{ k: 'Mode', v: tagOf('ops'), reviewed: false },
107-
{ k: 'Category', v: ex.category, reviewed: false },
107+
{ k: 'Category', v: `${titleText(CATEGORY_META[ex.category].label)}${CATEGORY_META[ex.category].em ?? ''}`, reviewed: false },
108108
{ k: 'Last reviewed', v: lastReviewed, reviewed: true },
109109
].filter((m) => m.v);
110110
---

docs/src/pages/examples/index.astro

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,10 @@ const featured = findExample(featuredSlug)!;
3333
// Category order and counts, computed from the data.
3434
const CAT_ORDER: Category[] = ['search', 'ingest', 'llm', 'agents', 'image'];
3535
const catCount = (c: Category) => examples.filter((e) => e.category === c).length;
36+
// Categories with no examples yet (e.g. 'image') stay out of the chips and
37+
// side-nav — their grid sections already render nothing, so a chip or anchor
38+
// would point nowhere.
39+
const activeCats = CAT_ORDER.filter((c) => catCount(c) > 0);
3640
3741
// Phone drawer mirror of the inline `.side-nav` browse aside. The aside
3842
// becomes a horizontal flex pill row at ≤1180px which is fine on tablet,
@@ -157,7 +161,7 @@ const breadcrumbLd = {
157161

158162
<div class="hero-filters" id="filters">
159163
<button class="filter-chip on" data-filter="all">All <span class="cnt">{total}</span></button>
160-
{CAT_ORDER.map((c) => (
164+
{activeCats.map((c) => (
161165
<button class="filter-chip" data-filter={c}>
162166
{CATEGORY_META[c].label}{CATEGORY_META[c].em ?? ''} <span class="cnt">{catCount(c)}</span>
163167
</button>
@@ -177,7 +181,7 @@ const breadcrumbLd = {
177181
</div>
178182
<ol id="sideList">
179183
<li data-target="section.feat"><a href="#feat-scroll" data-cat="featured"><span class="cat-dot"></span>Featured<span class="count">★</span></a></li>
180-
{CAT_ORDER.map((c) => (
184+
{activeCats.map((c) => (
181185
<li data-target={`#cat-${c}`}>
182186
<a href={`#cat-${c}`} data-cat={c}>
183187
<span class="cat-dot"></span>

docs/src/pages/llms.txt.ts

Lines changed: 12 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import {
1111
SKILL_MD_URL,
1212
} from '../consts';
1313
import { sidebar, type SidebarDoc } from '../data/docs-sidebar';
14-
import { EXAMPLE_CATALOG } from '../data/examples';
14+
import { EXAMPLE_CATALOG, EXAMPLE_CATALOG_GROUPS } from '../data/examples';
1515
const oneLine = (s?: string) => (s ?? '').replace(/\s+/g, ' ').trim();
1616

1717
export const GET: APIRoute = async () => {
@@ -71,13 +71,18 @@ export const GET: APIRoute = async () => {
7171
'`, then `cd cocoindex/examples/<dir>`, copy `.env.example` if present, install with `pip install -e .`, and run the command shown for that example.',
7272
);
7373
out.push('');
74-
for (const ex of EXAMPLE_CATALOG) {
75-
const href = ex.docs
76-
? url(`examples/${ex.docs}`)
77-
: `${GITHUB_REPO}/tree/main/examples/${ex.dir}`;
78-
out.push(`- [${ex.title}](${href}): ${oneLine(ex.description)} (examples/${ex.dir}; run: \`${ex.run ?? 'cocoindex update main'}\`)`);
74+
for (const group of EXAMPLE_CATALOG_GROUPS) {
75+
out.push(`### ${group.title}`);
76+
out.push(`> ${group.blurb}`);
77+
out.push('');
78+
for (const ex of group.entries) {
79+
const href = ex.docs
80+
? url(`examples/${ex.docs}`)
81+
: `${GITHUB_REPO}/tree/main/examples/${ex.dir}`;
82+
out.push(`- [${ex.title}](${href}): ${oneLine(ex.description)} (examples/${ex.dir}; run: \`${ex.run ?? 'cocoindex update main'}\`)`);
83+
}
84+
out.push('');
7985
}
80-
out.push('');
8186

8287
return new Response(out.join('\n'), {
8388
headers: { 'Content-Type': 'text/plain; charset=utf-8' },
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Example environment variables for this example
2+
# Copy this to .env and fill in your actual values
3+
4+
COCOINDEX_DB=./cocoindex.db
5+
6+
# OpenAI API key (used via LiteLLM). Not needed if LLM_MODEL points at a local
7+
# provider such as Ollama.
8+
#! PLEASE FILL IN
9+
OPENAI_API_KEY=
10+
11+
# Neo4j connection
12+
NEO4J_URI=bolt://localhost:7687
13+
NEO4J_USER=neo4j
14+
NEO4J_PASSWORD=cocoindex
15+
NEO4J_DATABASE=neo4j
16+
17+
# LLM model (LiteLLM-prefixed; e.g. openai/gpt-5.4, anthropic/claude-...,
18+
# gemini/..., or ollama/llama3.2 to run locally with no API key).
19+
LLM_MODEL=openai/gpt-5.4
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.env
2+
cocoindex.db/
3+
__pycache__/
4+
*.egg-info/
5+
.venv/
Lines changed: 119 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,119 @@
1+
# Build a Knowledge Graph for Docs — Neo4j (CocoIndex v1)
2+
3+
Turn a folder of Markdown documentation into a concept knowledge graph in
4+
[Neo4j](https://neo4j.com/). For each document an LLM (via
5+
[LiteLLM](https://docs.litellm.ai/) + [instructor](https://python.useinstructor.com/))
6+
produces a short summary and a set of `(subject, predicate, object)` triples
7+
about the concepts it covers — "concepts, not code" — and the triples become a
8+
property graph.
9+
10+
This is the CocoIndex **v1** port of the blog post
11+
[Build a Knowledge Graph for Documents](https://cocoindex.io/blogs/knowledge-graph-for-docs/).
12+
13+
Please drop [CocoIndex on Github](https://github.com/cocoindex-io/cocoindex) a
14+
star to support us and stay tuned for more updates. Thank you so much 🥥🤗.
15+
[![GitHub](https://img.shields.io/github/stars/cocoindex-io/cocoindex?color=5B5BD6)](https://github.com/cocoindex-io/cocoindex)
16+
17+
## What this builds
18+
19+
- `Document` nodes — one per Markdown file, keyed by filename, with an
20+
LLM-generated `title` and `summary`
21+
- `Entity` nodes — one per distinct concept named in a triple, keyed by `value`
22+
- Relationships:
23+
- `RELATIONSHIP``Entity → Entity`, with the `predicate` stored on the edge
24+
- `MENTION``Document → Entity`, recording which document named which concept
25+
26+
The flow watches the source folder and keeps the graph up to date
27+
incrementally.
28+
29+
## How it works
30+
31+
The pipeline runs in two phases:
32+
33+
1. **Per-file extraction.** Read each Markdown file, extract a `DocumentSummary`
34+
(title + summary) and a list of relationship triples with LiteLLM +
35+
instructor. The `Document` node is declared in this phase; the triples are
36+
carried forward.
37+
2. **Graph building.** A single pass declares the deduplicated `Entity` nodes
38+
and the `RELATIONSHIP` / `MENTION` edges across all documents. Each distinct
39+
triple is keyed by a stable hash, so re-asserting the same fact in another
40+
doc maps to the same edge.
41+
42+
CocoIndex reconciles changes incrementally — re-running after editing one doc
43+
only re-extracts that doc, and the graph pass only re-runs when the set of
44+
triples changes. To collapse near-identical entity names (e.g. "CocoIndex" vs
45+
"Cocoindex"), add an entity-resolution pass like the one in
46+
[`meeting_notes_graph_neo4j`](../meeting_notes_graph_neo4j).
47+
48+
## Prerequisites
49+
50+
- A running Neo4j 5.18+ instance:
51+
```sh
52+
docker run -d \
53+
-p 7474:7474 -p 7687:7687 \
54+
-e NEO4J_AUTH=neo4j/cocoindex \
55+
--name cocoindex-neo4j \
56+
neo4j:5.26-community
57+
```
58+
The browser UI is at <http://localhost:7474>; log in with `neo4j` /
59+
`cocoindex`.
60+
61+
- An LLM. Defaults to OpenAI (set `OPENAI_API_KEY`); set `LLM_MODEL` to any
62+
[LiteLLM provider](https://docs.litellm.ai/docs/providers) — e.g.
63+
`LLM_MODEL=ollama/llama3.2` to run the extraction locally with no API key.
64+
65+
## Environment
66+
67+
Copy `.env.example` to `.env` and fill in the blanks:
68+
69+
```sh
70+
cp .env.example .env
71+
set -a && source .env && set +a
72+
```
73+
74+
## Run
75+
76+
Install dependencies:
77+
78+
```sh
79+
uv pip install -e .
80+
```
81+
82+
This example ships a small `markdown_files/` folder of sample concept docs so it
83+
runs out of the box. Build/update the graph:
84+
85+
```sh
86+
cocoindex update main
87+
```
88+
89+
To index your own docs, drop `.md` / `.mdx` files into `markdown_files/` (or
90+
point `sourcedir` in `main.py` at another directory — e.g. CocoIndex's own
91+
`docs/`) and re-run.
92+
93+
## Browse the knowledge graph
94+
95+
Open Neo4j Browser at <http://localhost:7474>, log in, and run Cypher queries:
96+
97+
```cypher
98+
// Everything
99+
MATCH p=()-->() RETURN p LIMIT 200
100+
101+
// Concept-to-concept relationships
102+
MATCH (a:Entity)-[r:RELATIONSHIP]->(b:Entity)
103+
RETURN a.value, r.predicate, b.value
104+
105+
// Which documents mention which concepts
106+
MATCH (d:Document)-[:MENTION]->(e:Entity)
107+
RETURN d.filename, d.title, e.value
108+
109+
// Concepts mentioned in the most documents
110+
MATCH (d:Document)-[:MENTION]->(e:Entity)
111+
RETURN e.value, count(DISTINCT d) AS docs
112+
ORDER BY docs DESC LIMIT 10
113+
```
114+
115+
To wipe the graph between runs:
116+
117+
```cypher
118+
MATCH (n) DETACH DELETE n
119+
```

0 commit comments

Comments
 (0)