Skip to content

Commit a98ed4f

Browse files
committed
fix: TXT and MD chunking
1 parent b026180 commit a98ed4f

File tree

9 files changed

+131
-39
lines changed

9 files changed

+131
-39
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -516,7 +516,10 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde
516516
"autoGc": true, // Auto garbage collection
517517
"gcIntervalDays": 7, // GC interval (days)
518518
"gcOrphanThreshold": 100, // GC trigger threshold
519-
"requireProjectMarker": true // Require .git/package.json to index
519+
"requireProjectMarker": true, // Require .git/package.json to index
520+
"maxDepth": 5, // Max directory depth (-1=unlimited, 0=root only)
521+
"maxFilesPerDirectory": 100, // Max files per directory (smallest first)
522+
"fallbackToTextOnMaxChunks": true // Fallback to text chunking on maxChunksPerFile
520523
},
521524

522525
// === Search ===
@@ -590,6 +593,9 @@ String values in `codebase-index.json` can reference environment variables with
590593
| `gcIntervalDays` | `7` | Run GC on initialization if last GC was more than N days ago |
591594
| `gcOrphanThreshold` | `100` | Run GC after indexing if orphan count exceeds this threshold |
592595
| `requireProjectMarker` | `true` | Require a project marker (`.git`, `package.json`, etc.) to enable file watching and auto-indexing. Prevents accidentally indexing large directories like home. Set to `false` to index any directory. |
596+
| `maxDepth` | `5` | Max directory traversal depth. `-1` = unlimited, `0` = only files in root dir, `1` = one level of subdirectories, etc. |
597+
| `maxFilesPerDirectory` | `100` | Max files to index per directory. Always picks the smallest files first. |
598+
| `fallbackToTextOnMaxChunks` | `true` | When a file exceeds `maxChunksPerFile`, fallback to text-based (line-by-line) chunking instead of skipping the rest of the file. |
593599
| **search** | | |
594600
| `maxResults` | `20` | Maximum results to return |
595601
| `minScore` | `0.1` | Minimum similarity score (0-1). Lower = more results |

native/src/lib.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,12 @@ pub fn parse_file(file_path: String, content: String) -> Result<Vec<CodeChunk>>
2525
parser::parse_file_internal(&file_path, &content).map_err(|e| Error::from_reason(e.to_string()))
2626
}
2727

28+
#[napi]
29+
pub fn parse_file_as_text(file_path: String, content: String) -> Result<Vec<CodeChunk>> {
30+
parser::parse_file_as_text_internal(&file_path, &content)
31+
.map_err(|e| Error::from_reason(e.to_string()))
32+
}
33+
2834
#[napi]
2935
pub fn parse_files(files: Vec<FileInput>) -> Result<Vec<ParsedFile>> {
3036
parser::parse_files_parallel(files).map_err(|e| Error::from_reason(e.to_string()))

native/src/parser.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,16 @@ pub fn parse_file_internal(file_path: &str, content: &str) -> Result<Vec<CodeChu
5858
extract_chunks(&tree, content, &language)
5959
}
6060

61+
pub fn parse_file_as_text_internal(file_path: &str, content: &str) -> Result<Vec<CodeChunk>> {
62+
let ext = Path::new(file_path)
63+
.extension()
64+
.and_then(|e| e.to_str())
65+
.unwrap_or("");
66+
67+
let language = Language::from_extension(ext);
68+
Ok(chunk_by_lines(content, &language))
69+
}
70+
6171
pub fn parse_files_parallel(files: Vec<FileInput>) -> Result<Vec<ParsedFile>> {
6272
let results: Vec<ParsedFile> = files
6373
.par_iter()

package-lock.json

Lines changed: 12 additions & 15 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,7 @@
7070
]
7171
},
7272
"dependencies": {
73+
"@opencode-ai/plugin": "~1.3.13",
7374
"chokidar": "^5.0.0",
7475
"ignore": "^7.0.5",
7576
"p-queue": "^9.1.1",
@@ -80,7 +81,6 @@
8081
"@eslint/js": "^9.39.4",
8182
"@modelcontextprotocol/sdk": "^1.29.0",
8283
"@napi-rs/cli": "^3.6.0",
83-
"@opencode-ai/plugin": "^1.3.13",
8484
"@types/node": "^25.5.2",
8585
"@vitest/coverage-v8": "^4.1.2",
8686
"eslint": "^9.39.4",
@@ -106,4 +106,4 @@
106106
"optional": true
107107
}
108108
}
109-
}
109+
}

src/config/schema.ts

Lines changed: 23 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,13 +16,26 @@ export interface IndexingConfig {
1616
autoGc: boolean;
1717
gcIntervalDays: number;
1818
gcOrphanThreshold: number;
19-
/**
20-
* When true (default), requires a project marker (.git, package.json, Cargo.toml, etc.)
19+
/**
20+
* When true (default), requires a project marker (.git, package.json, Cargo.toml, etc.)
2121
* to be present before enabling file watching and auto-indexing.
22-
* This prevents accidentally watching/indexing large non-project directories like home.
23-
* Set to false to allow indexing any directory.
2422
*/
2523
requireProjectMarker: boolean;
24+
/**
25+
* Max directory traversal depth. -1 = unlimited, 0 = only files in the root dir,
26+
* 1 = one level of subdirectories, etc. Default: 5
27+
*/
28+
maxDepth: number;
29+
/**
30+
* Max number of files to index per directory. Always picks the smallest files first.
31+
* Default: 100
32+
*/
33+
maxFilesPerDirectory: number;
34+
/**
35+
* When a file hits maxChunksPerFile, fallback to text-based (chunk_by_lines) parsing
36+
* instead of skipping the rest of the file. Default: true
37+
*/
38+
fallbackToTextOnMaxChunks: boolean;
2639
}
2740

2841
export interface SearchConfig {
@@ -128,6 +141,9 @@ function getDefaultIndexingConfig(): IndexingConfig {
128141
gcIntervalDays: 7,
129142
gcOrphanThreshold: 100,
130143
requireProjectMarker: true,
144+
maxDepth: 5,
145+
maxFilesPerDirectory: 100,
146+
fallbackToTextOnMaxChunks: true,
131147
};
132148
}
133149

@@ -237,6 +253,9 @@ export function parseConfig(raw: unknown): ParsedCodebaseIndexConfig {
237253
gcIntervalDays: typeof rawIndexing.gcIntervalDays === "number" ? Math.max(1, rawIndexing.gcIntervalDays) : defaultIndexing.gcIntervalDays,
238254
gcOrphanThreshold: typeof rawIndexing.gcOrphanThreshold === "number" ? Math.max(0, rawIndexing.gcOrphanThreshold) : defaultIndexing.gcOrphanThreshold,
239255
requireProjectMarker: typeof rawIndexing.requireProjectMarker === "boolean" ? rawIndexing.requireProjectMarker : defaultIndexing.requireProjectMarker,
256+
maxDepth: typeof rawIndexing.maxDepth === "number" ? (rawIndexing.maxDepth < -1 ? -1 : rawIndexing.maxDepth) : defaultIndexing.maxDepth,
257+
maxFilesPerDirectory: typeof rawIndexing.maxFilesPerDirectory === "number" ? Math.max(1, rawIndexing.maxFilesPerDirectory) : defaultIndexing.maxFilesPerDirectory,
258+
fallbackToTextOnMaxChunks: typeof rawIndexing.fallbackToTextOnMaxChunks === "boolean" ? rawIndexing.fallbackToTextOnMaxChunks : defaultIndexing.fallbackToTextOnMaxChunks,
240259
};
241260

242261
const rawSearch = (input.search && typeof input.search === "object" ? input.search : {}) as Record<string, unknown>;

src/indexer/index.ts

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ import {
2929
hashFile,
3030
hashContent,
3131
extractCalls,
32+
parseFileAsText,
3233
} from "../native/index.js";
3334
import type { SymbolData, CallEdgeData } from "../native/index.js";
3435
import { getBranchOrDefault, getBaseBranch, isGitRepo } from "../git/index.js";
@@ -1736,7 +1737,8 @@ export class Indexer {
17361737
includePatterns,
17371738
this.config.exclude,
17381739
this.config.indexing.maxFileSize,
1739-
this.config.knowledgeBases
1740+
this.config.knowledgeBases,
1741+
{ maxDepth: this.config.indexing.maxDepth, maxFilesPerDirectory: this.config.indexing.maxFilesPerDirectory }
17401742
);
17411743

17421744
return createCostEstimate(files, configuredProviderInfo);
@@ -1786,7 +1788,8 @@ export class Indexer {
17861788
includePatterns,
17871789
this.config.exclude,
17881790
this.config.indexing.maxFileSize,
1789-
this.config.knowledgeBases
1791+
this.config.knowledgeBases,
1792+
{ maxDepth: this.config.indexing.maxDepth, maxFilesPerDirectory: this.config.indexing.maxFilesPerDirectory }
17901793
);
17911794

17921795
stats.totalFiles = files.length;
@@ -1871,7 +1874,17 @@ export class Indexer {
18711874
}
18721875

18731876
let fileChunkCount = 0;
1874-
for (const chunk of parsed.chunks) {
1877+
let chunksToProcess = parsed.chunks;
1878+
1879+
if (this.config.indexing.fallbackToTextOnMaxChunks && chunksToProcess.length > this.config.indexing.maxChunksPerFile) {
1880+
const changedFile = changedFiles.find(f => f.path === parsed.path);
1881+
if (changedFile) {
1882+
const textChunks = parseFileAsText(parsed.path, changedFile.content);
1883+
chunksToProcess = textChunks;
1884+
}
1885+
}
1886+
1887+
for (const chunk of chunksToProcess) {
18751888
if (fileChunkCount >= this.config.indexing.maxChunksPerFile) {
18761889
break;
18771890
}

src/native/index.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,11 @@ export function parseFile(filePath: string, content: string): CodeChunk[] {
178178
return result.map(mapChunk);
179179
}
180180

181+
export function parseFileAsText(filePath: string, content: string): CodeChunk[] {
182+
const result = native.parseFileAsText(filePath, content);
183+
return result.map(mapChunk);
184+
}
185+
181186
export function parseFiles(files: FileInput[]): ParsedFile[] {
182187
const result = native.parseFiles(files);
183188
return result.map((f: any) => ({

src/utils/files.ts

Lines changed: 50 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -127,17 +127,27 @@ function matchGlob(filePath: string, pattern: string): boolean {
127127
return regex.test(filePath);
128128
}
129129

130+
export interface WalkOptions {
131+
maxDepth: number;
132+
maxFilesPerDirectory: number;
133+
}
134+
130135
export async function* walkDirectory(
131136
dir: string,
132137
projectRoot: string,
133138
includePatterns: string[],
134139
excludePatterns: string[],
135140
ignoreFilter: Ignore,
136141
maxFileSize: number,
137-
skipped: SkippedFile[]
142+
skipped: SkippedFile[],
143+
options: WalkOptions,
144+
currentDepth: number = 0
138145
): AsyncGenerator<{ path: string; size: number }> {
139146
const entries = await fsPromises.readdir(dir, { withFileTypes: true });
140147

148+
const filesInDir: Array<{ path: string; size: number }> = [];
149+
const subdirs: Array<{ fullPath: string; relativePath: string }> = [];
150+
141151
for (const entry of entries) {
142152
const fullPath = path.join(dir, entry.name);
143153
const relativePath = path.relative(projectRoot, fullPath);
@@ -164,15 +174,7 @@ export async function* walkDirectory(
164174
}
165175

166176
if (entry.isDirectory()) {
167-
yield* walkDirectory(
168-
fullPath,
169-
projectRoot,
170-
includePatterns,
171-
excludePatterns,
172-
ignoreFilter,
173-
maxFileSize,
174-
skipped
175-
);
177+
subdirs.push({ fullPath, relativePath });
176178
} else if (entry.isFile()) {
177179
const stat = await fsPromises.stat(fullPath);
178180

@@ -197,19 +199,49 @@ export async function* walkDirectory(
197199
}
198200

199201
if (matched) {
200-
yield { path: fullPath, size: stat.size };
202+
filesInDir.push({ path: fullPath, size: stat.size });
201203
}
202204
}
203205
}
206+
207+
// Sort by size ascending, keep only the smallest maxFilesPerDirectory files
208+
filesInDir.sort((a, b) => a.size - b.size);
209+
const limitedFiles = filesInDir.slice(0, options.maxFilesPerDirectory);
210+
for (const f of limitedFiles) {
211+
yield f;
212+
}
213+
for (let i = options.maxFilesPerDirectory; i < filesInDir.length; i++) {
214+
skipped.push({ path: path.relative(projectRoot, filesInDir[i].path), reason: "excluded" });
215+
}
216+
217+
// Recurse into subdirectories respecting depth limit
218+
const canRecurse = options.maxDepth === -1 || currentDepth < options.maxDepth;
219+
if (canRecurse) {
220+
for (const sub of subdirs) {
221+
yield* walkDirectory(
222+
sub.fullPath,
223+
projectRoot,
224+
includePatterns,
225+
excludePatterns,
226+
ignoreFilter,
227+
maxFileSize,
228+
skipped,
229+
options,
230+
currentDepth + 1
231+
);
232+
}
233+
}
204234
}
205235

206236
export async function collectFiles(
207237
projectRoot: string,
208238
includePatterns: string[],
209239
excludePatterns: string[],
210240
maxFileSize: number,
211-
additionalRoots?: string[]
241+
additionalRoots?: string[],
242+
walkOptions?: WalkOptions
212243
): Promise<CollectFilesResult> {
244+
const opts: WalkOptions = walkOptions ?? { maxDepth: 5, maxFilesPerDirectory: 100 };
213245
const ignoreFilter = createIgnoreFilter(projectRoot);
214246
const files: Array<{ path: string; size: number }> = [];
215247
const skipped: SkippedFile[] = [];
@@ -222,7 +254,9 @@ export async function collectFiles(
222254
excludePatterns,
223255
ignoreFilter,
224256
maxFileSize,
225-
skipped
257+
skipped,
258+
opts,
259+
0
226260
)) {
227261
files.push(file);
228262
}
@@ -253,7 +287,9 @@ export async function collectFiles(
253287
excludePatterns,
254288
kbIgnoreFilter,
255289
maxFileSize,
256-
skipped
290+
skipped,
291+
opts,
292+
0
257293
)) {
258294
files.push(file);
259295
}

0 commit comments

Comments
 (0)