Skip to content

Commit dea2ea8

Browse files
committed
feat: implement auto garbage collection configuration and tests
1 parent 7efbac3 commit dea2ea8

File tree

3 files changed

+287
-1
lines changed

3 files changed

+287
-1
lines changed

README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,10 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde
219219
"watchFiles": true,
220220
"maxFileSize": 1048576,
221221
"maxChunksPerFile": 100,
222-
"semanticOnly": false
222+
"semanticOnly": false,
223+
"autoGc": true,
224+
"gcIntervalDays": 7,
225+
"gcOrphanThreshold": 100
223226
},
224227
"search": {
225228
"maxResults": 20,
@@ -244,6 +247,9 @@ Zero-config by default (uses `auto` mode). Customize in `.opencode/codebase-inde
244247
| `semanticOnly` | `false` | When `true`, only index semantic nodes (functions, classes) and skip generic blocks |
245248
| `retries` | `3` | Number of retry attempts for failed embedding API calls |
246249
| `retryDelayMs` | `1000` | Delay between retries in milliseconds |
250+
| `autoGc` | `true` | Automatically run garbage collection to remove orphaned embeddings/chunks |
251+
| `gcIntervalDays` | `7` | Run GC on initialization if last GC was more than N days ago |
252+
| `gcOrphanThreshold` | `100` | Run GC after indexing if orphan count exceeds this threshold |
247253
| **search** | | |
248254
| `maxResults` | `20` | Maximum results to return |
249255
| `minScore` | `0.1` | Minimum similarity score (0-1). Lower = more results |

tests/auto-gc.test.ts

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
import { describe, it, expect, beforeEach, afterEach } from "vitest";
2+
import * as fs from "fs";
3+
import * as path from "path";
4+
import * as os from "os";
5+
import { Database } from "../src/native/index.js";
6+
import { parseConfig } from "../src/config/schema.js";
7+
8+
describe("Auto-GC", () => {
9+
describe("config parsing", () => {
10+
it("should use default GC values when not specified", () => {
11+
const config = parseConfig({});
12+
13+
expect(config.indexing.autoGc).toBe(true);
14+
expect(config.indexing.gcIntervalDays).toBe(7);
15+
expect(config.indexing.gcOrphanThreshold).toBe(100);
16+
});
17+
18+
it("should parse custom GC values", () => {
19+
const config = parseConfig({
20+
indexing: {
21+
autoGc: false,
22+
gcIntervalDays: 14,
23+
gcOrphanThreshold: 50,
24+
},
25+
});
26+
27+
expect(config.indexing.autoGc).toBe(false);
28+
expect(config.indexing.gcIntervalDays).toBe(14);
29+
expect(config.indexing.gcOrphanThreshold).toBe(50);
30+
});
31+
32+
it("should enforce minimum gcIntervalDays of 1", () => {
33+
const config = parseConfig({
34+
indexing: {
35+
gcIntervalDays: 0,
36+
},
37+
});
38+
39+
expect(config.indexing.gcIntervalDays).toBe(1);
40+
});
41+
42+
it("should enforce minimum gcIntervalDays of 1 for negative values", () => {
43+
const config = parseConfig({
44+
indexing: {
45+
gcIntervalDays: -5,
46+
},
47+
});
48+
49+
expect(config.indexing.gcIntervalDays).toBe(1);
50+
});
51+
52+
it("should enforce minimum gcOrphanThreshold of 0", () => {
53+
const config = parseConfig({
54+
indexing: {
55+
gcOrphanThreshold: -10,
56+
},
57+
});
58+
59+
expect(config.indexing.gcOrphanThreshold).toBe(0);
60+
});
61+
62+
it("should allow gcOrphanThreshold of 0", () => {
63+
const config = parseConfig({
64+
indexing: {
65+
gcOrphanThreshold: 0,
66+
},
67+
});
68+
69+
expect(config.indexing.gcOrphanThreshold).toBe(0);
70+
});
71+
});
72+
73+
describe("GC timestamp tracking", () => {
74+
let tempDir: string;
75+
let db: Database;
76+
77+
beforeEach(() => {
78+
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "gc-test-"));
79+
db = new Database(path.join(tempDir, "test.db"));
80+
});
81+
82+
afterEach(() => {
83+
fs.rmSync(tempDir, { recursive: true, force: true });
84+
});
85+
86+
it("should return null for lastGcTimestamp when never set", () => {
87+
expect(db.getMetadata("lastGcTimestamp")).toBeNull();
88+
});
89+
90+
it("should store and retrieve lastGcTimestamp", () => {
91+
const timestamp = Date.now().toString();
92+
db.setMetadata("lastGcTimestamp", timestamp);
93+
94+
expect(db.getMetadata("lastGcTimestamp")).toBe(timestamp);
95+
});
96+
97+
it("should update lastGcTimestamp on subsequent sets", () => {
98+
const timestamp1 = "1000000000000";
99+
const timestamp2 = "2000000000000";
100+
101+
db.setMetadata("lastGcTimestamp", timestamp1);
102+
db.setMetadata("lastGcTimestamp", timestamp2);
103+
104+
expect(db.getMetadata("lastGcTimestamp")).toBe(timestamp2);
105+
});
106+
});
107+
108+
describe("GC interval logic", () => {
109+
it("should trigger GC when lastGcTimestamp is null", () => {
110+
const lastGcTimestamp: string | null = null;
111+
const gcIntervalDays = 7;
112+
const intervalMs = gcIntervalDays * 24 * 60 * 60 * 1000;
113+
const now = Date.now();
114+
115+
let shouldRunGc = false;
116+
if (!lastGcTimestamp) {
117+
shouldRunGc = true;
118+
} else {
119+
const lastGcTime = parseInt(lastGcTimestamp, 10);
120+
if (!isNaN(lastGcTime) && now - lastGcTime > intervalMs) {
121+
shouldRunGc = true;
122+
}
123+
}
124+
125+
expect(shouldRunGc).toBe(true);
126+
});
127+
128+
it("should trigger GC when interval has elapsed", () => {
129+
const gcIntervalDays = 7;
130+
const intervalMs = gcIntervalDays * 24 * 60 * 60 * 1000;
131+
const now = Date.now();
132+
const eightDaysAgo = now - 8 * 24 * 60 * 60 * 1000;
133+
const lastGcTimestamp = eightDaysAgo.toString();
134+
135+
let shouldRunGc = false;
136+
if (!lastGcTimestamp) {
137+
shouldRunGc = true;
138+
} else {
139+
const lastGcTime = parseInt(lastGcTimestamp, 10);
140+
if (!isNaN(lastGcTime) && now - lastGcTime > intervalMs) {
141+
shouldRunGc = true;
142+
}
143+
}
144+
145+
expect(shouldRunGc).toBe(true);
146+
});
147+
148+
it("should not trigger GC when interval has not elapsed", () => {
149+
const gcIntervalDays = 7;
150+
const intervalMs = gcIntervalDays * 24 * 60 * 60 * 1000;
151+
const now = Date.now();
152+
const threeDaysAgo = now - 3 * 24 * 60 * 60 * 1000;
153+
const lastGcTimestamp = threeDaysAgo.toString();
154+
155+
let shouldRunGc = false;
156+
if (!lastGcTimestamp) {
157+
shouldRunGc = true;
158+
} else {
159+
const lastGcTime = parseInt(lastGcTimestamp, 10);
160+
if (!isNaN(lastGcTime) && now - lastGcTime > intervalMs) {
161+
shouldRunGc = true;
162+
}
163+
}
164+
165+
expect(shouldRunGc).toBe(false);
166+
});
167+
168+
it("should handle invalid timestamp gracefully", () => {
169+
const gcIntervalDays = 7;
170+
const intervalMs = gcIntervalDays * 24 * 60 * 60 * 1000;
171+
const now = Date.now();
172+
const lastGcTimestamp = "invalid_timestamp";
173+
174+
let shouldRunGc = false;
175+
if (!lastGcTimestamp) {
176+
shouldRunGc = true;
177+
} else {
178+
const lastGcTime = parseInt(lastGcTimestamp, 10);
179+
if (!isNaN(lastGcTime) && now - lastGcTime > intervalMs) {
180+
shouldRunGc = true;
181+
}
182+
}
183+
184+
expect(shouldRunGc).toBe(false);
185+
});
186+
});
187+
188+
describe("orphan threshold logic", () => {
189+
let tempDir: string;
190+
let db: Database;
191+
192+
beforeEach(() => {
193+
tempDir = fs.mkdtempSync(path.join(os.tmpdir(), "gc-orphan-test-"));
194+
db = new Database(path.join(tempDir, "test.db"));
195+
});
196+
197+
afterEach(() => {
198+
fs.rmSync(tempDir, { recursive: true, force: true });
199+
});
200+
201+
it("should calculate orphan count from stats", () => {
202+
const embedding = Buffer.from(new Float32Array([1.0, 2.0]).buffer);
203+
db.upsertEmbedding("hash1", embedding, "text1", "model");
204+
db.upsertEmbedding("hash2", embedding, "text2", "model");
205+
db.upsertEmbedding("hash3", embedding, "text3", "model");
206+
207+
db.upsertChunk({
208+
chunkId: "chunk1",
209+
contentHash: "hash1",
210+
filePath: "/file.ts",
211+
startLine: 1,
212+
endLine: 5,
213+
language: "typescript",
214+
});
215+
db.addChunksToBranch("main", ["chunk1"]);
216+
217+
const stats = db.getStats();
218+
expect(stats).not.toBeNull();
219+
220+
const orphanCount = stats!.embeddingCount - stats!.chunkCount;
221+
expect(orphanCount).toBe(2);
222+
});
223+
224+
it("should trigger GC when orphan count exceeds threshold", () => {
225+
const gcOrphanThreshold = 1;
226+
227+
const embedding = Buffer.from(new Float32Array([1.0, 2.0]).buffer);
228+
db.upsertEmbedding("hash1", embedding, "text1", "model");
229+
db.upsertEmbedding("hash2", embedding, "text2", "model");
230+
db.upsertEmbedding("hash3", embedding, "text3", "model");
231+
232+
db.upsertChunk({
233+
chunkId: "chunk1",
234+
contentHash: "hash1",
235+
filePath: "/file.ts",
236+
startLine: 1,
237+
endLine: 5,
238+
language: "typescript",
239+
});
240+
db.addChunksToBranch("main", ["chunk1"]);
241+
242+
const stats = db.getStats();
243+
const orphanCount = stats!.embeddingCount - stats!.chunkCount;
244+
245+
const shouldRunGc = orphanCount > gcOrphanThreshold;
246+
expect(shouldRunGc).toBe(true);
247+
248+
if (shouldRunGc) {
249+
const gcCount = db.gcOrphanEmbeddings();
250+
expect(gcCount).toBe(2);
251+
}
252+
});
253+
254+
it("should not trigger GC when orphan count is below threshold", () => {
255+
const gcOrphanThreshold = 100;
256+
257+
const embedding = Buffer.from(new Float32Array([1.0, 2.0]).buffer);
258+
db.upsertEmbedding("hash1", embedding, "text1", "model");
259+
260+
db.upsertChunk({
261+
chunkId: "chunk1",
262+
contentHash: "hash1",
263+
filePath: "/file.ts",
264+
startLine: 1,
265+
endLine: 5,
266+
language: "typescript",
267+
});
268+
db.addChunksToBranch("main", ["chunk1"]);
269+
270+
const stats = db.getStats();
271+
const orphanCount = stats!.embeddingCount - stats!.chunkCount;
272+
273+
const shouldRunGc = orphanCount > gcOrphanThreshold;
274+
expect(shouldRunGc).toBe(false);
275+
});
276+
});
277+
});

tests/watcher.test.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,9 @@ const createTestConfig = (overrides: Partial<ParsedCodebaseIndexConfig> = {}): P
1919
semanticOnly: false,
2020
retries: 3,
2121
retryDelayMs: 1000,
22+
autoGc: true,
23+
gcIntervalDays: 7,
24+
gcOrphanThreshold: 100,
2225
},
2326
search: {
2427
maxResults: 20,

0 commit comments

Comments
 (0)