Skip to content

Commit ea9356d

Browse files
committed
fix(token-search): renumber inverted index after doc removal
FuseIndex.removeAll renumbers surviving records contiguously, but the inverted index only deleted the old docIdx entries — postings and docTerms keys for docs past the removal point kept stale indices. Later removes or token scoring could then operate on wrong docs. Adds removeAndShiftInvertedIndex which removes the given docIdx entries, then renumbers surviving postings.docIdx and docTerms keys in a single pass using a binary-search shift. Wired into Fuse.remove and Fuse.removeAt; removeFromInvertedIndex is kept for direct use in tests. Covers single remove with shift, non-contiguous batch remove, add after shift, and full-collection remove.
1 parent e4217f9 commit ea9356d

3 files changed

Lines changed: 124 additions & 6 deletions

File tree

src/core/index.ts

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ import { createAnalyzer } from '../search/token/analyzer'
1212
import {
1313
buildInvertedIndex,
1414
addToInvertedIndex,
15-
removeFromInvertedIndex
15+
removeAndShiftInvertedIndex
1616
} from '../search/token/InvertedIndex'
1717
import type { InvertedIndexData } from '../search/token/InvertedIndex'
1818
import type {
@@ -160,9 +160,7 @@ export default class Fuse<T> {
160160

161161
if (indicesToRemove.length) {
162162
if (this._invertedIndex) {
163-
for (const idx of indicesToRemove) {
164-
removeFromInvertedIndex(this._invertedIndex, idx)
165-
}
163+
removeAndShiftInvertedIndex(this._invertedIndex, indicesToRemove)
166164
}
167165

168166
// Filter docs in a single pass instead of reverse-splicing
@@ -176,7 +174,7 @@ export default class Fuse<T> {
176174

177175
removeAt(idx: number): T {
178176
if (this._invertedIndex) {
179-
removeFromInvertedIndex(this._invertedIndex, idx)
177+
removeAndShiftInvertedIndex(this._invertedIndex, [idx])
180178
}
181179
const doc = this._docs.splice(idx, 1)[0]
182180
this._myIndex.removeAt(idx)

src/search/token/InvertedIndex.ts

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -186,3 +186,48 @@ export function removeFromInvertedIndex(
186186
index.fieldCount -= docFields.size
187187
index.docTerms.delete(docIdx)
188188
}
189+
190+
// Removes the given docIdx entries and renumbers remaining postings/docTerms
191+
// so that they stay in sync with FuseIndex's contiguous renumbering on remove.
192+
export function removeAndShiftInvertedIndex(
193+
index: InvertedIndexData,
194+
removedIndices: number[]
195+
): void {
196+
if (removedIndices.length === 0) return
197+
198+
// De-dup and sort so the shift computation is O(log k) per lookup.
199+
const sorted = Array.from(new Set(removedIndices)).sort((a, b) => a - b)
200+
201+
for (const idx of sorted) {
202+
removeFromInvertedIndex(index, idx)
203+
}
204+
205+
// For any surviving oldIdx, its new idx is oldIdx minus the number of
206+
// removed indices strictly less than oldIdx.
207+
const shift = (oldIdx: number): number => {
208+
let lo = 0
209+
let hi = sorted.length
210+
while (lo < hi) {
211+
const mid = (lo + hi) >>> 1
212+
if (sorted[mid] < oldIdx) lo = mid + 1
213+
else hi = mid
214+
}
215+
return oldIdx - lo
216+
}
217+
218+
const firstRemoved = sorted[0]
219+
220+
for (const postings of index.terms.values()) {
221+
for (const p of postings) {
222+
if (p.docIdx > firstRemoved) {
223+
p.docIdx = shift(p.docIdx)
224+
}
225+
}
226+
}
227+
228+
const shiftedDocTerms = new Map<number, Set<string>>()
229+
for (const [oldKey, terms] of index.docTerms) {
230+
shiftedDocTerms.set(oldKey > firstRemoved ? shift(oldKey) : oldKey, terms)
231+
}
232+
index.docTerms = shiftedDocTerms
233+
}

test/internals.test.ts

Lines changed: 76 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@ import norm from '../src/tools/fieldNorm'
22
import {
33
buildInvertedIndex,
44
addToInvertedIndex,
5-
removeFromInvertedIndex
5+
removeFromInvertedIndex,
6+
removeAndShiftInvertedIndex
67
} from '../src/search/token/InvertedIndex'
78
import { createAnalyzer } from '../src/search/token/analyzer'
89
import type { IndexRecord } from '../src/types'
@@ -135,4 +136,78 @@ describe('InvertedIndex', () => {
135136
removeFromInvertedIndex(index, 0)
136137
expect(index.docTerms.has(0)).toBe(false)
137138
})
139+
140+
test('removeAndShiftInvertedIndex renumbers surviving postings', () => {
141+
const records = makeRecords()
142+
const index = buildInvertedIndex(records, 2, analyzer)
143+
144+
// Remove doc 0 — docs 1, 2 must shift down to 0, 1
145+
removeAndShiftInvertedIndex(index, [0])
146+
147+
expect(index.docTerms.has(0)).toBe(true)
148+
expect(index.docTerms.has(1)).toBe(true)
149+
expect(index.docTerms.has(2)).toBe(false)
150+
expect(index.docTerms.get(0)).toEqual(new Set(['hello', 'there']))
151+
expect(index.docTerms.get(1)).toEqual(new Set(['goodbye', 'cruel', 'world']))
152+
153+
const helloPostings = index.terms.get('hello')!
154+
expect(helloPostings.length).toBe(1)
155+
expect(helloPostings[0].docIdx).toBe(0)
156+
157+
const worldPostings = index.terms.get('world')!
158+
expect(worldPostings.length).toBe(1)
159+
expect(worldPostings[0].docIdx).toBe(1)
160+
})
161+
162+
test('batch remove + shift handles non-contiguous indices', () => {
163+
const records: IndexRecord[] = [
164+
{ i: 0, $: { 0: { v: 'alpha', n: 1 } } },
165+
{ i: 1, $: { 0: { v: 'beta', n: 1 } } },
166+
{ i: 2, $: { 0: { v: 'gamma', n: 1 } } },
167+
{ i: 3, $: { 0: { v: 'delta', n: 1 } } },
168+
{ i: 4, $: { 0: { v: 'epsilon', n: 1 } } }
169+
]
170+
const index = buildInvertedIndex(records, 1, analyzer)
171+
172+
// Remove docs 1 and 3 — remaining (0, 2, 4) become (0, 1, 2)
173+
removeAndShiftInvertedIndex(index, [1, 3])
174+
175+
expect(index.docTerms.get(0)).toEqual(new Set(['alpha']))
176+
expect(index.docTerms.get(1)).toEqual(new Set(['gamma']))
177+
expect(index.docTerms.get(2)).toEqual(new Set(['epsilon']))
178+
expect(index.docTerms.has(3)).toBe(false)
179+
expect(index.docTerms.has(4)).toBe(false)
180+
181+
expect(index.terms.get('gamma')![0].docIdx).toBe(1)
182+
expect(index.terms.get('epsilon')![0].docIdx).toBe(2)
183+
})
184+
185+
test('removing every doc leaves an empty index', () => {
186+
const records = makeRecords()
187+
const index = buildInvertedIndex(records, 2, analyzer)
188+
189+
removeAndShiftInvertedIndex(index, [0, 1, 2])
190+
191+
expect(index.fieldCount).toBe(0)
192+
expect(index.docTerms.size).toBe(0)
193+
expect(index.terms.size).toBe(0)
194+
expect(index.df.size).toBe(0)
195+
})
196+
197+
test('subsequent add after shift uses the new contiguous index', () => {
198+
const records = makeRecords()
199+
const index = buildInvertedIndex(records, 2, analyzer)
200+
201+
removeAndShiftInvertedIndex(index, [0])
202+
203+
// After shift: 2 surviving docs at indices 0, 1. A new add gets index 2.
204+
const newRecord: IndexRecord = {
205+
i: 2,
206+
$: { 0: { v: 'fresh doc', n: 1 } }
207+
}
208+
addToInvertedIndex(index, newRecord, 2, analyzer)
209+
210+
expect(index.docTerms.get(2)).toEqual(new Set(['fresh', 'doc']))
211+
expect(index.terms.get('fresh')![0].docIdx).toBe(2)
212+
})
138213
})

0 commit comments

Comments
 (0)