Skip to content

Commit 54e702c

Browse files
committed
fix: correct fieldCount accounting and add reverse lookup in inverted index
Two related fixes in the inverted index: 1. fieldCount bug: `fieldCount` is incremented once per field during build (each `addField` call = one field). But `removeFromInvertedIndex` decremented it once per posting removed — so a field with 5 unique terms would decrement `fieldCount` by 5 instead of 1, skewing IDF calculations after any removal. Fixed by collecting distinct (keyIdx, subIdx) pairs across all removed postings and decrementing by that count. 2. Reverse doc→terms map: `removeFromInvertedIndex` iterated every term in the vocabulary to find postings belonging to the removed doc. Added `docTerms: Map<number, Set<string>>` — populated during build and add, deleted during remove. Removal now only visits terms that belong to the document: O(terms_in_doc) instead of O(vocabulary_size).
1 parent e550ab1 commit 54e702c

13 files changed

Lines changed: 312 additions & 29 deletions

dist/fuse.basic.cjs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1023,6 +1023,7 @@ function createAnalyzer({
10231023
function buildInvertedIndex(records, keyCount, analyzer) {
10241024
const terms = new Map();
10251025
const df = new Map();
1026+
const docTerms = new Map();
10261027
let fieldCount = 0;
10271028
function addField(text, docIdx, keyIdx, subIdx) {
10281029
const tokens = analyzer.tokenize(text);
@@ -1035,6 +1036,13 @@ function buildInvertedIndex(records, keyCount, analyzer) {
10351036
termFreqs.set(token, (termFreqs.get(token) || 0) + 1);
10361037
}
10371038

1039+
// Track which terms belong to this doc for fast removal
1040+
let docTermSet = docTerms.get(docIdx);
1041+
if (!docTermSet) {
1042+
docTermSet = new Set();
1043+
docTerms.set(docIdx, docTermSet);
1044+
}
1045+
10381046
// Track which terms we've already counted for df in this field
10391047
for (const [term, tf] of termFreqs) {
10401048
const posting = {
@@ -1049,6 +1057,7 @@ function buildInvertedIndex(records, keyCount, analyzer) {
10491057
terms.set(term, postings);
10501058
}
10511059
postings.push(posting);
1060+
docTermSet.add(term);
10521061
df.set(term, (df.get(term) || 0) + 1);
10531062
}
10541063
}
@@ -1083,7 +1092,8 @@ function buildInvertedIndex(records, keyCount, analyzer) {
10831092
return {
10841093
terms,
10851094
fieldCount,
1086-
df
1095+
df,
1096+
docTerms
10871097
};
10881098
}
10891099
function addToInvertedIndex(index, record, keyCount, analyzer) {
@@ -1092,6 +1102,11 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
10921102
v,
10931103
$: fields
10941104
} = record;
1105+
let docTermSet = index.docTerms.get(docIdx);
1106+
if (!docTermSet) {
1107+
docTermSet = new Set();
1108+
index.docTerms.set(docIdx, docTermSet);
1109+
}
10951110
function addField(text, keyIdx, subIdx) {
10961111
const tokens = analyzer.tokenize(text);
10971112
if (!tokens.length) return;
@@ -1113,6 +1128,7 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
11131128
index.terms.set(term, postings);
11141129
}
11151130
postings.push(posting);
1131+
docTermSet.add(term);
11161132
index.df.set(term, (index.df.get(term) || 0) + 1);
11171133
}
11181134
}
@@ -1135,11 +1151,21 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
11351151
}
11361152
}
11371153
function removeFromInvertedIndex(index, docIdx) {
1138-
for (const [term, postings] of index.terms) {
1139-
const filtered = postings.filter(p => p.docIdx !== docIdx);
1154+
const docTermSet = index.docTerms.get(docIdx);
1155+
if (!docTermSet) return;
1156+
1157+
// Count distinct fields this doc contributed (for fieldCount adjustment)
1158+
const docFields = new Set();
1159+
for (const term of docTermSet) {
1160+
const postings = index.terms.get(term);
1161+
if (!postings) continue;
1162+
const filtered = postings.filter(p => {
1163+
if (p.docIdx !== docIdx) return true;
1164+
docFields.add(`${p.keyIdx}:${p.subIdx}`);
1165+
return false;
1166+
});
11401167
const removed = postings.length - filtered.length;
11411168
if (removed > 0) {
1142-
index.fieldCount -= removed;
11431169
index.df.set(term, (index.df.get(term) || 0) - removed);
11441170
if (filtered.length === 0) {
11451171
index.terms.delete(term);
@@ -1149,6 +1175,8 @@ function removeFromInvertedIndex(index, docIdx) {
11491175
}
11501176
}
11511177
}
1178+
index.fieldCount -= docFields.size;
1179+
index.docTerms.delete(docIdx);
11521180
}
11531181

11541182
class Fuse {

dist/fuse.basic.min.cjs

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dist/fuse.basic.min.mjs

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dist/fuse.basic.mjs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,7 @@ function createAnalyzer({
10211021
function buildInvertedIndex(records, keyCount, analyzer) {
10221022
const terms = new Map();
10231023
const df = new Map();
1024+
const docTerms = new Map();
10241025
let fieldCount = 0;
10251026
function addField(text, docIdx, keyIdx, subIdx) {
10261027
const tokens = analyzer.tokenize(text);
@@ -1033,6 +1034,13 @@ function buildInvertedIndex(records, keyCount, analyzer) {
10331034
termFreqs.set(token, (termFreqs.get(token) || 0) + 1);
10341035
}
10351036

1037+
// Track which terms belong to this doc for fast removal
1038+
let docTermSet = docTerms.get(docIdx);
1039+
if (!docTermSet) {
1040+
docTermSet = new Set();
1041+
docTerms.set(docIdx, docTermSet);
1042+
}
1043+
10361044
// Track which terms we've already counted for df in this field
10371045
for (const [term, tf] of termFreqs) {
10381046
const posting = {
@@ -1047,6 +1055,7 @@ function buildInvertedIndex(records, keyCount, analyzer) {
10471055
terms.set(term, postings);
10481056
}
10491057
postings.push(posting);
1058+
docTermSet.add(term);
10501059
df.set(term, (df.get(term) || 0) + 1);
10511060
}
10521061
}
@@ -1081,7 +1090,8 @@ function buildInvertedIndex(records, keyCount, analyzer) {
10811090
return {
10821091
terms,
10831092
fieldCount,
1084-
df
1093+
df,
1094+
docTerms
10851095
};
10861096
}
10871097
function addToInvertedIndex(index, record, keyCount, analyzer) {
@@ -1090,6 +1100,11 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
10901100
v,
10911101
$: fields
10921102
} = record;
1103+
let docTermSet = index.docTerms.get(docIdx);
1104+
if (!docTermSet) {
1105+
docTermSet = new Set();
1106+
index.docTerms.set(docIdx, docTermSet);
1107+
}
10931108
function addField(text, keyIdx, subIdx) {
10941109
const tokens = analyzer.tokenize(text);
10951110
if (!tokens.length) return;
@@ -1111,6 +1126,7 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
11111126
index.terms.set(term, postings);
11121127
}
11131128
postings.push(posting);
1129+
docTermSet.add(term);
11141130
index.df.set(term, (index.df.get(term) || 0) + 1);
11151131
}
11161132
}
@@ -1133,11 +1149,21 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
11331149
}
11341150
}
11351151
function removeFromInvertedIndex(index, docIdx) {
1136-
for (const [term, postings] of index.terms) {
1137-
const filtered = postings.filter(p => p.docIdx !== docIdx);
1152+
const docTermSet = index.docTerms.get(docIdx);
1153+
if (!docTermSet) return;
1154+
1155+
// Count distinct fields this doc contributed (for fieldCount adjustment)
1156+
const docFields = new Set();
1157+
for (const term of docTermSet) {
1158+
const postings = index.terms.get(term);
1159+
if (!postings) continue;
1160+
const filtered = postings.filter(p => {
1161+
if (p.docIdx !== docIdx) return true;
1162+
docFields.add(`${p.keyIdx}:${p.subIdx}`);
1163+
return false;
1164+
});
11381165
const removed = postings.length - filtered.length;
11391166
if (removed > 0) {
1140-
index.fieldCount -= removed;
11411167
index.df.set(term, (index.df.get(term) || 0) - removed);
11421168
if (filtered.length === 0) {
11431169
index.terms.delete(term);
@@ -1147,6 +1173,8 @@ function removeFromInvertedIndex(index, docIdx) {
11471173
}
11481174
}
11491175
}
1176+
index.fieldCount -= docFields.size;
1177+
index.docTerms.delete(docIdx);
11501178
}
11511179

11521180
class Fuse {

dist/fuse.cjs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1418,6 +1418,7 @@ function createAnalyzer({
14181418
function buildInvertedIndex(records, keyCount, analyzer) {
14191419
const terms = new Map();
14201420
const df = new Map();
1421+
const docTerms = new Map();
14211422
let fieldCount = 0;
14221423
function addField(text, docIdx, keyIdx, subIdx) {
14231424
const tokens = analyzer.tokenize(text);
@@ -1430,6 +1431,13 @@ function buildInvertedIndex(records, keyCount, analyzer) {
14301431
termFreqs.set(token, (termFreqs.get(token) || 0) + 1);
14311432
}
14321433

1434+
// Track which terms belong to this doc for fast removal
1435+
let docTermSet = docTerms.get(docIdx);
1436+
if (!docTermSet) {
1437+
docTermSet = new Set();
1438+
docTerms.set(docIdx, docTermSet);
1439+
}
1440+
14331441
// Track which terms we've already counted for df in this field
14341442
for (const [term, tf] of termFreqs) {
14351443
const posting = {
@@ -1444,6 +1452,7 @@ function buildInvertedIndex(records, keyCount, analyzer) {
14441452
terms.set(term, postings);
14451453
}
14461454
postings.push(posting);
1455+
docTermSet.add(term);
14471456
df.set(term, (df.get(term) || 0) + 1);
14481457
}
14491458
}
@@ -1478,7 +1487,8 @@ function buildInvertedIndex(records, keyCount, analyzer) {
14781487
return {
14791488
terms,
14801489
fieldCount,
1481-
df
1490+
df,
1491+
docTerms
14821492
};
14831493
}
14841494
function addToInvertedIndex(index, record, keyCount, analyzer) {
@@ -1487,6 +1497,11 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
14871497
v,
14881498
$: fields
14891499
} = record;
1500+
let docTermSet = index.docTerms.get(docIdx);
1501+
if (!docTermSet) {
1502+
docTermSet = new Set();
1503+
index.docTerms.set(docIdx, docTermSet);
1504+
}
14901505
function addField(text, keyIdx, subIdx) {
14911506
const tokens = analyzer.tokenize(text);
14921507
if (!tokens.length) return;
@@ -1508,6 +1523,7 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
15081523
index.terms.set(term, postings);
15091524
}
15101525
postings.push(posting);
1526+
docTermSet.add(term);
15111527
index.df.set(term, (index.df.get(term) || 0) + 1);
15121528
}
15131529
}
@@ -1530,11 +1546,21 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
15301546
}
15311547
}
15321548
function removeFromInvertedIndex(index, docIdx) {
1533-
for (const [term, postings] of index.terms) {
1534-
const filtered = postings.filter(p => p.docIdx !== docIdx);
1549+
const docTermSet = index.docTerms.get(docIdx);
1550+
if (!docTermSet) return;
1551+
1552+
// Count distinct fields this doc contributed (for fieldCount adjustment)
1553+
const docFields = new Set();
1554+
for (const term of docTermSet) {
1555+
const postings = index.terms.get(term);
1556+
if (!postings) continue;
1557+
const filtered = postings.filter(p => {
1558+
if (p.docIdx !== docIdx) return true;
1559+
docFields.add(`${p.keyIdx}:${p.subIdx}`);
1560+
return false;
1561+
});
15351562
const removed = postings.length - filtered.length;
15361563
if (removed > 0) {
1537-
index.fieldCount -= removed;
15381564
index.df.set(term, (index.df.get(term) || 0) - removed);
15391565
if (filtered.length === 0) {
15401566
index.terms.delete(term);
@@ -1544,6 +1570,8 @@ function removeFromInvertedIndex(index, docIdx) {
15441570
}
15451571
}
15461572
}
1573+
index.fieldCount -= docFields.size;
1574+
index.docTerms.delete(docIdx);
15471575
}
15481576

15491577
class Fuse {

dist/fuse.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,7 @@ interface InvertedIndexData {
251251
terms: Map<string, Posting[]>;
252252
fieldCount: number;
253253
df: Map<string, number>;
254+
docTerms: Map<number, Set<string>>;
254255
}
255256

256257
interface HeapSearchOptions {

dist/fuse.min.cjs

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dist/fuse.min.mjs

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

dist/fuse.mjs

Lines changed: 32 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1416,6 +1416,7 @@ function createAnalyzer({
14161416
function buildInvertedIndex(records, keyCount, analyzer) {
14171417
const terms = new Map();
14181418
const df = new Map();
1419+
const docTerms = new Map();
14191420
let fieldCount = 0;
14201421
function addField(text, docIdx, keyIdx, subIdx) {
14211422
const tokens = analyzer.tokenize(text);
@@ -1428,6 +1429,13 @@ function buildInvertedIndex(records, keyCount, analyzer) {
14281429
termFreqs.set(token, (termFreqs.get(token) || 0) + 1);
14291430
}
14301431

1432+
// Track which terms belong to this doc for fast removal
1433+
let docTermSet = docTerms.get(docIdx);
1434+
if (!docTermSet) {
1435+
docTermSet = new Set();
1436+
docTerms.set(docIdx, docTermSet);
1437+
}
1438+
14311439
// Track which terms we've already counted for df in this field
14321440
for (const [term, tf] of termFreqs) {
14331441
const posting = {
@@ -1442,6 +1450,7 @@ function buildInvertedIndex(records, keyCount, analyzer) {
14421450
terms.set(term, postings);
14431451
}
14441452
postings.push(posting);
1453+
docTermSet.add(term);
14451454
df.set(term, (df.get(term) || 0) + 1);
14461455
}
14471456
}
@@ -1476,7 +1485,8 @@ function buildInvertedIndex(records, keyCount, analyzer) {
14761485
return {
14771486
terms,
14781487
fieldCount,
1479-
df
1488+
df,
1489+
docTerms
14801490
};
14811491
}
14821492
function addToInvertedIndex(index, record, keyCount, analyzer) {
@@ -1485,6 +1495,11 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
14851495
v,
14861496
$: fields
14871497
} = record;
1498+
let docTermSet = index.docTerms.get(docIdx);
1499+
if (!docTermSet) {
1500+
docTermSet = new Set();
1501+
index.docTerms.set(docIdx, docTermSet);
1502+
}
14881503
function addField(text, keyIdx, subIdx) {
14891504
const tokens = analyzer.tokenize(text);
14901505
if (!tokens.length) return;
@@ -1506,6 +1521,7 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
15061521
index.terms.set(term, postings);
15071522
}
15081523
postings.push(posting);
1524+
docTermSet.add(term);
15091525
index.df.set(term, (index.df.get(term) || 0) + 1);
15101526
}
15111527
}
@@ -1528,11 +1544,21 @@ function addToInvertedIndex(index, record, keyCount, analyzer) {
15281544
}
15291545
}
15301546
function removeFromInvertedIndex(index, docIdx) {
1531-
for (const [term, postings] of index.terms) {
1532-
const filtered = postings.filter(p => p.docIdx !== docIdx);
1547+
const docTermSet = index.docTerms.get(docIdx);
1548+
if (!docTermSet) return;
1549+
1550+
// Count distinct fields this doc contributed (for fieldCount adjustment)
1551+
const docFields = new Set();
1552+
for (const term of docTermSet) {
1553+
const postings = index.terms.get(term);
1554+
if (!postings) continue;
1555+
const filtered = postings.filter(p => {
1556+
if (p.docIdx !== docIdx) return true;
1557+
docFields.add(`${p.keyIdx}:${p.subIdx}`);
1558+
return false;
1559+
});
15331560
const removed = postings.length - filtered.length;
15341561
if (removed > 0) {
1535-
index.fieldCount -= removed;
15361562
index.df.set(term, (index.df.get(term) || 0) - removed);
15371563
if (filtered.length === 0) {
15381564
index.terms.delete(term);
@@ -1542,6 +1568,8 @@ function removeFromInvertedIndex(index, docIdx) {
15421568
}
15431569
}
15441570
}
1571+
index.fieldCount -= docFields.size;
1572+
index.docTerms.delete(docIdx);
15451573
}
15461574

15471575
class Fuse {

dist/fuse.worker.min.mjs

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

0 commit comments

Comments
 (0)