Skip to content

Commit 0f01a75

Browse files
xiangfu0claude
andcommitted
feat: add Apache Pinot vector search client
Adds a complete Apache Pinot client for VectorDBBench. Index types: HNSW (Lucene), IVF_FLAT, IVF_PQ, IVF_ON_DISK Metrics: L2, IP, COSINE Filters: NumGE, StrEqual Optional dep: pip install "vectordb-bench[pinot]" Parallel loading: thread_safe=True — each worker thread maintains its own row buffer and flushes to Pinot via a fresh HTTP session. Since Pinot's ingestFromFile is synchronous (blocks until HNSW index is built, ~6 min per 100K×768D segment), concurrent flushes across threads reduce load time significantly vs sequential flushing. Benchmark results: Small dataset (OpenAI 50K, 768D, L2): HNSW: 798 QPS, recall=1.000 IVF_FLAT: 800 QPS, recall=1.000 IVF_PQ: 795 QPS, recall=1.000 IVF_ON_DISK: 691 QPS, recall=1.000 Large dataset (Cohere 1M, 768D, COSINE): HNSW m=16: 74 QPS, recall=0.982 Filter benchmark (Cohere 1M, COSINE, HNSW m=32): 1% NumGE: 71 QPS, recall=0.977 99% NumGE: 97 QPS, recall=0.649 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 77d76ab commit 0f01a75

File tree

8 files changed

+844
-0
lines changed

8 files changed

+844
-0
lines changed

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ turbopuffer = [ "turbopuffer" ]
8282
zvec = [ "zvec" ]
8383
endee = [ "endee==0.1.10" ]
8484
lindorm = [ "opensearch-py" ]
85+
pinot = [ "requests" ]
8586

8687
[project.urls]
8788
Repository = "https://github.com/zilliztech/VectorDBBench"

vectordb_bench/backend/clients/__init__.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ class DB(Enum):
6161
Lindorm = "Lindorm"
6262
VectorChord = "VectorChord"
6363
PolarDB = "PolarDB"
64+
Pinot = "Pinot"
6465

6566
@property
6667
def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
@@ -257,6 +258,11 @@ def init_cls(self) -> type[VectorDB]: # noqa: PLR0911, PLR0912, C901, PLR0915
257258

258259
return PolarDB
259260

261+
if self == DB.Pinot:
262+
from .pinot.pinot import Pinot
263+
264+
return Pinot
265+
260266
msg = f"Unknown DB: {self.name}"
261267
raise ValueError(msg)
262268

@@ -455,6 +461,11 @@ def config_cls(self) -> type[DBConfig]: # noqa: PLR0911, PLR0912, C901, PLR0915
455461

456462
return PolarDBConfig
457463

464+
if self == DB.Pinot:
465+
from .pinot.config import PinotConfig
466+
467+
return PinotConfig
468+
458469
msg = f"Unknown DB: {self.name}"
459470
raise ValueError(msg)
460471

@@ -631,6 +642,16 @@ def case_config_cls( # noqa: C901, PLR0911, PLR0912, PLR0915
631642

632643
return _vectorchord_case_config.get(index_type)
633644

645+
if self == DB.Pinot:
646+
from .pinot.config import PinotHNSWConfig, PinotIVFFlatConfig, PinotIVFOnDiskConfig, PinotIVFPQConfig
647+
648+
return {
649+
IndexType.HNSW: PinotHNSWConfig,
650+
IndexType.IVFFlat: PinotIVFFlatConfig,
651+
IndexType.IVFPQ: PinotIVFPQConfig,
652+
IndexType.IVFOnDisk: PinotIVFOnDiskConfig,
653+
}.get(index_type, PinotHNSWConfig)
654+
634655
# DB.Pinecone, DB.Redis
635656
return EmptyDBCaseConfig
636657

vectordb_bench/backend/clients/api.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ class IndexType(StrEnum):
2525
DISKANN = "DISKANN"
2626
STREAMING_DISKANN = "DISKANN"
2727
IVFFlat = "IVF_FLAT"
28+
IVFOnDisk = "IVF_ON_DISK"
2829
IVFPQ = "IVF_PQ"
2930
IVFBQ = "IVF_BQ"
3031
IVFSQ8 = "IVF_SQ8"

vectordb_bench/backend/clients/pinot/__init__.py

Whitespace-only changes.
Lines changed: 250 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,250 @@
1+
from typing import Annotated, TypedDict, Unpack
2+
3+
import click
4+
from pydantic import SecretStr
5+
6+
from ....cli.cli import (
7+
CommonTypedDict,
8+
HNSWFlavor2,
9+
cli,
10+
click_parameter_decorators_from_typed_dict,
11+
run,
12+
)
13+
from .. import DB
14+
15+
16+
class PinotTypedDict(TypedDict):
17+
controller_host: Annotated[
18+
str,
19+
click.option("--controller-host", type=str, default="localhost", help="Pinot Controller host"),
20+
]
21+
controller_port: Annotated[
22+
int,
23+
click.option("--controller-port", type=int, default=9000, help="Pinot Controller port"),
24+
]
25+
broker_host: Annotated[
26+
str,
27+
click.option("--broker-host", type=str, default="localhost", help="Pinot Broker host"),
28+
]
29+
broker_port: Annotated[
30+
int,
31+
click.option("--broker-port", type=int, default=8099, help="Pinot Broker port"),
32+
]
33+
username: Annotated[
34+
str,
35+
click.option("--username", type=str, default=None, help="Pinot username (optional)"),
36+
]
37+
password: Annotated[
38+
str,
39+
click.option("--password", type=str, default=None, help="Pinot password (optional)"),
40+
]
41+
ingest_batch_size: Annotated[
42+
int,
43+
click.option(
44+
"--ingest-batch-size",
45+
type=int,
46+
default=100_000,
47+
show_default=True,
48+
help=(
49+
"Rows buffered before flushing one Pinot segment (one ingestFromFile call). "
50+
"Larger values mean fewer segments and better IVF training / query performance. "
51+
"Reduce if memory is constrained (100K x 768-dim float32 ~= 300 MB)."
52+
),
53+
),
54+
]
55+
56+
57+
def _pinot_db_config(parameters: dict):
58+
from .config import PinotConfig
59+
60+
return PinotConfig(
61+
db_label=parameters["db_label"],
62+
controller_host=parameters["controller_host"],
63+
controller_port=parameters["controller_port"],
64+
broker_host=parameters["broker_host"],
65+
broker_port=parameters["broker_port"],
66+
username=parameters.get("username"),
67+
password=SecretStr(parameters["password"]) if parameters.get("password") else None,
68+
ingest_batch_size=parameters["ingest_batch_size"],
69+
)
70+
71+
72+
# ---------------------------------------------------------------------------
73+
# HNSW
74+
# ---------------------------------------------------------------------------
75+
76+
77+
class PinotHNSWTypedDict(CommonTypedDict, PinotTypedDict, HNSWFlavor2): ...
78+
79+
80+
@cli.command()
81+
@click_parameter_decorators_from_typed_dict(PinotHNSWTypedDict)
82+
def PinotHNSW(**parameters: Unpack[PinotHNSWTypedDict]):
83+
from .config import PinotHNSWConfig
84+
85+
run(
86+
db=DB.Pinot,
87+
db_config=_pinot_db_config(parameters),
88+
db_case_config=PinotHNSWConfig(
89+
m=parameters["m"],
90+
ef_construction=parameters["ef_construction"],
91+
ef=parameters["ef_runtime"],
92+
),
93+
**parameters,
94+
)
95+
96+
97+
# ---------------------------------------------------------------------------
98+
# IVF_FLAT
99+
# ---------------------------------------------------------------------------
100+
101+
102+
class PinotIVFFlatTypedDict(CommonTypedDict, PinotTypedDict):
103+
nlist: Annotated[
104+
int,
105+
click.option("--nlist", type=int, default=128, help="Number of Voronoi cells (IVF nlist)"),
106+
]
107+
quantizer: Annotated[
108+
str,
109+
click.option(
110+
"--quantizer",
111+
type=click.Choice(["FLAT", "SQ8", "SQ4"]),
112+
default="FLAT",
113+
help="Quantizer type for IVF_FLAT",
114+
),
115+
]
116+
nprobe: Annotated[
117+
int,
118+
click.option("--nprobe", type=int, default=8, help="Number of cells to probe at query time"),
119+
]
120+
train_sample_size: Annotated[
121+
int,
122+
click.option(
123+
"--train-sample-size",
124+
type=int,
125+
default=None,
126+
help="Training sample size (defaults to max(nlist*50, 1000) if not set)",
127+
),
128+
]
129+
130+
131+
@cli.command()
132+
@click_parameter_decorators_from_typed_dict(PinotIVFFlatTypedDict)
133+
def PinotIVFFlat(**parameters: Unpack[PinotIVFFlatTypedDict]):
134+
from .config import PinotIVFFlatConfig
135+
136+
run(
137+
db=DB.Pinot,
138+
db_config=_pinot_db_config(parameters),
139+
db_case_config=PinotIVFFlatConfig(
140+
nlist=parameters["nlist"],
141+
quantizer=parameters["quantizer"],
142+
nprobe=parameters["nprobe"],
143+
train_sample_size=parameters.get("train_sample_size"),
144+
),
145+
**parameters,
146+
)
147+
148+
149+
# ---------------------------------------------------------------------------
150+
# IVF_PQ
151+
# ---------------------------------------------------------------------------
152+
153+
154+
class PinotIVFPQTypedDict(CommonTypedDict, PinotTypedDict):
155+
nlist: Annotated[
156+
int,
157+
click.option("--nlist", type=int, default=128, help="Number of Voronoi cells (IVF nlist)"),
158+
]
159+
pq_m: Annotated[
160+
int,
161+
click.option("--pq-m", type=int, default=8, help="Number of PQ sub-quantizers (must divide dimension)"),
162+
]
163+
pq_nbits: Annotated[
164+
int,
165+
click.option(
166+
"--pq-nbits",
167+
type=click.Choice(["4", "6", "8"]),
168+
default="8",
169+
help="Bits per PQ code (4, 6, or 8)",
170+
),
171+
]
172+
train_sample_size: Annotated[
173+
int,
174+
click.option("--train-sample-size", type=int, default=6400, help="Training sample size (must be >= nlist)"),
175+
]
176+
nprobe: Annotated[
177+
int,
178+
click.option("--nprobe", type=int, default=8, help="Number of cells to probe at query time"),
179+
]
180+
181+
182+
@cli.command()
183+
@click_parameter_decorators_from_typed_dict(PinotIVFPQTypedDict)
184+
def PinotIVFPQ(**parameters: Unpack[PinotIVFPQTypedDict]):
185+
from .config import PinotIVFPQConfig
186+
187+
run(
188+
db=DB.Pinot,
189+
db_config=_pinot_db_config(parameters),
190+
db_case_config=PinotIVFPQConfig(
191+
nlist=parameters["nlist"],
192+
pq_m=parameters["pq_m"],
193+
pq_nbits=int(parameters["pq_nbits"]),
194+
train_sample_size=parameters["train_sample_size"],
195+
nprobe=parameters["nprobe"],
196+
),
197+
**parameters,
198+
)
199+
200+
201+
# ---------------------------------------------------------------------------
202+
# IVF_ON_DISK
203+
# ---------------------------------------------------------------------------
204+
205+
206+
class PinotIVFOnDiskTypedDict(CommonTypedDict, PinotTypedDict):
207+
nlist: Annotated[
208+
int,
209+
click.option("--nlist", type=int, default=128, help="Number of Voronoi cells (IVF nlist)"),
210+
]
211+
quantizer: Annotated[
212+
str,
213+
click.option(
214+
"--quantizer",
215+
type=click.Choice(["FLAT", "SQ8", "SQ4"]),
216+
default="FLAT",
217+
help="Quantizer type for IVF_ON_DISK",
218+
),
219+
]
220+
nprobe: Annotated[
221+
int,
222+
click.option("--nprobe", type=int, default=8, help="Number of cells to probe at query time"),
223+
]
224+
train_sample_size: Annotated[
225+
int,
226+
click.option(
227+
"--train-sample-size",
228+
type=int,
229+
default=None,
230+
help="Training sample size (defaults to max(nlist*50, 1000) if not set)",
231+
),
232+
]
233+
234+
235+
@cli.command()
236+
@click_parameter_decorators_from_typed_dict(PinotIVFOnDiskTypedDict)
237+
def PinotIVFOnDisk(**parameters: Unpack[PinotIVFOnDiskTypedDict]):
238+
from .config import PinotIVFOnDiskConfig
239+
240+
run(
241+
db=DB.Pinot,
242+
db_config=_pinot_db_config(parameters),
243+
db_case_config=PinotIVFOnDiskConfig(
244+
nlist=parameters["nlist"],
245+
quantizer=parameters["quantizer"],
246+
nprobe=parameters["nprobe"],
247+
train_sample_size=parameters.get("train_sample_size"),
248+
),
249+
**parameters,
250+
)

0 commit comments

Comments
 (0)