Skip to content

Commit a170d12

Browse files
dmeliksetianclaude
andcommitted
Keep qargs compose entirely on GPU; add qargs benchmark
Defer cp.asnumpy() in the qargs branch until after cp.repeat and scatter assignment, so the full embedding stays on GPU. Previously x3/z3/phase were transferred back to CPU before np.repeat was called. Also expand ASV benchmarks with SparsePauliOpGPUComposeQargsBench to measure CPU vs GPU speedup for the qargs path across varying total/sub qubit counts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent d3f52a7 commit a170d12

2 files changed

Lines changed: 67 additions & 10 deletions

File tree

qiskit/quantum_info/operators/symplectic/sparse_pauli_op.py

Lines changed: 10 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -390,18 +390,21 @@ def compose(
390390
x3 = xp.logical_xor(_x1[:, None], _x2).reshape((-1, num_qubits))
391391
z3 = xp.logical_xor(_z1[:, None], _z2).reshape((-1, num_qubits))
392392

393-
if _use_gpu:
394-
x3 = cp.asnumpy(x3)
395-
z3 = cp.asnumpy(z3)
396-
phase = cp.asnumpy(phase)
397-
398393
if qargs is None:
394+
if _use_gpu:
395+
x3 = cp.asnumpy(x3)
396+
z3 = cp.asnumpy(z3)
397+
phase = cp.asnumpy(phase)
399398
pauli_list = PauliList(BasePauli(z3, x3, phase))
400399
else:
401-
x4 = np.repeat(self.paulis.x, other.size, axis=0)
402-
z4 = np.repeat(self.paulis.z, other.size, axis=0)
400+
x4 = xp.repeat(xp.asarray(self.paulis.x), other.size, axis=0)
401+
z4 = xp.repeat(xp.asarray(self.paulis.z), other.size, axis=0)
403402
x4[:, qargs] = x3
404403
z4[:, qargs] = z3
404+
if _use_gpu:
405+
x4 = cp.asnumpy(x4)
406+
z4 = cp.asnumpy(z4)
407+
phase = cp.asnumpy(phase)
405408
pauli_list = PauliList(BasePauli(z4, x4, phase))
406409

407410
# note: the following is a faster code equivalent to

test/benchmarks/quantum_info.py

Lines changed: 57 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -215,8 +215,17 @@ class SparsePauliOpGPUComposeBench:
215215
``_GPU_COMPOSE_THRESHOLD`` (5 000 000), ensuring the CuPy branch is taken.
216216
"""
217217

218-
# (num_qubits, num_terms): product num_terms^2 * num_qubits must be > 5_000_000
219-
params = [[10, 800], [20, 520], [30, 420]]
218+
# "num_qubits,num_terms": product num_terms^2 * num_qubits must be > 5_000_000
219+
# Rows span just-above-threshold (~6M) up to large tensors (~50M) at varying
220+
# qubit counts, so we can see how GPU speedup scales with both tensor size and shape.
221+
params = [
222+
# ~6M elements (just above threshold)
223+
"10,800", "20,520", "30,420",
224+
# ~20M elements
225+
"10,1500", "20,1000", "30,820", "50,640",
226+
# ~50M elements
227+
"10,2300", "20,1600", "50,1000",
228+
]
220229
param_names = ["num_qubits,num_terms"]
221230

222231
def setup(self, num_qubits_num_terms):
@@ -225,7 +234,7 @@ def setup(self, num_qubits_num_terms):
225234
except ImportError as exc:
226235
raise NotImplementedError("CuPy not installed") from exc
227236

228-
num_qubits, num_terms = num_qubits_num_terms
237+
num_qubits, num_terms = map(int, num_qubits_num_terms.split(","))
229238
self.p1 = SparsePauliOp(
230239
random_pauli_list(num_qubits=num_qubits, size=num_terms, phase=True)
231240
)
@@ -245,3 +254,48 @@ def time_compose_cpu(self, _):
245254

246255
with unittest.mock.patch.object(_spo, "_GPU_COMPOSE_THRESHOLD", 10**18):
247256
self.p1.compose(self.p2)
257+
258+
259+
class SparsePauliOpGPUComposeQargsBench:
260+
"""Benchmark SparsePauliOp.compose with qargs on GPU vs CPU.
261+
262+
Uses a larger operator composed onto a subset of qubits, exercising the
263+
cp.repeat + scatter path added alongside the qargs=None GPU path.
264+
"""
265+
266+
# "total_qubits,sub_qubits,num_terms": compose a sub_qubits operator onto
267+
# a subset of a total_qubits operator. num_terms^2 * sub_qubits > 5_000_000.
268+
params = [
269+
"20,10,800", "30,10,800", "50,10,800",
270+
"30,20,520", "50,20,520",
271+
"50,30,420",
272+
]
273+
param_names = ["total_qubits,sub_qubits,num_terms"]
274+
275+
def setup(self, params):
276+
try:
277+
import cupy # noqa: F401 pylint: disable=import-outside-toplevel
278+
except ImportError as exc:
279+
raise NotImplementedError("CuPy not installed") from exc
280+
281+
total_qubits, sub_qubits, num_terms = map(int, params.split(","))
282+
self.p1 = SparsePauliOp(
283+
random_pauli_list(num_qubits=total_qubits, size=num_terms, phase=True)
284+
)
285+
self.p2 = SparsePauliOp(
286+
random_pauli_list(num_qubits=sub_qubits, size=num_terms, phase=True)
287+
)
288+
self.qargs = list(range(sub_qubits))
289+
290+
def time_compose_qargs_gpu(self, _):
291+
"""GPU path: compose smaller op onto subset of qubits."""
292+
self.p1.compose(self.p2, qargs=self.qargs)
293+
294+
def time_compose_qargs_cpu(self, _):
295+
"""CPU path on same inputs for direct comparison."""
296+
from qiskit.quantum_info.operators.symplectic import ( # pylint: disable=import-outside-toplevel
297+
sparse_pauli_op as _spo,
298+
)
299+
300+
with unittest.mock.patch.object(_spo, "_GPU_COMPOSE_THRESHOLD", 10**18):
301+
self.p1.compose(self.p2, qargs=self.qargs)

0 commit comments

Comments
 (0)