qiskit-terra/crates/transpiler/src/passes/sabre/route.rs at 4d73b639392f085cbb4fc09138b010e10092e771 · jakelishman/qiskit-terra · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
// This code is part of Qiskit.
//
// (C) Copyright IBM 2024
//
// This code is licensed under the Apache License, Version 2.0. You may
// obtain a copy of this license in the LICENSE.txt file in the root directory
// of this source tree or at https://www.apache.org/licenses/LICENSE-2.0.
//
// Any modifications or derivative works of this code must retain this
// copyright notice, and modified files need to carry a notice indicating
// that they have been altered from the originals.

use std::cmp::Ordering;
use std::collections::VecDeque;
use std::convert::Infallible;
use std::num::NonZero;

use numpy::{PyArray2, ToPyArray};
use pyo3::Python;
use pyo3::exceptions::PyValueError;
use pyo3::prelude::*;
use pyo3::types::PyDict;

use hashbrown::HashSet;
use indexmap::IndexMap;
use ndarray::Array2;
use rand::prelude::*;
use rand_pcg::Pcg64Mcg;
use rayon_cond::CondIterator;
use rustworkx_core::dictmap::*;
use rustworkx_core::petgraph::prelude::*;
use rustworkx_core::petgraph::visit::{EdgeCount, EdgeRef};
use rustworkx_core::shortest_path::{dijkstra, distance_matrix};
use rustworkx_core::token_swapper::token_swapper;
use smallvec::{SmallVec, smallvec};

use super::dag::{InteractionKind, SabreDAG};
use super::heuristic::{BasicHeuristic, DecayHeuristic, Heuristic, LookaheadHeuristic, SetScaling};
use super::layer::{ExtendedSet, FrontLayer};
use super::vec_map::VecMap;
use crate::TranspilerError;
use crate::neighbors::Neighbors;
use crate::target::{Target, TargetCouplingError};
use qiskit_circuit::dag_circuit::{DAGCircuit, DAGCircuitBuilder, NodeType, Wire};
use qiskit_circuit::nlayout::NLayout;
use qiskit_circuit::operations::{ControlFlow, StandardGate};
use qiskit_circuit::packed_instruction::PackedInstruction;
use qiskit_circuit::{BlocksMode, PhysicalQubit, Qubit, VirtualQubit, getenv_use_multiple_threads};

/// Number of trials for control flow block swap epilogues.
const SWAP_EPILOGUE_TRIALS: usize = 4;

/// The number of control-flow blocks to take off the stack.
///
/// This funky struct is just a trick to get the Rust compiler to use the niche optimisation for
/// `RoutedItemKind`.  We actually store the number of blocks as the bitwise negation of the true
/// number, and disallow there being `u32::MAX` blocks.
///
/// At the time of writing (2025-06-11), Qiskit's Python-space model doesn't allow constructing any
/// control-flow operations with zero blocks, so we could use `NonZero<u32>` directly.  Technically,
/// though, a `switch` on a zero-bit register _could_ be valid and have zero blocks, so doing this
/// little trick makes us safe against that long-range assumption changing, for zero measurable
/// runtime cost.
#[repr(transparent)]
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
struct ControlFlowBlockCount(NonZero<u32>);
impl ControlFlowBlockCount {
    pub fn get(self) -> u32 {
        !self.0.get()
    }
}
impl From<u32> for ControlFlowBlockCount {
    fn from(val: u32) -> Self {
        Self((!val).try_into().expect("cannot store u32::MAX blocks"))
    }
}

enum RoutedItemKind {
    /// A regular [`SabreDAG`] node.
    Simple,
    /// How many blocks out of [`Order::control_flow`] we need to take.  This is stored out-of-band
    /// of the item kind because control-flow is expected to be very uncommon, and we don't want to
    /// needless increase the size of the enum.
    ControlFlow(ControlFlowBlockCount),
}
/// A node in the routing order.
struct RoutedItem {
    /// The swaps (if any) to insert before the node.  It is usually `Some` for two-qubit run nodes,
    /// but can be `None` at the start of a circuit (if the `initial_layout` permits some gates to
    /// be immediately routed) or for `Synchronize` or `ControlFlow` nodes.
    initial_swaps: Option<Box<[[PhysicalQubit; 2]]>>,
    /// The corresponding node in the Sabre graph.
    node: NodeIndex,
    kind: RoutedItemKind,
}
impl RoutedItem {
    #[inline]
    pub fn initial_swaps(&self) -> &[[PhysicalQubit; 2]] {
        self.initial_swaps.as_deref().unwrap_or(&[])
    }
}

/// The final analysis of the Sabre routing algorithm.
///
/// This represents a total order of instructions to be applied, including swaps, to produce a
/// circuit that is fully routed.  This structure alone is insufficient; it contains references to a
/// [SabreDAG], which in turn contains references to a [DAGCircuit], and you need the initial
/// [NLayout] object to know where the virtual qubits in the input [DAGCircuit] should be mapped to
/// at the start of the circuit.  The [RoutingResult] object wraps up this object with the other
/// necessary components.
struct Order<'a> {
    /// The order the Sabre nodes get routed in.
    order: Vec<RoutedItem>,
    /// The swaps to apply after the last item of [`order`].  This generally happens within
    /// control-flow blocks.
    final_swaps: Vec<[PhysicalQubit; 2]>,
    /// Flattened list of control-flow node results.  A given [`RoutedItem`] might require some
    /// blocks to produce its complete node.  We store them here, rather than in the [`RoutedItem`],
    /// because control flow is rare, and this way we can keep the struct size of items smaller.
    control_flow: Vec<RoutingResult<'a>>,
}
impl<'a> Order<'a> {
    /// Initialize an empty `Order` with suitable capacity for the given problem.
    #[inline]
    pub fn for_problem(problem: RoutingProblem<'a>) -> Self {
        Self {
            order: Vec::with_capacity(problem.sabre.dag.node_count()),
            final_swaps: Vec::new(),
            control_flow: Vec::new(),
        }
    }

    /// Count the number of swaps inserted at the top level (i.e. without recursing into
    /// control-flow operations).
    #[inline]
    pub fn swap_count(&self) -> usize {
        self.order
            .iter()
            .map(|item| item.initial_swaps().len())
            .sum()
    }
}

/// A complete result from the Sabre routing algorithm, including the initial problem and layout
/// that it searched from.
///
/// The [`Order`] is the calculated result from the analysis (and the [`final_layout`] field is a
/// derived quantity that we simply get for free at the end of the algorithm, so store here), and
/// this struct wraps it up with the problem description and [`initial_layout`] necessary to fully
/// interpret it.
pub struct RoutingResult<'a> {
    problem: RoutingProblem<'a>,
    order: Order<'a>,
    /// The initial layout that the routing algorithm started from.
    pub initial_layout: NLayout,
    /// The layout after the routing algorithm had finished.  This can be rederived from [order] and
    /// [initial_layout], but we get it for free anyway.
    pub final_layout: NLayout,
}
impl RoutingResult<'_> {
    /// Count the number of swaps inserted at the top level (i.e. without recursing into
    /// control-flow operations).
    #[inline]
    pub fn swap_count(&self) -> usize {
        self.order.swap_count()
    }

    fn num_qubits(&self) -> usize {
        self.initial_layout.num_qubits()
    }

    /// Rebuild the physical circuit from the virtual DAG, using the natural width of the target of
    /// this component.
    ///
    /// This is the correct method to call if the [RoutingTarget] represented the entirety of the
    /// device.  If the device was subset (such as for disjoint handling), use [rebuild_onto] with
    /// suitable mappings back to the full-width [PhysicalQubit] instances instead.
    pub fn rebuild(&self) -> PyResult<DAGCircuit> {
        let num_swaps = self.order.swap_count();
        let dag = self.problem.dag.physical_empty_like_with_capacity(
            self.num_qubits(),
            self.problem.dag.num_ops() + num_swaps,
            self.problem.dag.dag().edge_count() + 2 * num_swaps,
            BlocksMode::Drop,
        )?;
        self.rebuild_onto(dag, |q| q)
    }

    /// Construct the routed [DAGCircuit] from the result, placing the operations onto an existing
    /// [DAGCircuit]
    ///
    /// `dag` should be a physical circuit that is the full width of the device, and already contain
    /// all the classical data that is referenced by this circuit (clbits, classical registers,
    /// vars, stretches, global phase, metadata, etc).  The target [DAGCircuit] might have more
    /// qubits than the layouts in this [RoutingResult], for example if the device was subset for
    /// disjoint handling.  Use `map_fn` to map the "physical qubits" in this component to the
    /// actual physical qubits they refer to.
    pub fn rebuild_onto(
        &self,
        dag: DAGCircuit,
        map_fn: impl Fn(PhysicalQubit) -> PhysicalQubit,
    ) -> PyResult<DAGCircuit> {
        let apply_swap = |swap: &[PhysicalQubit; 2],
                          layout: &mut NLayout,
                          dag: &mut DAGCircuitBuilder|
         -> PyResult<NodeIndex> {
            layout.swap_physical(swap[0], swap[1]);
            let swap = PackedInstruction::from_standard_gate(
                StandardGate::Swap,
                None,
                dag.insert_qargs(&[Qubit(map_fn(swap[1]).0), Qubit(map_fn(swap[0]).0)]),
            );
            dag.push_back(swap)
        };
        // The size here is pretty arbitrary, providing it can fit at least 2q operations in.
        let mut apply_scratch = Vec::with_capacity(4);
        let mut apply_op = |inst: &PackedInstruction,
                            layout: &NLayout,
                            dag: &mut DAGCircuitBuilder|
         -> PyResult<NodeIndex> {
            apply_scratch.clear();
            for qubit in self.problem.dag.get_qargs(inst.qubits) {
                apply_scratch.push(Qubit(map_fn(VirtualQubit(qubit.0).to_phys(layout)).0));
            }
            let new_inst = PackedInstruction {
                qubits: dag.insert_qargs(&apply_scratch),
                ..inst.clone()
            };
            dag.push_back(new_inst)
        };

        let mut dag = dag.into_builder();
        let mut layout = self.initial_layout.clone();
        let mut blocks = self.order.control_flow.iter();
        for node in &self.problem.sabre.initial {
            let NodeType::Operation(inst) = &self.problem.dag[*node] else {
                panic!("Sabre DAG should only contain op nodes");
            };
            apply_op(inst, &layout, &mut dag)?;
        }
        for item in &self.order.order {
            for swap in item.initial_swaps() {
                apply_swap(swap, &mut layout, &mut dag)?;
            }
            // In theory, `indices` will always have at least one entry if you're rebuilding the
            // DAG from a Sabre result, because there wouldn't be a Sabre node without at least one
            // DAG node backing it.  That said, we _do_ allow construction of Sabre graphs that have
            // thrown away this information ([SabreDAG::only_interactions]), and there's still a
            // well-defined behaviour to take.
            let split = self.problem.sabre.dag[item.node].indices.split_first();
            let Some((head, rest)) = split else {
                continue;
            };
            let NodeType::Operation(inst) = &self.problem.dag[*head] else {
                panic!("Sabre DAG should only contain op nodes");
            };

            match item.kind {
                RoutedItemKind::Simple => apply_op(inst, &layout, &mut dag)?,
                RoutedItemKind::ControlFlow(num_blocks) => {
                    let mut blocks = blocks
                        .by_ref()
                        .take(num_blocks.get() as usize)
                        .map(|block| block.rebuild())
                        .collect::<Result<Vec<_>, _>>()?;
                    let explicit = self
                        .problem
                        .dag
                        .get_qargs(inst.qubits)
                        .iter()
                        .map(|q| VirtualQubit(q.index() as u32))
                        .collect::<HashSet<_>>();
                    // Collect lists of the qargs that will remain, and the idle qubits that need to
                    // be removed from the DAG, then remove the idle ones.
                    // TODO: this logic of collecting the remaining `qargs` in order is making an
                    // assumption that the later call to `DAGCircuit::remove_qubits` retains
                    // relative ordering of the remaining qubits, but the method doesn't formally
                    // commit to that.
                    let mut qargs = Vec::new();
                    let mut idle = Vec::new();
                    for qubit in 0..self.num_qubits() as u32 {
                        let phys = PhysicalQubit::new(qubit);
                        let virt = phys.to_virt(&layout);
                        let qubit = Qubit(qubit);
                        if explicit.contains(&virt)
                            || blocks
                                .iter()
                                .any(|dag| !dag.is_wire_idle(Wire::Qubit(qubit)))
                        {
                            qargs.push(Qubit(map_fn(phys).0));
                        } else {
                            idle.push(qubit);
                        }
                    }
                    for dag in &mut blocks {
                        dag.remove_qubits(idle.iter().copied())?;
                    }
                    let mut new_op = inst.op.control_flow().clone();
                    if !matches!(
                        &new_op.control_flow,
                        ControlFlow::BreakLoop | ControlFlow::ContinueLoop
                    ) {
                        new_op.num_qubits = blocks[0].num_qubits() as u32;
                    }
                    let blocks = blocks.into_iter().map(|b| dag.add_block(b)).collect();
                    let new_inst = PackedInstruction::from_control_flow(
                        new_op,
                        blocks,
                        dag.insert_qargs(&qargs),
                        inst.clbits,
                        inst.label.as_deref().cloned(),
                    );
                    dag.push_back(new_inst)?
                }
            };
            for node in rest {
                let NodeType::Operation(inst) = &self.problem.dag[*node] else {
                    panic!("sabre DAG should only contain op nodes");
                };
                apply_op(inst, &layout, &mut dag)?;
            }
        }
        for swap in &self.order.final_swaps {
            apply_swap(swap, &mut layout, &mut dag)?;
        }
        debug_assert_eq!(layout, self.final_layout);
        Ok(dag.build())
    }
}

/// A description of the QPU that we're routing to.
#[derive(Clone, Debug)]
pub struct RoutingTarget {
    pub neighbors: Neighbors,
    pub distance: Array2<f64>,
}
impl RoutingTarget {
    pub fn from_neighbors(neighbors: Neighbors) -> Self {
        Self {
            distance: distance_matrix(&neighbors, usize::MAX, false, f64::NAN),
            neighbors,
        }
    }

    #[inline]
    pub fn num_qubits(&self) -> usize {
        self.neighbors.num_qubits()
    }
}

/// Python wrapper for the Rust-space Sabre target object.
///
/// Contains `None` when the target had all-to-all connectivity (in which case the two property
/// methods [coupling_list] and [distance_matrix] also return `None`).
#[pyclass]
#[pyo3(name = "RoutingTarget", module = "qiskit._accelerate.sabre")]
pub struct PyRoutingTarget(pub Option<RoutingTarget>);
#[pymethods]
impl PyRoutingTarget {
    #[new]
    fn py_new() -> Self {
        PyRoutingTarget(None)
    }

    fn __getstate__<'py>(&self, py: Python<'py>) -> PyResult<Bound<'py, PyDict>> {
        let (neighbors, partition) = self
            .0
            .as_ref()
            .map(|tg| tg.neighbors.clone().take())
            .unzip();
        let out_dict = PyDict::new(py);
        out_dict.set_item("neighbors", neighbors)?;
        out_dict.set_item("partition", partition)?;
        Ok(out_dict)
    }

    fn __setstate__(&mut self, value: Bound<PyDict>) -> PyResult<()> {
        let neighbors = value
            .get_item("neighbors")?
            .map(|x| x.extract())
            .transpose()?;
        let partition = value
            .get_item("partition")?
            .map(|x| x.extract())
            .transpose()?;
        let (Some(neighbors), Some(partition)) = (neighbors, partition) else {
            return Ok(());
        };
        let neighbors = Neighbors::from_parts(neighbors, partition)
            .map_err(|e| PyValueError::new_err(e.to_string()))?;
        self.0 = Some(RoutingTarget::from_neighbors(neighbors));
        Ok(())
    }

    #[staticmethod]
    pub(crate) fn from_target(target: &Target) -> PyResult<Self> {
        let coupling = match target.coupling_graph() {
            Ok(coupling) => coupling,
            Err(TargetCouplingError::AllToAll) => return Ok(Self(None)),
            Err(e @ TargetCouplingError::MultiQ(_)) => {
                return Err(TranspilerError::new_err(e.to_string()));
            }
        };
        Ok(Self(Some(RoutingTarget::from_neighbors(
            Neighbors::from_coupling(&coupling),
        ))))
    }

    fn coupling_list(&self) -> Option<Vec<[PhysicalQubit; 2]>> {
        use rustworkx_core::petgraph::visit::IntoEdgeReferences;
        self.0.as_ref().map(|target| {
            target
                .neighbors
                .edge_references()
                .map(|edge| [edge.source(), edge.target()])
                .collect()
        })
    }

    fn distance_matrix<'py>(&self, py: Python<'py>) -> Option<Bound<'py, PyArray2<f64>>> {
        self.0.as_ref().map(|target| target.distance.to_pyarray(py))
    }
}

/// Helper record struct for a Sabre routing problem.
///
/// This is mostly just encapsulation to make the nested call sites less verbose.
#[derive(Clone, Copy, Debug)]
pub struct RoutingProblem<'a> {
    pub target: &'a RoutingTarget,
    pub sabre: &'a SabreDAG,
    pub dag: &'a DAGCircuit,
    pub heuristic: &'a Heuristic,
}
impl<'a> RoutingProblem<'a> {
    /// The same problem, but using a different [SabreDAG] representation.
    pub fn with_sabre(mut self, sabre: &'a SabreDAG) -> Self {
        self.sabre = sabre;
        self
    }
}

/// Long-term internal state of the Sabre routing algorithm.
///
/// This includes all the scratch space and tracking that we use over the course of many swap
/// insertions, but doesn't include ephemeral state that never needs to leave the main loop.  This
/// is mostly just a convenience, so we don't have to pass everything from function to function.
struct State {
    layout: NLayout,
    front_layer: FrontLayer,
    extended_set: ExtendedSet,
    /// How many predecessors still need to be satisfied for each node index before it is at the
    /// front of the topological iteration through the nodes as they're routed.
    required_predecessors: VecMap<NodeIndex, u32>,
    decay: VecMap<PhysicalQubit, f64>,
    /// Reusable allocated storage space for accumulating and scoring swaps.  This is owned as part
    /// of the general state to avoid reallocation costs.
    swap_scores: Vec<([PhysicalQubit; 2], f64)>,
    /// Reusable allocated storage space for tracking the current best swaps.  This is owned as
    /// part of the general state to avoid reallocation costs.
    best_swaps: Vec<[PhysicalQubit; 2]>,
    rng: Pcg64Mcg,
    seed: u64,
}

impl State {
    /// Apply a swap to the program-state structures (front layer, extended set and current
    /// layout).
    #[inline]
    fn apply_swap(&mut self, swap: [PhysicalQubit; 2]) {
        self.front_layer.apply_swap(swap);
        self.extended_set.apply_swap(swap);
        self.layout.swap_physical(swap[0], swap[1]);
    }

    /// Return the node, if any, that is on this qubit and is routable with the current layout.
    #[inline]
    fn routable_node_on_qubit(
        &self,
        problem: RoutingProblem,
        qubit: PhysicalQubit,
    ) -> Option<NodeIndex> {
        self.front_layer.qubits()[qubit].and_then(|(node, other)| {
            problem
                .target
                .neighbors
                .contains_edge(qubit, other)
                .then_some(node)
        })
    }

    /// Search forwards from [nodes], adding any that are reachable, routable and have no
    /// unsatisfied dependencies to the result.
    ///
    /// All nodes in [nodes] must:
    ///
    /// * have no unsatisfied predecessors
    /// * not be in the [FrontLayer]
    ///
    /// # Panics
    ///
    /// If [initial_swaps] is given, but no nodes can be routed.
    fn update_route<'a>(
        &mut self,
        problem: RoutingProblem<'a>,
        order: &mut Order<'a>,
        nodes: &[NodeIndex],
        mut initial_swaps: Option<Vec<[PhysicalQubit; 2]>>,
    ) {
        let mut to_visit = nodes.iter().copied().collect::<VecDeque<_>>();
        while let Some(node_id) = to_visit.pop_front() {
            let node = &problem.sabre.dag[node_id];
            let kind = match &node.kind {
                InteractionKind::Synchronize => RoutedItemKind::Simple,
                InteractionKind::TwoQ([a, b]) => {
                    let a = a.to_phys(&self.layout);
                    let b = b.to_phys(&self.layout);
                    if problem.target.neighbors.contains_edge(a, b) {
                        RoutedItemKind::Simple
                    } else {
                        self.front_layer.insert(node_id, [a, b]);
                        continue;
                    }
                }
                InteractionKind::ControlFlow(blocks) => {
                    let dag_node_id = *node.indices.first().expect(
                        "if control-flow interactions are included, so are original DAG indices",
                    );
                    let NodeType::Operation(inst) = &problem.dag[dag_node_id] else {
                        panic!("Sabre DAG should only contain op nodes");
                    };
                    // The control-flow blocks aren't full width, so their "virtual" qubits aren't
                    // numbered the same as the full circuit's.  We still need it to route _as if_
                    // it's fully expanded with ancillas, though.
                    let mut layout =
                        NLayout::generate_trivial_layout(problem.target.num_qubits() as u32);
                    for (inner, outer) in problem.dag.get_qargs(inst.qubits).iter().enumerate() {
                        // The virtual qubit _inside_ the DAG block is mapped to some meaningless
                        // physical qubit in our current layout...
                        let dummy = VirtualQubit::new(inner as u32).to_phys(&layout);
                        // ... and we want it to be mapped to the current physical qubit of the
                        // corresponding outer virtual qubit.  We don't care where the dummy goes.
                        let actual = VirtualQubit::new(outer.index() as u32).to_phys(&self.layout);
                        layout.swap_physical(dummy, actual);
                    }
                    order.control_flow.extend(blocks.iter().map(|(sabre, dag)| {
                        let problem = RoutingProblem {
                            sabre,
                            dag,
                            ..problem
                        };
                        self.route_control_flow_block(problem, &layout)
                    }));
                    RoutedItemKind::ControlFlow((blocks.len() as u32).into())
                }
            };
            order.order.push(RoutedItem {
                initial_swaps: initial_swaps.take().map(Vec::into_boxed_slice),
                node: node_id,
                kind,
            });
            for edge in problem
                .sabre
                .dag
                .edges_directed(node_id, Direction::Outgoing)
            {
                let successor = edge.target();
                self.required_predecessors[successor] -= 1;
                if self.required_predecessors[successor] == 0 {
                    to_visit.push_back(successor);
                }
            }
        }
        assert!(
            initial_swaps.is_none(),
            "if initial swaps are given, at least one node must be known to be routable"
        );
    }

    /// Inner worker to route a control-flow block.  Since control-flow blocks are routed to
    /// restore the layout at the end of themselves, and the recursive calls spawn their own
    /// tracking states, this does not affect our own state.
    fn route_control_flow_block<'a>(
        &self,
        problem: RoutingProblem<'a>,
        layout: &NLayout,
    ) -> RoutingResult<'a> {
        let mut result = swap_map_trial(problem, layout, self.seed);
        // For now, we always append a swap circuit that gets the inner block back to the
        // parent's layout.
        result.order.final_swaps = token_swapper(
            &problem.target.neighbors,
            // Map physical location in the final layout from the inner routing to the current
            // location in the outer routing.
            result
                .final_layout
                .iter_physical()
                .map(|(p, v)| (p, v.to_phys(layout)))
                .collect(),
            Some(SWAP_EPILOGUE_TRIALS),
            Some(self.seed),
            None,
        )
        .unwrap()
        .into_iter()
        .map(|(l, r)| {
            [
                PhysicalQubit::new(l.index() as u32),
                PhysicalQubit::new(r.index() as u32),
            ]
        })
        .collect();
        result.final_layout = layout.clone();
        result
    }

    /// Fill the given `extended_set` with the next nodes that would be reachable after the front
    /// layer (and themselves).  This uses `required_predecessors` as scratch space for efficiency,
    /// but returns it to the same state as the input on return.
    fn populate_extended_set(&mut self, problem: RoutingProblem) {
        let extended_set_size =
            if let Some(LookaheadHeuristic { size, .. }) = problem.heuristic.lookahead {
                size
            } else {
                return;
            };
        let mut to_visit = self.front_layer.iter_nodes().copied().collect::<Vec<_>>();
        let mut decremented: IndexMap<NodeIndex, u32, ahash::RandomState> =
            IndexMap::with_hasher(ahash::RandomState::default());
        let mut i = 0;
        while i < to_visit.len() && self.extended_set.len() < extended_set_size {
            let node = to_visit[i];
            for edge in problem.sabre.dag.edges_directed(node, Direction::Outgoing) {
                let successor = edge.target();
                *decremented.entry(successor).or_insert(0) += 1;
                self.required_predecessors[successor] -= 1;
                if self.required_predecessors[successor] == 0 {
                    // TODO: this looks "through" control-flow ops without seeing them, but we
                    // actually eagerly route control-flow blocks as soon as they're eligible, so
                    // they should be reflected in the extended set.
                    if let InteractionKind::TwoQ([a, b]) = &problem.sabre.dag[successor].kind {
                        self.extended_set
                            .push([a.to_phys(&self.layout), b.to_phys(&self.layout)]);
                    }
                    to_visit.push(successor);
                }
            }
            i += 1;
        }
        for (node, amount) in decremented.iter() {
            self.required_predecessors[*node] += *amount;
        }
    }

    /// Add swaps to the current set that greedily bring the nearest node together.  This is a
    /// "release valve" mechanism; it ignores all the Sabre heuristics and forces progress, so we
    /// can't get permanently stuck.
    fn force_enable_closest_node(
        &mut self,
        problem: RoutingProblem,
        current_swaps: &mut Vec<[PhysicalQubit; 2]>,
    ) -> SmallVec<[NodeIndex; 2]> {
        let (&closest_node, &qubits) = {
            let dist = &problem.target.distance;
            self.front_layer
                .iter()
                .min_by(|(_, qubits_a), (_, qubits_b)| {
                    dist[[qubits_a[0].index(), qubits_a[1].index()]]
                        .partial_cmp(&dist[[qubits_b[0].index(), qubits_b[1].index()]])
                        .unwrap_or(Ordering::Equal)
                })
                .expect("front layer is never empty, except when routing is complete")
        };
        let shortest_path = {
            let mut shortest_paths: DictMap<PhysicalQubit, Vec<PhysicalQubit>> = DictMap::new();
            (dijkstra(
                &problem.target.neighbors,
                qubits[0],
                Some(qubits[1]),
                |_| Ok(1.),
                Some(&mut shortest_paths),
            ) as Result<Vec<_>, Infallible>)
                .expect("error is infallible");
            shortest_paths
                .swap_remove(&qubits[1])
                .expect("target is required to be connected")
        };
        // Insert greedy swaps along that shortest path, splitting them between moving the left side
        // and moving the right side to minimise the depth.  One side needs to move up to the split
        // point and the other can stop one short because the gate will be routable then.
        let split: usize = shortest_path.len() / 2;
        current_swaps.reserve(shortest_path.len() - 2);
        for i in 0..split {
            current_swaps.push([shortest_path[i], shortest_path[i + 1]]);
        }
        for i in 0..split - 1 {
            let end = shortest_path.len() - 1 - i;
            current_swaps.push([shortest_path[end], shortest_path[end - 1]]);
        }
        current_swaps.iter().for_each(|&swap| self.apply_swap(swap));

        // If we apply a single swap it could be that we route 2 nodes; that is a setup like
        //  A - B - A - B
        // and we swap the middle two qubits. This cannot happen if we apply 2 or more swaps.
        match current_swaps.as_slice() {
            [swap] => swap
                .iter()
                .filter_map(|q| self.routable_node_on_qubit(problem, *q))
                .collect(),
            _ => smallvec![closest_node],
        }
    }

    /// Return the swap of two virtual qubits that produces the best score of all possible swaps.
    fn choose_best_swap(&mut self, problem: RoutingProblem) -> [PhysicalQubit; 2] {
        // Obtain all candidate swaps from the front layer.  A coupling-map edge is a candidate
        // swap if it involves at least one active qubit (i.e. it must affect the "basic"
        // heuristic), and if it involves two active qubits, we choose the `swap[0] < swap[1]` form
        // to make a canonical choice.
        self.swap_scores.clear();
        for &phys in self.front_layer.iter_active() {
            for &neighbor in problem.target.neighbors[phys].iter() {
                if neighbor > phys || !self.front_layer.is_active(neighbor) {
                    self.swap_scores.push(([phys, neighbor], 0.0));
                }
            }
        }

        let dist = &problem.target.distance.view();
        let mut absolute_score = 0.0;

        if let Some(BasicHeuristic { weight, scale }) = problem.heuristic.basic {
            let weight = match scale {
                SetScaling::Constant => weight,
                SetScaling::Size => {
                    if self.front_layer.is_empty() {
                        0.0
                    } else {
                        weight / (self.front_layer.len() as f64)
                    }
                }
            };
            absolute_score += weight * self.front_layer.total_score(dist);
            for (swap, score) in self.swap_scores.iter_mut() {
                *score += weight * self.front_layer.score(*swap, dist);
            }
        }

        if let Some(LookaheadHeuristic { weight, scale, .. }) = problem.heuristic.lookahead {
            let weight = match scale {
                SetScaling::Constant => weight,
                SetScaling::Size => {
                    if self.extended_set.is_empty() {
                        0.0
                    } else {
                        weight / (self.extended_set.len() as f64)
                    }
                }
            };
            absolute_score += weight * self.extended_set.total_score(dist);
            for (swap, score) in self.swap_scores.iter_mut() {
                *score += weight * self.extended_set.score(*swap, dist);
            }
        }

        if let Some(DecayHeuristic { .. }) = problem.heuristic.decay {
            for (swap, score) in self.swap_scores.iter_mut() {
                *score = (absolute_score + *score) * self.decay[swap[0]].max(self.decay[swap[1]]);
            }
        }

        let mut min_score = f64::INFINITY;
        let epsilon = problem.heuristic.best_epsilon;
        for &(swap, score) in self.swap_scores.iter() {
            if score - min_score < -epsilon {
                min_score = score;
                self.best_swaps.clear();
                self.best_swaps.push(swap);
            } else if (score - min_score).abs() <= epsilon {
                self.best_swaps.push(swap);
            }
        }
        *self.best_swaps.choose(&mut self.rng).unwrap()
    }
}

/// Run Sabre swap on a circuit
///
/// Returns:
///     A two-tuple of the newly routed :class:`.DAGCircuit`, and the layout that maps virtual
///     qubits to their assigned physical qubits at the *end* of the circuit execution.
#[pyfunction]
#[pyo3(signature=(dag, target, heuristic, initial_layout, num_trials, seed=None, run_in_parallel=None))]
pub fn sabre_routing(
    dag: &DAGCircuit,
    target: &PyRoutingTarget,
    heuristic: &Heuristic,
    initial_layout: &NLayout,
    num_trials: usize,
    seed: Option<u64>,
    run_in_parallel: Option<bool>,
) -> PyResult<(DAGCircuit, NLayout)> {
    let Some(target) = target.0.as_ref() else {
        // All-to-all coupling.
        return Ok((dag.clone(), initial_layout.clone()));
    };
    let sabre = SabreDAG::from_dag(dag)?;
    let result = swap_map(
        RoutingProblem {
            target,
            sabre: &sabre,
            dag,
            heuristic,
        },
        initial_layout,
        seed,
        num_trials,
        run_in_parallel,
    );
    result.rebuild().map(|dag| (dag, result.final_layout))
}

/// Run (potentially in parallel) several trials of the Sabre routing algorithm on the given
/// problem and return the one with fewest swaps.
pub fn swap_map<'a>(
    problem: RoutingProblem<'a>,
    initial_layout: &'_ NLayout,
    seed: Option<u64>,
    num_trials: usize,
    run_in_parallel: Option<bool>,
) -> RoutingResult<'a> {
    let seeds = match seed {
        Some(seed) => Pcg64Mcg::seed_from_u64(seed),
        None => Pcg64Mcg::from_os_rng(),
    }
    .sample_iter(&rand::distr::StandardUniform)
    .take(num_trials)
    .collect::<Vec<_>>();

    CondIterator::new(
        seeds,
        num_trials > 1
            && run_in_parallel.unwrap_or_else(|| getenv_use_multiple_threads() && num_trials > 1),
    )
    .map(|seed| swap_map_trial(problem, initial_layout, seed))
    .enumerate()
    .min_by_key(|(index, result)| (result.order.swap_count(), *index))
    .map(|(_, result)| result)
    .expect("must have at least one trial")
}

/// Run a single trial of the Sabre routing algorithm.
pub fn swap_map_trial<'a>(
    problem: RoutingProblem<'a>,
    initial_layout: &NLayout,
    seed: u64,
) -> RoutingResult<'a> {
    let mut order = Order::for_problem(problem);

    let num_qubits: u32 = problem.target.num_qubits().try_into().unwrap();
    let mut required_predecessors = VecMap::from(vec![0; problem.sabre.dag.node_count()]);
    for edge in problem.sabre.dag.edge_references() {
        required_predecessors[edge.target()] += 1;
    }
    let mut state = State {
        front_layer: FrontLayer::new(num_qubits),
        extended_set: ExtendedSet::new(num_qubits),
        decay: vec![1.; num_qubits as usize].into(),
        required_predecessors,
        layout: initial_layout.clone(),
        swap_scores: Vec::with_capacity(problem.target.neighbors.edge_count() / 2),
        best_swaps: Vec::new(),
        rng: Pcg64Mcg::seed_from_u64(seed),
        seed,
    };
    state.update_route(problem, &mut order, &problem.sabre.first_layer, None);
    state.populate_extended_set(problem);

    // Main logic loop; the front layer only becomes empty when all nodes have been routed.  At
    // each iteration of this loop, we route either one or two gates.
    let mut routable_nodes = Vec::<NodeIndex>::with_capacity(2);
    let mut num_search_steps = 0;

    while !state.front_layer.is_empty() {
        let mut current_swaps: Vec<[PhysicalQubit; 2]> = Vec::new();
        // Swap-mapping loop.  This is the main part of the algorithm, which we repeat until we
        // either successfully route a node, or exceed the maximum number of attempts.
        while routable_nodes.is_empty() && current_swaps.len() <= problem.heuristic.attempt_limit {
            let best_swap = state.choose_best_swap(problem);
            state.apply_swap(best_swap);
            current_swaps.push(best_swap);
            if let Some(node) = state.routable_node_on_qubit(problem, best_swap[1]) {
                routable_nodes.push(node);
            }
            if let Some(node) = state.routable_node_on_qubit(problem, best_swap[0]) {
                routable_nodes.push(node);
            }
            if let Some(DecayHeuristic { increment, reset }) = problem.heuristic.decay {
                num_search_steps += 1;
                if num_search_steps >= reset {
                    state.decay.fill(1.);
                    num_search_steps = 0;
                } else {
                    state.decay[best_swap[0]] += increment;
                    state.decay[best_swap[1]] += increment;
                }
            }
        }
        if routable_nodes.is_empty() {
            // If we exceeded the max number of heuristic-chosen swaps without making progress,
            // unwind to the last progress point and greedily swap to bring a node together.
            // Efficiency doesn't matter much; this path never gets taken unless we're unlucky.
            current_swaps
                .drain(..)
                .rev()
                .for_each(|swap| state.apply_swap(swap));
            let force_routed = state.force_enable_closest_node(problem, &mut current_swaps);
            routable_nodes.extend(force_routed);
        }

        for node in &routable_nodes {
            state.front_layer.remove(node);
        }
        state.update_route(problem, &mut order, &routable_nodes, Some(current_swaps));
        // Ideally we'd know how to mutate the extended set directly, but since its limited size
        // easy to do better than just emptying it and rebuilding.
        state.extended_set.clear();
        state.populate_extended_set(problem);

        if problem.heuristic.decay.is_some() {
            state.decay.fill(1.);
        }
        routable_nodes.clear();
    }
    RoutingResult {
        problem,
        order,
        initial_layout: initial_layout.clone(),
        final_layout: state.layout,
    }
}