Skip to content

Commit 09d6f0e

Browse files
committed
Add Tomasulo correctness test and IPC performance benchmark apps
Two new sw/apps for validating and measuring Tomasulo out-of-order execution: tomasulo_test (10 tests covering RAW/WAR/WAW hazards, OOO execution, latency bypass, RS saturation, memory disambiguation, complex dependency chains, branches, and CDB contention) and tomasulo_perf (7 IPC benchmarks comparing dependent vs independent instruction chains using hardware cycle/instret counters). Both use C with inline asm volatile blocks to emit exact hazard-creating instruction sequences, and integrate into the existing cocotb test_real_program framework via TEST_REGISTRY entries.
1 parent 4c8b6b8 commit 09d6f0e

File tree

5 files changed

+859
-0
lines changed

5 files changed

+859
-0
lines changed

sw/apps/tomasulo_perf/Makefile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright 2026 Two Sigma Open Source, LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Makefile for Tomasulo Performance Measurement
16+
# Measures IPC across workloads to quantify OOO execution benefit
17+
SRC_C := ../../lib/src/uart.c tomasulo_perf.c
18+
include ../../common/common.mk
Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
/*
2+
* Copyright 2026 Two Sigma Open Source, LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
/**
18+
* Tomasulo Performance Measurement
19+
*
20+
* Measures Instructions Per Cycle (IPC) across different workloads to
21+
* quantify the benefit of out-of-order execution via Tomasulo's algorithm.
22+
*
23+
* Key comparison: dependent vs independent instruction chains.
24+
* - Dependent chains serialize on data hazards (IPC limited to ~1.0)
25+
* - Independent chains can exploit ILP (IPC scales with issue width)
26+
* - The ratio between them shows the OOO execution benefit
27+
*
28+
* Uses hardware cycle and instret counters (Zicntr CSRs) for measurement.
29+
* IPC is reported as IPC*100 (integer, so 150 means IPC = 1.50).
30+
*
31+
* Benchmarks:
32+
* 1. Dependent ADD chain (worst-case ILP: serialized)
33+
* 2. Independent ADD chains (best-case ILP: fully parallel)
34+
* 3. Dependent MUL chain (long-latency serialized)
35+
* 4. Independent MUL chains (long-latency parallel)
36+
* 5. Mixed MUL + ADD (latency hiding)
37+
* 6. Load-store throughput (memory subsystem)
38+
* 7. Branch-heavy loop (branch prediction + OOO)
39+
*/
40+
41+
#include "csr.h"
42+
#include "uart.h"
43+
#include <stdint.h>
44+
45+
static void print_result(uint32_t cycles, uint32_t instrs)
46+
{
47+
uint32_t ipc_x100 = cycles ? (instrs * 100) / cycles : 0;
48+
uart_printf(" Cycles: %lu Instrs: %lu IPC*100: %lu\n",
49+
(unsigned long) cycles,
50+
(unsigned long) instrs,
51+
(unsigned long) ipc_x100);
52+
}
53+
54+
int main(void)
55+
{
56+
uint32_t c0, c1, i0, i1;
57+
58+
uart_printf("\n");
59+
uart_printf("============================================================\n");
60+
uart_printf(" TOMASULO PERFORMANCE MEASUREMENT\n");
61+
uart_printf("============================================================\n");
62+
uart_printf(" IPC*100: 100 = 1.0 IPC, 150 = 1.5 IPC, etc.\n\n");
63+
64+
/* ===================================================================== */
65+
/* Benchmark 1: Dependent ADD chain (100 instructions) */
66+
/* Each ADD reads the result of the previous one - no ILP possible. */
67+
/* Baseline for comparison: OOO cannot help here. */
68+
/* ===================================================================== */
69+
uart_printf("Bench 1: Dependent ADD chain (100 instrs)\n");
70+
c0 = rdcycle();
71+
i0 = rdinstret();
72+
__asm__ volatile("addi t0, zero, 1\n"
73+
".rept 100\n"
74+
"add t0, t0, t0\n"
75+
".endr\n"
76+
:
77+
:
78+
: "t0");
79+
c1 = rdcycle();
80+
i1 = rdinstret();
81+
print_result(c1 - c0, i1 - i0);
82+
83+
/* ===================================================================== */
84+
/* Benchmark 2: Independent ADD chains (4 x 25 = 100 instructions) */
85+
/* 4 chains with no cross-dependencies - ideal for OOO execution. */
86+
/* IPC should be higher than Bench 1 if OOO is working. */
87+
/* ===================================================================== */
88+
uart_printf("Bench 2: Independent ADD chains (4x25 = 100 instrs)\n");
89+
c0 = rdcycle();
90+
i0 = rdinstret();
91+
__asm__ volatile("addi t0, zero, 1\n"
92+
"addi t1, zero, 2\n"
93+
"addi t2, zero, 3\n"
94+
"addi t3, zero, 4\n"
95+
".rept 25\n"
96+
"add t0, t0, t0\n"
97+
"add t1, t1, t1\n"
98+
"add t2, t2, t2\n"
99+
"add t3, t3, t3\n"
100+
".endr\n"
101+
:
102+
:
103+
: "t0", "t1", "t2", "t3");
104+
c1 = rdcycle();
105+
i1 = rdinstret();
106+
print_result(c1 - c0, i1 - i0);
107+
108+
/* ===================================================================== */
109+
/* Benchmark 3: Dependent MUL chain (50 instructions) */
110+
/* MUL has multi-cycle latency, so a dependent chain is very slow. */
111+
/* Multiply by 1 to keep the value stable (avoids overflow). */
112+
/* ===================================================================== */
113+
uart_printf("Bench 3: Dependent MUL chain (50 instrs)\n");
114+
c0 = rdcycle();
115+
i0 = rdinstret();
116+
__asm__ volatile("addi t0, zero, 3\n"
117+
"addi t1, zero, 1\n"
118+
".rept 50\n"
119+
"mul t0, t0, t1\n"
120+
".endr\n"
121+
:
122+
:
123+
: "t0", "t1");
124+
c1 = rdcycle();
125+
i1 = rdinstret();
126+
print_result(c1 - c0, i1 - i0);
127+
128+
/* ===================================================================== */
129+
/* Benchmark 4: Independent MUL chains (4 x 12 = 48 instructions) */
130+
/* 4 independent MUL chains. If the MUL unit is pipelined or there are */
131+
/* multiple MUL reservation stations, these can overlap. */
132+
/* ===================================================================== */
133+
uart_printf("Bench 4: Independent MUL chains (4x12 = 48 instrs)\n");
134+
c0 = rdcycle();
135+
i0 = rdinstret();
136+
__asm__ volatile("addi t0, zero, 2\n"
137+
"addi t1, zero, 3\n"
138+
"addi t2, zero, 5\n"
139+
"addi t3, zero, 7\n"
140+
"addi t4, zero, 1\n"
141+
".rept 12\n"
142+
"mul t0, t0, t4\n"
143+
"mul t1, t1, t4\n"
144+
"mul t2, t2, t4\n"
145+
"mul t3, t3, t4\n"
146+
".endr\n"
147+
:
148+
:
149+
: "t0", "t1", "t2", "t3", "t4");
150+
c1 = rdcycle();
151+
i1 = rdinstret();
152+
print_result(c1 - c0, i1 - i0);
153+
154+
/* ===================================================================== */
155+
/* Benchmark 5: Mixed MUL + independent ADD (100 instruction pairs) */
156+
/* Tests whether short-latency ADDs can execute while MUL is in flight. */
157+
/* An OOO machine should overlap the ADD with the MUL stall. */
158+
/* ===================================================================== */
159+
uart_printf("Bench 5: Mixed MUL+ADD (50 pairs = 100 instrs)\n");
160+
c0 = rdcycle();
161+
i0 = rdinstret();
162+
__asm__ volatile("addi t0, zero, 1\n"
163+
"addi t1, zero, 1\n"
164+
"addi t2, zero, 0\n"
165+
"addi t3, zero, 1\n"
166+
".rept 50\n"
167+
"mul t0, t0, t1\n" /* Long latency (dependent chain) */
168+
"add t2, t2, t3\n" /* Short latency (independent of MUL) */
169+
".endr\n"
170+
:
171+
:
172+
: "t0", "t1", "t2", "t3");
173+
c1 = rdcycle();
174+
i1 = rdinstret();
175+
print_result(c1 - c0, i1 - i0);
176+
177+
/* ===================================================================== */
178+
/* Benchmark 6: Load-store throughput (50 store-load pairs) */
179+
/* Alternating store and load to the same address. */
180+
/* Tests store-load forwarding and memory subsystem throughput. */
181+
/* ===================================================================== */
182+
uart_printf("Bench 6: Load-store pairs (50 pairs = 100 instrs)\n");
183+
{
184+
volatile uint32_t mem_area[4];
185+
c0 = rdcycle();
186+
i0 = rdinstret();
187+
__asm__ volatile("addi t0, zero, 1\n"
188+
".rept 50\n"
189+
"sw t0, 0(%[addr])\n"
190+
"lw t0, 0(%[addr])\n"
191+
".endr\n"
192+
:
193+
: [addr] "r"(mem_area)
194+
: "t0", "memory");
195+
c1 = rdcycle();
196+
i1 = rdinstret();
197+
print_result(c1 - c0, i1 - i0);
198+
}
199+
200+
/* ===================================================================== */
201+
/* Benchmark 7: Branch-heavy loop (200 iterations, 3 instrs/iter) */
202+
/* Tests branch prediction integration with OOO pipeline. */
203+
/* Good prediction allows the loop body to overlap across iterations. */
204+
/* ===================================================================== */
205+
uart_printf("Bench 7: Branch loop (200 iters, 3 instrs/iter)\n");
206+
c0 = rdcycle();
207+
i0 = rdinstret();
208+
__asm__ volatile("addi t0, zero, 200\n"
209+
"addi t1, zero, 0\n"
210+
"1:\n"
211+
"addi t1, t1, 1\n"
212+
"addi t0, t0, -1\n"
213+
"bne t0, zero, 1b\n"
214+
:
215+
:
216+
: "t0", "t1");
217+
c1 = rdcycle();
218+
i1 = rdinstret();
219+
print_result(c1 - c0, i1 - i0);
220+
221+
/* ===================================================================== */
222+
/* Summary */
223+
/* ===================================================================== */
224+
uart_printf("\n============================================================\n");
225+
uart_printf(" Performance measurement complete.\n");
226+
uart_printf(" Compare Bench 1 vs 2 (ADD) and Bench 3 vs 4 (MUL)\n");
227+
uart_printf(" to see the IPC benefit of out-of-order execution.\n");
228+
uart_printf("============================================================\n\n");
229+
230+
uart_printf("<<PASS>>\n");
231+
232+
return 0;
233+
}

sw/apps/tomasulo_test/Makefile

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# Copyright 2026 Two Sigma Open Source, LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
# Makefile for Tomasulo Correctness Test
16+
# Tests hazard handling, OOO execution, and register renaming
17+
SRC_C := ../../lib/src/uart.c tomasulo_test.c
18+
include ../../common/common.mk

0 commit comments

Comments
 (0)