Skip to content

Commit ed698e7

Browse files
authored
Merge pull request #609 from OpenXiangShan/add-kmhv3
Add kmhv3
2 parents 06691bb + 9750a5a commit ed698e7

File tree

11 files changed

+203
-39
lines changed

11 files changed

+203
-39
lines changed
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
name: gem5 Ideal BTB Performance Test
2+
3+
on:
4+
push:
5+
branches: [ xs-dev, '*-align' ] # xs-dev for normal CI, *-align for BTB-only performance testing (align to RTL)
6+
pull_request:
7+
branches: [ xs-dev ]
8+
workflow_dispatch:
9+
inputs:
10+
branch_name:
11+
description: 'Branch to test (leave empty for current branch)'
12+
required: false
13+
type: string
14+
15+
jobs:
16+
perf_test:
17+
uses: ./.github/workflows/gem5-perf-template.yml
18+
with:
19+
script_path: ../kmh_v3_btb.sh
20+
benchmark_type: "spec06-0.3c"

.github/workflows/gem5-ideal-btb-perf-weekly.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,19 +11,19 @@ jobs:
1111
perf_test_spec06:
1212
uses: ./.github/workflows/gem5-perf-template.yml
1313
with:
14-
script_path: ../kmh_v3_btb.sh
14+
script_path: ../kmh_v3_ideal.sh
1515
benchmark_type: "spec06-1.0c"
1616

1717
perf_test_spec17:
1818
uses: ./.github/workflows/gem5-perf-template.yml
1919
with:
20-
script_path: ../kmh_v3_btb.sh
20+
script_path: ../kmh_v3_ideal.sh
2121
benchmark_type: "spec17-1.0c"
2222

2323
perf_test_spec06_vector:
2424
uses: ./.github/workflows/gem5-perf-template.yml
2525
with:
26-
script_path: ../kmh_v3_btb.sh
26+
script_path: ../kmh_v3_ideal.sh
2727
benchmark_type: "spec06-rvv-1.0c"
2828
vector_type: "simple"
2929
check_result: false

.github/workflows/gem5-ideal-btb-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,5 +18,5 @@ jobs:
1818
perf_test:
1919
uses: ./.github/workflows/gem5-perf-template.yml
2020
with:
21-
script_path: ../kmh_v3_btb.sh
21+
script_path: ../kmh_v3_ideal.sh
2222
benchmark_type: "spec06-0.8c"

.github/workflows/gem5-ideal-rvv-simple-perf.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ jobs:
77
perf_test:
88
uses: ./.github/workflows/gem5-perf-template.yml
99
with:
10-
script_path: ../kmh_v3_btb.sh
10+
script_path: ../kmh_v3_ideal.sh
1111
benchmark_type: "spec06int-rvv-0.8c"
1212
vector_type: "simple"
1313
check_result: false # Warning: rvv test will not show the difftest failure

.github/workflows/gem5-perf-template.yml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ on:
99
benchmark_type:
1010
required: true
1111
type: string
12-
description: "Benchmark type: spec06-0.8c, spec06-1.0c, spec17-1.0c, spec06-rvv-1.0c or spec06int-rvv-0.8c"
12+
description: "Benchmark type: spec06-0.3c, spec06-0.8c, spec06-1.0c, spec17-1.0c, spec06-rvv-1.0c or spec06int-rvv-0.8c"
1313
vector_type:
1414
required: false
1515
type: string
@@ -37,6 +37,14 @@ jobs:
3737
id: config
3838
run: |
3939
case "${{ inputs.benchmark_type }}" in
40+
"spec06-0.3c")
41+
echo "checkpoint_list=/nfs/home/share/gem5_ci/spec06_cpts/spec06_0.3c_int.lst" >> $GITHUB_OUTPUT
42+
echo "checkpoint_root_node=/nfs/home/share/jiaxiaoyu/simpoint_checkpoint_zstd_format/spec06_rv64gcb_O3_20m_gcc12.2.0-intFpcOff-jeMalloc" >> $GITHUB_OUTPUT
43+
echo "score_script=gem5-score-ci.sh" >> $GITHUB_OUTPUT
44+
echo "cluster_config=/nfs/home/share/gem5_ci/spec06_cpts/cluster-0-0.json" >> $GITHUB_OUTPUT
45+
echo "artifact_name=performance-score-spec06-0.3c" >> $GITHUB_OUTPUT
46+
echo "comment=run 30% coverage spec06 checkpoints, 148 checkpoints" >> $GITHUB_OUTPUT
47+
;;
4048
"spec06-0.8c")
4149
echo "checkpoint_list=/nfs/home/share/gem5_ci/spec06_cpts/spec_0.8c_int.lst" >> $GITHUB_OUTPUT
4250
echo "checkpoint_root_node=/nfs/home/share/jiaxiaoyu/simpoint_checkpoint_zstd_format/spec06_rv64gcb_O3_20m_gcc12.2.0-intFpcOff-jeMalloc" >> $GITHUB_OUTPUT

.github/workflows/on-demand-spec.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ jobs:
8383
needs: trigger
8484
uses: ./.github/workflows/gem5-perf-template.yml
8585
with:
86-
script_path: ../kmh_v3_btb.sh
86+
script_path: ../kmh_v3_ideal.sh
8787
benchmark_type: ${{ needs.trigger.outputs.benchmark_type }}
8888
vector_type: ${{ needs.trigger.outputs.vector_type }}
8989
pr_ref: ${{ needs.trigger.outputs.pr_ref }}

.github/workflows/pr-quick-check.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ jobs:
3535
export GCBV_REF_SO="/nfs/home/share/gem5_ci/ref/normal/riscv64-nemu-notama-tvalref-so"
3636
3737
exit_code=0
38-
./build/RISCV/gem5.opt ./configs/example/idealkmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
38+
./build/RISCV/gem5.opt ./configs/example/kmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
3939
4040
# 验证 difftest 正常运行
4141
if [ "${exit_code}" -ne 0 ]; then
@@ -67,7 +67,7 @@ jobs:
6767
6868
# 运行测试
6969
exit_code=0
70-
./build/RISCV/gem5.opt ./configs/example/idealkmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
70+
./build/RISCV/gem5.opt ./configs/example/kmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
7171
7272
if [ "${exit_code}" -ne 0 ]; then
7373
echo "❌ Base branch difftest failed with exit code ${exit_code}!"

configs/example/idealkmhv3.py

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -75,31 +75,10 @@ def setKmhV3IdealParams(args, system):
7575
cpu.SbufferEvictThreshold = 16
7676

7777
# branch predictor
78-
if args.bp_type == 'DecoupledBPUWithFTB' or args.bp_type == 'DecoupledBPUWithBTB':
79-
if args.bp_type == 'DecoupledBPUWithFTB':
80-
cpu.branchPred.enableTwoTaken = False
81-
cpu.branchPred.numBr = 8 # numBr must be a power of 2, see getShuffledBrIndex()
82-
cpu.branchPred.predictWidth = 64
83-
cpu.branchPred.uftb.numEntries = 1024
84-
cpu.branchPred.ftb.numEntries = 16384
85-
cpu.branchPred.tage.baseTableSize = 16384
86-
cpu.branchPred.tage.tableSizes = [2048] * 8
87-
else:
88-
cpu.branchPred.predictWidth = 64 # max width of a fetch block
89-
cpu.branchPred.mbtb.numEntries = 8192
90-
# TODO: BTB TAGE do not bave base table, do not support SC
91-
cpu.branchPred.tage.tableSizes = [2048] * 8 # 2 way, 2048 sets
92-
cpu.branchPred.tage.numWays = 2
93-
cpu.branchPred.microtage.tableSizes = [512] # 2 way, 512 sets
94-
cpu.branchPred.microtage.numWays = 2
95-
cpu.branchPred.mgsc.enableMGSC = not args.disable_mgsc
96-
cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert
78+
if args.bp_type == 'DecoupledBPUWithBTB':
79+
cpu.branchPred.mgsc.enableMGSC = not args.disable_mgsc
9780
cpu.branchPred.ftq_size = 256
9881
cpu.branchPred.fsq_size = 256
99-
cpu.branchPred.tage.numPredictors = 8
100-
cpu.branchPred.tage.TTagBitSizes = [11] * 8
101-
cpu.branchPred.tage.TTagPcShifts = [1] * 8
102-
cpu.branchPred.tage.histLengths = [4, 9, 17, 29, 56, 109, 211, 397]
10382

10483
# l1 cache per core
10584
if args.caches:

configs/example/kmhv3.py

Lines changed: 157 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,157 @@
1+
import argparse
2+
import sys
3+
4+
import m5
5+
from m5.defines import buildEnv
6+
from m5.objects import *
7+
from m5.util import addToPath, fatal, warn
8+
from m5.util.fdthelper import *
9+
10+
addToPath('../')
11+
12+
from ruby import Ruby
13+
14+
from common.FSConfig import *
15+
from common.SysPaths import *
16+
from common.Benchmarks import *
17+
from common import Simulation
18+
from common.Caches import *
19+
from common.xiangshan import *
20+
21+
22+
def setKmhV3Params(args, system):
23+
for cpu in system.cpu:
24+
25+
# fetch (idealfetch not care)
26+
cpu.mmu.itb.size = 96
27+
cpu.fetchWidth = 32
28+
cpu.iewToFetchDelay = 2 # for resolved update, should train branch after squash
29+
cpu.commitToFetchDelay = 2
30+
cpu.fetchQueueSize = 64
31+
cpu.fetchToDecodeDelay = 2
32+
33+
# decode
34+
cpu.decodeWidth = 8
35+
cpu.enable_loadFusion = False
36+
cpu.enableConstantFolding = False
37+
38+
# rename
39+
cpu.renameWidth = 8
40+
cpu.numPhysIntRegs = 224
41+
cpu.numPhysFloatRegs = 256
42+
cpu.enable_storeSet_train = False
43+
44+
# dispatch
45+
cpu.enableDispatchStage = False
46+
cpu.numDQEntries = [8, 8, 8]
47+
cpu.dispWidth = [8, 8, 8]
48+
49+
# scheduler
50+
cpu.scheduler = KMHV3Scheduler()
51+
cpu.scheduler.disableAllRegArb()
52+
cpu.scheduler.enableMainRdpOpt = False
53+
cpu.scheduler.intRegfileBanks = 1
54+
# intiq0
55+
cpu.scheduler.IQs[0].oports[0].rp = [IntRD(0, 0), IntRD(1, 0)]
56+
cpu.scheduler.IQs[0].oports[1].rp = [IntRD(0, 1), IntRD(1, 1)]
57+
58+
# intiq1
59+
cpu.scheduler.IQs[1].oports[0].rp = [IntRD(2, 0), IntRD(3, 0)]
60+
cpu.scheduler.IQs[1].oports[1].rp = [IntRD(2, 1), IntRD(3, 1)]
61+
62+
# intiq2
63+
cpu.scheduler.IQs[2].oports[0].rp = [IntRD(4, 0), IntRD(5, 0)]
64+
cpu.scheduler.IQs[2].oports[1].rp = [IntRD(4, 1), IntRD(5, 1)]
65+
66+
# rob
67+
cpu.commitWidth = 8
68+
cpu.squashWidth = 8
69+
cpu.RobCompressPolicy = 'none'
70+
cpu.numROBEntries = 352
71+
cpu.CROB_instPerGroup = 2 # 1 if not using ROB compression
72+
73+
# lsu
74+
cpu.StoreWbStage = 4
75+
cpu.EnableLdMissReplay = True
76+
cpu.EnablePipeNukeCheck = True
77+
cpu.BankConflictCheck = True
78+
cpu.sbufferBankWriteAccurately = False
79+
80+
# lsq
81+
cpu.LQEntries = 120
82+
cpu.SQEntries = 64
83+
cpu.RARQEntries = 96
84+
cpu.RAWQEntries = 56
85+
cpu.LoadCompletionWidth = 8
86+
cpu.StoreCompletionWidth = 4
87+
cpu.RARDequeuePerCycle = 4
88+
cpu.RAWDequeuePerCycle = 4
89+
cpu.SbufferEntries = 16
90+
cpu.SbufferEvictThreshold = 7
91+
cpu.store_prefetch_train = False
92+
93+
# branch predictor
94+
if args.bp_type == 'DecoupledBPUWithBTB':
95+
cpu.branchPred.mgsc.enableMGSC = not args.disable_mgsc
96+
cpu.branchPred.ftq_size = 256
97+
cpu.branchPred.fsq_size = 256
98+
99+
# l1 cache per core
100+
if args.caches:
101+
cpu.icache.size = '64kB'
102+
cpu.dcache.size = '64kB'
103+
cpu.dcache.tag_load_read_ports = 3
104+
cpu.dcache.mshrs = 16
105+
106+
# l2 caches
107+
if args.l2cache:
108+
for i in range(args.num_cpus):
109+
if args.classic_l2:
110+
system.l2_caches[i].slice_num = 4
111+
system.l2_caches[i].wpu = NULL
112+
else:
113+
l2_wrapper = system.l2_wrappers[i]
114+
l2_wrapper.data_sram_banks = 1
115+
l2_wrapper.dir_sram_banks = 1
116+
l2_wrapper.pipe_dir_write_stage = 3
117+
l2_wrapper.dir_read_bypass = False
118+
for j in range(args.l2_slices):
119+
l2_wrapper.slices[j].inner_cache.wpu = NULL
120+
system.tol2bus_list[i].forward_latency = 3 # 3->0
121+
system.tol2bus_list[i].response_latency = 3 # 3->0
122+
system.tol2bus_list[i].hint_wakeup_ahead_cycles = 2 # 2->0
123+
124+
# Enable dual-port for DCache → L2 communication
125+
# ReqLayer[0]: ICache+DCache+ITB+DTB → L2, allow 2 requests per cycle
126+
# RespLayer[1]: L2 → DCache, allow 2 responses per cycle
127+
# system.tol2bus_list[i].layer_bandwidth_configs = [
128+
# LayerBandwidthConfig(direction="req", port_index=0, max_per_cycle=2),
129+
# LayerBandwidthConfig(direction="resp", port_index=1, max_per_cycle=2),
130+
# ]
131+
132+
# l3 cache
133+
if args.l3cache:
134+
system.l3.mshrs = 64
135+
136+
if __name__ == '__m5_main__':
137+
FutureClass = None
138+
139+
args = xiangshan_system_init()
140+
141+
assert not args.external_memory_system
142+
143+
# Set default bp_type based on ideal_kmhv3 flag
144+
# If user didn't specify bp_type, set default based on ideal_kmhv3
145+
args.bp_type = 'DecoupledBPUWithBTB'
146+
args.l2_size = '2MB'
147+
148+
# Match the memories with the CPUs, based on the options for the test system
149+
TestMemClass = Simulation.setMemClass(args)
150+
151+
test_sys = build_xiangshan_system(args)
152+
# Set ideal parameters here with the highest priority, over command-line arguments
153+
setKmhV3Params(args, test_sys)
154+
155+
root = Root(full_system=True, system=test_sys)
156+
157+
Simulation.run_vanilla(args, root, test_sys, FutureClass)

src/cpu/pred/BranchPredictor.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -976,7 +976,7 @@ class MBTB(TimedBaseBTBPredictor):
976976
cxx_class = 'gem5::branch_prediction::btb_pred::MBTB'
977977
cxx_header = 'cpu/pred/btb/mbtb.hh'
978978

979-
numEntries = Param.Unsigned(2048, "Number of entries in the MBTB")
979+
numEntries = Param.Unsigned(8192, "Number of entries in the MBTB")
980980
tagBits = Param.Unsigned(20, "Number of bits in the tag")
981981
instShiftAmt = Param.Unsigned(1, "Amount to shift PC to get inst bits")
982982
numThreads = Param.Unsigned(1, "Number of threads")
@@ -1042,13 +1042,13 @@ class BTBTAGE(TimedBaseBTBPredictor):
10421042
needMoreHistories = Param.Bool(True, "BTBTAGE needs more histories")
10431043
enableSC = Param.Bool(False, "Enable SC or not") # TODO: BTBTAGE doesn't support SC
10441044
updateOnRead = Param.Bool(True, "Enable update on read, no need to save tage meta in FTQ")
1045-
numPredictors = Param.Unsigned(4, "Number of TAGE predictors")
1046-
tableSizes = VectorParam.Unsigned([2048]*4, "the TAGE T0~Tn length")
1047-
TTagBitSizes = VectorParam.Unsigned([8]*4, "the T0~Tn entry's tag bit size")
1048-
TTagPcShifts = VectorParam.Unsigned([1] * 4, "when the T0~Tn entry's tag generating, PC right shift")
1045+
numPredictors = Param.Unsigned(8, "Number of TAGE predictors")
1046+
tableSizes = VectorParam.Unsigned([2048]*8, "the TAGE T0~Tn length")
1047+
TTagBitSizes = VectorParam.Unsigned([11]*8, "the T0~Tn entry's tag bit size")
1048+
TTagPcShifts = VectorParam.Unsigned([1] * 8, "when the T0~Tn entry's tag generating, PC right shift")
10491049
blockSize = 32 # tage index function uses 32B aligned block address
10501050

1051-
histLengths = VectorParam.Unsigned([8, 13, 32, 119], "the BTB TAGE T0~Tn history length")
1051+
histLengths = VectorParam.Unsigned([4, 9, 17, 29, 56, 109, 211, 397], "the BTB TAGE T0~Tn history length")
10521052
maxHistLen = Param.Unsigned(970, "The length of history passed from DBP")
10531053
numTablesToAlloc = Param.Unsigned(1,"The number of table to allocated each time")
10541054
numWays = Param.Unsigned(2, "Number of ways per set")

0 commit comments

Comments
 (0)