Merge pull request #609 from OpenXiangShan/add-kmhv3

jensen-yan · web-flow · commit ed698e7e5d89 · 2025-11-24T12:16:52.000+08:00
Add kmhv3
diff --git a/.github/workflows/gem5-ideal-btb-0.3c.yml b/.github/workflows/gem5-ideal-btb-0.3c.yml
@@ -0,0 +1,20 @@
+name: gem5 Ideal BTB Performance Test
+
+on:
+  push:
+    branches: [ xs-dev, '*-align' ]  # xs-dev for normal CI, *-align for BTB-only performance testing (align to RTL)
+  pull_request:
+    branches: [ xs-dev ]
+  workflow_dispatch:
+    inputs:
+      branch_name:
+        description: 'Branch to test (leave empty for current branch)'
+        required: false
+        type: string
+
+jobs:
+  perf_test:
+    uses: ./.github/workflows/gem5-perf-template.yml
+    with:
+      script_path: ../kmh_v3_btb.sh
+      benchmark_type: "spec06-0.3c"
diff --git a/.github/workflows/gem5-ideal-btb-perf-weekly.yml b/.github/workflows/gem5-ideal-btb-perf-weekly.yml
@@ -11,19 +11,19 @@ jobs:
   perf_test_spec06:
     uses: ./.github/workflows/gem5-perf-template.yml
     with:
-      script_path: ../kmh_v3_btb.sh
+      script_path: ../kmh_v3_ideal.sh
       benchmark_type: "spec06-1.0c"
   
   perf_test_spec17:
     uses: ./.github/workflows/gem5-perf-template.yml
     with:
-      script_path: ../kmh_v3_btb.sh
+      script_path: ../kmh_v3_ideal.sh
       benchmark_type: "spec17-1.0c" 
   
   perf_test_spec06_vector:
     uses: ./.github/workflows/gem5-perf-template.yml
     with:
-      script_path: ../kmh_v3_btb.sh
+      script_path: ../kmh_v3_ideal.sh
       benchmark_type: "spec06-rvv-1.0c"
       vector_type: "simple"
       check_result: false
diff --git a/.github/workflows/gem5-ideal-btb-perf.yml b/.github/workflows/gem5-ideal-btb-perf.yml
@@ -18,5 +18,5 @@ jobs:
   perf_test:
     uses: ./.github/workflows/gem5-perf-template.yml
     with:
-      script_path: ../kmh_v3_btb.sh
+      script_path: ../kmh_v3_ideal.sh
       benchmark_type: "spec06-0.8c"
diff --git a/.github/workflows/gem5-ideal-rvv-simple-perf.yml b/.github/workflows/gem5-ideal-rvv-simple-perf.yml
@@ -7,7 +7,7 @@ jobs:
   perf_test:
     uses: ./.github/workflows/gem5-perf-template.yml
     with:
-      script_path: ../kmh_v3_btb.sh
+      script_path: ../kmh_v3_ideal.sh
       benchmark_type: "spec06int-rvv-0.8c"
       vector_type: "simple"
       check_result: false # Warning: rvv test will not show the difftest failure
diff --git a/.github/workflows/gem5-perf-template.yml b/.github/workflows/gem5-perf-template.yml
@@ -9,7 +9,7 @@ on:
       benchmark_type:
         required: true
         type: string
-        description: "Benchmark type: spec06-0.8c, spec06-1.0c, spec17-1.0c, spec06-rvv-1.0c or spec06int-rvv-0.8c"
+        description: "Benchmark type: spec06-0.3c, spec06-0.8c, spec06-1.0c, spec17-1.0c, spec06-rvv-1.0c or spec06int-rvv-0.8c"
       vector_type:
         required: false
         type: string
@@ -37,6 +37,14 @@ jobs:
         id: config
         run: |
           case "${{ inputs.benchmark_type }}" in
+              "spec06-0.3c")
+              echo "checkpoint_list=/nfs/home/share/gem5_ci/spec06_cpts/spec06_0.3c_int.lst" >> $GITHUB_OUTPUT
+              echo "checkpoint_root_node=/nfs/home/share/jiaxiaoyu/simpoint_checkpoint_zstd_format/spec06_rv64gcb_O3_20m_gcc12.2.0-intFpcOff-jeMalloc" >> $GITHUB_OUTPUT
+              echo "score_script=gem5-score-ci.sh" >> $GITHUB_OUTPUT
+              echo "cluster_config=/nfs/home/share/gem5_ci/spec06_cpts/cluster-0-0.json" >> $GITHUB_OUTPUT
+              echo "artifact_name=performance-score-spec06-0.3c" >> $GITHUB_OUTPUT
+              echo "comment=run 30% coverage spec06 checkpoints, 148 checkpoints" >> $GITHUB_OUTPUT
+              ;;
             "spec06-0.8c")
               echo "checkpoint_list=/nfs/home/share/gem5_ci/spec06_cpts/spec_0.8c_int.lst" >> $GITHUB_OUTPUT
               echo "checkpoint_root_node=/nfs/home/share/jiaxiaoyu/simpoint_checkpoint_zstd_format/spec06_rv64gcb_O3_20m_gcc12.2.0-intFpcOff-jeMalloc" >> $GITHUB_OUTPUT
diff --git a/.github/workflows/on-demand-spec.yml b/.github/workflows/on-demand-spec.yml
@@ -83,7 +83,7 @@ jobs:
     needs: trigger
     uses: ./.github/workflows/gem5-perf-template.yml
     with:
-      script_path: ../kmh_v3_btb.sh
+      script_path: ../kmh_v3_ideal.sh
       benchmark_type: ${{ needs.trigger.outputs.benchmark_type }}
       vector_type: ${{ needs.trigger.outputs.vector_type }}
       pr_ref: ${{ needs.trigger.outputs.pr_ref }}
diff --git a/.github/workflows/pr-quick-check.yml b/.github/workflows/pr-quick-check.yml
@@ -35,7 +35,7 @@ jobs:
           export GCBV_REF_SO="/nfs/home/share/gem5_ci/ref/normal/riscv64-nemu-notama-tvalref-so"
           
           exit_code=0
-          ./build/RISCV/gem5.opt ./configs/example/idealkmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
+          ./build/RISCV/gem5.opt ./configs/example/kmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
           
           # 验证 difftest 正常运行
           if [ "${exit_code}" -ne 0 ]; then 
@@ -67,7 +67,7 @@ jobs:
           
           # 运行测试
           exit_code=0
-          ./build/RISCV/gem5.opt ./configs/example/idealkmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
+          ./build/RISCV/gem5.opt ./configs/example/kmhv3.py --raw-cpt --generic-rv-cpt=/nfs/home/share/gem5_ci/checkpoints/coremark-riscv64-xs.bin || exit_code=$?
           
           if [ "${exit_code}" -ne 0 ]; then 
             echo "❌ Base branch difftest failed with exit code ${exit_code}!"
diff --git a/configs/example/idealkmhv3.py b/configs/example/idealkmhv3.py
@@ -75,31 +75,10 @@ def setKmhV3IdealParams(args, system):
         cpu.SbufferEvictThreshold = 16
 
         # branch predictor
-        if args.bp_type == 'DecoupledBPUWithFTB' or args.bp_type == 'DecoupledBPUWithBTB':
-            if args.bp_type == 'DecoupledBPUWithFTB':
-                cpu.branchPred.enableTwoTaken = False
-                cpu.branchPred.numBr = 8    # numBr must be a power of 2, see getShuffledBrIndex()
-                cpu.branchPred.predictWidth = 64
-                cpu.branchPred.uftb.numEntries = 1024
-                cpu.branchPred.ftb.numEntries = 16384
-                cpu.branchPred.tage.baseTableSize = 16384
-                cpu.branchPred.tage.tableSizes = [2048] * 8
-            else:
-                cpu.branchPred.predictWidth = 64              # max width of a fetch block
-                cpu.branchPred.mbtb.numEntries = 8192
-                # TODO: BTB TAGE do not bave base table, do not support SC
-                cpu.branchPred.tage.tableSizes = [2048] * 8  # 2 way, 2048 sets
-                cpu.branchPred.tage.numWays = 2
-                cpu.branchPred.microtage.tableSizes = [512]   # 2 way, 512 sets
-                cpu.branchPred.microtage.numWays = 2
-                cpu.branchPred.mgsc.enableMGSC = not args.disable_mgsc
-            cpu.branchPred.tage.enableSC = False # TODO(bug): When numBr changes, enabling SC will trigger an assert
+        if args.bp_type == 'DecoupledBPUWithBTB':
+            cpu.branchPred.mgsc.enableMGSC = not args.disable_mgsc
             cpu.branchPred.ftq_size = 256
             cpu.branchPred.fsq_size = 256
-            cpu.branchPred.tage.numPredictors = 8
-            cpu.branchPred.tage.TTagBitSizes = [11] * 8
-            cpu.branchPred.tage.TTagPcShifts = [1] * 8
-            cpu.branchPred.tage.histLengths = [4, 9, 17, 29, 56, 109, 211, 397]
 
         # l1 cache per core
         if args.caches:
diff --git a/configs/example/kmhv3.py b/configs/example/kmhv3.py
@@ -0,0 +1,157 @@
+import argparse
+import sys
+
+import m5
+from m5.defines import buildEnv
+from m5.objects import *
+from m5.util import addToPath, fatal, warn
+from m5.util.fdthelper import *
+
+addToPath('../')
+
+from ruby import Ruby
+
+from common.FSConfig import *
+from common.SysPaths import *
+from common.Benchmarks import *
+from common import Simulation
+from common.Caches import *
+from common.xiangshan import *
+
+
+def setKmhV3Params(args, system):
+    for cpu in system.cpu:
+
+        # fetch (idealfetch not care)
+        cpu.mmu.itb.size = 96
+        cpu.fetchWidth = 32
+        cpu.iewToFetchDelay = 2 # for resolved update, should train branch after squash
+        cpu.commitToFetchDelay = 2
+        cpu.fetchQueueSize = 64
+        cpu.fetchToDecodeDelay = 2
+
+        # decode
+        cpu.decodeWidth = 8
+        cpu.enable_loadFusion = False
+        cpu.enableConstantFolding = False
+
+        # rename
+        cpu.renameWidth = 8
+        cpu.numPhysIntRegs = 224
+        cpu.numPhysFloatRegs = 256
+        cpu.enable_storeSet_train = False
+
+        # dispatch
+        cpu.enableDispatchStage = False
+        cpu.numDQEntries = [8, 8, 8]
+        cpu.dispWidth = [8, 8, 8]
+
+        # scheduler
+        cpu.scheduler = KMHV3Scheduler()
+        cpu.scheduler.disableAllRegArb()
+        cpu.scheduler.enableMainRdpOpt = False
+        cpu.scheduler.intRegfileBanks = 1
+        # intiq0
+        cpu.scheduler.IQs[0].oports[0].rp = [IntRD(0, 0), IntRD(1, 0)]
+        cpu.scheduler.IQs[0].oports[1].rp = [IntRD(0, 1), IntRD(1, 1)]
+
+        # intiq1
+        cpu.scheduler.IQs[1].oports[0].rp = [IntRD(2, 0), IntRD(3, 0)]
+        cpu.scheduler.IQs[1].oports[1].rp = [IntRD(2, 1), IntRD(3, 1)]
+
+        # intiq2
+        cpu.scheduler.IQs[2].oports[0].rp = [IntRD(4, 0), IntRD(5, 0)]
+        cpu.scheduler.IQs[2].oports[1].rp = [IntRD(4, 1), IntRD(5, 1)]
+
+        # rob
+        cpu.commitWidth = 8
+        cpu.squashWidth = 8
+        cpu.RobCompressPolicy = 'none'
+        cpu.numROBEntries = 352
+        cpu.CROB_instPerGroup = 2 # 1 if not using ROB compression
+
+        # lsu
+        cpu.StoreWbStage = 4
+        cpu.EnableLdMissReplay = True
+        cpu.EnablePipeNukeCheck = True
+        cpu.BankConflictCheck = True
+        cpu.sbufferBankWriteAccurately = False
+
+        # lsq
+        cpu.LQEntries = 120
+        cpu.SQEntries = 64
+        cpu.RARQEntries = 96
+        cpu.RAWQEntries = 56
+        cpu.LoadCompletionWidth = 8
+        cpu.StoreCompletionWidth = 4
+        cpu.RARDequeuePerCycle = 4
+        cpu.RAWDequeuePerCycle = 4
+        cpu.SbufferEntries = 16
+        cpu.SbufferEvictThreshold = 7
+        cpu.store_prefetch_train = False
+
+        # branch predictor
+        if args.bp_type == 'DecoupledBPUWithBTB':
+            cpu.branchPred.mgsc.enableMGSC = not args.disable_mgsc
+            cpu.branchPred.ftq_size = 256
+            cpu.branchPred.fsq_size = 256
+
+        # l1 cache per core
+        if args.caches:
+            cpu.icache.size = '64kB'
+            cpu.dcache.size = '64kB'
+            cpu.dcache.tag_load_read_ports = 3
+            cpu.dcache.mshrs = 16
+
+    # l2 caches
+    if args.l2cache:
+        for i in range(args.num_cpus):
+            if args.classic_l2:
+                system.l2_caches[i].slice_num = 4
+                system.l2_caches[i].wpu = NULL
+            else:
+                l2_wrapper = system.l2_wrappers[i]
+                l2_wrapper.data_sram_banks = 1
+                l2_wrapper.dir_sram_banks = 1
+                l2_wrapper.pipe_dir_write_stage = 3
+                l2_wrapper.dir_read_bypass = False
+                for j in range(args.l2_slices):
+                    l2_wrapper.slices[j].inner_cache.wpu = NULL
+            system.tol2bus_list[i].forward_latency = 3  # 3->0
+            system.tol2bus_list[i].response_latency = 3  # 3->0
+            system.tol2bus_list[i].hint_wakeup_ahead_cycles = 2  # 2->0
+
+            # Enable dual-port for DCache → L2 communication
+            # ReqLayer[0]: ICache+DCache+ITB+DTB → L2, allow 2 requests per cycle
+            # RespLayer[1]: L2 → DCache, allow 2 responses per cycle
+            # system.tol2bus_list[i].layer_bandwidth_configs = [
+            #     LayerBandwidthConfig(direction="req", port_index=0, max_per_cycle=2),
+            #     LayerBandwidthConfig(direction="resp", port_index=1, max_per_cycle=2),
+            # ]
+
+    # l3 cache
+    if args.l3cache:
+        system.l3.mshrs = 64
+
+if __name__ == '__m5_main__':
+    FutureClass = None
+
+    args = xiangshan_system_init()
+
+    assert not args.external_memory_system
+
+    # Set default bp_type based on ideal_kmhv3 flag
+    # If user didn't specify bp_type, set default based on ideal_kmhv3
+    args.bp_type = 'DecoupledBPUWithBTB'
+    args.l2_size = '2MB'
+
+    # Match the memories with the CPUs, based on the options for the test system
+    TestMemClass = Simulation.setMemClass(args)
+
+    test_sys = build_xiangshan_system(args)
+    # Set ideal parameters here with the highest priority, over command-line arguments
+    setKmhV3Params(args, test_sys)
+
+    root = Root(full_system=True, system=test_sys)
+
+    Simulation.run_vanilla(args, root, test_sys, FutureClass)
diff --git a/src/cpu/pred/BranchPredictor.py b/src/cpu/pred/BranchPredictor.py
@@ -976,7 +976,7 @@ class MBTB(TimedBaseBTBPredictor):
     cxx_class = 'gem5::branch_prediction::btb_pred::MBTB'
     cxx_header = 'cpu/pred/btb/mbtb.hh'
 
-    numEntries = Param.Unsigned(2048, "Number of entries in the MBTB")
+    numEntries = Param.Unsigned(8192, "Number of entries in the MBTB")
     tagBits = Param.Unsigned(20, "Number of bits in the tag")
     instShiftAmt = Param.Unsigned(1, "Amount to shift PC to get inst bits")
     numThreads = Param.Unsigned(1, "Number of threads")
@@ -1042,13 +1042,13 @@ class BTBTAGE(TimedBaseBTBPredictor):
     needMoreHistories = Param.Bool(True, "BTBTAGE needs more histories")
     enableSC = Param.Bool(False, "Enable SC or not")    # TODO: BTBTAGE doesn't support SC
     updateOnRead = Param.Bool(True, "Enable update on read, no need to save tage meta in FTQ")
-    numPredictors = Param.Unsigned(4, "Number of TAGE predictors")
-    tableSizes = VectorParam.Unsigned([2048]*4, "the TAGE T0~Tn length")
-    TTagBitSizes = VectorParam.Unsigned([8]*4, "the T0~Tn entry's tag bit size")
-    TTagPcShifts = VectorParam.Unsigned([1] * 4, "when the T0~Tn entry's tag generating, PC right shift")
+    numPredictors = Param.Unsigned(8, "Number of TAGE predictors")
+    tableSizes = VectorParam.Unsigned([2048]*8, "the TAGE T0~Tn length")
+    TTagBitSizes = VectorParam.Unsigned([11]*8, "the T0~Tn entry's tag bit size")
+    TTagPcShifts = VectorParam.Unsigned([1] * 8, "when the T0~Tn entry's tag generating, PC right shift")
     blockSize = 32 # tage index function uses 32B aligned block address
 
-    histLengths = VectorParam.Unsigned([8, 13, 32, 119], "the BTB TAGE T0~Tn history length")
+    histLengths = VectorParam.Unsigned([4, 9, 17, 29, 56, 109, 211, 397], "the BTB TAGE T0~Tn history length")
     maxHistLen = Param.Unsigned(970, "The length of history passed from DBP")
     numTablesToAlloc = Param.Unsigned(1,"The number of table to allocated each time")
     numWays = Param.Unsigned(2, "Number of ways per set")
diff --git a/util/xs_scripts/kmh_v3_btb.sh b/util/xs_scripts/kmh_v3_btb.sh
@@ -7,4 +7,4 @@ for var in GCBV_REF_SO GCB_RESTORER gem5_home; do
     checkForVariable $var
 done
 
-$gem5 $gem5_home/configs/example/idealkmhv3.py --generic-rv-cpt=$1
+$gem5 $gem5_home/configs/example/kmhv3.py --generic-rv-cpt=$1